# Data Encoding 

## Importing the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("data/german_credit_data.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad
...,...,...,...,...,...,...,...,...,...,...,...
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad


# Data Cleaning

The first column of this dataframe is a copy of the index. This is already included in the dataframe, so we can remove it:

In [4]:
df = df.drop(columns = ['Unnamed: 0'], axis = 1)

Filling missing values with a global constant:

In [5]:
df['Saving accounts'] = df['Saving accounts'].fillna('Missing')
df['Checking account'] = df['Checking account'].fillna('Missing')

The feature 'sex' only has 2 values. Which value should be considered base case when binary encoding is used?

In [6]:
total = df.shape[0]

male_count = df["Sex"].value_counts()[0]
female_count = df["Sex"].value_counts()[1]

print(str(100 * male_count/total) +  "% of instances describe males while " + str(100 * female_count/total) + "% of instances describe females.")

69.0% of instances describe males while 31.0% of instances describe females.


Male will be the base case, as females are less common.

Using One-Hot-Encoding:

In [7]:
tempDf = pd.get_dummies(df.drop(columns= ['Risk'],axis=1), drop_first=False)
tempDf['Risk'] = df['Risk']

df = tempDf

Considering males as base case, encoding females = 1:

In [8]:
df = df.drop(columns = ['Sex_male'], axis = 1)

Encoding our target variable:

In [9]:
df['Risk'] = df['Risk'].replace('bad', 0)
df['Risk'] = df['Risk'].replace('good', 1)

In [10]:
df

Unnamed: 0,Age,Job,Credit amount,Duration,Sex_female,Housing_free,Housing_own,Housing_rent,Saving accounts_Missing,Saving accounts_little,...,Checking account_rich,Purpose_business,Purpose_car,Purpose_domestic appliances,Purpose_education,Purpose_furniture/equipment,Purpose_radio/TV,Purpose_repairs,Purpose_vacation/others,Risk
0,67,2,1169,6,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1
1,22,2,5951,48,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,49,1,2096,12,0,0,1,0,0,1,...,0,0,0,0,1,0,0,0,0,1
3,45,2,7882,42,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,53,2,4870,24,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,31,1,1736,12,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
996,40,3,3857,30,0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
997,38,2,804,12,0,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,1
998,23,2,1845,45,0,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [11]:
df.to_csv('data/encoded_dataset.csv', index = False)

In [12]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state = 0)

In [13]:
train.to_csv('data/train.csv', index = False)
test.to_csv('data/test.csv', index = False)