## Read and Preprocess UCI German Credit Data

### Define imports

In [12]:
import numpy as np
import pandas as pd
import os

### Define constant variables

In [15]:
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
data_file_path = './data/german.data.csv'
column_names = ['Checking_Account','Duration_In_Months', 'Credit_History',
               'Purpose', 'Credit_Amount', 'Savings_Account',
                'Employment_Duration','Installment_Rate', 'Marital_Sex_Status', 
                'Other_Debtors_Guarantors','Present_Residence', 'Property_Owned', 
                'Age_In_Years','Other_Installments', 'Housing', 
                'Existing_Credits', 'Job_Type', 'Number_Dependents', 
                'Telephone', 'Is_Foregn_Worker','Is_Credit_worthy']

numeric_feature_names = ['Present_Residence',
 'Duration_In_Months',
 'Installment_Rate',
 'Age_In_Years',
 'Number_Dependents',
 'Credit_Amount']

dependent_variable_name = 'Is_Credit_worthy' 


### Get the categorical features

In [16]:
def getCategoricalColumns(all_column_names=column_names, dependent_variable_name=dependent_variable_name):
    categorical_feature_names = list(set(column_names) - set(numeric_feature_names))
    categorical_feature_names.remove(dependent_variable_name)
    return categorical_feature_names

def printListItems(msg, l):
    print(msg)
    print('[')
    for s in l:
        print("\t{}".format(s))
    print(']')
categorical_feature_names = getCategoricalColumns()        
printListItems("categorical_feature_names:", categorical_feature_names)

categorical_feature_names:
[
	Other_Installments
	Employment_Duration
	Purpose
	Marital_Sex_Status
	Telephone
	Existing_Credits
	Savings_Account
	Housing
	Is_Foregn_Worker
	Other_Debtors_Guarantors
	Property_Owned
	Credit_History
	Checking_Account
	Job_Type
]


### Get the German UCI Credit data
 - Firstly check to see if the data is locally cached to disc at the path: './data/german.data.csv'
 - Else get it from the url location: 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

In [26]:
def getGermanCreditData():
    if not os.path.exists(data_file_path):
        raw_credit_data = pd.read_csv(data_url, sep=' ', names=column_names)
        raw_credit_data.to_csv(data_file_path)
        print("Data was sourced from url..")
    else:
        raw_credit_data = pd.read_csv(data_file_path, index_col=0)
        print("Data already exists..")
    return raw_credit_data

raw_credit_data = getGermanCreditData()


Data already exists..


### Use OneHotEncoding to encode the categorical features
 - We will use the Pandas pd.get_dummies() function to do this

In [28]:
credit_data_onehotencoded = pd.get_dummies(raw_credit_data, columns=categorical_feature_names)
credit_data_onehotencoded.head(2)

Unnamed: 0,Duration_In_Months,Credit_Amount,Installment_Rate,Present_Residence,Age_In_Years,Number_Dependents,Is_Credit_worthy,Other_Installments_A141,Other_Installments_A142,Other_Installments_A143,...,Credit_History_A33,Credit_History_A34,Checking_Account_A11,Checking_Account_A12,Checking_Account_A13,Checking_Account_A14,Job_Type_A171,Job_Type_A172,Job_Type_A173,Job_Type_A174
0,6,1169,4,4,67,1,1,0,0,1,...,0,1,1,0,0,0,0,0,1,0
1,48,5951,2,2,22,1,2,0,0,1,...,0,0,0,1,0,0,0,0,1,0


In [4]:
raw_credit_data.head(5)
#list(raw_credit_data.iloc[0])
#len(column_names)
#len(raw_credit_data.columns)

Unnamed: 0,Checking_Account,Duration_In_Months,Credit_History,Purpose,Credit_Amount,Savings_Account,Employment_Duration,Installment_Rate,Marital_Sex_Status,Other_Debtors_Guarantors,...,Property_Owned,Age_In_Years,Other_Installments,Housing,Existing_Credits,Job_Type,Number_Dependents,Telephone,Is_Foregn_Worker,Is_Credit_worthy
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [43]:
numeric_feature_name

['Age_In_Years',
 'Present_Residence',
 'Number_Dependents',
 'Credit_Amount',
 'Installment_Rate',
 'Duration_In_Months']