## Load Libraries

In [18]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Own Libraries

In [19]:
import log_regression # Simple regression model

## Data Exploration

In [20]:

# Load dataset
cc_data = pd.read_csv("data/cc_approvals.data", header=None)

In [21]:
# Inspect data
print(cc_data.head(5))

  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  00202    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g  00043  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  00280  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  00100    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  00120    0  +


In [22]:
column_names = ["Gender",
                "Age",
                "Debt",
                "Married",
                "BankCustomer",
                "EducationLevel",
                "Ethnicity",
                "YearsEmployed",
                "PriorDefault",
                "Employed",
                "CreditScore",
                "DriversLicense",
                "Citizen",
                "ZipCode",
                "Income",
                "ApprovalStatus"]

# Add column names
cc_data.columns = column_names

cc_data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [23]:
cc_data.shape

(690, 16)

## Feature Engineering

### Select Features

In [24]:
#define mask to filter only for values we are interested in
mask = ['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore',
        'DriversLicense', 'Citizen', 'Income'] 

# apply mask
X = cc_data[mask]

y = cc_data["ApprovalStatus"]

In [25]:
X.nunique()

Gender              3
Age               350
Debt              215
Married             4
BankCustomer        4
YearsEmployed     132
PriorDefault        2
Employed            2
CreditScore        23
DriversLicense      2
Citizen             3
Income            240
dtype: int64

In [26]:
X.dtypes

Gender             object
Age                object
Debt              float64
Married            object
BankCustomer       object
YearsEmployed     float64
PriorDefault       object
Employed           object
CreditScore         int64
DriversLicense     object
Citizen            object
Income              int64
dtype: object

In [27]:
# Replace '?' with NaN
X = X.replace('?', np.NaN)

In [28]:
X.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
Income             0
dtype: int64

In [29]:
# Drop rows with missing values in X and the same rows in y
X = X.dropna(axis=0)
y = y[X.index]

In [30]:
print(X.shape)
print(y.shape)

(660, 12)
(660,)


### Mapping Categorical Features

In [31]:
# looking at the unique values of the columns Citizen, DriverLicense, PriorDefault, Married, BankCustomer
print(f'Unique values of Citizen: {X["Citizen"].unique()}')
print(f'Unique values of DriverLicense: {X["DriversLicense"].unique()}')
print(f'Unique values of PriorDefault: {X["PriorDefault"].unique()}')
print(f'Unique values of Married: {X["Married"].unique()}')
print(f'Unique values of BankCustomer: {X["BankCustomer"].unique()}')

Unique values of Citizen: ['g' 's' 'p']
Unique values of DriverLicense: ['f' 't']
Unique values of PriorDefault: ['t' 'f']
Unique values of Married: ['u' 'y' 'l']
Unique values of BankCustomer: ['g' 'p' 'gg']


In [32]:
# Mapping in X
# Gender: 'a' --> 1, 'b' --> 0
d = {'a': 1, 'b': 0}
X.loc[:, 'Gender'] = X.loc[:, 'Gender'].map(d)

# Employed:
d = {'t': 1, 'f': 0}
X.loc[:, 'Employed'] = X.loc[:, 'Employed'].map(d)

# Citizen: 
d = {'g': 2, 's': 1, 'p': 0}
X.loc[:, 'Citizen'] = X.loc[:, 'Citizen'].map(d)

# DriverLicense: 
d = {'t': 1, 'f': 0}
X.loc[:, 'DriversLicense'] = X.loc[:, 'DriversLicense'].map(d)

# PriorDefault:
d = {'t': 1, 'f': 0}
X.loc[:, 'PriorDefault'] = X.loc[:, 'PriorDefault'].map(d)

# Married
d = {'u': 2, 'y': 1, 'l': 0}
X.loc[:, 'Married'] = X.loc[:, 'Married'].map(d)

# BankCustomer:
d = {'g': 2, 'p': 1, 'gg': 0}
X.loc[:, 'BankCustomer'] = X.loc[:, 'BankCustomer'].map(d)



In [33]:
X.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,Income
0,0,30.83,0.0,2,2,1.25,1,1,1,0,2,0
1,1,58.67,4.46,2,2,3.04,1,1,6,0,2,560
2,1,24.5,0.5,2,2,1.5,1,0,0,0,2,824
3,0,27.83,1.54,2,2,3.75,1,1,5,1,2,3
4,0,20.17,5.625,2,2,1.71,1,0,0,0,1,0


In [34]:
# Mapping in y
# approval status from "+" to 1 and "-" to 0
d = {'+': 1, '-': 0}
y = y.map(d)

## Modelling

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Logistic Regression

In [36]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
f1_lr = f1_score(y_test, y_valid_pred_lr)
f1_lr

0.8431372549019607

### XGBoost

In [38]:
# Building a model with XGBoost

# all dtypes in X must be numeric
X_train = X_train.astype('float')
X_test = X_test.astype('float')

# Instantiate the XGBClassifier: xg_cl
xg_cl = XGBClassifier(objective='binary:logistic', n_estimators=150, seed=42)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)


In [39]:

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the f1 score
f1_score(y_test, preds)


0.8082901554404144

In [40]:

# in more detail
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.89      0.81      0.85       127
           1       0.76      0.86      0.81        91

    accuracy                           0.83       218
   macro avg       0.83      0.83      0.83       218
weighted avg       0.84      0.83      0.83       218

