## Load Libraries

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pickle


### Own Libraries

In [2]:
# none for now

## Data Exploration

In [3]:
# Load dataset
cc_data = pd.read_csv("data/cc_approvals.data", header=None)

In [4]:
# Inspect data
print(cc_data.head(5))

  0      1      2  3  4  5  6     7  8  9   10 11 12     13   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  f  g  00202    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  f  g  00043  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  f  g  00280  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  t  g  00100    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  f  s  00120    0  +


In [5]:
column_names = ["Gender",
                "Age",
                "Debt",
                "Married",
                "BankCustomer",
                "EducationLevel",
                "Ethnicity",
                "YearsEmployed",
                "PriorDefault",
                "Employed",
                "CreditScore",
                "DriversLicense",
                "Citizen",
                "ZipCode",
                "Income",
                "ApprovalStatus"]

# Add column names
cc_data.columns = column_names

cc_data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [6]:
cc_data.shape

(690, 16)

In [7]:
cc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    object 
 1   Age             690 non-null    object 
 2   Debt            690 non-null    float64
 3   Married         690 non-null    object 
 4   BankCustomer    690 non-null    object 
 5   EducationLevel  690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    object 
 9   Employed        690 non-null    object 
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    object 
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    object 
 14  Income          690 non-null    int64  
 15  ApprovalStatus  690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [8]:
cc_data.describe()

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


## Data Cleaning

In [9]:
cc_data.nunique()

Gender              3
Age               350
Debt              215
Married             4
BankCustomer        4
EducationLevel     15
Ethnicity          10
YearsEmployed     132
PriorDefault        2
Employed            2
CreditScore        23
DriversLicense      2
Citizen             3
ZipCode           171
Income            240
ApprovalStatus      2
dtype: int64

In [10]:
# Replace '?' with NaN
cc_data = cc_data.replace('?', np.NaN)

In [11]:
cc_data.loc[:, "Age"] = pd.to_numeric(cc_data.loc[:, "Age"])

In [12]:
cc_data.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

## Feature Engineering

### Select Features and split dataset

In [13]:
# Drop ZipCode that will not have predictive value for us
cc_data = cc_data.drop(["ZipCode"], axis=1)

In [14]:
#define mask to filter only for values we are interested in (Final version: Everything but ApprovalStatus)
mask = ['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'YearsEmployed', 'PriorDefault',
        'Employed', 'CreditScore', 'DriversLicense', 'Citizen', 'Income'] 

# apply mask
X = cc_data[mask]

y = cc_data["ApprovalStatus"]

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Feature Engineering

In [16]:
# Replace missing values in numeric category/categories by mean
X_train["Age"] = X_train["Age"].fillna(X_train["Age"].mean())
X_test["Age"] = X_test["Age"].fillna(X_test["Age"].mean())

In [17]:
# Replace non-numerical values by the most common feature
for col in X_train.columns:
    # Check if the column is of object type
    if X_train[col].dtypes == "object":
        # Impute with the most frequent value
        X_train = X_train.fillna(X_train[col].value_counts().index[0])
        X_test = X_test.fillna(X_test[col].value_counts().index[0])

In [18]:
X_train.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
Income            0
dtype: int64

In [19]:
X_test.isnull().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
Income            0
dtype: int64

In [20]:
print(X_train.shape)
print(X_test.shape)

(462, 14)
(228, 14)


### Mapping Categorical Features

In [21]:
# looking at the unique values of the non-categorical values
print(f'Unique values of Gender: {X_train["Gender"].unique()}')
print(f'Unique values of Married: {X_train["Married"].unique()}')
print(f'Unique values of BankCustomer: {X_train["BankCustomer"].unique()}')
print(f'Unique values of EducationLevel: {X_train["EducationLevel"].unique()}')
print(f'Unique values of Ethnicity: {X_train["Ethnicity"].unique()}')
print(f'Unique values of PriorDefault: {X_train["PriorDefault"].unique()}')
print(f'Unique values of Employed: {X_train["Employed"].unique()}')
print(f'Unique values of DriverLicense: {X_train["DriversLicense"].unique()}')
print(f'Unique values of Citizen: {X_train["Citizen"].unique()}')

Unique values of Gender: ['a' 'b']
Unique values of Married: ['y' 'u' 'b' 'l']
Unique values of BankCustomer: ['p' 'g' 'b' 'gg']
Unique values of EducationLevel: ['i' 'm' 'c' 'e' 'w' 'ff' 'k' 'aa' 'q' 'x' 'r' 'cc' 'b' 'd' 'j']
Unique values of Ethnicity: ['bb' 'v' 'h' 'ff' 'z' 'dd' 'n' 'b' 'j' 'o']
Unique values of PriorDefault: ['f' 't']
Unique values of Employed: ['f' 't']
Unique values of DriverLicense: ['f' 't']
Unique values of Citizen: ['g' 's' 'p']


### Export clean dataset for charts in streamlit

In [22]:
# Create Dataframe for output
Clean_train = pd.concat([X_train, y_train], axis=1)
Clean_test  = pd.concat([X_test, y_test], axis=1)
output_df = pd.concat([Clean_train, Clean_test], ignore_index=True)

# Write to csv
output_df.to_csv('data/cc_clean.csv', index=False)

In [23]:
categorical_columns = ['Gender', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'PriorDefault',
        'Employed', 'DriversLicense', 'Citizen']

# converting all the categorical columns to numeric
col_mapper = {}
for col in categorical_columns:
    le = LabelEncoder()
    le.fit(X_train.loc[:, col])
    class_names = le.classes_
    X_train.loc[:, col] = le.transform(X_train.loc[:, col])
    # saving encoder for each column to be able to inverse-transform later
    col_mapper.update({col: le})

for col in categorical_columns:
    le = LabelEncoder()
    le.fit(X_test.loc[:, col])
    class_names = le.classes_
    X_test.loc[:, col] = le.transform(X_test.loc[:, col])

In [24]:
#minmax_columns = ['CreditScore']
# reduces f1, so not used.


#for col in minmax_columns:
#    mm_scaler = MinMaxScaler()
#    X_train.loc[:, col] = mm_scaler.fit_transform(X_train.loc[:, col].values.reshape(-1, 1))
#    # saving encoder for each column to be able to inverse-transform later
#    col_mapper.update({col: mm_scaler})
 


In [25]:
X_test.dtypes

Gender             object
Age               float64
Debt              float64
Married            object
BankCustomer       object
EducationLevel     object
Ethnicity          object
YearsEmployed     float64
PriorDefault       object
Employed           object
CreditScore         int64
DriversLicense     object
Citizen            object
Income              int64
dtype: object

In [26]:
X_test.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,Income
286,0,31.428416,1.5,1,0,6,3,0.0,0,1,2,1,0,105
511,0,46.0,4.0,1,0,8,5,0.0,1,0,0,0,0,960
257,1,20.0,0.0,1,0,4,7,0.5,0,0,0,0,0,0
336,1,47.33,6.5,1,0,2,7,1.0,0,0,0,1,0,228
318,1,19.17,0.0,2,2,10,1,0.0,0,0,0,1,2,1


In [27]:
X.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,Income
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,560
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,824
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,0


In [28]:
# Mapping in y
# approval status from "+" to 1 and "-" to 0
d = {'+': 1, '-': 0}
y_train = y_train.map(d)
y_test = y_test.map(d)

## Modelling

### Logistic Regression

In [29]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

In [30]:
y_valid_pred_lr = lr.predict(X_test)

In [31]:
f1_lr = f1_score(y_test, y_valid_pred_lr)
f1_lr

0.8347826086956522

### XGBoost

In [32]:
# Building a model with XGBoost

# all dtypes in X must be numeric
X_train = X_train.astype('float')
X_test = X_test.astype('float')

# Instantiate the XGBClassifier: xg_cl
xg_cl = XGBClassifier(objective='binary:logistic', n_estimators=150, seed=42)

# Fit the classifier to the training set
xg_cl.fit(X_train, y_train)


In [33]:

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the f1 score
f1_xb = f1_score(y_test, preds)
f1_xb


0.7894736842105263

In [34]:

# in more detail
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.87      0.72      0.79       125
           1       0.72      0.87      0.79       103

    accuracy                           0.79       228
   macro avg       0.80      0.80      0.79       228
weighted avg       0.80      0.79      0.79       228



In [35]:
# Select best model
if f1_lr > f1_xb:
    print(f"We select Logistic Regression with its f1 of {f1_lr:2f} over XGBoost with f1 of {f1_xb:2f}")
    model = lr
else:
    print(f"We select XGBoost with its f1 of {f1_xb:2f} over Logistic Regression with f1 of {f1_lr:2f}")
    model = xg_cl

We select Logistic Regression with its f1 of 0.834783 over XGBoost with f1 of 0.789474


In [36]:
# pickling model
pickler = open("cc_model.pkl", "wb")
pickle.dump(model, pickler)
pickler.close()

In [37]:
# pickling encoder
pickler = open("cc_label_encoders.pkl", "wb")
pickle.dump(col_mapper, pickler)
pickler.close()