## Import packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
# All feature names and values have been changed to meaningless symbols to protect confidentiality of the data. 
# This blog gives us a good overview of the probable features: 
# http://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html 

column_names = ['Male', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', "YearsEmployed",
               'PriorDefault', 'Employed', 'CreditScore', 'DriverLicense', 'Citizen', 'ZipCode', 'Income', 'Approved']
cc_apps = pd.read_csv('cc_approvals.data', header=None, names=column_names)

display(cc_apps.head())

# Summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print('\n')

# DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

cc_apps = cc_apps.drop(['DriverLicense', 'ZipCode'], axis=1) 

Unnamed: 0,Male,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriverLicense,Citizen,ZipCode,Income,Approved
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


             Debt  YearsEmployed  CreditScore         Income
count  690.000000     690.000000    690.00000     690.000000
mean     4.758725       2.223406      2.40000    1017.385507
std      4.978163       3.346513      4.86294    5210.102598
min      0.000000       0.000000      0.00000       0.000000
25%      1.000000       0.165000      0.00000       0.000000
50%      2.750000       1.000000      0.00000       5.000000
75%      7.207500       2.625000      3.00000     395.500000
max     28.000000      28.500000     67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Male            690 non-null    object 
 1   Age             690 non-null    object 
 2   Debt            690 non-null    float64
 3   Married         690 non-null    object 
 4   BankCustomer    690 non-null    object 
 5   EducationLevel  690 non-

In [3]:
# The missing values in the dataset are shown as '?'
cc_apps.isin(['?']).sum()

Male              12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
Citizen            0
Income             0
Approved           0
dtype: int64

 ## Split the dataset into train and test sets¶

In [4]:
# Split into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.3, random_state=42)

## Deal with the missing values

In [5]:
# Replace the '?'s with NaN in the train and test sets
cc_apps_train = cc_apps_train.replace('?', np.nan)
cc_apps_test = cc_apps_test.replace('?', np.nan)

# Impute the missing values with mean imputation
cc_apps_train.fillna(cc_apps_train.mean(), inplace=True)
cc_apps_test.fillna(cc_apps_train.mean(), inplace=True)

print(cc_apps_train.isnull().sum())  # There are still some missing values, all of which are non-numeric data.
print(cc_apps_test.isnull().sum())   # There are still some missing values, all of which are non-numeric data.

Male              9
Age               6
Debt              0
Married           6
BankCustomer      6
EducationLevel    7
Ethnicity         7
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
Citizen           0
Income            0
Approved          0
dtype: int64
Male              3
Age               6
Debt              0
Married           0
BankCustomer      0
EducationLevel    2
Ethnicity         2
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
Citizen           0
Income            0
Approved          0
dtype: int64


In [6]:
# Impute the non-numeric missing values with the most frequent values
for col in cc_apps_train:
    if  cc_apps_train[col].dtypes == 'object':
        # Impute with the most frequent value
        cc_apps_train = cc_apps_train.fillna(cc_apps_train[col].value_counts().index[0])
        cc_apps_test = cc_apps_test.fillna(cc_apps_train[col].value_counts().index[0])

print(cc_apps_train.isnull().sum())
print(cc_apps_test.isnull().sum())

Male              0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
Citizen           0
Income            0
Approved          0
dtype: int64
Male              0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
Citizen           0
Income            0
Approved          0
dtype: int64


## Convert the non-numeric data into numeric

In [7]:
# Convert the non-numeric data into numeric
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)

# Reindex the columns of the test set aligning with the train set
cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

## Scale the features

In [8]:
X_train, y_train = cc_apps_train.iloc[:, :-1].values, cc_apps_train.iloc[:, [-1]].values
X_test, y_test = cc_apps_test.iloc[:, :-1].values, cc_apps_test.iloc[:, [-1]].values

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0,1))
scaledX_train = scaler.fit_transform(X_train)
scaledX_test = scaler.transform(X_test)

##  Fit the logistic regression model

In [9]:
# Instantiate a LogisticRegression classifier 
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(scaledX_train, y_train)

LogisticRegression()

## Make predictions and evaluate performance¶

In [10]:
# Predict instances from the test set with logreg model
y_pred = logreg.predict(scaledX_test)

# Accuracy score of the logreg model 
print(f'Accuracy of the logistic regression classifier: {logreg.score(scaledX_test, y_test)*100}%')

# Confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))

Accuracy of the logistic regression classifier: 100.0%
[[ 97   0]
 [  0 110]]


# Grid search

In [11]:
# Define the grid of values for tol(Tolerance for stopping criteria) and max_iter (Maximum number of iterations) 
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

param_grid = {'tol':tol, 'max_iter':max_iter}

# Instantiate the GridSearchCV 
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

In [12]:
# Fit grid_model to the data
grid_model_result = grid_model.fit(scaledX_train, y_train)

best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print(f'Best: {best_score} using {best_params}')

# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
print(f'Accuracy of logistic regression classifier: {best_model.score(scaledX_test,y_test)*100}%')

Best: 1.0 using {'max_iter': 100, 'tol': 0.01}
Accuracy of logistic regression classifier: 100.0%
