In [1]:
# Import modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Load data
cc_apps = pd.read_csv('Data/crx.data', header=None)

# Adding column names
columns = ['Male','Age','Debt','Married','BankCustomer','EducationLevel',
           'Ethnicity','YearsEmployed','PriorDefault','Employed',
           'CreditScore','DriversLicense','Citizen','ZipCode','Income','Approved']

# Preserving original
cc_orig = cc_apps.copy()

# Changing column names
cc_apps.columns = columns

cc_apps.sample(5)
cc_apps.shape

(690, 16)

In [3]:
# Replace '?' with np.nan

cc_apps.replace(to_replace='?', value=np.nan, inplace=True)

In [4]:
# Checking for missing values

cc_apps.isnull().sum()

Male              12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
Approved           0
dtype: int64

In [5]:
# Examining numeric and non-numeric columns
non_numeric_columns = []
numeric_columns = []

for col in cc_apps.columns:
    if cc_apps[col].dtype == 'object' and col != 'Approved':
        non_numeric_columns.append(col)
    elif cc_apps[col].dtype in ['int','float']:
        numeric_columns.append(col)

In [6]:
# Convert Age to numeric column

cc_apps['Age'] = cc_apps['Age'].astype(float)
print(cc_apps['Age'].dtype)

numeric_columns.insert(0,'Age')
non_numeric_columns.remove('Age')

float64


In [7]:
numeric_columns

['Age', 'Debt', 'YearsEmployed', 'CreditScore', 'Income']

In [8]:
non_numeric_columns

['Male',
 'Married',
 'BankCustomer',
 'EducationLevel',
 'Ethnicity',
 'PriorDefault',
 'Employed',
 'DriversLicense',
 'Citizen',
 'ZipCode']

In [9]:
# Impute missing values: Define column transformer

from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer

column_trans1 = make_column_transformer(
    (SimpleImputer(), numeric_columns),
    (SimpleImputer(missing_values=np.nan, strategy='most_frequent'), non_numeric_columns),
    remainder='passthrough')

In [10]:
# Forming features df 

X = cc_apps.drop('Approved', axis='columns')
X.head()

Unnamed: 0,Male,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0


In [11]:
# Impute missing values: Fit&Transform column transformer

X_imp_arr = column_trans1.fit_transform(X)
X_imp_df = pd.DataFrame(X_imp_arr, columns = numeric_columns+non_numeric_columns)
X_imp_df.drop('ZipCode', axis='columns', inplace=True)
non_numeric_columns.remove('ZipCode')
X_imp_df.head()

Unnamed: 0,Age,Debt,YearsEmployed,CreditScore,Income,Male,Married,BankCustomer,EducationLevel,Ethnicity,PriorDefault,Employed,DriversLicense,Citizen
0,30.83,0.0,1.25,1,0,b,u,g,w,v,t,t,f,g
1,58.67,4.46,3.04,6,560,a,u,g,q,h,t,t,f,g
2,24.5,0.5,1.5,0,824,a,u,g,q,h,t,f,f,g
3,27.83,1.54,3.75,5,3,b,u,g,w,v,t,t,t,g
4,20.17,5.625,1.71,0,0,b,u,g,w,v,t,f,f,s


In [12]:
# Check for missing values after imputation

X_imp_df.isnull().sum()

Age               0
Debt              0
YearsEmployed     0
CreditScore       0
Income            0
Male              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
PriorDefault      0
Employed          0
DriversLicense    0
Citizen           0
dtype: int64

In [13]:
# Encode target categorical variable

from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(cc_apps['Approved'])

In [14]:
# Preprocessing
# Instantiate LabelEncoder
le = LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in non_numeric_columns:
    # Fit and train LabelEncoder 
    X_imp_df[col]=le.fit_transform(X_imp_df[col])

In [15]:
X_imp_df.head()

Unnamed: 0,Age,Debt,YearsEmployed,CreditScore,Income,Male,Married,BankCustomer,EducationLevel,Ethnicity,PriorDefault,Employed,DriversLicense,Citizen
0,30.83,0.0,1.25,1,0,1,1,0,12,7,1,1,0,0
1,58.67,4.46,3.04,6,560,0,1,0,10,3,1,1,0,0
2,24.5,0.5,1.5,0,824,0,1,0,10,3,1,0,0,0
3,27.83,1.54,3.75,5,3,1,1,0,12,7,1,1,1,0
4,20.17,5.625,1.71,0,0,1,1,0,12,7,1,0,0,2


In [16]:
# Splitting data into train and test sets:

# Import train_test_split
from sklearn.model_selection import train_test_split

# Drop DriversLicense feature and convert the DataFrame to a NumPy array
X_imp_df.drop(['DriversLicense'], axis=1, inplace=True)

# Creating a copy of the df before converting to an array
cc_apps_after_preprocessing = X_imp_df.copy()

# Form features array
X = X_imp_df.values

In [17]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

In [18]:
# Scaling data

# Import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

In [19]:
# Fitting a Logistic Regression to training data (which has been rescaled)

# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression(solver='lbfgs')

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [20]:
# Making predictions and evaluating performance

# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
print(confusion_matrix(y_test, y_pred))

Accuracy of logistic regression classifier:  0.8421052631578947
[[94  9]
 [27 98]]


In [21]:
# Hyperparameter tuning to seek better model performance

# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define grid parameters
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]


# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict([('tol', tol), ('max_iter', max_iter)])
param_grid

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}

In [22]:
# Find the best performing model

# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best score: %f using hyperparameters: %s" % (best_score, best_params))

Best score: 0.850725 using hyperparameters: {'max_iter': 100, 'tol': 0.01}


---