Commercial banks receive _a lot_ of applications for credit cards. Many of them get rejected for many reasons, like high loan balances, low income levels, or too many inquiries on an individual's credit report, for example. Manually analyzing these applications is mundane, error-prone, and time-consuming (and time is money!). Luckily, this task can be automated with the power of machine learning and pretty much every commercial bank does so nowadays. In this notebook, we will build an automatic credit card approval predictor using machine learning techniques, just like real banks do.

![Credit card being held in hand](credit_card.jpg)

You have been provided with a small subset of the credit card applications a bank receives. The dataset has been loaded as a Pandas DataFrame for you. You will start from there. 

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

# Load dataset
cc_apps = pd.read_csv("cc_approvals.data", header=None)
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [3]:
# Drop features 11 + 13
cc_apps = cc_apps.drop([11, 13], axis=1)

In [4]:
# Check data types - mostly look right, except column 1 should be numeric
cc_apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  12      690 non-null    object 
 12  14      690 non-null    int64  
 13  15      690 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 75.6+ KB


In [5]:
# Split data into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

In [6]:
# Check to see if there are nulls - there are none
cc_apps.isnull().count()

0     690
1     690
2     690
3     690
4     690
5     690
6     690
7     690
8     690
9     690
10    690
12    690
14    690
15    690
dtype: int64

In [7]:
# Check for odd values - see ?s instead of NaNs
cc_apps.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,12,14,15
670,b,47.17,5.835,u,g,w,v,5.5,f,f,0,g,150,-
671,b,25.83,12.835,u,g,cc,v,0.5,f,f,0,g,2,-
672,a,50.25,0.835,u,g,aa,v,0.5,f,f,0,g,117,-
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,g,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,g,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,g,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,g,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,g,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,g,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,g,0,-


In [8]:
# Replace the ?s in the data with NaN
cc_apps_train_nans_replaced = cc_apps_train.replace("?", np.NaN)
cc_apps_test_nans_replaced = cc_apps_test.replace("?", np.NaN)

In [9]:
# Check data types - mostly look right, except column 1 should be numeric
cc_apps_train_nans_replaced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462 entries, 382 to 102
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       454 non-null    object 
 1   1       457 non-null    object 
 2   2       462 non-null    float64
 3   3       456 non-null    object 
 4   4       456 non-null    object 
 5   5       455 non-null    object 
 6   6       455 non-null    object 
 7   7       462 non-null    float64
 8   8       462 non-null    object 
 9   9       462 non-null    object 
 10  10      462 non-null    int64  
 11  12      462 non-null    object 
 12  14      462 non-null    int64  
 13  15      462 non-null    object 
dtypes: float64(2), int64(2), object(10)
memory usage: 54.1+ KB


In [10]:
# Change column 1 to data type float
cc_apps_train_nans_replaced = cc_apps_train_nans_replaced.astype({1: float})
cc_apps_test_nans_replaced = cc_apps_test_nans_replaced.astype({1: float})
cc_apps_train_nans_replaced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462 entries, 382 to 102
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       454 non-null    object 
 1   1       457 non-null    float64
 2   2       462 non-null    float64
 3   3       456 non-null    object 
 4   4       456 non-null    object 
 5   5       455 non-null    object 
 6   6       455 non-null    object 
 7   7       462 non-null    float64
 8   8       462 non-null    object 
 9   9       462 non-null    object 
 10  10      462 non-null    int64  
 11  12      462 non-null    object 
 12  14      462 non-null    int64  
 13  15      462 non-null    object 
dtypes: float64(3), int64(2), object(9)
memory usage: 54.1+ KB


In [34]:
cc_apps_train_nans_replaced[1].fillna(cc_apps_train_nans_replaced[1].mean())

382    24.33
137    33.58
346    32.25
326    30.17
33     36.75
       ...  
71     34.83
106    28.75
270    37.58
435    19.00
102    18.67
Name: 1, Length: 462, dtype: float64

In [11]:
# Impute missing values for numeric columns
cc_apps_train_imputed = cc_apps_train_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())
cc_apps_test_imputed = cc_apps_test_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())

# Impute missing values for object-type columns
for col in cc_apps_train_imputed.columns:
    if cc_apps_train_imputed[col].dtypes == "object":
        cc_apps_train_imputed.fillna(
            cc_apps_train_imputed[col].value_counts().index[0],inplace=True
        )
        cc_apps_test_imputed.fillna(
            cc_apps_train_imputed[col].value_counts().index[0],inplace=True
        )

  cc_apps_train_imputed = cc_apps_train_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())
  cc_apps_test_imputed = cc_apps_test_nans_replaced.fillna(cc_apps_train_nans_replaced.mean())


In [12]:
# Convert categorical features into numeric
cc_apps_train_cat_encoding = pd.get_dummies(cc_apps_train_imputed)
cc_apps_test_cat_encoding = pd.get_dummies(cc_apps_test_imputed)

In [13]:
# Checks

# 204
cc_apps_train_cat_encoding[cc_apps_train_cat_encoding["15_-"] == 0].count()

# 258
cc_apps_train_cat_encoding[cc_apps_train_cat_encoding["15_-"] == 1].count()

# 103
cc_apps_test_cat_encoding[cc_apps_test_cat_encoding["15_-"] == 0].count()

# 125
cc_apps_test_cat_encoding[cc_apps_test_cat_encoding["15_-"] == 1].count()

1       125
2       125
7       125
10      125
14      125
0_a     125
0_b     125
3_l     125
3_u     125
3_y     125
4_g     125
4_gg    125
4_p     125
5_aa    125
5_b     125
5_c     125
5_cc    125
5_d     125
5_e     125
5_ff    125
5_i     125
5_j     125
5_k     125
5_m     125
5_q     125
5_r     125
5_w     125
5_x     125
6_b     125
6_bb    125
6_dd    125
6_ff    125
6_h     125
6_j     125
6_n     125
6_v     125
6_z     125
8_f     125
8_t     125
9_f     125
9_t     125
12_g    125
12_p    125
12_s    125
15_+    125
15_-    125
dtype: int64

In [14]:
# Reindex
cc_apps_test_cat_encoding = cc_apps_test_cat_encoding.reindex(
    columns=cc_apps_train_cat_encoding.columns, fill_value=0
)

In [15]:
# Put into separate variables
X_train, y_train = (
    cc_apps_train_cat_encoding.iloc[:, :-1].values,
    cc_apps_train_cat_encoding.iloc[:, [-1]].values,
)
X_test, y_test = (
    cc_apps_test_cat_encoding.iloc[:, :-1].values,
    cc_apps_test_cat_encoding.iloc[:, [-1]].values,
)

In [16]:
# Rescale
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

In [17]:
# LogisticRegression
logreg = LogisticRegression()
logreg.fit(rescaledX_train, y_train)
y_pred = logreg.predict(rescaledX_test)

  y = column_or_1d(y, warn=True)


In [19]:
# Confusion matrix of the logreg model
confusion_matrix(y_test, y_pred)

array([[103,   0],
       [  0, 125]], dtype=int64)

In [20]:
# GridSearchCV
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]
param_grid = {"tol":tol, 
              "max_iter":max_iter}

grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)
grid_model_result = grid_model.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [22]:
# Results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 1.000000 using {'max_iter': 100, 'tol': 0.01}


In [23]:
# Extract best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
print(
    "Accuracy of logistic regression classifier: ",
    best_model.score(rescaledX_test, y_test),
)

Accuracy of logistic regression classifier:  1.0
