# Bank Account Assessment Project Part 2
Using Scikit-Learn and model avaliation

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# enhance plot quality
plt.rcParams["figure.dpi"] = 400

In [5]:
df = pd.read_csv("Chapter_1_cleaned_data.csv")

In [6]:
# the data shown below has a difference of samples
# although it still can be considered balanced
# for unbalanced data, it should be applied
# undersampling, oversampling or weighting samples

print(df["default payment next month"].mean(), "\n")
df.groupby("default payment next month")["ID"].count()

0.2217971797179718 



default payment next month
0    20750
1     5914
Name: ID, dtype: int64

In [7]:
from sklearn.linear_model import LogisticRegression

In [14]:
my_lr = LogisticRegression()

In [17]:
params = my_lr.get_params()
params

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [27]:
my_new_lr = LogisticRegression(C=1.0, class_weight=None, dual=False,
                               fit_intercept=True,
                              intercept_scaling=1, max_iter=100, multi_class="auto",
                              n_jobs=None, penalty="l2", random_state=None, solver="warn",
                              tol=0.0001, verbose=0, warm_start=False)

In [28]:
my_new_lr.C = 0.1
my_new_lr.solver = "liblinear"
new_params = my_new_lr.get_params()
new_params

{'C': 0.1,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [29]:
# selecting the first 10 values from EDUCATION
X = df["EDUCATION"][0:10].values.reshape(-1, 1)
X

array([[2],
       [2],
       [2],
       [2],
       [2],
       [1],
       [1],
       [2],
       [3],
       [3]], dtype=int64)

In [30]:
# selecting the first 10 values from default payment next month
# this is the response variable
Y = df["default payment next month"][0:10].values
Y

array([1, 1, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [32]:
# after running this cell, the model is trained with the data
my_new_lr.fit(X, Y)
my_new_lr.get_params()

{'C': 0.1,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [33]:
# after training the model, let's test it
new_X = df["EDUCATION"][10:20].values.reshape(-1, 1)
new_X

array([[3],
       [1],
       [2],
       [2],
       [1],
       [3],
       [1],
       [1],
       [1],
       [3]], dtype=int64)

In [34]:
# predict with the trained model
my_new_lr.predict(new_X)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [35]:
# verify correct answers
df["default payment next month"][10:20].values

# after the test, it can be seen that the model got 80% of "right answers"
# but it didn't predict any 1 as answer, it's not good if the data were extremely important

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0], dtype=int64)