In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel("credit_card_payment.xls", first_sheet = "Sheet1")

In [3]:
df.shape

(30000, 25)

In [4]:
x = df.iloc[:,1:-1].values 
y = df.iloc[:,-1].values

In [5]:
y.shape

(30000,)

In [6]:
x[:1]

array([[20000, 'Male', 2, 1, 24, 2, 2, -1, -1, -2, -2, 3913, 3102, 689,
        0, 0, 0, 0, 689, 0, 0, 0, 0]], dtype=object)

In [7]:
np.delete(x,1,1)

array([[20000, 2, 1, ..., 0, 0, 0],
       [120000, 2, 2, ..., 1000, 0, 2000],
       [90000, 2, 2, ..., 1000, 1000, 5000],
       ...,
       [30000, 2, 2, ..., 4200, 2000, 3100],
       [80000, 3, 1, ..., 1926, 52964, 1804],
       [50000, 2, 1, ..., 1000, 1000, 1000]], dtype=object)

In [8]:
x = np.delete(x,1,1)

In [9]:
x

array([[20000, 2, 1, ..., 0, 0, 0],
       [120000, 2, 2, ..., 1000, 0, 2000],
       [90000, 2, 2, ..., 1000, 1000, 5000],
       ...,
       [30000, 2, 2, ..., 4200, 2000, 3100],
       [80000, 3, 1, ..., 1926, 52964, 1804],
       [50000, 2, 1, ..., 1000, 1000, 1000]], dtype=object)

In [10]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [11]:
x_test.shape

(6000, 22)

In [12]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(y_train,lr.predict(x_train))

array([[18739,     2],
       [ 5259,     0]], dtype=int64)

In [16]:
lr.score(x_train,y_train)

0.7807916666666667

In [17]:
lr.score(x_test,y_test)

0.7705

In [18]:
lr.predict_proba(x_train)[,1]

array([[0.75359535, 0.24640465],
       [0.6394955 , 0.3605045 ],
       [0.60100934, 0.39899066],
       ...,
       [0.69306869, 0.30693131],
       [0.73829977, 0.26170023],
       [0.85770878, 0.14229122]])

In [22]:
prob = lr.predict_proba(x_train)[:,1]

In [24]:
for t in np.arange(0,1,0.05):
    class_labels = prob >= t
    print("Threshold is", t)
    print(confusion_matrix(y_train,class_labels.astype(int)))
    print()

Threshold is 0.0
[[    0 18741]
 [    0  5259]]

Threshold is 0.05
[[ 1098 17643]
 [   88  5171]]

Threshold is 0.1
[[ 2540 16201]
 [  266  4993]]

Threshold is 0.15000000000000002
[[ 4723 14018]
 [  599  4660]]

Threshold is 0.2
[[ 7501 11240]
 [ 1114  4145]]

Threshold is 0.25
[[10633  8108]
 [ 1844  3415]]

Threshold is 0.30000000000000004
[[14033  4708]
 [ 2876  2383]]

Threshold is 0.35000000000000003
[[16936  1805]
 [ 4167  1092]]

Threshold is 0.4
[[18630   111]
 [ 5147   112]]

Threshold is 0.45
[[18737     4]
 [ 5258     1]]

Threshold is 0.5
[[18739     2]
 [ 5259     0]]

Threshold is 0.55
[[18740     1]
 [ 5259     0]]

Threshold is 0.6000000000000001
[[18740     1]
 [ 5259     0]]

Threshold is 0.65
[[18741     0]
 [ 5259     0]]

Threshold is 0.7000000000000001
[[18741     0]
 [ 5259     0]]

Threshold is 0.75
[[18741     0]
 [ 5259     0]]

Threshold is 0.8
[[18741     0]
 [ 5259     0]]

Threshold is 0.8500000000000001
[[18741     0]
 [ 5259     0]]

Threshold is 0.9
[[

In [47]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(criterion='entropy',n_estimators=100,max_features=10,max_depth=10)
rf_classifier.fit(x_train,y_train)
rf_classifier.score(x_train,y_train)

0.852625

In [51]:
from sklearn.model_selection import GridSearchCV
forest = RandomForestClassifier()
param_grid = {
    "min_samples_leaf": np.arange(1,10,1),
    "max_features": [3,5,7,10],
    "max_depth": [5,4,3,None],
    "criterion":['gini','entropy']
}
grid = GridSearchCV (estimator=forest, param_grid=param_grid, cv=5)
grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'max_features': [3, 5, 7, 10], 'max_depth': [5, 4, 3, None], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [52]:
print(grid.best_estimator_,grid.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features=10, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False) 0.822875


In [53]:
confusion_matrix(y_train,rf_classifier.predict(x_train))

array([[18195,   546],
       [ 2991,  2268]], dtype=int64)

In [54]:
confusion_matrix(y_test,rf_classifier.predict(x_test))

array([[4437,  186],
       [ 903,  474]], dtype=int64)

In [48]:
rf_classifier.score(x_test,y_test)

0.8185

In [49]:
confusion_matrix(y_train,rf_classifier.predict(x_train))

array([[18195,   546],
       [ 2991,  2268]], dtype=int64)

In [50]:
confusion_matrix(y_test,rf_classifier.predict(x_test))

array([[4437,  186],
       [ 903,  474]], dtype=int64)