In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot

import numpy as np
from sklearn.model_selection import train_test_split

#Transforming
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

#cluster and Eval
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
from sklearn.manifold import TSNE
from sklearn import decomposition

#Visualization
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.svm import LinearSVC

In [2]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_excel("loan_data_final.xlsx")

In [6]:
X = df.drop(['loan_status'],axis = 1)
y = df['loan_status']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

X_train.shape, X_test.shape
cols = X_train.columns

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])

### KNN

In [7]:
from sklearn.utils import shuffle
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn import utils

In [8]:
train_score = []
test_score = []
k_vals = []

for k in range(1, 21):
    k_vals.append(k)
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(X_train, y_train)
    
    tr_score = knn.score(X_train, y_train)
    train_score.append(tr_score)
    
    te_score = knn.score(X_test, y_test)
    test_score.append(te_score)

In [9]:
## score that comes from the testing set only
max_test_score = max(test_score)
test_scores_ind = [i for i, v in enumerate(test_score) if v == max_test_score]
print('Max test score {} and k = {}'.format(max_test_score * 100, list(map(lambda x: x + 1, test_scores_ind))))

Max test score 91.59344262295082 and k = [12]


In [11]:
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(12)
knn.fit(X_train, y_train.ravel())
knn.score(X_test, y_test)

0.9159344262295082

In [13]:
from pandas import Series
from numpy.random import randn
from sklearn import metrics
from sklearn.metrics import make_scorer, f1_score, recall_score, precision_score
from sklearn.metrics import accuracy_score

y_pred_knn = knn.predict(X_test)
print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_knn)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_knn,labels = [1,0]))
print(metrics.classification_report(y_test,y_pred_knn))

Model accuracy score : 0.9159
Confusion metrics :
 [[6414   87]
 [ 554  570]]
              precision    recall  f1-score   support

           0       0.87      0.51      0.64      1124
           1       0.92      0.99      0.95      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.75      0.80      7625
weighted avg       0.91      0.92      0.91      7625



### Parameters Tuning (KNN)

In [37]:
from sklearn.model_selection import GridSearchCV
import numpy as np

gs = {'n_neighbors': np.arange(1,25)}
knn_gs = KNeighborsClassifier()
knn_param_search = GridSearchCV(knn_gs,gs,cv=10)
knn_param_search.fit(X_train,y_train)
knn_param_search.best_params_
print("Tuned hyperparameter parameters: {}".format(knn_param_search.best_params_)) 
print("Best cross-validation score: {}".format(knn_param_search.best_score_))

Tuned hyperparameter parameters: {'n_neighbors': 8}
Best cross-validation score: 0.9211315174979182


In [15]:
knn = KNeighborsClassifier(8)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9147540983606557

In [16]:
y_pred_knn_gs = knn.predict(X_test)
print('Model accuracy score : {0:0.4f}'. format(accuracy_score(y_test, y_pred_knn)))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_knn,labels = [1,0]))
print(metrics.classification_report(y_test,y_pred_knn))

Model accuracy score : 0.9159
Confusion metrics :
 [[6414   87]
 [ 554  570]]
              precision    recall  f1-score   support

           0       0.87      0.51      0.64      1124
           1       0.92      0.99      0.95      6501

    accuracy                           0.92      7625
   macro avg       0.89      0.75      0.80      7625
weighted avg       0.91      0.92      0.91      7625



### Random Forest

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
rf = RandomForestRegressor(max_features = 5, n_estimators = 100)

In [18]:
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
errors = abs(y_pred_rf - y_test)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.14 degrees.


In [28]:
print(metrics.classification_report(y_test,y_pred_rf.round()))
print('Confusion metrics :\n', metrics.confusion_matrix(y_test, y_pred_rf.round(),labels = [1,0]))
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(accuracy_score(y_test, y_pred_rf.round())))

              precision    recall  f1-score   support

           0       0.86      0.56      0.68      1124
           1       0.93      0.98      0.96      6501

    accuracy                           0.92      7625
   macro avg       0.90      0.77      0.82      7625
weighted avg       0.92      0.92      0.91      7625

Confusion metrics :
 [[6402   99]
 [ 495  629]]
Model accuracy score with default hyperparameters: 0.9221


### Paremeters Tuninng (RF)

In [29]:
n_estimators = [int(x) for x in np.linspace(start = 10, stop = 80, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [2,4]
min_samples_split = [2,5]
min_samples_leaf = [1,2]
bootstrap = [True, False]

import random
scorer = make_scorer(f1_score)
param_grid = {'n_estimators':n_estimators,
             'max_feature': max_features,
             'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'bootstrap':bootstrap}
print(param_grid)

{'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80], 'max_feature': ['auto', 'sqrt'], 'max_depth': [2, 4], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'bootstrap': [True, False]}


In [38]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [43]:
rf_model = RandomForestClassifier().get_params()
RandomForestClassifier().get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [68]:
max_features_range = np.arange(1,6,1)
#n_estimators_range = np.arange(10,210,10)
n_estimators_range = [1,3,10]
param_grid = dict(max_features = max_features_range,
                  n_estimator = n_estimators_range)
scorer = make_scorer(f1_score)
rf = RandomForestClassifier()
grid = GridSearchCV(estimator=rf, 
                    param_grid=param_grid,
                    scoring=scorer,
                    cv=10)

In [67]:
grid.fit(X_train, y_train.round())

ValueError: Invalid parameter n_estimator for estimator RandomForestClassifier(max_features=1). Check the list of available parameters with `estimator.get_params().keys()`.