In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_df = pd.read_csv(Path('Resources/Generator/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/Generator/2020Q1loans.csv'))

In [3]:
# Convert categorical data to numeric and separate target feature for training data
train_df = pd.get_dummies(train_df, columns=['home_ownership','initial_list_status', 'application_type', 'hardship_flag', \
                                 'debt_settlement_flag', 'verification_status', 'pymnt_plan', 'target'], drop_first=True)
train_df

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,initial_list_status_w,application_type_Joint App,hardship_flag_Y,debt_settlement_flag_Y,verification_status_Source Verified,verification_status_Verified,target_low_risk
0,7000.0,0.1894,256.38,75000.0,28.62,0.0,2.0,20.0,0.0,40414.0,...,1,0,0,0,0,0,0,0,0,1
1,40000.0,0.1614,975.71,102000.0,11.72,2.0,0.0,10.0,0.0,43531.0,...,1,0,0,1,0,0,0,1,0,1
2,11000.0,0.2055,294.81,45000.0,37.25,1.0,3.0,23.0,0.0,8242.0,...,0,0,1,1,0,0,0,0,1,1
3,4000.0,0.1612,140.87,38000.0,42.89,1.0,0.0,7.0,0.0,12767.0,...,1,0,0,1,1,0,0,0,0,1
4,14000.0,0.1797,505.93,43000.0,22.16,1.0,0.0,22.0,0.0,11182.0,...,1,0,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,28000.0,28.42,0.0,0.0,15.0,0.0,9679.0,...,0,0,1,1,0,0,0,0,0,0
12176,15000.0,0.1774,540.34,50000.0,23.43,4.0,0.0,16.0,0.0,3193.0,...,0,0,1,1,0,0,0,0,1,0
12177,3600.0,0.1862,131.28,60000.0,28.80,0.0,1.0,14.0,0.0,11804.0,...,0,0,1,1,0,0,0,0,0,0
12178,15000.0,0.0881,475.68,62000.0,11.44,0.0,0.0,5.0,0.0,157.0,...,1,0,0,1,1,0,0,1,0,0


In [4]:
# Convert categorical data to numeric and separate target feature for testing data
test_df = pd.get_dummies(test_df, columns=['home_ownership','initial_list_status', 'application_type', 'hardship_flag', \
                                 'debt_settlement_flag', 'verification_status', 'pymnt_plan', 'target'], drop_first=True)
test_df

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,total_il_high_credit_limit,home_ownership_MORTGAGE,home_ownership_OWN,home_ownership_RENT,initial_list_status_w,application_type_Joint App,hardship_flag_Y,verification_status_Source Verified,verification_status_Verified,target_low_risk
0,40000.0,0.1033,856.40,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,14515.0,0,0,1,1,0,1,1,0,1
1,24450.0,0.1430,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,18925.0,1,0,0,1,0,0,0,0,1
2,13500.0,0.1430,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,61724.0,0,1,0,1,0,1,0,0,1
3,10625.0,0.1774,268.31,60000.0,15.70,0.0,4.0,17.0,0.0,6216.0,...,26255.0,0,0,1,1,0,0,0,1,1
4,6375.0,0.1862,232.46,60000.0,35.50,0.0,0.0,13.0,0.0,12681.0,...,72345.0,0,0,1,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,23215.0,...,107388.0,0,0,1,0,0,0,1,0,0
4698,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,459.0,...,30775.0,0,0,1,1,0,0,0,0,0
4699,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,6342.0,...,29550.0,0,0,1,0,0,0,0,1,0
4700,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,11636.0,...,9657.0,0,0,1,1,0,0,1,0,0


In [5]:
# add missing dummy variables to testing set
# list(test_df)
test_df['debt_settlement_flag'] = 0


In [6]:
X_train = train_df.drop(['target_low_risk'], axis=1)
y_train = train_df['target_low_risk']
X_test = test_df.drop(['target_low_risk'], axis=1)
y_test = test_df['target_low_risk']

print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

12180
12180
4702
4702


In [7]:
# (AI) Scale x_train and x_test 
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Educated Guess on Better Model (LogisticRegression vs. RandomTreeClassifier)
Looking at the variability of the coeffeicient, Logistic Regression would likely not be a good model as it could get confused the different coefficients; whereas, Random Tree classifier a random forest algorithm will sample the data and build several smaller, simpler decisions trees, each tree is much simpler because it is built from a subset of the data. Each tree is considered a “weak classifier” but when you combine them, they form a “strong classifier.”


## Fit model to unscaled and scaled data on LogisticRegression

In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score
# (AI) Fit unscaled data to Linear Regression model
reg = LogisticRegression(solver='lbfgs').fit(X_train, y_train)
reg.score(X_test, y_test)
# print(reg.coef_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.5087196937473416

In [9]:
# Fit the Logistic Regression model on the scaled data and print the model score
reg = LogisticRegression().fit(X_train_scaled, y_train)
reg.score(X_test_scaled, y_test)
# print(reg.coef_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.67333049766057

## Fit model to unscaled and scaled data on RandomForestClassifier before feature selection

In [10]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 1.0
Testing Score: 0.6407911527009783


In [11]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6405784772437261


## Feature selection

In [12]:
# Get feature importance
features = clf.feature_importances_
features = sorted(features, reverse=True)


In [13]:
# Use SelectFromModel to only get the important features
from sklearn.feature_selection import SelectFromModel

# create selection object by passing in the model fitted above
# Note, the original model MUST be one that has a feature_importances_ or coef_
sel = SelectFromModel(clf)

In [14]:
# Fit selection object to the training set
sel.fit(X_train_scaled, y_train)

# Displays an array indicating whether given feature is important
sel.get_support()

array([ True,  True,  True,  True,  True, False, False, False, False,
        True, False,  True,  True,  True,  True,  True,  True,  True,
       False, False,  True, False, False, False, False,  True, False,
       False, False, False,  True,  True,  True, False, False,  True,
        True,  True, False, False, False, False,  True,  True,  True,
       False, False,  True,  True,  True, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True, False, False, False, False, False,
       False, False, False, False])

In [15]:
# Retrain entire model on just selected features by transforming features
# Using the sklearn selection object
# X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X_train_scaled), y_train, random_state=1)
X_selected_train = sel.transform(X_train_scaled)
X_selected_test = sel.transform(X_test_scaled)



## Scale feature selected data

In [16]:
# Scale the data
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

## Educated guess on LogisticRegression vs. RandomForestRegression. 

RF still seems to be the better model to use based on the variability of the features.

## Fit the selected featured scaled data to LogisticRegression and RandomForestRegression models


In [17]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.6108039132284134


In [18]:
# Fit the selected featured scaled data nto 
clf = LogisticRegression()
clf.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

Training Score: 0.7094417077175698
Testing Score: 0.7735006380263718


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


## The logistic regression model fit to the full dataset is overfitting the training data. That is, the noisy parameters confuse it.  However, fitting to the smaller dataset with the selected features has less noisy parameters to confuse the model. So, we get a more accurate model.

## Hypertuning feature selected data

In [19]:
# KNN classifier
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()

In [20]:
# Create the grid search estimator along with a parameter object containing the values to adjust.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using 10, 50, 100, and 500.
# Include both uniform and distance options for weights.
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance'],
    'leaf_size': [10, 50, 100, 500]
}
grid_clf = GridSearchCV(model, param_grid, verbose=3)

## Fit the model by using the grid search estimator.

In [22]:
# Fit the model by using the grid search estimator.
# This will take the KNN model and try each combination of parameters.
grid_clf.fit(X_selected_train_scaled, y_train)
# grid_clf.fit(X_selected_test_scaled, y_t)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.542, total=   1.7s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.6s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.569, total=   1.7s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.3s remaining:    0.0s


[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.606, total=   1.7s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.604, total=   1.7s
[CV] leaf_size=10, n_neighbors=1, weights=uniform ....................
[CV]  leaf_size=10, n_neighbors=1, weights=uniform, score=0.602, total=   1.7s
[CV] leaf_size=10, n_neighbors=1, weights=distance ...................
[CV]  leaf_size=10, n_neighbors=1, weights=distance, score=0.542, total=   1.6s
[CV] leaf_size=10, n_neighbors=1, weights=distance ...................
[CV]  leaf_size=10, n_neighbors=1, weights=distance, score=0.569, total=   1.6s
[CV] leaf_size=10, n_neighbors=1, weights=distance ...................
[CV]  leaf_size=10, n_neighbors=1, weights=distance, score=0.606, total=   1.7s
[CV] leaf_size=10, n_neighbors=1, weights=distance ...................
[CV]  leaf_size=10, n_neighbors=1, weights=distance, score=0.604, total=   1.7s
[CV] leaf_size=10

[CV]  leaf_size=10, n_neighbors=11, weights=distance, score=0.671, total=   1.9s
[CV] leaf_size=10, n_neighbors=11, weights=distance ..................
[CV]  leaf_size=10, n_neighbors=11, weights=distance, score=0.666, total=   2.0s
[CV] leaf_size=10, n_neighbors=11, weights=distance ..................
[CV]  leaf_size=10, n_neighbors=11, weights=distance, score=0.661, total=   2.0s
[CV] leaf_size=10, n_neighbors=13, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=13, weights=uniform, score=0.581, total=   2.0s
[CV] leaf_size=10, n_neighbors=13, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=13, weights=uniform, score=0.624, total=   2.0s
[CV] leaf_size=10, n_neighbors=13, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=13, weights=uniform, score=0.672, total=   2.1s
[CV] leaf_size=10, n_neighbors=13, weights=uniform ...................
[CV]  leaf_size=10, n_neighbors=13, weights=uniform, score=0.669, total=   2.0s
[CV] leaf_s

[CV]  leaf_size=50, n_neighbors=3, weights=uniform, score=0.643, total=   1.1s
[CV] leaf_size=50, n_neighbors=3, weights=uniform ....................
[CV]  leaf_size=50, n_neighbors=3, weights=uniform, score=0.624, total=   1.1s
[CV] leaf_size=50, n_neighbors=3, weights=uniform ....................
[CV]  leaf_size=50, n_neighbors=3, weights=uniform, score=0.627, total=   1.1s
[CV] leaf_size=50, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=50, n_neighbors=3, weights=distance, score=0.558, total=   1.0s
[CV] leaf_size=50, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=50, n_neighbors=3, weights=distance, score=0.590, total=   1.1s
[CV] leaf_size=50, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=50, n_neighbors=3, weights=distance, score=0.643, total=   1.1s
[CV] leaf_size=50, n_neighbors=3, weights=distance ...................
[CV]  leaf_size=50, n_neighbors=3, weights=distance, score=0.624, total=   1.1s
[CV] leaf_size=50

[CV]  leaf_size=50, n_neighbors=13, weights=distance, score=0.672, total=   1.3s
[CV] leaf_size=50, n_neighbors=13, weights=distance ..................
[CV]  leaf_size=50, n_neighbors=13, weights=distance, score=0.669, total=   1.4s
[CV] leaf_size=50, n_neighbors=13, weights=distance ..................
[CV]  leaf_size=50, n_neighbors=13, weights=distance, score=0.665, total=   1.4s
[CV] leaf_size=50, n_neighbors=15, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.576, total=   1.4s
[CV] leaf_size=50, n_neighbors=15, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.615, total=   1.3s
[CV] leaf_size=50, n_neighbors=15, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.666, total=   1.3s
[CV] leaf_size=50, n_neighbors=15, weights=uniform ...................
[CV]  leaf_size=50, n_neighbors=15, weights=uniform, score=0.669, total=   1.3s
[CV] leaf_s

[CV]  leaf_size=100, n_neighbors=5, weights=uniform, score=0.643, total=   1.2s
[CV] leaf_size=100, n_neighbors=5, weights=uniform ...................
[CV]  leaf_size=100, n_neighbors=5, weights=uniform, score=0.659, total=   1.2s
[CV] leaf_size=100, n_neighbors=5, weights=uniform ...................
[CV]  leaf_size=100, n_neighbors=5, weights=uniform, score=0.635, total=   1.2s
[CV] leaf_size=100, n_neighbors=5, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=5, weights=distance, score=0.564, total=   1.1s
[CV] leaf_size=100, n_neighbors=5, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=5, weights=distance, score=0.609, total=   1.1s
[CV] leaf_size=100, n_neighbors=5, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=5, weights=distance, score=0.643, total=   1.1s
[CV] leaf_size=100, n_neighbors=5, weights=distance ..................
[CV]  leaf_size=100, n_neighbors=5, weights=distance, score=0.659, total=   1.1s
[CV] leaf_

[CV]  leaf_size=100, n_neighbors=15, weights=distance, score=0.615, total=   1.1s
[CV] leaf_size=100, n_neighbors=15, weights=distance .................
[CV]  leaf_size=100, n_neighbors=15, weights=distance, score=0.667, total=   1.2s
[CV] leaf_size=100, n_neighbors=15, weights=distance .................
[CV]  leaf_size=100, n_neighbors=15, weights=distance, score=0.669, total=   1.2s
[CV] leaf_size=100, n_neighbors=15, weights=distance .................
[CV]  leaf_size=100, n_neighbors=15, weights=distance, score=0.671, total=   1.2s
[CV] leaf_size=100, n_neighbors=17, weights=uniform ..................
[CV]  leaf_size=100, n_neighbors=17, weights=uniform, score=0.576, total=   1.2s
[CV] leaf_size=100, n_neighbors=17, weights=uniform ..................
[CV]  leaf_size=100, n_neighbors=17, weights=uniform, score=0.617, total=   1.2s
[CV] leaf_size=100, n_neighbors=17, weights=uniform ..................
[CV]  leaf_size=100, n_neighbors=17, weights=uniform, score=0.675, total=   1.3s
[CV

[CV]  leaf_size=500, n_neighbors=7, weights=uniform, score=0.571, total=   1.1s
[CV] leaf_size=500, n_neighbors=7, weights=uniform ...................
[CV]  leaf_size=500, n_neighbors=7, weights=uniform, score=0.617, total=   1.1s
[CV] leaf_size=500, n_neighbors=7, weights=uniform ...................
[CV]  leaf_size=500, n_neighbors=7, weights=uniform, score=0.656, total=   1.1s
[CV] leaf_size=500, n_neighbors=7, weights=uniform ...................
[CV]  leaf_size=500, n_neighbors=7, weights=uniform, score=0.653, total=   1.1s
[CV] leaf_size=500, n_neighbors=7, weights=uniform ...................
[CV]  leaf_size=500, n_neighbors=7, weights=uniform, score=0.646, total=   1.1s
[CV] leaf_size=500, n_neighbors=7, weights=distance ..................
[CV]  leaf_size=500, n_neighbors=7, weights=distance, score=0.571, total=   1.0s
[CV] leaf_size=500, n_neighbors=7, weights=distance ..................
[CV]  leaf_size=500, n_neighbors=7, weights=distance, score=0.617, total=   1.0s
[CV] leaf_si

[CV]  leaf_size=500, n_neighbors=17, weights=uniform, score=0.676, total=   1.1s
[CV] leaf_size=500, n_neighbors=17, weights=distance .................
[CV]  leaf_size=500, n_neighbors=17, weights=distance, score=0.576, total=   1.0s
[CV] leaf_size=500, n_neighbors=17, weights=distance .................
[CV]  leaf_size=500, n_neighbors=17, weights=distance, score=0.618, total=   1.0s
[CV] leaf_size=500, n_neighbors=17, weights=distance .................
[CV]  leaf_size=500, n_neighbors=17, weights=distance, score=0.675, total=   1.0s
[CV] leaf_size=500, n_neighbors=17, weights=distance .................
[CV]  leaf_size=500, n_neighbors=17, weights=distance, score=0.672, total=   1.0s
[CV] leaf_size=500, n_neighbors=17, weights=distance .................
[CV]  leaf_size=500, n_neighbors=17, weights=distance, score=0.676, total=   1.0s
[CV] leaf_size=500, n_neighbors=19, weights=uniform ..................
[CV]  leaf_size=500, n_neighbors=19, weights=uniform, score=0.579, total=   1.1s
[C

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  9.0min finished


GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'leaf_size': [10, 50, 100, 500],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [23]:
# List the best parameters for this dataset
print(grid_clf.best_params_)

{'leaf_size': 10, 'n_neighbors': 19, 'weights': 'distance'}


In [24]:
# List the best score
print(grid_clf.best_score_)

0.6459770114942529


In [25]:
# Create the parameter object for the randomized search estimator.
# Try adjusting n_neighbors with values of 1 through 19. Adjust leaf_size by using a range from 1 to 500.
# Include both uniform and distance options for weights.
param_grid = {
    'n_neighbors': np.arange(1,20,2),
    'weights': ['uniform', 'distance'],
    'leaf_size': np.arange(1, 500)
}
param_grid

{'n_neighbors': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
 'weights': ['uniform', 'distance'],
 'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
        131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        1

In [26]:
# Create the randomized search estimator by using the logistic regression model and the parameter grid that you created.
from sklearn.model_selection import RandomizedSearchCV
random_clf = RandomizedSearchCV(model, param_grid, random_state=0, verbose=3)

## Fit the model by using the randomized search estimator.

In [27]:
# Fit the model by using the randomized search estimator.
# This will take the logistic regression model and a random sample of combinations of parameters.
random_clf.fit(X_selected_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.581, total=   1.1s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.624, total=   1.0s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.672, total=   1.0s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.669, total=   1.0s
[CV] weights=uniform, n_neighbors=13, leaf_size=137 ..................
[CV]  weights=uniform, n_neighbors=13, leaf_size=137, score=0.664, total=   1.0s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.564, total=   0.8s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.609, total=   0.9s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.643, total=   0.9s
[CV] weights=distance, n_neighbors=5, leaf_size=493 ..................
[CV]  weights=distance, n_neighbors=5, leaf_size=493, score=0.659, total=   0.9s
[CV] we

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   48.6s finished


RandomizedSearchCV(estimator=KNeighborsClassifier(),
                   param_distributions={'leaf_size': array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,...
       430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
       443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455,
       456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
       469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481,
       482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
       495, 496, 497, 498, 499]),
   

In [28]:
# List the best parameters for this dataset
print(random_clf.best_params_)

{'weights': 'distance', 'n_neighbors': 19, 'leaf_size': 243}


In [29]:
# List the best score
print(random_clf.best_score_)

0.6459770114942529


In [30]:
# Make predictions with the hypertuned model
predictions = random_clf.predict(X_selected_test_scaled)

In [32]:
target_names = ["low_risk", "high_risk"]

In [33]:
# Calculate the classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    low_risk       0.57      0.60      0.58      2351
   high_risk       0.57      0.54      0.56      2351

    accuracy                           0.57      4702
   macro avg       0.57      0.57      0.57      4702
weighted avg       0.57      0.57      0.57      4702

