In [99]:
#IMPORTING THE REQUIRED LIBRARIES
import pandas as pd

In [100]:
#READ DATA
data = pd.read_csv("https://raw.githubusercontent.com/subashgandyer/datasets/main/loan_train.csv")

In [101]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [102]:
#CLEAN DATASET
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


In [103]:
data.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [104]:
#DEALING WITH MISSING VALUES
#Since there are nan values I am handling each as expected
data['Gender'].mode()#male values will be filled here since the mode is male
data['Gender'].fillna('Male', inplace=True)

#For self Employed
data['Self_Employed'].mode()
data['Self_Employed'].fillna('Yes', inplace=True)

#For Married
data['Married'].mode()
data['Married'].fillna('Yes', inplace=True)

data['Dependents'] = data['Dependents'].replace(['3+'], 3)
data['Dependents'].fillna('0', inplace=True)

#Fill numerical missing values
data.fillna(data.mean(), inplace=True)
#The above method is used because the missing values are less than 5%

In [105]:
#To ensure that all missing values are dealt with
data.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [106]:
#Convert all categorical data(Gender, Married, Education, Self_Employed) to numerical data

In [107]:
#By using label encoder
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le=LabelEncoder()

In [108]:
#For Gender
data.Gender = le.fit_transform(data.Gender)

#For self Employed
data.Self_Employed = le.fit_transform(data.Self_Employed)

#For Married
data.Married = le.fit_transform(data.Married)

#For Education
data.Education = le.fit_transform(data.Education)

#For Loan_Status
data.Loan_Status = le.fit_transform(data.Loan_Status)


In [109]:
#OneHotEncoder will be used because Property Area has more than one Categories
hc= OneHotEncoder()
area = hc.fit_transform(data[['Property_Area']]).toarray()
n_frame=pd.DataFrame(area)

#crete new data frame to include the transformed data
new_data = pd.concat([data, n_frame], axis=1)

In [110]:
#Ensure test dataset is 100% processed
new_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status,0,1,2
0,LP001002,1,0,0,0,0,5849,0.0,146.412162,360.0,1.0,Urban,1,0.0,0.0,1.0
1,LP001003,1,1,1,0,0,4583,1508.0,128.000000,360.0,1.0,Rural,0,1.0,0.0,0.0
2,LP001005,1,1,0,0,1,3000,0.0,66.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
3,LP001006,1,1,0,1,0,2583,2358.0,120.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
4,LP001008,1,0,0,0,0,6000,0.0,141.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,0,0,0,0,0,2900,0.0,71.000000,360.0,1.0,Rural,1,1.0,0.0,0.0
610,LP002979,1,1,3,0,0,4106,0.0,40.000000,180.0,1.0,Rural,1,1.0,0.0,0.0
611,LP002983,1,1,1,0,0,8072,240.0,253.000000,360.0,1.0,Urban,1,0.0,0.0,1.0
612,LP002984,1,1,2,0,0,7583,0.0,187.000000,360.0,1.0,Urban,1,0.0,0.0,1.0


In [111]:
#Split data into dependent and independent variables

In [112]:
X=new_data.loc[:,['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','Loan_Amount_Term','Credit_History',0,1,2]]
Y=data['Loan_Status']

In [113]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,Loan_Amount_Term,Credit_History,0,1,2
0,1,0,0,0,0,5849,0.0,360.0,1.0,0.0,0.0,1.0
1,1,1,1,0,0,4583,1508.0,360.0,1.0,1.0,0.0,0.0
2,1,1,0,0,1,3000,0.0,360.0,1.0,0.0,0.0,1.0
3,1,1,0,1,0,2583,2358.0,360.0,1.0,0.0,0.0,1.0
4,1,0,0,0,0,6000,0.0,360.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,0,0,2900,0.0,360.0,1.0,1.0,0.0,0.0
610,1,1,3,0,0,4106,0.0,180.0,1.0,1.0,0.0,0.0
611,1,1,1,0,0,8072,240.0,360.0,1.0,0.0,0.0,1.0
612,1,1,2,0,0,7583,0.0,360.0,1.0,0.0,0.0,1.0


In [114]:
Y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 614, dtype: int32

In [115]:
#Split dataset into train and test

In [116]:
#import model selection from skit learn
from sklearn.model_selection import train_test_split
train_X, test_X, train_Y,test_Y = train_test_split(X,Y,train_size=0.5,test_size=0.5, random_state=123)

In [117]:
#Use the decision tree algorithm

In [118]:
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier(criterion='gini', max_depth=None)

In [119]:
tree.fit(train_X, train_Y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [120]:
DecisionTreeClassifier()

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [121]:
prediction = tree.predict(test_X)

In [122]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test_Y, prediction))

[[ 52  51]
 [ 45 159]]


In [123]:
param_dist={'criterion':['gini','entropy'],'max_depth':[1,2,3,4,5,6,7,None]}

In [124]:
from sklearn.model_selection import GridSearchCV

In [125]:
grid=GridSearchCV(tree, param_grid = param_dist, cv=10, n_jobs=-1)

In [126]:
grid.fit(train_X, train_Y)

GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=None,
                                              splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                    

In [127]:
#Loop through the preprocessing technique with LG
import numpy as np
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()

In [128]:
lr.fit(train_X, train_Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [129]:
LogisticRegression()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [130]:
reg_prediction = lr.predict(test_X)

In [131]:
print(confusion_matrix(test_Y, reg_prediction))

[[ 44  59]
 [  4 200]]


In [135]:
param_dist_reg={'penalty':['l1','l2','elasticnet', 'none'],
                'C':np.logspace(-4, 4, 20),
                 'solver':['lbfgs', 'newton-cg','liblinear','sag','saga'],
                'max_iter':[100, 1000, 2500, 5000]}

In [136]:
reg_grid=GridSearchCV(lr, param_grid=param_dist_reg, verbose=1, cv=3, n_jobs=-1)

In [137]:
reg_grid.fit(train_X, train_Y)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  44 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 1176 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 3176 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed:  1.6min finished
  "Setting penalty='none' will ignore the C and l1_ratio "


GridSearchCV(cv=3, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': array([1.00000000e-04, 2.636...
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                         'max_iter': [100, 1000, 2500, 5000],
               

In [138]:
#The parameter give best results
reg_grid.best_params_

{'C': 0.0001, 'max_iter': 100, 'penalty': 'none', 'solver': 'newton-cg'}

In [139]:
#This estimator gives the highest score
reg_grid.best_estimator_

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False)

In [140]:
grid_predictionsReg=reg_grid.predict(test_X)

In [141]:
print(confusion_matrix(test_Y, grid_predictionsReg),'\n with accuracy',reg_grid.best_score_)

[[ 44  59]
 [  4 200]] 
 with accuracy 0.8273367599466971


In [142]:
#Use the KNN algorithm

In [143]:
#Loop through the preprocessing algorithm
from sklearn.neighbors import KNeighborsClassifier
Knn=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                                    metric_params=None, n_jobs=None, n_neighbors=5, p=2, weights='uniform')


In [144]:
Knn.fit(train_X, train_Y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [145]:
KNeighborsClassifier()

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [146]:
K_prediction = Knn.predict(test_X)

In [148]:
print(confusion_matrix(test_Y, K_prediction))

[[ 12  91]
 [ 30 174]]


In [153]:
param_distk={'n_neighbors': [5,6,11,13,15], 'weights':['uniform', 'distance'], 'metric':['minkowski', 'euclidean', 'manhattan']}

In [154]:
K_grid = GridSearchCV(KNeighborsClassifier(), param_distk, verbose=1, cv=3, n_jobs=-1)

In [155]:
K_grid.fit(train_X, train_Y)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done  83 out of  90 | elapsed:    3.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    3.3s finished


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': [5, 6, 11, 13, 15],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [156]:
GridSearchCV(cv=3, estimator = KNeighborsClassifier(), n_jobs=-1, param_grid={'metric':['minkowski', 'euclidean', 'manhattan'],
                                              'n_neighbors':[5,7,9,11,13,15],
                                              'weights': ['uniform', 'distance']}, verbose=1)


GridSearchCV(cv=3, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=-1,
             param_grid={'metric': ['minkowski', 'euclidean', 'manhattan'],
                         'n_neighbors': [5, 7, 9, 11, 13, 15],
                         'weights': ['uniform', 'distance']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=1)

In [157]:
#Use Parameter setting
K_grid.best_params_

{'metric': 'minkowski', 'n_neighbors': 11, 'weights': 'uniform'}

In [158]:
#Estimator gives the highest score
K_grid.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [159]:
KNeighborsClassifier(n_neighbors=10)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [163]:
grid_predictionsK=K_grid.predict(test_X)

In [169]:

print(confusion_matrix(test_Y, grid_predictionsK), '\n with accuracy ', K_grid.best_score_)

[[ 10  93]
 [ 18 186]] 
 with accuracy  0.7426867187004252


In [None]:
#Use the Svm algorithm

In [171]:
from sklearn.svm import SVC
svm_model = SVC(C=1, gamma=0.01, kernel='rbf', random_state=40)

In [172]:
svm_model.fit(train_X, train_Y)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
    max_iter=-1, probability=False, random_state=40, shrinking=True, tol=0.001,
    verbose=False)

In [173]:
svm_prediction = svm_model.predict(test_X)

In [176]:
print(confusion_matrix(test_Y, svm_prediction))

[[  2 101]
 [  3 201]]


In [177]:
param_grid_svm ={'C': [1,10,100,1000,10000], 'gamma':[1,0.1,0.01,0.001,0.0001], 'kernel':['rbf']}

In [179]:
grid_svm = GridSearchCV(SVC(), param_grid_svm, refit=True, verbose=3)

In [180]:
grid_svm.fit(train_X, train_Y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.710, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.710, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.721, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.705, total=   0.0s
[CV] C=1, gamma=1, kernel=rbf ........................................
[CV] ............ C=1, gamma=1, kernel=rbf, score=0.705, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] .......... C=1, gamma=0.1, kernel=rbf, score=0.726, total=   0.0s
[CV] C=1, gamma=0.1, kernel=rbf ......................................
[CV] ..........

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s



[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.694, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.726, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.705, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.721, total=   0.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] ......... C=1, gamma=0.01, kernel=rbf, score=0.689, total=   0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.694, total=   0.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.694, total=   0.0s
[CV] 

[Parallel(n_jobs=1)]: Done 125 out of 125 | elapsed:    1.9s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 10, 100, 1000, 10000],
                         'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [181]:
#This parameter gives the best reults
grid_svm.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [182]:
#This estimator gives the highest score
grid_svm.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [183]:
grid_prediction_svm = grid.predict(test_X)

In [184]:
print(confusion_matrix(test_Y, grid_prediction_svm), '\n with accuracy', grid_svm.best_score_)

[[ 43  60]
 [  4 200]] 
 with accuracy 0.7198307773664727


In [186]:
from sklearn.ensemble import RandomForestClassifier
ranf_model= RandomForestClassifier(random_state=0, n_estimators=100)

In [187]:
ranf_model.fit(train_X, train_Y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [188]:
RandomForestClassifier(random_state=0)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [189]:
ranf_prediction=ranf_model.predict(test_X)

In [190]:
print(confusion_matrix(test_Y, ranf_prediction))

[[ 46  57]
 [ 12 192]]


In [191]:
param_grid_ranf ={'bootstrap': [True, False],
                 'max_depth': [5,10,15,20,25,30,35,40,45,50],
                 'max_features': ['auto', 'sqrt'],
                 'min_samples_leaf':[1,2,3],
                 'n_estimators': [20, 40, 60, 80, 100, 120, 140, 160, 180, 200]}

In [193]:
grid_ranf = GridSearchCV(estimator = ranf_model, param_grid =param_grid_ranf, cv=3, verbose=2)

In [208]:
#using the k-means algorithm as any other algorithm
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [209]:
kmeans = KMeans(n_clusters=4)

In [210]:
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [215]:
y_kmeans = kmeans.predict(X)
y_kmeans

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3,
       3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 1, 3, 1, 3, 1, 3,
       3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 1, 3, 1, 3, 3, 3, 1, 3,
       3, 0, 3, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 0, 3, 0, 3, 3, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3,
       3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 1, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3,
       3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,