In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import plotly.express as px 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
url ="https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/heart.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
scaler = StandardScaler() 
X = df.drop(columns=["target"])
X = scaler.fit_transform(X)
y = df["target"]

In [4]:
clf = DecisionTreeClassifier()
params = {
    'criterion' : ["gini", "entropy", "log_loss"],
    'splitter' : ["best", "random"],
    'max_depth' : [5,10,15,20,50,100,150,200,250,300,350,400,450,500],
}

In [5]:
grid = GridSearchCV(
    estimator=clf,
    param_grid= params,
    cv = 3,
    n_jobs= -1,
    verbose= 2,
    return_train_score=True
)

In [6]:
grid.fit(X,y)

Fitting 3 folds for each of 84 candidates, totalling 252 fits


In [7]:
grid.best_estimator_

In [8]:
grid_tree_results = pd.DataFrame(grid.cv_results_)
grid_tree_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0.012106,0.006576,0.005096,0.002994,gini,5,best,"{'criterion': 'gini', 'max_depth': 5, 'splitte...",0.792079,0.712871,0.673267,0.726073,0.049395,64,0.965347,0.920792,0.940594,0.942244,0.018227
1,0.018021,0.007755,0.004962,0.002604,gini,5,random,"{'criterion': 'gini', 'max_depth': 5, 'splitte...",0.782178,0.742574,0.772277,0.765677,0.016828,17,0.871287,0.920792,0.920792,0.904290,0.023337
2,0.038878,0.035185,0.006987,0.003938,gini,10,best,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.752475,0.742574,0.683168,0.726073,0.030606,64,1.000000,1.000000,1.000000,1.000000,0.000000
3,0.017923,0.013208,0.003655,0.002303,gini,10,random,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",0.712871,0.752475,0.752475,0.739274,0.018669,44,0.985149,0.995050,0.995050,0.991749,0.004667
4,0.013657,0.003849,0.006468,0.006660,gini,15,best,"{'criterion': 'gini', 'max_depth': 15, 'splitt...",0.732673,0.742574,0.673267,0.716172,0.030606,78,1.000000,1.000000,1.000000,1.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,0.010229,0.004927,0.005171,0.003663,log_loss,400,random,"{'criterion': 'log_loss', 'max_depth': 400, 's...",0.772277,0.782178,0.742574,0.765677,0.016828,17,1.000000,1.000000,1.000000,1.000000,0.000000
80,0.009940,0.003764,0.006802,0.001741,log_loss,450,best,"{'criterion': 'log_loss', 'max_depth': 450, 's...",0.722772,0.752475,0.663366,0.712871,0.037046,80,1.000000,1.000000,1.000000,1.000000,0.000000
81,0.009620,0.002256,0.002667,0.003772,log_loss,450,random,"{'criterion': 'log_loss', 'max_depth': 450, 's...",0.762376,0.762376,0.752475,0.759076,0.004667,21,1.000000,1.000000,1.000000,1.000000,0.000000
82,0.018584,0.007592,0.005539,0.004165,log_loss,500,best,"{'criterion': 'log_loss', 'max_depth': 500, 's...",0.722772,0.792079,0.673267,0.729373,0.048729,59,1.000000,1.000000,1.000000,1.000000,0.000000


In [9]:
grid_tree_results.sort_values(by='rank_test_score', inplace=True)
grid_tree_results.reset_index(inplace=True)
grid_tree_results

Unnamed: 0,index,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,5,0.024751,0.029853,0.003663,0.003185,gini,15,random,"{'criterion': 'gini', 'max_depth': 15, 'splitt...",0.801980,0.792079,0.762376,0.785479,0.016828,1,1.0,1.0,1.0,1.0,0.0
1,83,0.008499,0.002768,0.006190,0.004497,log_loss,500,random,"{'criterion': 'log_loss', 'max_depth': 500, 's...",0.762376,0.851485,0.722772,0.778878,0.053827,2,1.0,1.0,1.0,1.0,0.0
2,43,0.015370,0.000824,0.013698,0.015979,entropy,200,random,"{'criterion': 'entropy', 'max_depth': 200, 'sp...",0.821782,0.772277,0.742574,0.778878,0.032672,2,1.0,1.0,1.0,1.0,0.0
3,45,0.007736,0.003317,0.009315,0.007211,entropy,250,random,"{'criterion': 'entropy', 'max_depth': 250, 'sp...",0.841584,0.752475,0.742574,0.778878,0.044524,2,1.0,1.0,1.0,1.0,0.0
4,73,0.017119,0.007638,0.014258,0.002916,log_loss,250,random,"{'criterion': 'log_loss', 'max_depth': 250, 's...",0.782178,0.801980,0.752475,0.778878,0.020345,2,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,15,0.018244,0.020825,0.006983,0.004938,gini,200,random,"{'criterion': 'gini', 'max_depth': 200, 'split...",0.693069,0.702970,0.742574,0.712871,0.021389,80,1.0,1.0,1.0,1.0,0.0
80,42,0.021410,0.018413,0.007719,0.003255,entropy,200,best,"{'criterion': 'entropy', 'max_depth': 200, 'sp...",0.712871,0.742574,0.683168,0.712871,0.024252,80,1.0,1.0,1.0,1.0,0.0
81,12,0.042710,0.020741,0.004335,0.001519,gini,150,best,"{'criterion': 'gini', 'max_depth': 150, 'split...",0.742574,0.722772,0.673267,0.712871,0.029148,80,1.0,1.0,1.0,1.0,0.0
82,36,0.020678,0.003012,0.001526,0.002158,entropy,50,best,"{'criterion': 'entropy', 'max_depth': 50, 'spl...",0.702970,0.772277,0.663366,0.712871,0.045011,80,1.0,1.0,1.0,1.0,0.0


In [10]:
grid_tree_results.columns

Index(['index', 'mean_fit_time', 'std_fit_time', 'mean_score_time',
       'std_score_time', 'param_criterion', 'param_max_depth',
       'param_splitter', 'params', 'split0_test_score', 'split1_test_score',
       'split2_test_score', 'mean_test_score', 'std_test_score',
       'rank_test_score', 'split0_train_score', 'split1_train_score',
       'split2_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [11]:
px.line(grid_tree_results,
        y=['mean_test_score','mean_train_score'],
        title='Decicion Tree Grid Search', 
        hover_data=['param_criterion', 'param_splitter', 'param_max_depth'])

Implementing KNeighborsClassifier

In [12]:
clf2 = KNeighborsClassifier()
params_knn= {
    'n_neighbors': [3,5,7,9,10],
    'weights': ["uniform","distance"],
    'algorithm': ["auto","ball_tree","kd_tree","brute"],
    'metric' : ['euclidean', 'manhattan','chebyshev','minkowski']
}


In [13]:
grid2 = GridSearchCV(
    estimator=clf2,
    param_grid=params_knn,
    cv = 3,
    n_jobs = -1,
    verbose =3,
)

In [14]:
grid2.fit(X,y)

Fitting 3 folds for each of 160 candidates, totalling 480 fits


In [15]:
grid_knn_results = pd.DataFrame(grid2.cv_results_)
grid_knn_results.sort_values(by='rank_test_score', inplace=True)
grid_knn_results.reset_index(inplace=True)
grid_knn_results

Unnamed: 0,index,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,138,0.005338,0.003775,0.061622,0.007313,brute,manhattan,10,uniform,"{'algorithm': 'brute', 'metric': 'manhattan', ...",0.841584,0.861386,0.782178,0.828383,0.033657,1
1,18,0.010845,0.003217,0.078432,0.039049,auto,manhattan,10,uniform,"{'algorithm': 'auto', 'metric': 'manhattan', '...",0.841584,0.861386,0.782178,0.828383,0.033657,1
2,98,0.006847,0.001662,0.046355,0.009433,kd_tree,manhattan,10,uniform,"{'algorithm': 'kd_tree', 'metric': 'manhattan'...",0.841584,0.861386,0.782178,0.828383,0.033657,1
3,58,0.006261,0.004438,0.113427,0.006880,ball_tree,manhattan,10,uniform,"{'algorithm': 'ball_tree', 'metric': 'manhatta...",0.841584,0.861386,0.782178,0.828383,0.033657,1
4,114,0.008377,0.000538,0.069385,0.027500,kd_tree,minkowski,7,uniform,"{'algorithm': 'kd_tree', 'metric': 'minkowski'...",0.851485,0.831683,0.762376,0.815182,0.038204,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,20,0.032295,0.039230,0.121705,0.011293,auto,chebyshev,3,uniform,"{'algorithm': 'auto', 'metric': 'chebyshev', '...",0.722772,0.752475,0.673267,0.716172,0.032672,155
156,100,0.005331,0.003770,0.051164,0.003462,kd_tree,chebyshev,3,uniform,"{'algorithm': 'kd_tree', 'metric': 'chebyshev'...",0.722772,0.752475,0.673267,0.716172,0.032672,155
157,140,0.000000,0.000000,0.061627,0.003316,brute,chebyshev,3,uniform,"{'algorithm': 'brute', 'metric': 'chebyshev', ...",0.732673,0.732673,0.673267,0.712871,0.028004,158
158,61,0.010840,0.004215,0.024673,0.009772,ball_tree,chebyshev,3,distance,"{'algorithm': 'ball_tree', 'metric': 'chebyshe...",0.722772,0.742574,0.673267,0.712871,0.029148,158


In [16]:
print(grid_knn_results.columns.tolist())

['index', 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'param_algorithm', 'param_metric', 'param_n_neighbors', 'param_weights', 'params', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'mean_test_score', 'std_test_score', 'rank_test_score']


In [17]:
px.line(grid_knn_results,
        y='mean_test_score',
        title = 'KNN Grid Search',
        hover_data=['param_n_neighbors', 'param_weights', 'param_algorithm', 'param_metric'],
        )