In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
X, y = make_classification(
    n_samples= 1000,
    n_features= 10,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    random_state=42
)

#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Method 1 : Evaluate the model using train-test split and tune parameters by trial and error

In [3]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier


model = DecisionTreeClassifier(criterion='entropy', max_depth=10)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
report = classification_report(y_pred, y_test)
print(report)

              precision    recall  f1-score   support

           0       0.75      0.85      0.80       115
           1       0.86      0.76      0.81       135

    accuracy                           0.80       250
   macro avg       0.81      0.81      0.80       250
weighted avg       0.81      0.80      0.80       250



### Method 2 : Evaluate the model using cross val score

In [4]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='gini', max_depth=5), X, y, cv=5)

array([0.775, 0.8  , 0.75 , 0.805, 0.775])

In [5]:
cross_val_score(DecisionTreeClassifier(criterion='gini', max_depth=10), X, y, cv=5)

array([0.785, 0.745, 0.805, 0.805, 0.805])

In [6]:
cross_val_score(DecisionTreeClassifier(criterion='entropy', max_depth=10), X, y, cv=5)

array([0.775, 0.785, 0.82 , 0.775, 0.79 ])

In [8]:
criterion = ['gini', 'entropy']
max_depth = [5,10,15]

all_scores = {}

for c in criterion:
    for d in max_depth:
        mdl = DecisionTreeClassifier(criterion=c, max_depth=d)
        scores = cross_val_score(mdl, X, y, cv=5)
        all_scores[f'{c} - {d}'] = np.average(scores)
all_scores

{'gini - 5': np.float64(0.782),
 'gini - 10': np.float64(0.79),
 'gini - 15': np.float64(0.799),
 'entropy - 5': np.float64(0.781),
 'entropy - 10': np.float64(0.7870000000000001),
 'entropy - 15': np.float64(0.817)}

In [9]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(DecisionTreeClassifier(), { 'criterion': criterion, 'max_depth': max_depth }, cv=5, return_train_score=False)

clf.fit(X, y)
clf.cv_results_

{'mean_fit_time': array([0.00688148, 0.01006885, 0.01070948, 0.00951481, 0.01350942,
        0.01435361]),
 'std_fit_time': array([0.00014675, 0.000992  , 0.0014731 , 0.00048528, 0.00097446,
        0.0004107 ]),
 'mean_score_time': array([0.00138183, 0.00094843, 0.0010006 , 0.00060072, 0.0010222 ,
        0.0008009 ]),
 'std_score_time': array([5.08156220e-04, 1.04786634e-04, 4.62310777e-07, 4.90485525e-04,
        4.47832604e-05, 4.00448215e-04]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': '

In [11]:
import pandas as pd

df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006881,0.000147,0.001382,0.0005081562,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.775,0.79,0.755,0.795,0.77,0.777,0.014353,6
1,0.010069,0.000992,0.000948,0.0001047866,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.8,0.73,0.81,0.79,0.81,0.788,0.029933,2
2,0.010709,0.001473,0.001001,4.623108e-07,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.775,0.74,0.79,0.815,0.82,0.788,0.029086,2
3,0.009515,0.000485,0.000601,0.0004904855,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.775,0.755,0.815,0.78,0.778,0.020396,5
4,0.013509,0.000974,0.001022,4.478326e-05,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.77,0.805,0.82,0.755,0.78,0.786,0.023537,4
5,0.014354,0.000411,0.000801,0.0004004482,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.75,0.805,0.825,0.81,0.865,0.811,0.037068,1


In [12]:
df[['param_criterion','mean_test_score']]

Unnamed: 0,param_criterion,mean_test_score
0,gini,0.777
1,gini,0.788
2,gini,0.788
3,entropy,0.778
4,entropy,0.786
5,entropy,0.811


In [13]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [15]:
clf.best_estimator_

In [19]:
from sklearn.svm import SVC
from xgboost import XGBClassifier

model_params = {
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [5,10,15]
        }
    },
    'support_vector_classifier': {
        'model': SVC(gamma='auto'),
        'params': {
            'C': [1,10,20],
            'kernel': ['rbf', 'linear']
        }
    },
    'xgb': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 150],
            'max_depth': [3, 5, 7],
        }
    }
}

In [None]:

scores = []

for key, val in model_params.items():
    clf = GridSearchCV(val['model'], val['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': key,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

df = pd.DataFrame(scores)
df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.801,"{'criterion': 'entropy', 'max_depth': 15}"
1,support_vector_classifier,0.926,"{'C': 1, 'kernel': 'rbf'}"
2,xgb,0.892,"{'max_depth': 5, 'n_estimators': 50}"


In [24]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(DecisionTreeClassifier(), { 'criterion': criterion, 'max_depth': max_depth }, cv=5, return_train_score=False, n_iter=3)

clf.fit(X, y)
df = pd.DataFrame(clf.cv_results_)

In [25]:
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008824,0.000418,0.000997,4e-06,10,gini,"{'max_depth': 10, 'criterion': 'gini'}",0.78,0.77,0.785,0.795,0.825,0.791,0.018815,1
1,0.008479,0.000494,0.000602,0.000492,5,entropy,"{'max_depth': 5, 'criterion': 'entropy'}",0.765,0.785,0.755,0.81,0.775,0.778,0.018868,2
2,0.006277,0.000757,0.000598,0.000488,5,gini,"{'max_depth': 5, 'criterion': 'gini'}",0.78,0.8,0.74,0.79,0.775,0.777,0.020396,3


In [26]:
clf.best_params_

{'max_depth': 10, 'criterion': 'gini'}