In [15]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Create synthetic dataset
X, y = make_classification(
    n_features=10,
    n_samples=1000,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    random_state=42
)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

### Method 1: Evaluate model using train_test_split and tune parameters by Trial and Error

In [8]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion="gini", max_depth=10)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.85      0.82      0.83       130
           1       0.81      0.85      0.83       120

    accuracy                           0.83       250
   macro avg       0.83      0.83      0.83       250
weighted avg       0.83      0.83      0.83       250



### Method 2: Cross Validation score

In [9]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="gini", max_depth=5), X, y, cv=5)

array([0.78 , 0.79 , 0.74 , 0.805, 0.775])

In [10]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="gini", max_depth=10), X, y, cv=5)

array([0.79 , 0.735, 0.78 , 0.795, 0.815])

In [11]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="entropy", max_depth=5), X, y, cv=5)

array([0.765, 0.785, 0.75 , 0.815, 0.78 ])

In [12]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="entropy", max_depth=10), X, y, cv=5)

array([0.785, 0.78 , 0.84 , 0.78 , 0.79 ])

### Using For loop for parameters

In [13]:
criteron = ["gini", "entropy"]
max_depth = [5, 10, 15]

avg_scores = {}

for c in criteron:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion=c, max_depth=d)
        scores_list = cross_val_score(clf, X, y, cv=5)
        avg_scores[c + "_" + str(d)] = np.average(scores_list)
avg_scores

{'gini_5': np.float64(0.78),
 'gini_10': np.float64(0.776),
 'gini_15': np.float64(0.7949999999999999),
 'entropy_5': np.float64(0.7799999999999999),
 'entropy_10': np.float64(0.7869999999999999),
 'entropy_15': np.float64(0.807)}

### Grid Search CV

In [14]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {
        "criterion": ["gini", "entropy"],
        "max_depth": [5, 10, 15]
    },
    cv=5,
    return_train_score=False
)

clf.fit(X, y)
clf.cv_results_

{'mean_fit_time': array([0.0118422 , 0.01085348, 0.01116834, 0.00994182, 0.0140862 ,
        0.01487923]),
 'std_fit_time': array([0.00726713, 0.00142196, 0.00274711, 0.00110087, 0.00184491,
        0.00202228]),
 'mean_score_time': array([0.00235348, 0.00094008, 0.00100989, 0.0009407 , 0.00115209,
        0.00107994]),
 'std_score_time': array([2.23518744e-03, 5.59677649e-05, 2.17899316e-04, 3.02679409e-04,
        3.58545759e-04, 3.89839081e-04]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': '

In [17]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011842,0.007267,0.002353,0.002235,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.815,0.75,0.8,0.77,0.783,0.022716,5
1,0.010853,0.001422,0.00094,5.6e-05,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.8,0.72,0.81,0.785,0.815,0.786,0.034554,4
2,0.011168,0.002747,0.00101,0.000218,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.8,0.725,0.83,0.8,0.825,0.796,0.037603,2
3,0.009942,0.001101,0.000941,0.000303,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.78,0.76,0.815,0.79,0.782,0.019647,6
4,0.014086,0.001845,0.001152,0.000359,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.775,0.79,0.8,0.785,0.78,0.786,0.008602,3
5,0.014879,0.002022,0.00108,0.00039,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.755,0.81,0.84,0.8,0.86,0.813,0.036,1


In [18]:
df[["param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.783
1,gini,10,0.786
2,gini,15,0.796
3,entropy,5,0.782
4,entropy,10,0.786
5,entropy,15,0.813


In [19]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [20]:
clf.best_estimator_

In [21]:
from sklearn import svm

model_params = {
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [5, 10, 15]
        }
    },
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params': {
            'C': [1, 10, 20],
            'kernel': ['rbf', 'linear']
        }
    }
}

# Store results
scores = []

# Run GridSearchCV for each model
for key, val in model_params.items():
    clf = GridSearchCV(val['model'], val['params'], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append({
        'model': key,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
scores

[{'model': 'decision_tree',
  'best_score': np.float64(0.807),
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': np.float64(0.9260000000000002),
  'best_params': {'C': 1, 'kernel': 'rbf'}}]

In [22]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.807,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.926,"{'C': 1, 'kernel': 'rbf'}"
