In [16]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [17]:
X, y = make_classification(
    n_samples= 1000,
    n_features= 10,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    weights=[0.9, 0.1],
    random_state=42
)

#Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [18]:
from collections import Counter

Counter(y)

Counter({np.int64(0): 897, np.int64(1): 103})

In [19]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(f'Target class: {Counter(y_train)}')
    # print(f'Target class: {Counter(y_test)}')


Target class: Counter({np.int64(0): 720, np.int64(1): 80})
Target class: Counter({np.int64(0): 718, np.int64(1): 82})
Target class: Counter({np.int64(0): 714, np.int64(1): 86})
Target class: Counter({np.int64(0): 716, np.int64(1): 84})
Target class: Counter({np.int64(0): 720, np.int64(1): 80})


In [20]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(f'Target class: {Counter(y_train)}')
    # print(f'Target class: {Counter(y_test)}')


Target class: Counter({np.int64(0): 717, np.int64(1): 83})
Target class: Counter({np.int64(0): 717, np.int64(1): 83})
Target class: Counter({np.int64(0): 718, np.int64(1): 82})
Target class: Counter({np.int64(0): 718, np.int64(1): 82})
Target class: Counter({np.int64(0): 718, np.int64(1): 82})


### Evaluate Logistic Regression

In [21]:
from sklearn.model_selection import cross_val_score

scores_logistic = cross_val_score(LogisticRegression(), X, y, cv=kf)
np.average(scores_logistic)

np.float64(0.897)

### Evaluate Decision Tree

In [22]:
from sklearn.tree import DecisionTreeClassifier

scores_dtree = cross_val_score(DecisionTreeClassifier(), X, y, cv=kf)
np.average(scores_dtree)

np.float64(0.8699999999999999)

### Evaluate XGBoost

In [23]:
from xgboost import XGBClassifier

scores_xgb = cross_val_score(XGBClassifier(), X, y, cv=kf)
np.average(scores_xgb)

np.float64(0.9340000000000002)

### Evaluate Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier

scores_rforest = cross_val_score(RandomForestClassifier(n_estimators=20), X, y, cv=kf)
np.average(scores_rforest)

np.float64(0.914)

In [25]:
from sklearn.ensemble import RandomForestClassifier

scores_rforest = cross_val_score(RandomForestClassifier(n_estimators=30), X, y, cv=kf)
np.average(scores_rforest)

np.float64(0.917)

In [26]:
from sklearn.model_selection import cross_validate

cross_validate(DecisionTreeClassifier(), X, y, cv=kf, scoring=['accuracy', 'roc_auc'])

{'fit_time': array([0.01200175, 0.01100183, 0.01200271, 0.01300168, 0.01000023]),
 'score_time': array([0.00400257, 0.00300193, 0.00400114, 0.0030005 , 0.00300241]),
 'test_accuracy': array([0.88 , 0.885, 0.865, 0.855, 0.88 ]),
 'test_roc_auc': array([0.64848931, 0.66254323, 0.63275474, 0.61369584, 0.68631786])}