In [8]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
df = load_breast_cancer()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.data, df.target, 
                                               test_size=0.33, random_state=42)

In [None]:
kf = KFold(n_splits=3)

In [None]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
  print(train_index,'|+|', test_index)

[3 4 5 6 7 8] |+| [0 1 2]
[0 1 2 6 7 8] |+| [3 4 5]
[0 1 2 3 4 5] |+| [6 7 8]


In [None]:
skf = StratifiedKFold(n_splits=3)

In [None]:
RandomForestClassifier(n_estimators=)

***StratifiedKFold - Only test Score***

In [None]:
def model_performance(model,x,y,n_splits):
  skf = StratifiedKFold(n_splits=n_splits)
  model_score = []

  for train_index, test_index in skf.split(x,y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train,y_train)
    model_score.append(model.score(X_test,y_test))

    if len(model_score) == n_splits:
      return (np.mean(model_score))

***StratifiedKFold - with train & test Score***

In [3]:
def model_performance(model,x,y,n_splits):
  skf = StratifiedKFold(n_splits=n_splits)
  train_model_score = []
  test_model_score = []

  for train_index, test_index in skf.split(x,y):
    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model.fit(X_train,y_train)
    train_model_score.append(model.score(X_train,y_train))    
    test_model_score.append(model.score(X_test,y_test))

    if len(test_model_score) == n_splits:
      return [np.mean(train_model_score),np.mean(test_model_score)]


In [None]:
model_performance(RandomForestClassifier(),df.data,df.target,10)

0.9666666666666668

In [None]:
model_performance(SVC(),df.data,df.target,10)

0.9138784461152882

### Compare models by applying StratifiedKFold

In [4]:
X, y = make_classification(n_samples=100,
                           n_classes = 2,
                          # n_informative = 9, incase n_class is more than 2
                           n_features = 5,
                           n_redundant= 0,
                           random_state =1)

In [5]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier

In [31]:
model_classifiers =\
 {'K-Nearest_Neighbors': KNeighborsClassifier(3),
  'Linear_SVM'         : SVC(kernel="linear", C=0.025),
  'Polynomial_SVM'     : SVC(kernel="poly", degree=3, C=0.025),
  'RBF_SVM'            : SVC(kernel="rbf", C=1, gamma=2),
  'Gaussian_Process'   : GaussianProcessClassifier(1.0 * RBF(1.0)),
  'Gradient_Boosting'  : GradientBoostingClassifier(n_estimators=100, 
                                                    learning_rate=1.0),
  'Decision_Tree'      : DecisionTreeClassifier(max_depth=5),
  'Extra_Trees'        : ExtraTreesClassifier(n_estimators=10,
                                              min_samples_split=2),
  'Random_Forest'      : RandomForestClassifier(max_depth=5, n_estimators=100),
  'Neural_Net'         : MLPClassifier(alpha=1, max_iter=1000),
  'AdaBoost'           : AdaBoostClassifier(n_estimators=100),
  'Naive_Bayes'        : GaussianNB(),
  'QDA'                : QuadraticDiscriminantAnalysis(),
  'SGD'                : SGDClassifier(loss="hinge", penalty="l2")}

In [33]:
classifiers = {}
for i in model_classifiers.keys():
  # using the function StratifiedKFold - with train & test Score
  classifiers[i] = model_performance(model_classifiers[i],X,y,5) 
pd.DataFrame(classifiers,index=['Train Score','Test Score']).transpose()
  

Unnamed: 0,Train Score,Test Score
K-Nearest_Neighbors,0.9575,0.96
Linear_SVM,0.95,0.95
Polynomial_SVM,0.53,0.52
RBF_SVM,1.0,0.78
Gaussian_Process,0.9525,0.95
Gradient_Boosting,1.0,0.93
Decision_Tree,1.0,0.91
Extra_Trees,1.0,0.91
Random_Forest,1.0,0.93
Neural_Net,0.9575,0.95


***Check cross_val_score***

In [11]:
cross_val_score(RandomForestClassifier(),X,y,cv=3,scoring='accuracy')

array([0.97058824, 0.93939394, 0.87878788])

In [27]:
cross_val_score(AdaBoostClassifier(),X,y,cv=10,scoring='accuracy').mean()

0.9299999999999999

In [30]:
cross_val_score(AdaBoostClassifier(),X,y,cv=20,scoring='accuracy').mean()

0.93