# Model Selection

In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(iris['data'])

In [4]:
df.columns = iris['feature_names']
df['target'] = iris['target']

In [5]:
# random categories
df['cat'] = np.random.choice([1, 2, 3, 4, 5], len(df))

In [6]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target,cat
0,5.1,3.5,1.4,0.2,0,4
1,4.9,3.0,1.4,0.2,0,4
2,4.7,3.2,1.3,0.2,0,1
3,4.6,3.1,1.5,0.2,0,3
4,5.0,3.6,1.4,0.2,0,3
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,1
146,6.3,2.5,5.0,1.9,2,3
147,6.5,3.0,5.2,2.0,2,2
148,6.2,3.4,5.4,2.3,2,3


### KFold

In [7]:
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
from sklearn.model_selection import KFold

In [9]:
kf = KFold(n_splits=5)

In [10]:
for i, data_idx in enumerate(kf.split(df)):
    print(f'Fold {i + 1}')
    print(f"Training data: {round(df.loc[data_idx[0], 'target'].value_counts() / (len(df)*0.8) * 100, 2)}")
    print(f"Testing data: {round(df.loc[data_idx[1], 'target'].value_counts() / (len(df)*0.2) * 100, 2)}")
    print('==========================')

Fold 1
Training data: target
1    41.67
2    41.67
0    16.67
Name: count, dtype: float64
Testing data: target
0    100.0
Name: count, dtype: float64
Fold 2
Training data: target
2    41.67
1    33.33
0    25.00
Name: count, dtype: float64
Testing data: target
0    66.67
1    33.33
Name: count, dtype: float64
Fold 3
Training data: target
0    41.67
2    41.67
1    16.67
Name: count, dtype: float64
Testing data: target
1    100.0
Name: count, dtype: float64
Fold 4
Training data: target
0    41.67
1    33.33
2    25.00
Name: count, dtype: float64
Testing data: target
2    66.67
1    33.33
Name: count, dtype: float64
Fold 5
Training data: target
0    41.67
1    41.67
2    16.67
Name: count, dtype: float64
Testing data: target
2    100.0
Name: count, dtype: float64


### Group KFold

In [11]:
from sklearn.model_selection import GroupKFold

In [12]:
gkf = GroupKFold(n_splits=5)

In [13]:
for i, data_idx in enumerate(gkf.split(df, groups=df['cat'])):
    print(f'Fold {i + 1}')
    print(f"Training data: {round(df.loc[data_idx[0], 'cat'].value_counts() / len(df) * 100, 2)}")
    print(f"Testing data: {round(df.loc[data_idx[1], 'cat'].value_counts() / (len(df)/5) * 100, 2)}")
    print('==========================')

Fold 1
Training data: cat
4    21.33
3    20.67
5    18.67
2    17.33
Name: count, dtype: float64
Testing data: cat
1    110.0
Name: count, dtype: float64
Fold 2
Training data: cat
1    22.00
3    20.67
5    18.67
2    17.33
Name: count, dtype: float64
Testing data: cat
4    106.67
Name: count, dtype: float64
Fold 3
Training data: cat
1    22.00
4    21.33
5    18.67
2    17.33
Name: count, dtype: float64
Testing data: cat
3    103.33
Name: count, dtype: float64
Fold 4
Training data: cat
1    22.00
4    21.33
3    20.67
2    17.33
Name: count, dtype: float64
Testing data: cat
5    93.33
Name: count, dtype: float64
Fold 5
Training data: cat
1    22.00
4    21.33
3    20.67
5    18.67
Name: count, dtype: float64
Testing data: cat
2    86.67
Name: count, dtype: float64


### Stratified KFold

In [14]:
from sklearn.model_selection import StratifiedKFold

In [15]:
skf = StratifiedKFold(n_splits=5)

In [16]:
for i, data_idx in enumerate(skf.split(df, y=df['target'])):
    print(f'Fold {i + 1}')
    print(f"Training data: {round(df.loc[data_idx[0], 'target'].value_counts() / (len(df)*0.8) * 100, 2)}")
    print(f"Testing data: {round(df.loc[data_idx[1], 'target'].value_counts() / (len(df)*0.2) * 100, 2)}")
    print('==========================')

Fold 1
Training data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Testing data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Fold 2
Training data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Testing data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Fold 3
Training data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Testing data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Fold 4
Training data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Testing data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Fold 5
Training data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64
Testing data: target
0    33.33
1    33.33
2    33.33
Name: count, dtype: float64


### Stratified Group KFold

In [17]:
from sklearn.model_selection import StratifiedGroupKFold

In [18]:
sgkf = StratifiedGroupKFold(n_splits=5)

In [19]:
for i, data_idx in enumerate(sgkf.split(df, y=df['target'], groups=df['cat'])):
    print(f'Fold {i + 1}')
    print(f"Training data: {round(df.loc[data_idx[0], 'target'].value_counts() / (len(df)*0.8) * 100, 2)}")
    print(f"Training data: {round(df.loc[data_idx[0], 'cat'].value_counts() / (len(df)*0.8) * 100, 2)}")
    print(f"Testing data: {round(df.loc[data_idx[1], 'target'].value_counts() / (len(df)*0.2) * 100, 2)}")
    print('==========================')

Fold 1
Training data: target
0    35.83
2    34.17
1    29.17
Name: count, dtype: float64
Training data: cat
1    27.50
4    26.67
5    23.33
2    21.67
Name: count, dtype: float64
Testing data: target
1    50.00
2    30.00
0    23.33
Name: count, dtype: float64
Fold 2
Training data: target
2    36.67
1    35.83
0    30.83
Name: count, dtype: float64
Training data: cat
1    27.50
4    26.67
3    25.83
5    23.33
Name: count, dtype: float64
Testing data: target
0    43.33
1    23.33
2    20.00
Name: count, dtype: float64
Fold 3
Training data: target
1    36.67
0    33.33
2    31.67
Name: count, dtype: float64
Training data: cat
1    27.50
4    26.67
3    25.83
2    21.67
Name: count, dtype: float64
Testing data: target
2    40.00
0    33.33
1    20.00
Name: count, dtype: float64
Fold 4
Training data: target
0    34.17
2    32.50
1    30.83
Name: count, dtype: float64
Training data: cat
4    26.67
3    25.83
5    23.33
2    21.67
Name: count, dtype: float64
Testing data: target
1    43.3

### Train Test Split

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(df[df.columns.difference(['target'])], df['target'], stratify=df['target'], test_size=0.2)

In [22]:
y_train.value_counts()

target
0    40
1    40
2    40
Name: count, dtype: int64

In [23]:
y_test.value_counts()

target
2    10
0    10
1    10
Name: count, dtype: int64