## Holdout method

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd

In [None]:
X, y = np.arange(100).reshape((50, 2)), range(50)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
train_index = np.random.choice(50, 30, replace=False)

In [None]:
X[train_index]

In [None]:
train_index

## Train/Val/Test 

In [None]:
from sklearn import datasets
digits = datasets.load_digits()

In [None]:
digits["data"]

In [None]:
Counter(digits["target"]) 

In [None]:
pd.Series(digits["target"]).value_counts() / digits["target"].size

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    digits["data"], digits["target"], test_size=0.33, random_state=42)

In [None]:
X_dev, X_test, y_dev, y_test = train_test_split(
    digits["data"], digits["target"], test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
    X_dev, y_dev, test_size=0.25, random_state=42)

In [None]:
y_train.size

In [None]:
y_val.size

In [None]:
y_test.size

## K-fold cross validation

In [None]:
from sklearn import datasets
from sklearn.model_selection import KFold 
iris = datasets.load_iris()

In [None]:
X = iris["data"]
y = iris["target"]

In [None]:
kf = KFold(n_splits=10)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

## Leave One Out 


In [None]:
from sklearn.model_selection import LeaveOneOut

In [None]:
X = iris["data"]
y = iris["target"]

In [None]:
kf = KFold(n_splits=y.size)
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
loo = LeaveOneOut()
for train_index, test_index in loo.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

## Stratified sampling 

In [None]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [None]:
y = train.pop("Survived")
train = train.drop(columns=['PassengerId'])

In [None]:
Counter(y)

In [None]:
pd.Series(y).value_counts() / y.size

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train, y, test_size=0.33, random_state=42)

In [None]:
pd.Series(y_train).value_counts() / y_train.size

In [None]:
pd.Series(y_test).value_counts() / y_test.size

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train, y, test_size=0.33, random_state=42, stratify=y)

In [None]:
pd.Series(y_train).value_counts() / y_train.size

In [None]:
pd.Series(y_test).value_counts() / y_test.size

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train, y, test_size=0.33, random_state=42, stratify=train["Sex"])

In [None]:
pd.Series(y_train).value_counts() / y_train.size

In [None]:
pd.Series(y_test).value_counts() / y_test.size