<a href="https://colab.research.google.com/github/acapodanno/Machine-Learning-per-AI-Solution-Architect/blob/main/hold_out_and_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Hold-out

In [1]:
!pip install scikit-learn



In [2]:
from sklearn.datasets import make_regression

RANDOM_SEED = 0

X , y= make_regression(
    n_samples=100,
    n_features=100,
    n_informative=10,
    random_state=RANDOM_SEED
)

In [3]:
X.shape

(100, 100)

In [4]:
y.shape

(100,)

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=RANDOM_SEED)

In [6]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
lr = LinearRegression()
lr.fit(X_train,y_train)
def evalute_model(data,model):
  x , y = data

  y_pred = model.predict(x)
  print(f"MSE : {mean_squared_error(y,y_pred)}")
  print(f"R2 SCORE: {r2_score(y,y_pred)}")

evalute_model((X_train,y_train),lr)

MSE : 3.362548909052471e-26
R2 SCORE: 1.0


In [8]:
lr = LinearRegression()
lr.fit(X_test,y_test)
evalute_model((X_test,y_test),lr)

MSE : 1.2233065902024885e-25
R2 SCORE: 1.0


###Cross-Validation

In [9]:
from sklearn.model_selection import cross_val_score
lr = LinearRegression()

cvs = cross_val_score(lr,X_train,y_train,cv=5)

print(f"{cvs}")
print(f"Mean : {cvs.mean()}")
print(f"Std : {cvs.std()}")
print(f"Min : {cvs.min()}")
print(f"Max : {cvs.max()}")

[0.31552511 0.39853973 0.42129068 0.05408551 0.41143675]
Mean : 0.32017555588014307
Std : 0.13821675514271364
Min : 0.05408551010878526
Max : 0.4212906830849954


In [10]:
from sklearn.model_selection import cross_validate

cv = cross_validate(lr,X_train,y_train,cv=5,return_train_score=True)
print(cv)
print(f"Mean Train Score : {cv['train_score'].mean()}")
print(f"Mean Test Score : {cv['test_score'].mean()}")


{'fit_time': array([0.01125669, 0.00798225, 0.00766754, 0.00568271, 0.0047338 ]), 'score_time': array([0.00240731, 0.00327039, 0.00115895, 0.00222087, 0.00330138]), 'test_score': array([0.31552511, 0.39853973, 0.42129068, 0.05408551, 0.41143675]), 'train_score': array([1., 1., 1., 1., 1.])}
Mean Train Score : 1.0
Mean Test Score : 0.32017555588014307


### Divide in batch with KFold

In [11]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,shuffle=True, random_state=RANDOM_SEED)
kf.get_n_splits(X)

5

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import  numpy as np
train_score = []
test_score = []

for train_index,test_index in kf.split(X):
  X_train,X_test = X[train_index],X[test_index]
  y_train,y_test = y[train_index],y[test_index]
  ss = StandardScaler()
  X_train = ss.fit_transform(X_train)
  X_test = ss.transform(X_test)
  lr = LinearRegression()
  lr.fit(X_train,y_train)
  r2_train = r2_score(y_train,lr.predict(X_train))
  r2_test = r2_score(y_test,lr.predict(X_test))
  train_score.append(r2_train)
  test_score.append(r2_test)

scores = {
    "train_score":np.array(train_score),
    "test_score":np.array(test_score)
}





In [13]:
scores

{'train_score': array([1., 1., 1., 1., 1.]),
 'test_score': array([0.63227624, 0.65395404, 0.71145494, 0.64973226, 0.87203322])}

In [14]:
print(f"Train score:{scores["train_score"].mean()}")
print(f"Test score:{scores["test_score"].mean()}")

Train score:1.0
Test score:0.7038901400337566


In [16]:
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error
loo = LeaveOneOut()
loo.get_n_splits(X)

100

In [21]:
for train_index,test_index in loo.split(X):
  X_train,X_test = X[train_index],X[test_index]
  y_train,y_test = y[train_index],y[test_index]
  ss = StandardScaler()
  X_train = ss.fit_transform(X_train)
  X_test = ss.transform(X_test)
  lr = LinearRegression()
  lr.fit(X_train,y_train)
  r2_train = mean_squared_error(y_train,lr.predict(X_train))
  r2_test = mean_squared_error(y_test,lr.predict(X_test))
  train_score.append(r2_train)
  test_score.append(r2_test)

costs = {
    "train_cost":np.array(train_score),
    "test_cost":np.array(test_score)
}
costs


{'train_cost': array([1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
        1.00000000e+00, 1.00000000e+00, 1.00000000

In [28]:
print(f"Train cost:{costs["train_cost"].mean()}")
print(f"Test cost:{costs["test_cost"].mean()}")

Train cost:0.25925925925925924
Test cost:nan
