<a href="https://colab.research.google.com/github/acapodanno/Machine-Learning-per-AI-Solution-Architect/blob/main/hold_out_and_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Hold-out

In [48]:
!pip install scikit-learn



In [49]:
from sklearn.datasets import make_regression

RANDOM_SEED = 0

X , y= make_regression(
    n_samples=100,
    n_features=100,
    n_informative=10,
    random_state=RANDOM_SEED
)

In [50]:
X.shape

(100, 100)

In [51]:
y.shape

(100,)

In [52]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=RANDOM_SEED)

In [53]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)


In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score,mean_squared_error
lr = LinearRegression()
lr.fit(X_train,y_train)
def evalute_model(data,model):
  x , y = data

  y_pred = model.predict(x)
  print(f"MSE : {mean_squared_error(y,y_pred)}")
  print(f"R2 SCORE: {r2_score(y,y_pred)}")

evalute_model((X_train,y_train),lr)

MSE : 3.362548909052471e-26
R2 SCORE: 1.0


In [55]:
lr = LinearRegression()
lr.fit(X_test,y_test)
evalute_model((X_test,y_test),lr)

MSE : 1.2233065902024885e-25
R2 SCORE: 1.0


###Cross-Validation

In [56]:
from sklearn.model_selection import cross_val_score
lr = LinearRegression()

cvs = cross_val_score(lr,X_train,y_train,cv=5)

print(f"{cvs}")
print(f"Mean : {cvs.mean()}")
print(f"Std : {cvs.std()}")
print(f"Min : {cvs.min()}")
print(f"Max : {cvs.max()}")

[0.31552511 0.39853973 0.42129068 0.05408551 0.41143675]
Mean : 0.32017555588014307
Std : 0.13821675514271364
Min : 0.05408551010878526
Max : 0.4212906830849954


In [57]:
from sklearn.model_selection import cross_validate

cv = cross_validate(lr,X_train,y_train,cv=5,return_train_score=True)
print(cv)
print(f"Mean Train Score : {cv['train_score'].mean()}")
print(f"Mean Test Score : {cv['test_score'].mean()}")


{'fit_time': array([0.00731325, 0.0037508 , 0.00210929, 0.00201178, 0.00230384]), 'score_time': array([0.00271797, 0.00099063, 0.00089955, 0.00083756, 0.00096273]), 'test_score': array([0.31552511, 0.39853973, 0.42129068, 0.05408551, 0.41143675]), 'train_score': array([1., 1., 1., 1., 1.])}
Mean Train Score : 1.0
Mean Test Score : 0.32017555588014307


### Divide in batch with KFold

In [58]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5,shuffle=True, random_state=RANDOM_SEED)
kf.get_n_splits(X)

ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
train_score = []
test_score = []

for train_index,test_index in kf.split(X):
  X_train,X_test = X[train_index],X[test_index]
  y_train,y_test = y[train_index],y[test_index]
  ss = StandardScaler()
  X_train = ss.fit_transform(X_train)
  X_test = ss.transform(X_test)
  lr = LinearRegression()
  lr.fit(X_train,y_train)
  r2_train = r2_score(y_train,lr.predict(X_train))
  r2_test = r2_score(y_test,lr.predict(X_test))
  train_score.append(r2_train)
  test_score.append(r2_test)

print(f"Train Score : {r2_train}")
print(f"Test Score : {r2_test}")
