Title: Cross-Validation


Task 1: K-Fold Cross-Validation for House Prices<br>
Apply K-Fold Cross-Validation (K=5) to check variability in performance.

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np

# Load dataset
data = fetch_california_housing(as_frame=True)
X = data.data
y = data.target

# Define Linear Regression model
model = LinearRegression()

# Define RMSE scorer (negated for cross_val_score which maximizes score)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

neg_rmse_scorer = make_scorer(rmse, greater_is_better=False)

# Setup 5-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation and get negative RMSE scores
neg_rmse_scores = cross_val_score(model, X, y, scoring=neg_rmse_scorer, cv=kf)

# Convert negative RMSE scores to positive
rmse_scores = -neg_rmse_scores

print("RMSE scores for each fold:")
for i, score in enumerate(rmse_scores, 1):
    print(f" Fold {i}: {score:.4f}")

print(f"\nMean RMSE: {rmse_scores.mean():.4f}")
print(f"Std deviation of RMSE: {rmse_scores.std():.4f}")


RMSE scores for each fold:
 Fold 1: 0.7456
 Fold 2: 0.7264
 Fold 3: 0.7136
 Fold 4: 0.7105
 Fold 5: 0.7451

Mean RMSE: 0.7283
Std deviation of RMSE: 0.0149


Task 2: Stratified K-Fold for Imbalanced Churn Dataset<br>
Use Stratified K-Fold to ensure each class is represented.

In [2]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Create synthetic imbalanced churn-like dataset
X, y = make_classification(n_samples=1000, n_features=20,
                           n_informative=10, n_redundant=5,
                           n_classes=2, weights=[0.7, 0.3], # imbalance
                           random_state=42)

# Initialize Stratified K-Fold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train Random Forest classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Predict on validation fold
    y_pred = model.predict(X_val)

    # Evaluate accuracy
    acc = accuracy_score(y_val, y_pred)
    accuracies.append(acc)
    print(f"Fold {fold} Accuracy: {acc:.4f}")

print(f"\nMean Accuracy over 5 folds: {np.mean(accuracies):.4f}")


Fold 1 Accuracy: 0.9400
Fold 2 Accuracy: 0.9200
Fold 3 Accuracy: 0.9500
Fold 4 Accuracy: 0.9450
Fold 5 Accuracy: 0.9150

Mean Accuracy over 5 folds: 0.9340


Task 3: Leave-One-Out Cross-Validation for Iris<br>
Use LOOCV to assess model prediction for the Iris dataset.

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

# Load Iris dataset
data = load_iris()
X, y = data.data, data.target

# Initialize Leave-One-Out Cross-Validation
loo = LeaveOneOut()

y_true, y_pred = [], []

# Model
model = LogisticRegression(max_iter=200)

# Perform LOOCV
for train_index, test_index in loo.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model.fit(X_train, y_train)
    prediction = model.predict(X_test)
    
    y_true.append(y_test[0])
    y_pred.append(prediction[0])

# Calculate overall accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"LOOCV Accuracy on Iris dataset: {accuracy:.4f}")


LOOCV Accuracy on Iris dataset: 0.9667
