Title: Data Splitting (Train-Test-Validation)


Task 1: House Prices Dataset (Regression)<br>
Use the House Prices dataset to predict house prices.<br>
Split the data into training, validation, and test sets (70% train, 15% validation, 15% test).

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 1: Load real-world housing dataset from OpenML (House Prices)
try:
    housing = fetch_openml(name='house_prices', as_frame=True)
    df = housing.frame
except Exception as e:
    raise RuntimeError(f"Failed to fetch dataset: {e}")

# Step 2: Select numerical features and drop rows with missing target
df = df[['OverallQual', 'GrLivArea', 'GarageCars', 'TotRmsAbvGrd', 'YearBuilt', 'SalePrice']]
df.dropna(subset=['SalePrice'], inplace=True)

# Define features and target
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Step 3: Split into train (70%), validation (15%), and test (15%) — using random_state=42 for reproducibility
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Step 4: Preprocessing — impute missing values using median (more robust than mean)
numerical_features = X.columns.tolist()
numerical_transformer = SimpleImputer(strategy='median')
preprocessor = ColumnTransformer(transformers=[('num', numerical_transformer, numerical_features)])

# Step 5: Define models — include a simpler model for comparison
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "LinearRegression": LinearRegression()
}

# Step 6: Train and evaluate both models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    try:
        pipeline.fit(X_train, y_train)
        y_val_pred = pipeline.predict(X_val)
        y_test_pred = pipeline.predict(X_test)

        # Validation metrics
        print(f"\n{name} - Validation Set Performance:")
        print(f"MAE: {mean_absolute_error(y_val, y_val_pred):.2f}")
        print(f"MSE: {mean_squared_error(y_val, y_val_pred):.2f}")
        print(f"R2 Score: {r2_score(y_val, y_val_pred):.2f}")

        # Test metrics
        print(f"\n{name} - Test Set Performance:")
        print(f"MAE: {mean_absolute_error(y_test, y_test_pred):.2f}")
        print(f"MSE: {mean_squared_error(y_test, y_test_pred):.2f}")
        print(f"R2 Score: {r2_score(y_test, y_test_pred):.2f}")
        
    except Exception as e:
        print(f"{name} training failed: {e}")


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(subset=['SalePrice'], inplace=True)



RandomForest - Validation Set Performance:
MAE: 22383.14
MSE: 1059954942.95
R2 Score: 0.83

RandomForest - Test Set Performance:
MAE: 21134.10
MSE: 1063910243.56
R2 Score: 0.86

LinearRegression - Validation Set Performance:
MAE: 25551.10
MSE: 1128531065.17
R2 Score: 0.82

LinearRegression - Test Set Performance:
MAE: 29061.71
MSE: 2098725849.80
R2 Score: 0.72


Task 2: Iris Dataset (Classification)<br>
Apply data splitting to the Iris dataset.<br>
Split it into train (70%), validation (15%), and test (15%).


In [5]:
# Task 2: Iris Dataset Classification — Data Splitting
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()
X = data.data
y = data.target

# 70% train, 15% validation, 15% test
# random_state=42 ensures reproducible splits
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



Task 3: Customer Churn Dataset (Classification)<br>
Predict customer churn using the telecom dataset.<br>
Split the data into training, validation, and test sets.

In [7]:
# Task 3: Customer Churn Dataset Simulation — Classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

# Simulate a telecom churn dataset (binary classification)
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

# Split the dataset: 70% train, 15% validation, 15% test
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

