Title: Data Splitting (Train-Test-Validation)


Task 1: House Prices Dataset (Regression)<br>
Use the House Prices dataset to predict house prices.<br>
Split the data into training, validation, and test sets (70% train, 15% validation, 15% test).

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor


data = {
    'OverallQual': [7, 6, 7, 8, 5],
    'GrLivArea': [1500, 1800, 1200, 2500, 1500],
    'GarageCars': [2, 2, 1, 3, 2],
    'TotRmsAbvGrd': [8, 6, 7, 9, 8],
    'YearBuilt': [2000, 1998, 2005, 2010, 2001],
    'Price': [200000, 180000, 160000, 300000, 250000]  # Target variable
}
df = pd.DataFrame(data)

# Step 2: Split the dataset into features (X) and target variable (y)
X = df.drop('Price', axis=1)  # Features
y = df['Price']  # Target

# Step 3: Split the data into training, validation, and test sets (70% train, 15% validation, 15% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


numerical_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotRmsAbvGrd', 'YearBuilt']
numerical_transformer = SimpleImputer(strategy='mean')

# Define a column transformer (for handling categorical and numerical features)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

# Step 5: Build a Regression Pipeline (e.g., using Random Forest for better accuracy)
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Combine preprocessing and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Step 6: Train the model on the training set
pipeline.fit(X_train, y_train)

# Step 7: Evaluate the model on the validation set
y_val_pred = pipeline.predict(X_val)

# Calculate evaluation metrics (MAE, MSE, R2 score)
mae_val = mean_absolute_error(y_val, y_val_pred)
mse_val = mean_squared_error(y_val, y_val_pred)
r2_val = r2_score(y_val, y_val_pred)

print(f"Validation Set Performance:")
print(f"MAE: {mae_val:.2f}")
print(f"MSE: {mse_val:.2f}")
print(f"R2 Score: {r2_val:.2f}")


y_test_pred = pipeline.predict(X_test)

mae_test = mean_absolute_error(y_test, y_test_pred)
mse_test = mean_squared_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f"\nTest Set Performance:")
print(f"MAE: {mae_test:.2f}")
print(f"MSE: {mse_test:.2f}")
print(f"R2 Score: {r2_test:.2f}")



Validation Set Performance:
MAE: 8400.00
MSE: 70560000.00
R2 Score: nan

Test Set Performance:
MAE: 57600.00
MSE: 3317760000.00
R2 Score: nan




Task 2: Iris Dataset (Classification)<br>
Apply data splitting to the Iris dataset.<br>
Split it into train (70%), validation (15%), and test (15%).


In [2]:
# Write your code here
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

data = load_iris()
X = data.data
y = data.target

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



Task 3: Customer Churn Dataset (Classification)<br>
Predict customer churn using the telecom dataset.<br>
Split the data into training, validation, and test sets.

In [1]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

