Title: Model Selection

Task 1: Linear Regression on House Prices<br>
Use Linear Regression and evaluate its performance on the validation set.

In [7]:

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load real-world dataset
X, y = fetch_california_housing(return_X_y=True)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LinearRegression())
])
pipeline.fit(X_train, y_train)

y_val_pred = pipeline.predict(X_val)

if y_val.shape == y_val_pred.shape:
    print("Linear Regression:")
    print(f"MAE: {mean_absolute_error(y_val, y_val_pred):.2f}")
    print(f"MSE: {mean_squared_error(y_val, y_val_pred):.2f}")
    print(f"R2 Score: {r2_score(y_val, y_val_pred):.2f}")



Linear Regression:
MAE: 0.53
MSE: 0.54
R2 Score: 0.58


Task 2: Decision Tree Classifier on Iris Dataset<br>
Train a Decision Tree model and evaluate its performance on validation data.

In [8]:

#from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

iris = load_iris()
X, y = iris.data, iris.target
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

param_grid = {'max_depth': [2, 3, 4, 5], 'criterion': ['gini', 'entropy']}
grid = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=3)
grid.fit(X_train, y_train)

y_val_pred = grid.predict(X_val)

if y_val.shape == y_val_pred.shape:
    print("\nDecision Tree:")
    print(f"Best Params: {grid.best_params_}")
    print(f"Validation Accuracy: {grid.score(X_val, y_val):.2f}")




Decision Tree:
Best Params: {'criterion': 'entropy', 'max_depth': 3}
Validation Accuracy: 1.00


Task 3:  Random Forest on Customer Churn<br>
Apply Random Forest and assess its accuracy on the validation set.

In [9]:

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, random_state=42)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20]
}
grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3)
grid.fit(X_train, y_train)

y_val_pred = grid.predict(X_val)

if y_val.shape == y_val_pred.shape:
    print("\nRandom Forest:")
    print(f"Best Params: {grid.best_params_}")
    print(f"Validation Accuracy: {grid.score(X_val, y_val):.2f}")



Random Forest:
Best Params: {'max_depth': 10, 'n_estimators': 100}
Validation Accuracy: 0.87
