Title: Model Selection

Task 1: Linear Regression on House Prices<br>
Use Linear Regression and evaluate its performance on the validation set.

In [2]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load California Housing dataset
california = fetch_california_housing(as_frame=True)
X = california.data
y = california.target

# Split data: 85% train, 15% validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate model performance
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print(f'Validation RMSE: {rmse:.3f}')
print(f'Validation R²: {r2:.3f}')


Validation RMSE: 0.733
Validation R²: 0.590


Task 2: Decision Tree Classifier on Iris Dataset<br>
Train a Decision Tree model and evaluate its performance on validation data.

In [3]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split into train (85%) and validation (15%)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42, stratify=y)

# Train Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predict on validation set
y_pred = clf.predict(X_val)

# Evaluate performance
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred, target_names=iris.target_names)

print(f'Validation Accuracy: {accuracy:.3f}')
print('Classification Report:\n', report)


Validation Accuracy: 0.913
Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         7
  versicolor       0.88      0.88      0.88         8
   virginica       0.88      0.88      0.88         8

    accuracy                           0.91        23
   macro avg       0.92      0.92      0.92        23
weighted avg       0.91      0.91      0.91        23



Task 3:  Random Forest on Customer Churn<br>
Apply Random Forest and assess its accuracy on the validation set.

In [4]:
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Generate synthetic churn-like dataset (binary classification)
X, y = make_classification(n_samples=1000, n_features=20, 
                           n_informative=10, n_redundant=5,
                           n_classes=2, random_state=42)

# Split: 70% train, 30% validation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on validation
y_pred = rf.predict(X_val)

# Evaluate accuracy
acc = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Validation Accuracy: {acc:.3f}')
print("Classification Report:\n", report)


Validation Accuracy: 0.943
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.94       149
           1       0.95      0.94      0.94       151

    accuracy                           0.94       300
   macro avg       0.94      0.94      0.94       300
weighted avg       0.94      0.94      0.94       300

