Title: GridSearchCV & RandomizedSearchCV

Task 1: GridSearchCV for Decision Trees<br>
Use GridSearchCV to tune max_depth and min_samples_split in Decision Tree for Iris.

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Load data
data = load_iris()
X, y = data.data, data.target

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model and param grid
dt = DecisionTreeClassifier(random_state=42)
param_grid = {
    'max_depth': [2, 3, 4, 5, None],
    'min_samples_split': [2, 5, 10]
}

# GridSearchCV setup
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best params
print("Best Parameters:", grid_search.best_params_)
print(f"Best CV Accuracy: {grid_search.best_score_:.4f}")

# Evaluate on test set
best_dt = grid_search.best_estimator_
y_pred = best_dt.predict(X_test)
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'max_depth': 4, 'min_samples_split': 2}
Best CV Accuracy: 0.9417
Test Accuracy: 1.0000
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Task 2: RandomizedSearchCV for Random Forest<br>
Apply RandomizedSearchCV to optimize hyperparameters of Random Forest for customer churn.

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score

# === Replace this with your actual churn dataset loading ===
# For example, if CSV:
# data = pd.read_csv('customer_churn.csv')
# X = data.drop('Churn', axis=1)
# y = data['Churn']

# Here is a placeholder dummy example:
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=20, n_informative=10, n_redundant=5, random_state=42)

# Split into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter distribution for RandomizedSearch
param_dist = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize Random Forest
rf = RandomForestClassifier(random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Show best params
print("Best Hyperparameters:", random_search.best_params_)

# Predict and evaluate
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.8s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=True, max_depth=10, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.7s
[CV] END bootstrap=False, max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total t

Task 3: Fine-Tuning SVR with GridSearchCV<br>
Use GridSearchCV to find best parameters for Support Vector Regression on housing data.

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load housing data (California Housing dataset)
data = fetch_california_housing()
X, y = data.data, data.target

# Split dataset into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (important for SVR)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define SVR model
svr = SVR()

# Define hyperparameter grid for GridSearchCV
param_grid = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 0.2, 0.5],
    'degree': [2, 3, 4]  # Only relevant for 'poly' kernel
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=svr,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

# Fit GridSearchCV
grid_search.fit(X_train_scaled, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate on test data
best_svr = grid_search.best_estimator_
y_pred = best_svr.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test MSE: {mse:.4f}")
print(f"Test R^2 Score: {r2:.4f}")


Fitting 5 folds for each of 144 candidates, totalling 720 fits
[CV] END .......C=0.1, degree=2, epsilon=0.01, kernel=linear; total time=   9.3s
[CV] END .......C=0.1, degree=2, epsilon=0.01, kernel=linear; total time=   9.7s
[CV] END .......C=0.1, degree=2, epsilon=0.01, kernel=linear; total time=   9.7s
[CV] END .......C=0.1, degree=2, epsilon=0.01, kernel=linear; total time=   9.5s
[CV] END .......C=0.1, degree=2, epsilon=0.01, kernel=linear; total time=   9.0s
[CV] END ..........C=0.1, degree=2, epsilon=0.01, kernel=rbf; total time=  12.1s
[CV] END ..........C=0.1, degree=2, epsilon=0.01, kernel=rbf; total time=  12.1s
[CV] END ..........C=0.1, degree=2, epsilon=0.01, kernel=rbf; total time=  12.4s
[CV] END ..........C=0.1, degree=2, epsilon=0.01, kernel=rbf; total time=  12.1s
[CV] END ..........C=0.1, degree=2, epsilon=0.01, kernel=rbf; total time=  12.2s
[CV] END .........C=0.1, degree=2, epsilon=0.01, kernel=poly; total time=  10.7s
[CV] END .........C=0.1, degree=2, epsilon=0.0