In [None]:
# Cell 1: Setup, Imports, and Data Loading (Based on original Cell 98)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Cell 2: Load Data (Based on original Cell 99)
# Assuming your file is named 'housing.csv'
df = pd.read_csv('housing.csv')

In [None]:
# Cell 3: Handle Missing Values (Based on original Cell 101)
# Drop rows with NA values (specifically in 'total_bedrooms')
df.dropna(inplace=True)

In [None]:
# Cell 4: Define X and y (Based on original Cell 102)
# Define Features (X) and Target (y) before transformations
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [None]:
# Cell 5: Train-Test Split (The Consistency Fix)
# FIX: Use a fixed random_state=42 for reproducible results. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

In [None]:
# Cell 6: Feature Transformation Function (The Data Leakage Fix)
def transform_features(X_data):
    """Applies Log Transform, Feature Engineering, and OHE. Operates on a copy of data."""

    # Work on a copy to avoid SettingWithCopyWarning
    X_transformed = X_data.copy()

    # 1. Log Transform for Skewed Features (as in video [00:13:24])
    skewed_features = ['total_rooms', 'total_bedrooms', 'population', 'households']
    for col in skewed_features:
        # np.log1p is log(x + 1)
        X_transformed[col] = np.log1p(X_transformed[col])

    # 2. Feature Engineering (as in video [00:20:00])
    X_transformed['bedroom_ratio'] = X_transformed['total_bedrooms'] / X_transformed['total_rooms']
    X_transformed['household_rooms'] = X_transformed['total_rooms'] / X_transformed['households']

    # 3. One-Hot Encode (OHE) 'ocean_proximity' (as in video [00:16:28])
    # drop_first=True avoids multicollinearity
    ohe_columns = pd.get_dummies(X_transformed['ocean_proximity'], prefix='ocean', drop_first=True)
    X_transformed = X_transformed.join(ohe_columns)
    X_transformed.drop('ocean_proximity', axis=1, inplace=True)
    
    return X_transformed

# Apply the transformations to the Training and Test sets separately
X_train_processed = transform_features(X_train)
X_test_processed = transform_features(X_test)

In [None]:
# Cell 7: Column Alignment for Robustness
# Safety step: Ensure test set columns match training set columns after OHE
missing_cols = set(X_train_processed.columns) - set(X_test_processed.columns)
for c in missing_cols:
    X_test_processed[c] = 0

# Re-order test columns to match training columns
X_test_processed = X_test_processed[X_train_processed.columns]

In [None]:
# Cell 8: Scaling Numerical Features (Based on original Cell 110)
# FIX: Scaling is fit *only* on the training data to prevent leakage.
scaler = StandardScaler()

# Fit and Transform Training Data
X_train_scaled = scaler.fit_transform(X_train_processed)

# Transform Test Data using the fitted scaler
X_test_scaled = scaler.transform(X_test_processed)

In [None]:
# Cell 9: Train Linear Regression (Based on original Cell 113)
reg = LinearRegression()
reg.fit(X_train_scaled, y_train)

In [None]:
# Cell 10: Score Linear Regression (Based on original Cell 114)
# Expected Score (R^2): ~0.66
print(f"Linear Regression R^2 Score: {reg.score(X_test_scaled, y_test):.4f}")

In [None]:
# Cell 11: Train Random Forest Regressor (Based on original Cell 115)
# FIX: Set random_state for reproducible Random Forest results.
forest = RandomForestRegressor(random_state=42) 
forest.fit(X_train_scaled, y_train)

In [None]:
# Cell 12: Score Random Forest Regressor (Based on original Cell 116)
# Expected Score (R^2): ~0.80
print(f"Random Forest R^2 Score (Default): {forest.score(X_test_scaled, y_test):.4f}")

In [None]:
# Cell 13: Grid Search Setup and Fit (Based on original Cell 126)
# Use a fixed random_state for the base estimator
reg = RandomForestRegressor(random_state=42)

# Adjusted param_grid based on common practice and video findings (more estimators/depths)
param_grid = {
    "n_estimators" : [100, 200, 300], 
    "max_features" : [2, 4, 6, 8],
    "max_depth": [None, 5, 8],
}

grid_search = GridSearchCV(reg, param_grid, n_jobs=-1, cv=5, scoring="neg_mean_squared_error", return_train_score=True)

# Fit on the correctly scaled and processed training data
grid_search.fit(X_train_scaled, y_train)

In [None]:
# Cell 14: Get Best Estimator (Based on original Cell 128)
best_reg = grid_search.best_estimator_
print(f"Best Hyperparameters: {grid_search.best_params_}")

In [None]:
# Cell 15: Score Best Estimator (Based on original Cell 129)
# Score the best model found by the Grid Search
print(f"Random Forest R^2 Score (Grid Search): {best_reg.score(X_test_scaled, y_test):.4f}")