In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [2]:
# Load the dataset
data = pd.read_csv("housing.csv")

In [3]:
# Drop rows with missing values
data.dropna(inplace=True)

In [4]:
# Define features (X) and target (y)
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']

In [5]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
numerical_cols = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
categorical_cols = ['ocean_proximity']

In [7]:
numerical_transformer = Pipeline([
    ('log_transform', StandardScaler())
])


In [8]:
# Define categorical transformer: OneHotEncoder
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [9]:
# Combine transformers
preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])



In [10]:
# Create a pipeline with preprocessor and Random Forest Regressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])


In [11]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__min_samples_split': [2, 4, 6],
    'model__max_depth': [None, 10, 20]
}

In [12]:
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True)


In [13]:
# Fit the model using grid search
grid_search.fit(X_train, y_train)


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('log_transform',
                                                                                          StandardScaler())]),
                                                                         ['longitude',
                                                                          'latitude',
                                                                          'housing_median_age',
                                                                          'total_rooms',
                                                                          'total_bedrooms',
                                                                          'population',
                                                                          'households',
 