1. Import the necessary libraries


In [2]:
from google.colab import drive
drive.mount('/content/drive')


import pandas as pd
import numpy as np
import pickle  # For saving the model

# Scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


Mounted at /content/drive


2. Load the dataset

In [3]:
file_path = '/content/drive/MyDrive/CSC/house_pricing_train.csv'

try:
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")

Dataset loaded successfully.


3. Data Preprocessing

In [4]:
selected_features = [
    'OverallQual',
    'GrLivArea',
    'TotalBsmtSF',
    'GarageCars',
    'YearBuilt',
    'FullBath'
]
target = 'SalePrice'

# Filter the dataframe to only use these columns
X = df[selected_features]
y = df[target]

print(f"Features selected: {selected_features}")

# b. Train-Test Split
# We split before preprocessing to prevent data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Features selected: ['OverallQual', 'GrLivArea', 'TotalBsmtSF', 'GarageCars', 'YearBuilt', 'FullBath']


4. Model Pipeline

In [5]:
# We use a Pipeline to bundle preprocessing steps with the model.
# This ensures that when we save the model, we also save the scaling logic.

model_pipeline = Pipeline([
    # Step 1: Handle Missing Values (Impute with Median)
    ('imputer', SimpleImputer(strategy='median')),

    # Step 2: Feature Scaling (Standardization)
    ('scaler', StandardScaler()),

    # Step 3: Algorithm (Random Forest Regressor)
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

5. Train the model


In [6]:
print("Training the model...")
model_pipeline.fit(X_train, y_train)
print("Training complete.")

Training the model...
Training complete.


6. Evaluate the model

In [7]:
# Make predictions on the test set
y_pred = model_pipeline.predict(X_test)

# Calculate Metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\n" + "="*30)
print("MODEL EVALUATION METRICS")
print("="*30)
print(f"Mean Absolute Error (MAE): ${mae:,.2f}")
print(f"Mean Squared Error (MSE):  {mse:,.2f}")
print(f"Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"R-squared (R²):            {r2:.4f}")
print("="*30)


MODEL EVALUATION METRICS
Mean Absolute Error (MAE): $19,227.60
Mean Squared Error (MSE):  840,780,132.23
Root Mean Squared Error (RMSE): $28,996.21
R-squared (R²):            0.8904


7. Save the model

In [8]:
# We save the entire pipeline (imputer + scaler + model)
save_path = '/content/drive/MyDrive/CSC/house_price_model.pkl'


with open(save_path, 'wb') as file:
    pickle.dump(model_pipeline, file)

print(f"\nModel saved successfully to: {save_path}")

# This confirms the saved model works without retraining
with open(save_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Test prediction
test_prediction = loaded_model.predict(X_test.iloc[[0]])
print(f"Verification - Prediction for first test house: ${test_prediction[0]:,.2f}")



Model saved successfully to: /content/drive/MyDrive/CSC/house_price_model.pkl
Verification - Prediction for first test house: $139,335.50
