Mounted at /content/drive


Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
# -*- coding: utf-8 -*-
"""House Price Prediction Script"""

# 🚀 Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# 🚀 Step 1: Load the Dataset
file_path = "/content/drive/MyDrive/GithubProjects/AmesHousingDataset/AmesHousing.csv"
df = pd.read_csv(file_path)

# 🚀 Step 2: Data Preprocessing
# Remove spaces in column names
df.columns = df.columns.str.replace(' ', '_')

# Fill missing values
num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = df.select_dtypes(include=['object']).columns.tolist()

df[num_features] = df[num_features].fillna(df[num_features].median())
df[cat_features] = df[cat_features].fillna("None")

# 🚀 Step 3: Feature Engineering
df['TotalSF'] = df['1st_Flr_SF'] + df['2nd_Flr_SF'] + df['Total_Bsmt_SF']
df['SalePrice'] = np.log1p(df['SalePrice'])  # Log transformation of target

# 🚀 Step 4: Encode Categorical Variables
df_encoded = pd.get_dummies(df, columns=cat_features, drop_first=True)

# 🚀 Step 5: Feature Selection (Remove Low-Correlation Features)
corr = df_encoded.corr()['SalePrice'].sort_values(ascending=False)
low_corr_features = corr[abs(corr) < 0.1].index.tolist()
df_encoded = df_encoded.drop(columns=low_corr_features)

# 🚀 Step 6: Train-Test Split
X = df_encoded.drop(columns=['SalePrice'])
y = df_encoded['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 🚀 Step 7: Hyperparameter Tuning
def optimize_model(model, param_grid):
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_absolute_error', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(f"Best Parameters for {model.__class__.__name__}: {grid_search.best_params_}")
    return grid_search.best_estimator_

# Random Forest Optimization
rf_params = {'n_estimators': [100, 300], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5]}
best_rf = optimize_model(RandomForestRegressor(random_state=42), rf_params)

# XGBoost Optimization
xgb_params = {'n_estimators': [100, 300], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 6]}
best_xgb = optimize_model(XGBRegressor(random_state=42), xgb_params)

# 🚀 Step 8: Model Training and Evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{model.__class__.__name__} - MAE: {mae:.4f}, RMSE: {rmse:.4f}")
    return mae, rmse

print("\n🎯 Evaluating Models:")
mae_rf, rmse_rf = evaluate_model(best_rf, X_test, y_test)
mae_xgb, rmse_xgb = evaluate_model(best_xgb, X_test, y_test)

# 🚀 Step 9: Ensemble Learning
y_pred_ensemble = (best_rf.predict(X_test) + best_xgb.predict(X_test)) / 2
mae_ensemble = mean_absolute_error(y_test, y_pred_ensemble)
rmse_ensemble = np.sqrt(mean_squared_error(y_test, y_pred_ensemble))

print(f"\n🔥 Ensemble Model - MAE: {mae_ensemble:.4f}, RMSE: {rmse_ensemble:.4f}")

# 🚀 Step 10: Save the Best Model
joblib.dump(best_xgb, "optimized_house_price_model.pkl")

# 🚀 Step 11: Make Predictions on New Data
new_house = X_test.iloc[0:1]
predicted_price = np.expm1(best_xgb.predict(new_house)[0])
print(f"\n🏡 Predicted House Price: ${predicted_price:.2f}")


Fitting 3 folds for each of 12 candidates, totalling 36 fits
