In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_absolute_error,
    r2_score,
    root_mean_squared_error,
)

## 🔍 1️⃣ Exploratory Data Analysis (EDA)


#### 1. Load Data


In [None]:
# Load train and test datasets
df = pd.read_csv("dataset/train.csv", low_memory=False)

In [None]:
print("Dataset shape:", df.shape)

In [None]:
print("First rows", df.head(), sep="\n")

In [None]:
print("Info", df.info(), sep="\n")

#### 2. Handle Missing Values


In [None]:
print(df.isnull().sum().sort_values(ascending=False))

In [None]:
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
df[numerical_features] = df[numerical_features].fillna(df[numerical_features].median())

In [None]:
categorical_features = df.select_dtypes(include=["object"]).columns
df[categorical_features] = df[categorical_features].fillna("MISSING")

In [None]:
print(df.isnull().sum().sort_values(ascending=False))

#### 3. Summary Statistics


In [None]:
df.describe()

#### 4. Visualizing Target Variable (`SalePrice`)


In [None]:
numerical_features = df.select_dtypes(include=["int64", "float64"]).columns
categorical_features = df.select_dtypes(include=["object"]).columns

###### Distribution of Numerical Features with SalePrice


In [None]:
fig, axes = plt.subplots(
    nrows=len(numerical_features) // 4 + 1,
    ncols=4,
    figsize=(20, len(numerical_features) * 1.2),
)
fig.suptitle("Distribution of Numerical Features with SalePrice", fontsize=16)

axes = axes.flatten()  # Flatten the 2D array for easier iteration

for i, col in enumerate(numerical_features):
    sns.histplot(df[col], kde=True, ax=axes[i], color="royalblue")
    axes[i].set_title(f"{col} Distribution")

# Hide empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

###### SalePrice Distribution Across Categorical Features


In [None]:
fig, axes = plt.subplots(
    nrows=len(categorical_features) // 3 + 1,
    ncols=3,
    figsize=(20, len(categorical_features) * 1.5),
)
fig.suptitle("SalePrice Distribution Across Categorical Features", fontsize=16)

axes = axes.flatten()

for i, col in enumerate(categorical_features):
    sns.boxplot(x=df[col], y=df["SalePrice"], ax=axes[i])
    axes[i].set_title(f"{col} vs SalePrice")
    axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)

# Hide empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

###### Scatterplots of Numerical Features vs. SalePrice


In [None]:
fig, axes = plt.subplots(
    nrows=len(numerical_features) // 3 + 1,
    ncols=3,
    figsize=(20, len(numerical_features) * 1.5),
)
fig.suptitle("Scatterplots of Numerical Features vs. SalePrice", fontsize=16)

axes = axes.flatten()

for i, col in enumerate(numerical_features):
    sns.scatterplot(
        x=df[col], y=df["SalePrice"], alpha=0.6, color="darkorange", ax=axes[i]
    )
    axes[i].set_title(f"{col} vs SalePrice")

# Hide empty subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

#### 5. Correlation Analysis


In [None]:
# Compute the correlation matrix
corr_matrix = df.corr(numeric_only=True)

In [None]:
plt.figure(figsize=(18, 10))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix Heatmap", fontsize=16)
plt.show()

In [None]:
print(df.isnull().sum().sort_values(ascending=False))

## 📌 2️⃣ Data Preprocessing


#### 1. Feature Enginneering


In [None]:
# Older houses may have different pricing patterns than newer ones.
# 🏠 Creating House Age Feature
df["HouseAge"] = df["YrSold"] - df["YearBuilt"]

# 🔨 Remodeling Indicator
# Some houses are renovated, affecting their value.
df["WasRemodeled"] = (df["YearRemodAdd"] != df["YearBuilt"]).astype(int)

# 📏 Total Square Footage
# Combining basement, first, and second-floor areas into a single feature.
df["TotalSF"] = df["TotalBsmtSF"] + df["1stFlrSF"] + df["2ndFlrSF"]

df["LotQual"] = df["LotArea"] * df["OverallQual"]

#### 2. Encoding Categorical Features


In [None]:
# Some categorical variables have a natural order (e.g., quality ratings).
quality_features = ["ExterQual", "ExterCond", "HeatingQC", "KitchenQual"]

bsmt_features = [
    "BsmtQual",
    "BsmtCond",
    "FireplaceQu",
    "GarageQual",
    "GarageCond",
]

# Mapping pour chaque feature
quality_mapping = {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}
bsmt_mapping = {"MISSING": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5}

# Appliquer le mapping
for col in quality_features:
    df[col] = df[col].map(quality_mapping)

for col in bsmt_features:
    df[col] = df[col].map(bsmt_mapping)

In [None]:
# For categorical features without any ranking, we use one-hot encoding.
categorical_features = df.select_dtypes(include=["object"]).columns
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)

## 📊 3️⃣ Feature Selection


In [None]:
# ✅ Step 2: Feature Importance using XGBoost
# *******************************************

# X = df.drop(columns=["SalePrice"])  # Features
# y = df["SalePrice"]  # Target

# # Train an XGBoost model
# xgb = XGBRegressor(n_estimators=100, random_state=42)
# xgb.fit(X, y)

# # Get feature importance
# xgb_importance = xgb.feature_importances_
# xgb_importance_df = pd.DataFrame({"Feature": X.columns, "Importance": xgb_importance})
# xgb_importance_df.sort_values(by="Importance", ascending=False, inplace=True)

# # Plot top 20 important features
# plt.figure(figsize=(12, 6))
# sns.barplot(x="Importance", y="Feature", data=xgb_importance_df[:20])
# plt.title("Top 20 Important Features (XGBoost)")
# plt.show()

# # Drop low-importance features
# threshold = 0.005
# selected_features_xgb = xgb_importance_df[xgb_importance_df["Importance"] > threshold][
#     "Feature"
# ].tolist()

# print(f"Selected features (XGBoost): {selected_features_xgb}")

In [None]:
# ✅ Step 3: Keep only the selected features in the dataset
# ************************************************

# df = df[selected_features_xgb + ["SalePrice"]]

## 🚀 4️⃣ Model Training & Evaluation


#### 📌 Step 1: Train-Test Split


In [None]:
# Define features (X) and target variable (y)
# df
X = df.drop(columns=["SalePrice"])
y = df["SalePrice"]

# Split into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training Set: {X_train.shape}, Testing Set: {X_test.shape}")

#### 📌 Step 2: Standardization


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### 📌 Step 3: Choose & Train Models


In [None]:
# Define models with default parameters
models = {
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42
    ),
}

# Train models and evaluate
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results[name] = {
        "R² Score": r2_score(y_test, y_pred),
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": root_mean_squared_error(y_test, y_pred),
    }

# Convert results to DataFrame
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Define parameter grid for XGBoost tuning
param_grid = {
    "n_estimators": [100, 250, 300, 500],
    "max_depth": [3, 5, 7],
    "learning_rate": [0.01, 0.05, 0.1],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.7, 0.8, 0.9],
}

# Grid Search for best parameters
xgb_tuned = GridSearchCV(
    XGBRegressor(random_state=42), param_grid, cv=3, scoring="r2", n_jobs=-1, verbose=2
)
xgb_tuned.fit(X_train, y_train)

# Best parameters
print("🚀 Best XGBoost Parameters:", xgb_tuned.best_params_)

# Train optimized model
best_xgb = XGBRegressor(**xgb_tuned.best_params_, random_state=42)
best_xgb.fit(X_train, y_train)

# Predict on test set
y_pred_best_xgb = best_xgb.predict(X_test)

# Evaluate performance
xgb_r2_best = r2_score(y_test, y_pred_best_xgb)
xgb_mae_best = mean_absolute_error(y_test, y_pred_best_xgb)
xgb_rmse_best = root_mean_squared_error(y_test, y_pred_best_xgb)

print(
    f"✅ Optimized XGBoost Results: R² = {xgb_r2_best:.4f}, MAE = {xgb_mae_best:.4f}, RMSE = {xgb_rmse_best:.4f}"
)

In [None]:
# # Define parameter grid
# rf_param_grid = {
#     "n_estimators": [100, 120, 150, 180, 200],  # Number of trees
#     "max_depth": [10, 20, None],  # Tree depth
#     "min_samples_split": [2, 5, 10],  # Minimum samples to split
#     "min_samples_leaf": [1, 2, 4],  # Minimum samples per leaf
#     "bootstrap": [True, False],  # Bootstrapping technique
# }

# # Initialize RandomForestRegressor
# rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# # Grid search with cross-validation
# rf_grid_search = GridSearchCV(
#     rf_model,
#     rf_param_grid,
#     cv=5,
#     scoring="r2",
#     n_jobs=-1,
# )

# # Train the model
# rf_grid_search.fit(X_train, y_train)

# # Get the best model
# best_rf = rf_grid_search.best_estimator_

# # Print best parameters
# print("🔥 Best Random Forest Parameters:", rf_grid_search.best_params_)

In [None]:
# rf_pred = best_rf.predict(X_test)
# xgb_pred = best_xgb.predict(X_test)

# # Weighted average (adjust weights based on R² performance)
# final_pred = (0.6 * xgb_pred) + (0.4 * rf_pred)

# # Compute metrics
# ensemble_r2 = r2_score(y_test, final_pred)
# ensemble_mae = mean_absolute_error(y_test, final_pred)
# ensemble_rmse = root_mean_squared_error(y_test, final_pred)

# # Print results
# print("⚡ Ensemble Model Performance:")
# print(f"   📌 R² Score: {ensemble_r2:.4f}")
# print(f"   📌 MAE: {ensemble_mae:.2f}")
# print(f"   📌 RMSE: {ensemble_rmse:.2f}")