# SIG720 Task 5D: Paris Housing Price Prediction

This notebook contains the complete solution for Task 5D using the **Paris Housing Price Prediction** dataset from Kaggle.

**Dataset Source:** [Paris Housing Price Prediction – Kaggle](https://www.kaggle.com/datasets/mssmartypants/paris-housing-price-prediction)

---

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import shap
import streamlit as st

In [None]:
# Load dataset
data = pd.read_csv('ParisHousing.csv')  # Ensure this file is in the same directory
print(data.shape)
data.head()

## Data Preprocessing and EDA

In [None]:
# Check for null values and data types
data.info()

In [None]:
# Summary statistics
data.describe()

In [None]:
# Visualize price distribution
sns.histplot(data['price'], bins=30, kde=True)
plt.title('Housing Price Distribution')
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Feature scaling
scaler = StandardScaler()
X = data.drop(columns=['price'])
y = data['price']
X_scaled = scaler.fit_transform(X)

## Model Development and Evaluation

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(objective='reg:squarederror', random_state=42)
}

In [None]:
# Evaluate models using cross-validation
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
    print(f"{name} - CV R2: {np.mean(scores):.4f} (+/- {np.std(scores):.4f})")

## Feature Importance

In [None]:
# Fit Random Forest and show feature importances
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
for feat, score in zip(X.columns, importances):
    print(f"{feat}: {score:.4f}")

In [None]:
# SHAP values for interpretability
explainer = shap.Explainer(rf, X_train)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values)

## Deployment

In [None]:
# Use this section in a separate Streamlit script (app.py) to deploy the model
# See generated app script separately

---
**End of notebook**