<a href="https://colab.research.google.com/github/ahmadyadgari/house_price_predictor/blob/main/house_price_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🏠 House-Price Predictor Notebook

In [1]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 2. Imports & Settings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Show all columns in DataFrame outputs
pd.set_option('display.max_columns', None)

In [3]:
# 3. Load Dataset
data_dir = '/content/drive/MyDrive/house-prices/'
train_df = pd.read_csv(f'{data_dir}train.csv')
test_df  = pd.read_csv(f'{data_dir}test.csv')

print(f"Training set: {train_df.shape}, Test set: {test_df.shape}")
train_df.head()

Training set: (17000, 9), Test set: (3000, 9)


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [4]:
# 4.1 Summary statistics
display(train_df.describe())

# 4.2 Missing-value report (top 10)
missing = train_df.isnull().sum().sort_values(ascending=False)
display(missing[missing > 0].head(10))

# 4.3 SalePrice distribution
plt.figure(figsize=(8,6))
sns.histplot(train_df['SalePrice'], kde=True)
plt.title('SalePrice Distribution')
plt.xlabel('SalePrice')
plt.show()

# 4.4 Correlation heatmap for top features
corr = train_df.corr()['SalePrice'].abs().sort_values(ascending=False).head(10)
plt.figure(figsize=(6,4))
sns.barplot(x=corr.values[1:], y=corr.index[1:])
plt.title('Top 10 Features Correlated with SalePrice')
plt.show()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


Unnamed: 0,0


KeyError: 'SalePrice'

<Figure size 800x600 with 0 Axes>

In [5]:
# Work on a copy
df = train_df.copy()

# 5.1 Handle missing data
df['LotFrontage'].fillna(df['LotFrontage'].median(), inplace=True)
for col in ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']:
    df[col].fillna('None', inplace=True)

# Drop any remaining rows with missing values (or apply custom imputations)
df.dropna(inplace=True)

# 5.2 Encode categorical variables
df = pd.get_dummies(df, drop_first=True)

# 5.3 Feature creation
df['HouseAge'] = df['YrSold'] - df['YearBuilt']

# 5.4 Define X and y, then split
X = df.drop(['SalePrice','Id'], axis=1)
y = df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")


KeyError: 'LotFrontage'

In [6]:
# 6.1 Initialize and train
lr = LinearRegression()
lr.fit(X_train, y_train)

# 6.2 Predict on test set
y_pred_lr = lr.predict(X_test)

# 6.3 Evaluate
rmse_lr = np.sqrt(mean_squared_error(y_test, y_pred_lr))
r2_lr   = r2_score(y_test, y_pred_lr)

print(f"Linear Regression → RMSE: {rmse_lr:.2f},  R²: {r2_lr:.3f}")

NameError: name 'X_train' is not defined

In [7]:
# 7.1 Initialize (100 trees, can tune later)
rf = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)
# 7.2 Train
rf.fit(X_train, y_train)

# 7.3 Predict
y_pred_rf = rf.predict(X_test)

# 7.4 Evaluate
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
r2_rf   = r2_score(y_test, y_pred_rf)

print(f"Random Forest → RMSE: {rmse_rf:.2f},  R²: {r2_rf:.3f}")

NameError: name 'X_train' is not defined

In [8]:
# 8.1 Metrics comparison
results = pd.DataFrame({
    'Model': ['Linear Regression','Random Forest'],
    'RMSE': [rmse_lr, rmse_rf],
    'R2':   [r2_lr,   r2_rf]
})
display(results)

# 8.2 Feature importances from RF
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
top20 = importances.sort_values(ascending=False).head(20)

plt.figure(figsize=(10,6))
sns.barplot(x=top20, y=top20.index)
plt.title('Top 20 Random Forest Feature Importances')
plt.xlabel('Importance')
plt.show()

NameError: name 'rmse_lr' is not defined