# Portland Housing Prices — Full EDA & Modeling Notebook
This notebook performs complete exploratory data analysis and builds forecasting models for Portland, OR median monthly home prices (2012–2025).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

sns.set(style="whitegrid")

df = pd.read_csv("portland, or prices 2012-2022.csv")

df.head()

## Data Cleaning

In [None]:

df = df.rename(columns={
    "Month of Period End": "Date",
    "Measure Values": "median_price"
})

df["Date"] = pd.to_datetime(df["Date"])
df = df.sort_values("Date").reset_index(drop=True)

df = df[["Date","median_price"]]
df.head()

## Basic Time Series Plot

In [None]:

plt.figure(figsize=(10,4))
plt.plot(df["Date"], df["median_price"])
plt.title("Median Home Price Over Time (Portland, OR)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Feature Engineering

In [None]:

df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month

df["lag1"] = df["median_price"].shift(1)
df["lag3"] = df["median_price"].shift(3)
df["lag12"] = df["median_price"].shift(12)

df["roll3"] = df["median_price"].rolling(3).mean()
df["roll12"] = df["median_price"].rolling(12).mean()

df["pct_change"] = df["median_price"].pct_change()*100

df_fe = df.dropna().reset_index(drop=True)
df_fe.head()

## Correlation Heatmap

In [None]:

numeric_cols = df_fe.select_dtypes(include=['float64','int64']).columns.tolist()
corr = df_fe[numeric_cols].corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Correlation Matrix")
plt.show()


## Train-Test Split (Time Aware)

In [None]:

split_year = 2020
train = df_fe[df_fe["Year"] < split_year]
test = df_fe[df_fe["Year"] >= split_year]

feature_cols = ["Year","Month","lag1","lag3","lag12","roll3","roll12"]
X_train = train[feature_cols]
y_train = train["median_price"]

X_test = test[feature_cols]
y_test = test["median_price"]

X_train.shape, X_test.shape

## Baseline Model

In [None]:

y_pred_base = X_test["lag1"]
baseline_mae = mean_absolute_error(y_test, y_pred_base)
baseline_rmse = np.sqrt(mean_squared_error(y_test, y_pred_base))
baseline_mae, baseline_rmse

## Linear Regression Model

In [None]:

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
lr_r2 = r2_score(y_test, y_pred_lr)

lr_mae, lr_rmse, lr_r2

## Random Forest Model

In [None]:

rf = RandomForestRegressor(n_estimators=300, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

rf_mae, rf_rmse, rf_r2

## Gradient Boosting Model

In [None]:

gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb))
gb_r2 = r2_score(y_test, y_pred_gb)

gb_mae, gb_rmse, gb_r2

## Model Comparison Summary

In [None]:

results = pd.DataFrame({
    "Model":["Baseline","Linear Regression","Random Forest","Gradient Boosting"],
    "MAE":[baseline_mae, lr_mae, rf_mae, gb_mae],
    "RMSE":[baseline_rmse, lr_rmse, rf_rmse, gb_rmse],
    "R2":[None, lr_r2, rf_r2, gb_r2]
})
results