# Tooele County Housing Affordability Forecast
### A Capstone Data Analysis
#### Author: Adam F.
#### Repository: [GitHub - capstone_project_1](https://github.com/adamichaelf/capstone_project_1)
---

## Table of Contents
1. [Data Acquisition & Cleaning](#Data-Acquisition-&-Cleaning)
2. [Exploratory Data Analysis (EDA)](#EDA)
3. [Modeling & Forecasting](#Modeling-&-Forecasting)
4. [Affordability Analysis](#Affordability-Analysis)
5. [Summary & Decision Impact](#Summary-&-Decision-Impact)
---

## Data Acquisition & Cleaning <a id='Data-Acquisition-&-Cleaning'></a>

In [None]:
# Load necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

### Load and Clean Data

In [None]:
# Placeholder: Load cleaned `housing_df`, `income_df`, and `rent_df`
# Ensure they have columns: 'date', 'value', 'year' as needed
# Your real data loading logic goes here

In [None]:
# INCOME DATA
# Fetching median household income data from the Census Bureau API

CENSUS_API_KEY = "de53f7491b5574a451233a6d04721c4176263ab4"
years = range(2015, 2023)  # 2023 ACS not fully available yet
results = []

for year in years:
    url = (
        f"https://api.census.gov/data/{year}/acs/acs5"
        f"?get=NAME,B19013_001E&for=county:045&in=state:49&key={CENSUS_API_KEY}"
    )
    response = requests.get(url)
    if response.status_code == 200:
        try:
            data = response.json()
            value = data[1][1]
            results.append({"year": year, "median_income": int(value)})
        except Exception as e:
            print(f"Error parsing {year}: {e}")
    else:
        print(f"Failed for {year}: {response.status_code}")

# Convert to DataFrame
income_df = pd.DataFrame(results)
income_df = income_df.sort_values("year")
income_df['monthly_income'] = income_df["median_income"] / 12
income_df.tail()

In [None]:
# Housing Prices – Zillow ZHVI (Zillow Home Value Index)

path = kagglehub.dataset_download("robikscube/zillow-home-value-index")
zhvi_path = os.path.join(path, "ZHVI.csv")  # Adjust if the filename is different
zhvi_df = pd.read_csv(zhvi_path)

zhvi_df.rename(columns={zhvi_df.columns[0]: "date"}, inplace=True)

# Filter for Utah
zhvi_ut_df = zhvi_df[['date', 'Utah']].copy()
# Format 'date' to datetime and normalize dates to first of the month
zhvi_ut_df['date'] = pd.to_datetime(zhvi_ut_df['date']).dt.to_period('M').dt.to_timestamp()
zhvi_ut_df.head()

In [None]:
# Join Utah ZHVI data with Tooele and Salt Lake County Data

# County data is reported on the last day of the month and State data reported on the first, 
# dates will be normalized to the first of the month, with county data shifted forward a day

zhvi_county_df = pd.read_csv("data/zhvi_county_data.csv")
# Format 'date' to datetime and normalize dates to first of the month
zhvi_county_df['date'] = pd.to_datetime(zhvi_county_df['date']).dt.to_period('M').dt.to_timestamp()
# Shift month forward by 1
zhvi_county_df['date'] = zhvi_county_df['date'] + pd.DateOffset(months=1)

zhvi_all_df = pd.merge(zhvi_ut_df, zhvi_county_df, on='date', how='outer')

housing_df = zhvi_all_df[zhvi_all_df['date'] >= '2015-01-01']

# housing_df.info()
housing_df.head(10)
# housing_df.tail(10)

In [None]:
# Download and load the Zillow Rent Index dataset
path = kagglehub.dataset_download("zillow/rent-index")
rent_index_path = os.path.join(path, "price.csv")
rent_index_df = pd.read_csv(rent_index_path)

# Filter for Tooele County
rent_index_df = rent_index_df[rent_index_df['County'] == "Tooele"]

# Drop unneeded columns
columns_to_drop = ["City Code", "Metro", "State", "Population Rank"]
rent_index_df.drop(columns=columns_to_drop, inplace=True)

# Set 'City' as the index and transpose the date columns
rent_index_df.set_index('City', inplace=True)

# Transpose the DataFrame
rent_df1 = rent_index_df.T.copy()

# Convert index to datetime if it's month strings like "2020-01"
rent_df1.index = pd.to_datetime(rent_df1.index, errors='coerce')

# Clean up any rows with invalid dates (if any)
rent_df1 = rent_df1[rent_df1.index.notnull()]

# Add a County mean summary column
rent_df1['Tooele County'] = rent_df1.mean(axis=1)

rent_df1 = rent_df1.dropna(subset=["Tooele County"])

rent_df1 = rent_df1.reset_index().rename(columns={'index': 'date'})
rent_df1.index.name = None
rent_df1 = rent_df1[(rent_df1['date'] >= '2015-01-01') & (rent_df1['date'] < '2017-01-01')]

rent_df1.tail()

In [None]:
# Load the CSV file from the 'data' directory
csv_path = os.path.join("data", "county_zori_data.csv")
rent_index_df2 = pd.read_csv(csv_path)

# Filter for Tooele County
rent_index_df2 = rent_index_df2[rent_index_df2['RegionName'] == "Tooele County"]
rent_index_df2.head()

# Drop unneeded columns
columns_to_drop = ["RegionID", "RegionType", "StateName", "State", "Metro", "SizeRank", "StateCodeFIPS", "MunicipalCodeFIPS"]
rent_index_df2.drop(columns=columns_to_drop, inplace=True)

# Set 'City' as the index and transpose the date columns
rent_index_df2.set_index('RegionName', inplace=True)
rent_df2 = rent_index_df2.T.copy()

# Convert index to datetime if it's month strings like "2020-01"
rent_df2.index = pd.to_datetime(rent_df2.index, errors='coerce')

# Clean up any rows with invalid dates (if any)
rent_df2 = rent_df2[rent_df2.index.notnull()]

# Drop rows where the summary column is NaN
rent_df2 = rent_df2.dropna(subset=["Tooele County"])

# Reset index and rename
rent_df2 = rent_df2.reset_index().rename(columns={'index': 'date'})
rent_df2.index.name = None

# Format 'date' to datetime and normalize dates to first of the month
rent_df2['date'] = pd.to_datetime(rent_df2['date']).dt.to_period('M').dt.to_timestamp()
# Shift month forward by 1
rent_df2['date'] = rent_df2['date'] + pd.DateOffset(months=1)

# Filter for dates starting from January 2015
rent_df2 = rent_df2[rent_df2['date'] >= '2015-01-01']

rent_df2.head()

In [None]:
# Select just the date and Tooele County columns
df1 = rent_df1[['date', 'Tooele County']].copy()
df2 = rent_df2[['date', 'Tooele County']].copy()

# Concatenate the two DataFrames
combined_df = pd.concat([df1, df2])

# Set 'date' as the index
combined_df.set_index('date', inplace=True)

#  Create a complete monthly date range
full_index = pd.date_range(start=combined_df.index.min(), end=combined_df.index.max(), freq='MS')

# Reindex and interpolate missing months
combined_df = combined_df.reindex(full_index)
combined_df.index.name = 'date'

# Interpolate missing rent values linearly
# combined_df['Tooele County'] = combined_df['Tooele County'].interpolate(method='linear', limit_direction='both')
# combined_df['Tooele County'] = combined_df['Tooele County'].fillna(method='ffill').fillna(method='bfill')
# Interpolate and fill edge NaNs
# combined_df['Tooele County'] = (
    # combined_df['Tooele County']
    # .interpolate(method='linear', limit_direction='both')
    # .fillna(method='ffill')
    # .fillna(method='bfill')
# )

# Reset index if needed
rent_df = combined_df.reset_index()
rent_df.rename(columns={'index': 'date'}, inplace=True)

rent_df.tail(50)

## Exploratory Data Analysis (EDA) <a id='EDA'></a>

In [None]:
# Line plots: Housing and Income trends over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=housing_df, x='year', y='value', label='Housing Prices')
sns.lineplot(data=income_df, x='year', y='value', label='Income')
plt.title('Historical Trends in Housing Prices and Income')
plt.legend(); plt.grid(); plt.show()

In [None]:
# Optional: YoY percentage change
housing_df['pct_change'] = housing_df['value'].pct_change() * 100
income_df['pct_change'] = income_df['value'].pct_change() * 100

## Modeling & Forecasting <a id='Modeling-&-Forecasting'></a>

In [None]:
# Forecast 2025 using linear regression
# Example: Housing
X = housing_df[['year']]
y = housing_df['value']
model = LinearRegression().fit(X, y)
pred_2025 = model.predict([[2025]])
housing_forecast = pred_2025[0]
# TODO: Repeat for income_forecast

## Affordability Analysis <a id='Affordability-Analysis'></a>

In [None]:
# Example: Affordability Index for 2025
# Assumes income_forecast is defined
afford_index = housing_forecast / income_forecast
afford_pct = afford_index * 100

## Summary & Decision Impact <a id='Summary-&-Decision-Impact'></a>
### Key Findings:
- Forecasted 2025 Housing Price: $XXX
- Forecasted 2025 Income: $YYY
- Affordability Index: Z% (⚠️ if >30%)
- Conclusion: Is affordability getting better or worse?
- Impact: Support decisions on housing programs, zoning, and subsidies.