In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
redfin_url = 'https://redfin-public-data.s3.us-west-2.amazonaws.com/redfin_market_tracker/zip_code_market_tracker.tsv000.gz'
df_redfin = pd.read_csv(redfin_url, sep="\t")
#df_redfin.info(memory_usage='deep')

In [5]:
# Extract only the zipcode from region column
df_redfin.columns = df_redfin.columns.str.lower()
df_redfin = df_redfin[df_redfin["region"].str.contains("Zip Code:")]
df_redfin["ZIP"] = pd.to_numeric(df_redfin["region"].str.extract(r"Zip Code:\s*(\d{5})")[0], errors="coerce")

In [6]:
# rename and select only relevant columns 
selected_cols = {
    "ZIP": "zip_code",
    "state_code": "state",
    "property_type": "property_type",
    "median_list_price": "list_price",
    "median_ppsf": "price_per_sqft",
    "median_list_ppsf": "list_price_per_sqft",
    "inventory": "active_inventory",
    "new_listings": "new_listings",
    "pending_sales": "pending_sales",
    "avg_sale_to_list": "sale_to_list_ratio",
    "sold_above_list": "percent_above_list",
    "off_market_in_two_weeks": "off_market_2w",
    "homes_sold": "homes_sold",
    "median_sale_price": "predicted_price",
    "median_dom": "predicted_days_on_market",
}

df_model_ready = df_redfin[list(selected_cols.keys())].rename(columns=selected_cols)

# convert columns to float where applicable

columns_to_float = [col for col in df_model_ready.columns if col not in ["zip_code", "state", "property_type"]]
df_model_ready[columns_to_float] = df_model_ready[columns_to_float].apply(pd.to_numeric, errors="coerce")


In [7]:
# Want to add crime rate onto data by state
crime_path = "Crimes_Merged_Cleaned.xlsx"

df_crime = pd.read_excel(crime_path, header=4)
df_crime = df_crime[["State", "Unnamed: 1", "Total"]]
df_crime.columns = ["state", "Population_Covered", "Total"]

# state names to abbreviations
state_abbrev_map = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR',
    'California': 'CA', 'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE',
    'District of Columbia': 'DC', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN',
    'Iowa': 'IA', 'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA',
    'Maine': 'ME', 'Maryland': 'MD', 'Massachusetts': 'MA', 'Michigan': 'MI',
    'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO', 'Montana': 'MT',
    'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC',
    'North Dakota': 'ND', 'Ohio': 'OH', 'Oklahoma': 'OK', 'Oregon': 'OR',
    'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT',
    'Vermont': 'VT', 'Virginia': 'VA', 'Washington': 'WA',
    'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY'
}
df_crime["state"] = df_crime["state"].map(state_abbrev_map)


df_crime = df_crime[df_crime["state"] != "Total"]
df_crime["Population_Covered"] = pd.to_numeric(df_crime["Population_Covered"], errors="coerce")
df_crime["Total"] = pd.to_numeric(df_crime["Total"], errors="coerce")

# scale crime rate
df_crime["crimes_per_100k"] = (df_crime["Total"] / df_crime["Population_Covered"]) * 100_000

#merge crime onto redfin
df_model_ready["state"] = df_model_ready["state"].str.strip().str.upper()
df_crime["state"] = df_crime["state"].str.strip().str.upper()
df_model_ready = df_model_ready.merge(df_crime[["state", "crimes_per_100k"]], on="state", how="left")

#print(df_model_ready.head())

In [8]:
group_cols = ["zip_code", "property_type"]
numeric_cols = [
    col for col in df_model_ready.columns
    if col not in ["zip_code", "state", "property_type"]
]

# impute group-wise mean, then global mean as backup
for col_name in numeric_cols:
    group_mean = df_model_ready.groupby(group_cols)[col_name].transform("mean")
    global_mean = df_model_ready[col_name].mean()
    
    df_model_ready[col_name] = df_model_ready[col_name].fillna(group_mean)
    df_model_ready[col_name] = df_model_ready[col_name].fillna(global_mean)


In [None]:
df_model_ready.rename(columns={
    'predicted_price': 'avg_price_sold',
    'predicted_days_on_market': 'avg_days_on_market',
    'list_price': 'avg_listed_price',
    'price_per_sqft': 'avg_price_per_sqft',
    'list_price_per_sqft': 'avg_list_price_per_sqft',
    'predicted_days_on_market': 'avg_days_on_market'
}, inplace=True)

cols_to_keep = [
    'zip_code',
    'property_type',
    'active_inventory',
    'new_listings',
    'pending_sales',
    'homes_sold',
    'sale_to_list_ratio',
    'percent_above_list',
    'avg_days_on_market',
    'crimes_per_100k',
    'off_market_2w',
    'avg_list_price_per_sqft',
    'avg_listed_price',
    'avg_price_per_sqft',
    'avg_price_sold'
]


df_model_ready = df_model_ready[cols_to_keep]

# export to CSV
output_path = os.path.join(os.getcwd(), "redfin_with_crime.csv")
df_model_ready.to_csv(output_path, index=False)