In [46]:
import pandas as pd
import numpy as np
from scipy.stats import linregress


In [40]:
sfh_prices = pd.read_csv("data/sfh_values_mid_tiers.csv")
print(sfh_prices.index.size)

30807


In [42]:
#pre-processing, filter out zipcodes without significant populations
twenty_ten_census_zip_populations = pd.read_csv("data/population_by_zip_2010.csv")
zip_populations = twenty_ten_census_zip_populations.groupby('zipcode')['population'].sum().reset_index()
prices_with_pops = sfh_prices.merge(zip_populations, left_on="RegionName", right_on="zipcode")
print(prices_with_pops.index.size)
#filter to zips with atleast 5K people living there
min_population = 5000
prices_with_pops = prices_with_pops[prices_with_pops.population>min_population]
print(prices_with_pops.index.size)
sfh_prices = prices_with_pops

30208
19382


In [44]:
#melt the date,price into multiple rows instead of one wide table
date_cols = [x for x in sfh_prices.columns if sfh_prices[x].dtype=='float64']
melted_prices = pd.melt(sfh_prices, id_vars=['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName','State', 'City', 'Metro', 'CountyName'], value_vars=date_cols)
melted_prices = melted_prices.rename(columns = {"variable": "date", "value": "price"})
melted_prices.date = pd.to_datetime(melted_prices.date)


In [47]:
#add in the year distnace from the last date so we can perform regression on the price
max_date = melted_prices.date.max()
melted_prices['months_difference'] = (melted_prices.date-max_date)/(np.timedelta64(1, 'Y'))
zipcode_slopes = melted_prices.sort_values(["RegionName", "date"]).groupby("RegionName").apply(
            lambda x: linregress(x.months_difference, x['price'])[0]).round(3)
zipcode_stds = melted_prices.groupby("RegionName")['price'].std()

In [48]:
zipcode_slopes

RegionName
602           NaN
693           NaN
705           NaN
917           NaN
1001     4763.474
           ...   
99712         NaN
99801         NaN
99824         NaN
99833         NaN
99901         NaN
Length: 19382, dtype: float64

In [4]:
# sfh_prices['present'] = sfh_prices['2021-07-31']
# sfh_prices['1_months_ago'] = sfh_prices['2021-06-30']
# sfh_prices['3_months_ago'] = sfh_prices['2021-04-30']
# sfh_prices['6_months_ago'] = sfh_prices['2021-01-31']
# sfh_prices['1_year_ago'] = sfh_prices['2020-07-31']
# sfh_prices['3_years_ago'] = sfh_prices['2017-07-31']
# sfh_prices['5_years_ago'] = sfh_prices['2016-07-31']

In [5]:
# sfh_prices['1_month_diff'] = sfh_prices['present']/sfh_prices['1_months_ago']-1
# sfh_prices['3_month_diff'] = sfh_prices['present']/sfh_prices['3_months_ago']-1
# sfh_prices['6_month_diff'] = sfh_prices['present']/sfh_prices['6_months_ago']-1
# sfh_prices['1_year_diff'] = sfh_prices['present']/sfh_prices['1_year_ago']-1

# sfh_prices['weighted_growth'] = sfh_prices['1_month_diff']*12 + sfh_prices['3_month_diff']*4 + sfh_prices['6_month_diff']*2 + sfh_prices['1_year_diff']

In [6]:

#2020 census data released to public on sept 16th ?
#using 2010 for now




In [7]:
print(sfh_prices.index.size)
print(zip_populations.index.size)


#small amount of data loss, going to assume its for really small/obscure zips. 
print(prices_with_pops.index.size)

30807
33119
30208
