In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

cwd = os.getcwd()
#print(cwd)

In [16]:
# Load the CSV data into a Pandas DataFrame
data = pd.read_csv('clean_median_sale_price_v2_df.csv', index_col=0)
data.head()

Unnamed: 0,RegionName,StateName,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
0,"New York, NY",NY,34491100,33935000,33535000,33351700,34200000,35600000,37100000,38066700,...,54333300,52833300,51833300,51533300,50950000,51033300,51666700,53583300,55833300,57966700
1,"Los Angeles, CA",CA,39400000,39733300,40733300,43066700,45066700,46500000,46833300,47166700,...,85200000,84033300,82700000,81833300,81666700,82733300,83900000,85400000,87333300,88833300
2,"Chicago, IL",IL,16283300,15800000,16021800,17021800,18338500,19783300,20750000,21233300,...,28500000,27833300,27333300,27000000,26900000,27563300,28730000,30030000,31316700,31983300
3,"Dallas, TX",TX,15166700,15216700,15533300,16450000,17280000,17846700,18163300,18112500,...,39050000,38250000,37333300,36333300,36233300,36700000,37700000,38466700,39500000,40033300
4,"Houston, TX",TX,14983300,14857200,15040500,16015500,16741700,17490000,17765000,17865000,...,32100000,31600000,31066700,30600000,30433300,30633300,31333300,32166700,32866700,33366700


In [17]:
# Author: anderoos
# This script is meant to break up Zillow Data from the continental US into five distinct regions
# West, Midwest, Southwest, Southeast, Northeast
def split_regions(df):
    df['StateName'] = df['StateName'].astype('str')
    # Define categories
    northeast = ['ME', 'NH', 'VT', 'MA', 'RI', 'CT', 'NY', 'NJ', 'PA']
    southeast = ['DE', 'MD', 'VA', 'WV', 'NC', 'SC', 'GA', 'FL', 'AL', 'MS', 'LA', 'KY', 'TN']
    midwest = ['OH', 'IN', 'IL', 'MI', 'WI', 'MN', 'IA', 'MO', 'ND', 'SD', 'NE', 'KS']
    west = ['MT', 'ID', 'WY', 'CO', 'NM', 'AZ', 'UT', 'NV', 'CA', 'OR', 'WA', 'AK', 'HI']
    southwest = ['TX', 'OK', 'AR']
    # Filters state codes into separate dataframes
    ne_df = df[df['StateName'].isin(northeast)]
    se_df = df[df['StateName'].isin(southeast)]
    mw_df = df[df['StateName'].isin(midwest)]
    w_df = df[df['StateName'].isin(west)]
    sw_df = df[df['StateName'].isin(southwest)]
    return ne_df, se_df, mw_df, w_df, sw_df

In [18]:
# Split the data into regions
ne_df, se_df, mw_df, w_df, sw_df = split_regions(data)

In [19]:
ne_df.head()

Unnamed: 0,RegionName,StateName,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
0,"New York, NY",NY,34491100,33935000,33535000,33351700,34200000,35600000,37100000,38066700,...,54333300,52833300,51833300,51533300,50950000,51033300,51666700,53583300,55833300,57966700
6,"Philadelphia, PA",PA,20156700,19541700,19375000,19750000,20800000,21966700,22966700,23333300,...,32113300,31613300,31246700,30800000,30300000,30500000,31066700,32233300,33733300,35000000
9,"Boston, MA",MA,31566700,30833300,30533300,31600000,33000000,35066700,36111700,36778300,...,60666700,59666700,58633300,57666700,56000000,57141700,59391700,62741700,65266700,67016700
26,"Pittsburgh, PA",PA,12500000,12475000,12708300,13132300,13682300,14155700,14348300,14390000,...,21300000,20633300,20000000,19166700,18833300,19163300,20063300,21063300,22366700,23233400
37,"Providence, RI",RI,19525000,19075000,18833300,19166700,20000000,21280000,21980000,22230000,...,39966700,39850000,39183300,38716700,38183300,38183300,38850000,40050000,41716700,42716700


In [20]:
# Define a function to calculate seasonal prices
def calculate_seasonal_prices(df):
    # Extract year from date columns
    date_columns = df.columns[1:]
    for column in date_columns:
        df[column] = pd.to_datetime(df[column], format='%m/%d/%Y').dt.year
    
    # Create a dictionary to map months to seasons
    month_to_season = {
        1: 'Winter',
        2: 'Winter',
        3: 'Spring',
        4: 'Spring',
        5: 'Spring',
        6: 'Summer',
        7: 'Summer',
        8: 'Summer',
        9: 'Fall',
        10: 'Fall',
        11: 'Fall',
        12: 'Winter'
    }
    
    # Map months to seasons
    df['Season'] = df['Month'].map(month_to_season)
    
    # Group by state and season, then calculate the mean
    seasonal_prices = df.groupby(['StateName', 'Season']).mean().reset_index()
    
    return seasonal_prices


In [21]:
ne_df_months = ne_df.drop(columns=['RegionName', 'StateName'])
ne_df_months.head()

Unnamed: 0,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,9/30/2013,10/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
0,34491100,33935000,33535000,33351700,34200000,35600000,37100000,38066700,37808300,36908300,...,54333300,52833300,51833300,51533300,50950000,51033300,51666700,53583300,55833300,57966700
6,20156700,19541700,19375000,19750000,20800000,21966700,22966700,23333300,22500000,21500000,...,32113300,31613300,31246700,30800000,30300000,30500000,31066700,32233300,33733300,35000000
9,31566700,30833300,30533300,31600000,33000000,35066700,36111700,36778300,35711700,34650000,...,60666700,59666700,58633300,57666700,56000000,57141700,59391700,62741700,65266700,67016700
26,12500000,12475000,12708300,13132300,13682300,14155700,14348300,14390000,14016700,13566700,...,21300000,20633300,20000000,19166700,18833300,19163300,20063300,21063300,22366700,23233400
37,19525000,19075000,18833300,19166700,20000000,21280000,21980000,22230000,21656700,21123300,...,39966700,39850000,39183300,38716700,38183300,38183300,38850000,40050000,41716700,42716700


In [22]:
se_df_months = se_df.drop(columns=['RegionName', 'StateName'])
se_df_months.head()

Unnamed: 0,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,9/30/2013,10/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
5,32366700,32000000,32500000,34000000,35833300,37443900,38094900,37928200,36551000,35400000,...,48641700,47808300,47225000,46041700,46208300,47566700,50273300,52606700,54331700,54666700
7,16333300,16166700,16233300,16933300,17800000,18766700,19500000,19966700,19766700,19266700,...,41333300,41166700,40833300,40866700,41366700,42533300,43633300,44580000,45580000,45880000
8,14191800,14500000,15050000,16183300,17350000,18083300,18746700,18543300,17876700,17046700,...,36183300,35516700,35116700,34433300,34393300,34760000,35826700,36700000,37733300,38333300
17,12330000,12256700,12493300,12826700,13396700,13830000,14496700,14733300,14650000,14206700,...,36496700,35863300,35666700,34996700,35163300,35493300,36163300,36663300,37166700,37500000
19,23333300,22833300,22833300,23663300,24826700,25958300,26795000,26965000,26750000,26083300,...,34666700,33700000,32933300,32600000,32750000,33319300,34486000,36002700,37366700,38033300


In [23]:
mw_df_months = mw_df.drop(columns=['RegionName', 'StateName'])
mw_df_months.head()

Unnamed: 0,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,9/30/2013,10/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
2,16283300,15800000,16021800,17021800,18338500,19783300,20750000,21233300,20516700,19516700,...,28500000,27833300,27333300,27000000,26900000,27563300,28730000,30030000,31316700,31983300
13,11413300,11183300,11283300,11733300,12500000,13166700,13941700,14508300,14508300,14116700,...,24116700,23443400,22676700,21610000,21516700,21866500,23033200,23949800,25200000,25866700
15,16400000,16100000,16500000,17166700,18250000,19178400,20011700,20328400,20023200,19489900,...,34650000,33933300,33050000,32575000,32258400,32758400,33563300,34580000,35913300,36416700
20,13568300,13360000,13526700,13908300,14608300,15306700,15761700,15825000,15368300,14963300,...,23686900,23303600,22985800,22366700,22033300,21866700,22733300,23400000,24833300,25533300
27,13468300,13401700,13743300,14100000,14833300,15360000,15668300,15485000,14991700,14483300,...,24558300,24275000,23816700,23016700,22883300,23016700,24350000,25500000,26725000,27058300


In [24]:
w_df_months = w_df.drop(columns=['RegionName', 'StateName'])
w_df_months.head()

Unnamed: 0,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,9/30/2013,10/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
1,39400000,39733300,40733300,43066700,45066700,46500000,46833300,47166700,46866700,46533300,...,85200000,84033300,82700000,81833300,81666700,82733300,83900000,85400000,87333300,88833300
10,16266700,16366700,16766700,17430000,17996700,18493300,18833500,18833500,18770200,18733300,...,45248300,44296700,43163300,42233300,41966700,42466700,42866700,43466700,44300000,44666700
11,40583300,39850000,41166700,46500000,51666700,55500000,56500000,55833300,54166700,53133300,...,105433300,106266700,102266700,96333300,91983300,95316700,101316700,107333300,111000000,111666700
12,21233300,21400000,21833300,22600000,23400000,23966700,24466700,24666700,24833300,24866700,...,53333300,52000000,51250000,50416700,50916700,51166700,52166700,52850000,53350000,53516700
14,26771300,26688000,27248300,28915000,30381700,31250000,32250000,32225000,32163000,31496300,...,66333300,65166700,63666700,61833300,61233300,61966700,63633300,66233300,68300000,69466700


In [25]:
sw_df_months = sw_df.drop(columns=['RegionName', 'StateName'])
sw_df_months.head()

Unnamed: 0,1/31/2013,2/28/2013,3/31/2013,4/30/2013,5/31/2013,6/30/2013,7/31/2013,8/31/2013,9/30/2013,10/31/2013,...,10/31/2022,11/30/2022,12/31/2022,1/31/2023,2/28/2023,3/31/2023,4/30/2023,5/31/2023,6/30/2023,7/31/2023
3,15166700,15216700,15533300,16450000,17280000,17846700,18163300,18112500,17812500,17379200,...,39050000,38250000,37333300,36333300,36233300,36700000,37700000,38466700,39500000,40033300
4,14983300,14857200,15040500,16015500,16741700,17490000,17765000,17865000,17483300,17100000,...,32100000,31600000,31066700,30600000,30433300,30633300,31333300,32166700,32866700,33366700
23,14696700,14696700,14778300,15180000,15731700,16415000,16906700,16855000,16525000,16095000,...,32266700,31600000,31233300,30566700,30600000,30366700,30700000,31066700,31400000,31733300
28,19383300,19683300,20100000,21066700,21766700,22275000,22208300,22041700,21716700,21350000,...,49627700,48294400,46805800,45978100,45311400,45733300,46400000,47400000,48466700,48333300
40,12263300,12306400,12656400,12891400,13098300,13348300,13683300,13808300,13541700,13108300,...,23933300,23733300,22950000,22383300,22183300,22966700,23750000,24450000,24866700,25283300


In [26]:
# Calculate seasonal prices for each region
ne_seasonal_prices = calculate_seasonal_prices(ne_df_months)
se_seasonal_prices = calculate_seasonal_prices(se_df_months)
mw_seasonal_prices = calculate_seasonal_prices(mw_df_months)
w_seasonal_prices = calculate_seasonal_prices(w_df_months)
sw_seasonal_prices = calculate_seasonal_prices(sw_df_months)

ValueError: time data "33935000" doesn't match format "%m/%d/%Y", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.