In [2]:
# Wrangling
import pandas as pd
import numpy as np

# Reading
import json
import pickle

# Navigating
import os

# Import Data

## Real Estate Transactions

In [3]:
# folder navigation
print(os.path.abspath(os.curdir))

/Users/Chris/Documents/GitHub/SIADS691-Covid-Real-Estate


In [4]:
# Import 
with open('datasets/CA_data/counties_pull.txt') as project_file:    
    data = json.load(project_file)  

ca_geo_df = pd.json_normalize(data)

In [5]:
# CO06037 = LA ; CO06075 = SF
ca_geo_df[ca_geo_df.name.isin(['Los Angeles','San Francisco'])]

Unnamed: 0,geo_center_latitude,geo_center_longitude,geo_key,id,name,type
28,34.196398,-118.261862,CO06037,CO06037,Los Angeles,CO
51,37.727239,-123.032229,CO06075,CO06075,San Francisco,CO


In [36]:
# Build Dataframes from json files

# Initialize dataframe holder
ca_df_dict = {}

# Loop to build list of files for concatination
folders = ['Los Angeles','San Francisco','San Diego','San Bernardino','Sacramento','Riverside','Orange']
for folder in folders:
    print(f'Building {folder}')
    # Get directories 
    files = os.listdir("datasets/CA_data/"+folder)

    # Get json files in directory
    for county_f in files:
        with open(f'datasets/CA_data/'+folder + '/' + county_f) as project_file:
            try:
                data = json.load(project_file)
                df = pd.json_normalize(data) 

                # Concat dataframes
                if folder in ca_df_dict:
                    df2 = ca_df_dict[folder]
                    h_concat_df = pd.concat([df2, df], axis=0)
                    ca_df_dict[folder] = h_concat_df
                else:
                    ca_df_dict[folder] = df
            except:
                print(f'Failed: {project_file}')

print('Dictionary Created!')

Building Los Angeles
Building San Francisco
Building San Diego
Building San Bernardino
Building Sacramento
Building Riverside
Building Orange
Dictionary Created!


In [37]:
# Create dictionary for renaming
col_dict = {'sale.saleTransDate':'transaction_date','address.postal1':'zipcode','sale.amount.saleamt':'sale_price','lot.lotSize1':'lot_size','building.size.universalsize':'size','location.latitude':'latitude','location.longitude':'longitude','summary.propsubtype':'sub_type','summary.proptype':'type','summary.yearbuilt':'year_built','building.rooms.bathstotal':'bathrooms','building.rooms.beds':'bedrooms'}
# Create list for anti filter drop
col_drop_list = [ col for col in ca_df_dict['Orange'].columns if col not in col_dict.keys()]

In [42]:
# Clean the DataFrames
for data in ca_df_dict.keys():
    ca_df_dict[data].rename(columns = col_dict, inplace=True)
    ca_df_dict[data].drop(col_drop_list, inplace=True, axis=1, errors='ignore')
    ca_df_dict[data].reset_index(drop=True, inplace=True)
    ca_df_dict[data].dropna(subset=['transaction_date'], inplace=True) #

pandas.core.frame.DataFrame

In [43]:
# export cleaned dictionary of dataframe
with open("ca_df_dict.pkl","wb") as ca_dump:
    pickle.dump(ca_df_dict,ca_dump)
    print('done')

done


In [67]:
total_size = 0
for bc in ca_pickled.keys():
    total_size += ca_pickled[bc].shape[0]
print(f'Total length is {total_size}')

Total length is 434395


In [50]:
# Read pickle back in
ca_pickled = pd.read_pickle("datasets/CA_data/ca_df_dict.pkl")
#print('done')

### We need to isolate only the residential transactions of interest

In [59]:
ca_pickled['Los Angeles'].type.unique()

array(['SFR', 'LIGHT INDUSTRIAL', 'CONDOMINIUM', 'TRIPLEX', 'PUD',
       'APARTMENT', 'QUADRUPLEX', 'DUPLEX', 'RESIDENTIAL ACREAGE',
       'VACANT LAND (NEC)', 'SERVICE STATION', 'TYPE UNKNOWN',
       'OFFICE BUILDING', 'MOTEL', 'HOSPITAL', 'WAREHOUSE',
       'STORE BUILDING', 'AGRICULTURAL (NEC)', 'STORES & OFFICES',
       'SUPERMARKET', 'COMMERCIAL ACREAGE', 'ANIMAL HOSPITAL/VET',
       'PARKING STRUCTURE', nan, 'RESTAURANT BUILDING', 'MOBILE HOME',
       'INDUSTRIAL (NEC)', 'COMMON AREA', 'INDUSTRIAL ACREAGE',
       'MEDICAL BUILDING', 'SHOPPING CENTER', 'FARMS', 'RELIGIOUS',
       'PUBLIC SCHOOL', 'CLUB', 'FUNERAL HOME', 'RESORT HOTEL',
       'MULTI FAMILY DWELLING', 'ORCHARD', 'NURSERY/HORTICULTURE',
       'DEPARTMENT STORE', 'TRANSPORT FACILITY', 'PUBLIC (NEC)',
       'HEAVY INDUSTRIAL', 'AUTO SALES', 'THEATER', 'FOOD PROCESSING',
       'MINI WAREHOUSE', 'COOPERATIVE', 'COMMUNICATION FACILITY',
       'GOLF COURSE', 'FINANCIAL BUILDING', 'MOBILE HOME PARK',
       'L

In [61]:
# Isolating SFR
residential = ['APARTMENT', 'CONDOMINIUM','SFR', 'DUPLEX','TOWNHOUSE/ROWHOUSE','QUADRUPLEX','TRIPLEX']
for data in ca_df_dict.keys():
    ca_pickled[data] = ca_pickled[data][ca_pickled[data].type.isin(residential)]

In [62]:
ca_pickled['Los Angeles'].type.unique()

array(['SFR', 'CONDOMINIUM', 'TRIPLEX', 'APARTMENT', 'QUADRUPLEX',
       'DUPLEX'], dtype=object)

## Income 

In [11]:
# Federal income data reported 2019
income_data = pd.read_excel("datasets/income/est19-all_income.xls", header=3)[['Postal Code', 'Name','Median Household Income']]

# Select relevant income data
income_data_CA_NY = income_data[(income_data['Postal Code'].isin(['CA'])) & income_data['Name'].str.contains('County')]

# Select relevant columns
median_income_df = income_data_CA_NY[['Postal Code', 'Name','Median Household Income']]
median_income_df.rename(columns={'Name':'County','Postal Code':'State ABV'}, inplace=True)

# Isolate each county's Area Median Income (AMI).
ca_median_income_df = median_income_df[median_income_df.County.isin(['San Francisco County','Los Angeles County','San Diego County','San Bernardino County','Sacramento County','Riverside County','Orange County'])]

ca_median_income_df

Unnamed: 0,State ABV,County,Median Household Income
210,CA,Los Angeles County,72721
221,CA,Orange County,95761
224,CA,Riverside County,72905
225,CA,Sacramento County,71891
227,CA,San Bernardino County,67398
228,CA,San Diego County,83576
229,CA,San Francisco County,121795


## Population

In [12]:
# Need to import population for Covid factorization 
# (Daily infection / Population) -> DEFAULT TO NYT
# (Daily death / Population) -> DEFAULT TO NYT

# Density for filtering on the "urban factor" & causal county comparison 
# https://worldpopulationreview.com/us-counties/states/ca
# https://worldpopulationreview.com/us-counties/states/ny
pop_density = pd.read_csv("datasets/population/ca_density_pop_by_county.csv")
pop_density['state'] = 'California'

# Rename for Title Caps
pop_density.rename(columns={'CTYNAME':'City Name','popDensity':'Pop Density','pop2021': 'Pop 2021','state':'State'}, inplace=True)

pop_density[pop_density['City Name'].isin(['San Francisco County','Los Angeles County','San Diego County','San Bernardino County','Sacramento County','Riverside County','Orange County'])]

Unnamed: 0,City Name,Pop 2021,GrowthRate,Pop Density,State
0,Los Angeles County,9969510,1.4889,13488.8513,California
1,San Diego County,3347270,7.8648,4528.8913,California
2,Orange County,3175130,5.3052,4295.9841,California
3,Riverside County,2520060,14.466,3409.6675,California
4,San Bernardino County,2206750,8.129,2985.7558,California
7,Sacramento County,1578680,11.0667,2135.9706,California
11,San Francisco County,883255,9.6523,1195.0533,California


## Covid-19

In [13]:
# Read in New York Times data - > SELECTED
# This represents a cumulative number
# https://github.com/nytimes/covid-19-data/blob/master/us-counties.csv
nyt_cum = pd.read_csv("datasets/covid/nyt-us-counties.csv")
nyt_cum = nyt_cum[
    (nyt_cum.county.isin(
        ['San Francisco','Los Angeles','San Diego','San Bernardino','Sacramento','Riverside','Orange'])) & (nyt_cum.state == 'California')].drop(['fips'], axis=1)
nyt_cum = nyt_cum.rename(columns={'cases':'cum_cases','deaths':'cum_deaths'})

# Next represents daily reports and averages - > SELECTED
# https://github.com/nytimes/covid-19-data/tree/master/rolling-averages
nyt_avgs = pd.read_csv("datasets/covid/nyt-rolling-us-counties.csv")
nyt_avgs = nyt_avgs[nyt_avgs.county.isin(['San Francisco','Los Angeles','San Diego','San Bernardino','Sacramento','Riverside','Orange']) & (nyt_avgs.state == 'California')].drop(['geoid'], axis=1)
nyt_avgs = nyt_avgs.rename(columns={'cases':'New Cases','cases_avg':'R7 Cases AVG','deaths':'Days Deaths','cases_avg_per_100k':'R7 Cases AVG Per 100k','deaths_avg':'R7 Deaths AVG','deaths_avg_per_100k':'R7 Deaths AVG Per 100k'})

# Merge the dataframes into one
covid = pd.merge(nyt_cum, nyt_avgs, on=['date','county','state'], how='outer')
covid.rename(columns={'date':'Date', 'county':'County', 'state':'State', 'cum_cases': 'CUM Cases', 'cum_deaths':'CUM Deaths'}, inplace=True)

In [14]:
covid.sample(2)

Unnamed: 0,Date,County,State,CUM Cases,CUM Deaths,New Cases,R7 Cases AVG,R7 Cases AVG Per 100k,Days Deaths,R7 Deaths AVG,R7 Deaths AVG Per 100k
2475,2021-02-02,Orange,California,247886.0,3162.0,851,1244.43,39.19,53,56.29,1.77
2675,2021-03-02,San Diego,California,261064.0,3317.0,376,456.57,13.68,14,14.14,0.42


In [47]:
# export covid df
with open("covid.pkl","wb") as covid_dump:
    pickle.dump(covid,covid_dump)
    print('done')

done


# Functions for Metrics, Statistics, Classification 

<em>Drilling into true affordability requires fincial data we don't have</em>

The first-time homebuyer affordability index is based upon the assumption of 70% median household income, 85% of the median house price, a 10% down payment on a 30-year fixed rate mortgage at prevailing rates plus 0.25% added per month for mortgage insurance. The assumption of 25% of gross income for mortgage repayments also applies.

The composite housing affordability index assumes a median household income, median house prices, a 20% down payment on a 30-year fixed rate mortgage at prevailing rates. The assumption is also made that the lender will not allow principal and interest costs to exceed 25% of gross income.

<em>Therefore we need to use some generic metric such as the 30% rule.</em>

A ratio of housing cost to income is the gold standard, it is known as the 30 percent rule, and it is established because of legal actions.

https://www.forbes.com/sites/rogervaldez/2020/02/07/housing-ideas-for-the-feds-find-a-new-measure-of-affordable/?sh=2e9f3c3c36f8

https://www.huduser.gov/portal/pdredge/pdr_edge_featd_article_092214.html

In [22]:
# For some reason nesting this inside summary states isn't working
def affordability(row, monthly_30):
    """
    Row wise operations for apply function
    Return row of values
    """

    # Mortgage variables
    # Cite: http://www.freddiemac.com/pmms/docs/30yr_pmmsmnth.xls
    freddie_mac_mortgage_rates = {2019:3.94,2020:3.11,2021:2.92}
    n = 30 * 12

    P = row['sale_price']
    r = freddie_mac_mortgage_rates[row['transaction_date'].year]
    row['Monthly Mortgage'] = (P * r * ((1 + r)**n)) / (((1 + r)**n) - 1)
    row['Affordability'] = 100 * ((monthly_30 - row['Monthly Mortgage']) / monthly_30 )
    return row

In [23]:
def summary_stats(df, county, ami):
    """
    Requires numpy 
    Input variable = df (dataframe) , area median income (int), county (string)

    Fixed Monthly Mortgage Repayment Calculation = P * r * (1 + r)n / [(1 + r)n – 1]
    where P = Outstanding loan amount, r = Effective monthly interest rate, 
    n = Total number of years * 12 months

    Return new unit price columns -> 'price_per_size_unit', 'price_per_bed_unit', 'price_per_lot_size_unit'
    Return class based on 'price_per_size_unit'  -> 'tier' [lower, mid , top]
    Return affordability -> 'monthly_mortgage' , 'affordability' , 'ami'
    """

    # Unit price columns
    df['Price Per Size Unit'] = np.where(df['size'] != 0, 
        round(df['sale_price'] / df['size'], 2), # True
        np.nan # False
        )
    df['Price Per Bed Unit'] = np.where(df['bedrooms'] != 0, 
        round(df['sale_price'] / df['bedrooms'], 2), # True
        np.nan # False
        )
    df['Price Per Lot Size Unit'] = np.where(df['lot_size'] != 0, 
        round(df['sale_price'] / df['lot_size'], 2), # True
        np.nan # False
        )

    # Binning into 3 equal tiers based on price
    df['Tier'] = pd.qcut(
        df[df['Price Per Size Unit'].notna()]['Price Per Size Unit'], 
        3, 
        labels=['Bottom','Middle','Top'], 
        ordered=True)

    # Convert to datetime
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])


    # Apply the affordability function
    # Monthly is needed for the afordability calculation
    monthly_30 = ( ami / 12 ) * .3
    df = df.apply(lambda x: affordability( row = x, monthly_30 = monthly_30), axis=1)
    

    return df


In [63]:
for county_key in sorted(ca_pickled.keys()):
    missing_counties = list()
    try:
        ca_pickled[county_key] = summary_stats(
                df = ca_pickled[county_key], 
                county = county_key, 
                ami = int(median_income_df[
                        median_income_df['County'].str.contains(str(county_key))]['Median Household Income'])
        )
    except:
        missing_counties.append(county_key)
        print(f"County income data mising for {county_key}. Continuing...")
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price Per Size Unit'] = np.where(df['size'] != 0,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price Per Bed Unit'] = np.where(df['bedrooms'] != 0,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Price Per Lot Size Unit'] = np.where(df['lot_size'] != 0,
A value is trying to be set on a

In [216]:
# Repeating the binning
ca_pickled['San Francisco']['Tier'], sf_bins = pd.qcut(
        ca_pickled['San Francisco'][ca_pickled['San Francisco']['Price Per Size Unit'].notna()]['Price Per Size Unit'], 
        3, 
        labels=['Bottom','Middle','Top'],
        precision = 0,
        retbins=True)
ca_pickled['Los Angeles']['Tier'], la_bins = pd.qcut(
        ca_pickled['Los Angeles'][ca_pickled['Los Angeles']['Price Per Size Unit'].notna()]['Price Per Size Unit'], 
        3, 
        labels=['Bottom','Middle','Top'],
        precision = 0,
        retbins=True)

In [215]:
print(sf_bins)

[3.65000000e+00 9.29643333e+02 1.22737000e+03 6.80000000e+05]


In [217]:
print(la_bins)

[1.00300000e+01 3.79770000e+02 5.47470000e+02 1.71428571e+06]


In [195]:
ca_pickled['San Francisco'][ca_pickled['San Francisco']['Price Per Size Unit'].notna()]['Price Per Size Unit'].describe()

count     11181.000000
mean       1268.021655
std        6697.544770
min           3.650000
25%         834.930000
50%        1077.810000
75%        1319.550000
max      680000.000000
Name: Price Per Size Unit, dtype: float64

In [203]:
ca_pickled['San Francisco']['Tier'].value_counts()

Bottom    3727
Middle    3727
Top       3727
Name: Tier, dtype: int64

In [218]:
ca_pickled['Los Angeles']['Tier'].value_counts()

Bottom    45116
Top       45112
Middle    45109
Name: Tier, dtype: int64

In [64]:
# export cleaned dictionary of dataframe
with open("ca_df_dict_residential_stats.pkl","wb") as ca_dump:
    pickle.dump(ca_pickled,ca_dump)
    print('done')

done


In [106]:
ca_pickled['San Francisco']['sale_price'].describe()

Unnamed: 0,lot_size,zipcode,latitude,longitude,sub_type,type,year_built,size,bathrooms,bedrooms,transaction_date,sale_price,Price Per Size Unit,Price Per Bed Unit,Price Per Lot Size Unit,Tier,Monthly Mortgage,Affordability
0,0.0556,94109,37.798711,-122.41882,RESIDENTIAL,APARTMENT,1961,3424,4.0,0,2020-01-01,256000,74.77,,4604316.55,Bottom,796160.0,-26047.543003
1,0.0,94108,37.791088,-122.407931,RESIDENTIAL,CONDOMINIUM,1920,288,1.0,0,2020-01-02,480000,1666.67,,,Bottom,1492800.0,-48926.64313
3,0.0,94121,37.775637,-122.502045,RESIDENTIAL,CONDOMINIUM,1998,670,1.0,2,2020-01-02,729000,1088.06,364500.0,,Bottom,2267190.0,-74359.214253
4,0.0,94132,37.710298,-122.466269,RESIDENTIAL,CONDOMINIUM,2002,1186,2.0,3,2020-01-02,830000,699.83,276666.67,,Bottom,2581300.0,-84675.237079
5,0.0,94105,37.787586,-122.390748,RESIDENTIAL,CONDOMINIUM,1991,795,1.0,1,2020-01-02,860000,1081.76,860000.0,,Bottom,2674600.0,-87739.402274


In [None]:
"""
ca_df_dict['Orange'] = summary_stats(
    df= ca_df_dict['Orange'], 
    county = 'Orange', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Orange')]['Median Household Income'])
    )
    
ca_df_dict['San Diego'] = summary_stats(
    df= ca_df_dict['San Diego'], 
    county = 'San Diego', 
    ami = int(median_income_df[median_income_df['County'].str.contains('San Diego')]['Median Household Income'])
    )

ca_df_dict['San Francisco'] = summary_stats(
    df= ca_df_dict['San Francisco'], 
    county = 'San Francisco', 
    ami = int(median_income_df[median_income_df['County'].str.contains('San Francisco')]['Median Household Income'])
    )

ca_df_dict['Los Angeles'] = summary_stats(
    df= ca_df_dict['Los Angeles'], 
    county = 'Los Angeles', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Los Angeles')]['Median Household Income'])
    )

ca_df_dict['Scremento'] = summary_stats(
    df= ca_df_dict['Scremento'], 
    county = 'Scremento', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Scremento')]['Median Household Income'])
    )
    
ca_df_dict['San Bernardino'] = summary_stats(
    df= ca_df_dict['San Bernardino'], 
    county = 'San Bernardino', 
    ami = int(median_income_df[median_income_df['County'].str.contains('San Bernardino')]['Median Household Income'])
    )

ca_df_dict['Riverside'] = summary_stats(
    df= ca_df_dict['Riverside'], 
    county = 'Riverside', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Riverside')]['Median Household Income'])
    )

ca_df_dict['Riverside'][ca_df_dict['Riverside']['transaction_date'].isna()]
ca_df_dict['Los Angeles'][ca_df_dict['Los Angeles']['transaction_date'].isna()]
ca_df_dict['San Diego'][ca_df_dict['San Diego']['transaction_date'].isna()]

"""

# EDA

In [132]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [179]:
fig2 = px.line(covid, x='Date', y='R7 Cases AVG Per 100k', color='County')
fig2.show()

In [180]:
# Focus on two market's for control and test group
filtered_covid = covid[covid.County.isin(['San Francisco','Los Angeles'])][['Date','R7 Cases AVG Per 100k','County']]

fig = px.line(filtered_covid, x='Date', y='R7 Cases AVG Per 100k', color='County')
fig.show()

Treatment variable: R7 cases per 100K 
Treatment variable threshold: ~15

Post treatment period : June 22, 2020 to July 22nd 2020
Pre treatment period : June 22, 2019 to July 2019

In DiD, in post-treatment period, the treatment group is affected, but the control group is not.
In DiD, in pre-treatment period, neither treatment nor control group is affected.

Post-treatment outcome variable for Control group: Median home prices for June 22, 2020 to July 22nd 2020
Post-treatment outcome variable for Treatment Group : Median home prices for LA for June 22, 2020 to July 22nd 2020

Pre-treatment outcome variable for Control group : Median home prices for SF for June 22, 2019 to July 22nd 2019
Pre-treatment outcome variable for Treatment Group : Median home prices for LA for June 22, 2019 to July 22nd 2019

In [135]:
# Los Angles numbers
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 15) & (covid.County == 'Los Angeles')].head(10)


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,R7 Cases AVG Per 100k,County
899,2020-06-22,17.29,Los Angeles
906,2020-06-23,18.75,Los Angeles
913,2020-06-24,17.5,Los Angeles
920,2020-06-25,18.84,Los Angeles
927,2020-06-26,19.39,Los Angeles
934,2020-06-27,19.54,Los Angeles
941,2020-06-28,20.63,Los Angeles
948,2020-06-29,21.1,Los Angeles
955,2020-06-30,21.73,Los Angeles
962,2020-07-01,22.79,Los Angeles


In [136]:
# Los Angles numbers
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 34) & (covid.County == 'Los Angeles')]


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,R7 Cases AVG Per 100k,County
1956,2020-11-20,35.00,Los Angeles
1963,2020-11-21,36.05,Los Angeles
1970,2020-11-22,35.54,Los Angeles
1977,2020-11-23,40.28,Los Angeles
1984,2020-11-24,42.16,Los Angeles
...,...,...,...
2509,2021-02-07,42.13,Los Angeles
2516,2021-02-08,40.15,Los Angeles
2523,2021-02-09,39.43,Los Angeles
2530,2021-02-10,36.80,Los Angeles


In [137]:
# San Francisco
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 15) & (covid.County == 'San Francisco')].head(10)


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,R7 Cases AVG Per 100k,County
1185,2020-08-01,15.17,San Francisco
2004,2020-11-26,16.14,San Francisco
2018,2020-11-28,17.02,San Francisco
2025,2020-11-29,16.29,San Francisco
2032,2020-11-30,16.68,San Francisco
2039,2020-12-01,15.83,San Francisco
2053,2020-12-03,16.29,San Francisco
2060,2020-12-04,17.5,San Francisco
2067,2020-12-05,18.18,San Francisco
2074,2020-12-06,21.65,San Francisco


In [138]:
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 34) & (covid.County == 'San Francisco')].head(10)


Boolean Series key will be reindexed to match DataFrame index.



Unnamed: 0,Date,R7 Cases AVG Per 100k,County
2172,2020-12-20,36.14,San Francisco
2179,2020-12-21,36.48,San Francisco
2186,2020-12-22,35.36,San Francisco
2298,2021-01-07,35.65,San Francisco
2305,2021-01-08,34.76,San Francisco
2312,2021-01-09,36.19,San Francisco
2319,2021-01-10,37.17,San Francisco
2326,2021-01-11,39.96,San Francisco
2333,2021-01-12,40.69,San Francisco
2340,2021-01-13,41.19,San Francisco


In [143]:
ca_pickled['San Francisco'].columns

Index(['lot_size', 'zipcode', 'latitude', 'longitude', 'sub_type', 'type',
       'year_built', 'size', 'bathrooms', 'bedrooms', 'transaction_date',
       'sale_price', 'Price Per Size Unit', 'Price Per Bed Unit',
       'Price Per Lot Size Unit', 'Tier', 'Monthly Mortgage', 'Affordability'],
      dtype='object')

In [148]:
ca_pickled.keys()

dict_keys(['Los Angeles', 'San Francisco', 'San Diego', 'San Bernardino', 'Sacramento', 'Riverside', 'Orange'])

In [151]:
sf = ca_pickled['San Francisco']


Unnamed: 0_level_0,transaction_date,Tier
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02,3,3
2019-01-03,6,6
2019-01-04,4,4
2019-01-07,5,5
2019-01-08,2,2
...,...,...
2021-02-20,1,1
2021-02-21,2,2
2021-02-22,5,5
2021-02-23,3,3


In [219]:
ca_pickled['San Francisco']['County'] = 'San Francisco'
ca_pickled['Los Angeles']['County'] = 'Los Angeles'
sf_la = pd.concat([ca_pickled['San Francisco'], ca_pickled['Los Angeles']])

In [220]:
sf_la.head()

Unnamed: 0,lot_size,zipcode,latitude,longitude,sub_type,type,year_built,size,bathrooms,bedrooms,transaction_date,sale_price,Price Per Size Unit,Price Per Bed Unit,Price Per Lot Size Unit,Tier,Monthly Mortgage,Affordability,County
0,0.0556,94109,37.798711,-122.41882,RESIDENTIAL,APARTMENT,1961,3424,4.0,0,2020-01-01,256000,74.77,,4604316.55,Bottom,796160.0,-26047.543003,San Francisco
1,0.0,94108,37.791088,-122.407931,RESIDENTIAL,CONDOMINIUM,1920,288,1.0,0,2020-01-02,480000,1666.67,,,Top,1492800.0,-48926.64313,San Francisco
3,0.0,94121,37.775637,-122.502045,RESIDENTIAL,CONDOMINIUM,1998,670,1.0,2,2020-01-02,729000,1088.06,364500.0,,Middle,2267190.0,-74359.214253,San Francisco
4,0.0,94132,37.710298,-122.466269,RESIDENTIAL,CONDOMINIUM,2002,1186,2.0,3,2020-01-02,830000,699.83,276666.67,,Bottom,2581300.0,-84675.237079,San Francisco
5,0.0,94105,37.787586,-122.390748,RESIDENTIAL,CONDOMINIUM,1991,795,1.0,1,2020-01-02,860000,1081.76,860000.0,,Middle,2674600.0,-87739.402274,San Francisco


In [221]:
# Look at daily counts
sales_count_by_type = sf_la.groupby(by=['County','transaction_date','type'])[['sub_type']].count().reset_index()
sales_count_by_tier = sf_la.groupby(by=['County','transaction_date','Tier'])[['sub_type']].count().reset_index()

In [178]:
# There are clearly errors in the pd cut function
sf_la['Tier'].value_counts()

Bottom    13998
Top           2
Middle        1
Name: Tier, dtype: int64

In [165]:
sales_count_by_type.head()

Unnamed: 0,County,transaction_date,type,sub_type
0,Los Angeles,2019-01-01,CONDOMINIUM,1
1,Los Angeles,2019-01-01,SFR,2
2,Los Angeles,2019-01-02,APARTMENT,1
3,Los Angeles,2019-01-02,CONDOMINIUM,34
4,Los Angeles,2019-01-02,DUPLEX,4


In [225]:
la_sales_type = px.bar(sales_count_by_type[sales_count_by_type.County =='Los Angeles'], x='transaction_date', y='sub_type', color='type', title='Los Angeles Daily Sales by Type')
la_sales_type.show()

In [226]:
sf_sales_type = px.bar(sales_count_by_type[sales_count_by_type.County =='San Francisco'], x='transaction_date', y='sub_type', color='type', title='San Francisco Daily Sales by Type')
sf_sales_type.show()

In [223]:
la_tier_chart = px.bar(sales_count_by_tier[sales_count_by_tier.County =='Los Angeles'], x='transaction_date', y='sub_type', color='Tier', title='Los Angeles Daily Sales by Tier')
la_tier_chart.show()

In [224]:
sf_tier_chart = px.bar(sales_count_by_tier[sales_count_by_tier.County =='San Francisco'], x='transaction_date', y='sub_type', color='Tier', title='San Francisco Daily Sales by Tier')
la_tier_chart.show()