In [1]:
# Wrangling
import pandas as pd
import numpy as np

# Reading
import json
import pickle

# Navigating
import os

# Import Data

## Real Estate Transactions

In [2]:
# Read pickle back in
ca_pickled = pd.read_pickle("ca_df_dict_with_stats.pkl")
print('done')

done


In [3]:
# Confirm that we only have residential data
ca_pickled['San Francisco'].type.unique()

array(['APARTMENT', 'CONDOMINIUM', 'SFR', 'DUPLEX', 'TOWNHOUSE/ROWHOUSE'],
      dtype=object)

In [4]:
ca_pickled['San Francisco'].columns

Index(['lot_size', 'zipcode', 'latitude', 'longitude', 'sub_type', 'type',
       'year_built', 'size', 'bathrooms', 'bedrooms', 'transaction_date',
       'sale_price', 'Price Per Size Unit', 'Price Per Bed Unit',
       'Price Per Lot Size Unit', 'Tier', 'Monthly Mortgage', 'Affordability'],
      dtype='object')

# Skip to next section if pickle file exists

In [3]:
# folder navigation
print(os.path.abspath(os.curdir))

/Users/Chris/Documents/GitHub/SIADS691-Covid-Real-Estate


In [4]:
# Import 
with open('datasets/CA_data/counties_pull.txt') as project_file:    
    data = json.load(project_file)  

ca_geo_df = pd.json_normalize(data)

In [5]:
# CO06037 = LA ; CO06075 = SF
ca_geo_df[ca_geo_df.name.isin(['Los Angeles','San Francisco'])]

Unnamed: 0,geo_center_latitude,geo_center_longitude,geo_key,id,name,type
28,34.196398,-118.261862,CO06037,CO06037,Los Angeles,CO
51,37.727239,-123.032229,CO06075,CO06075,San Francisco,CO


In [36]:
# Build Dataframes from json files

# Initialize dataframe holder
ca_df_dict = {}

# Loop to build list of files for concatination
folders = ['Los Angeles','San Francisco','San Diego','San Bernardino','Sacramento','Riverside','Orange']
for folder in folders:
    print(f'Building {folder}')
    # Get directories 
    files = os.listdir("datasets/CA_data/"+folder)

    # Get json files in directory
    for county_f in files:
        with open(f'datasets/CA_data/'+folder + '/' + county_f) as project_file:
            try:
                data = json.load(project_file)
                df = pd.json_normalize(data) 

                # Concat dataframes
                if folder in ca_df_dict:
                    df2 = ca_df_dict[folder]
                    h_concat_df = pd.concat([df2, df], axis=0)
                    ca_df_dict[folder] = h_concat_df
                else:
                    ca_df_dict[folder] = df
            except:
                print(f'Failed: {project_file}')

print('Dictionary Created!')

Building Los Angeles
Building San Francisco
Building San Diego
Building San Bernardino
Building Sacramento
Building Riverside
Building Orange
Dictionary Created!


In [37]:
# Create dictionary for renaming
col_dict = {'sale.saleTransDate':'transaction_date','address.postal1':'zipcode','sale.amount.saleamt':'sale_price','lot.lotSize1':'lot_size','building.size.universalsize':'size','location.latitude':'latitude','location.longitude':'longitude','summary.propsubtype':'sub_type','summary.proptype':'type','summary.yearbuilt':'year_built','building.rooms.bathstotal':'bathrooms','building.rooms.beds':'bedrooms'}
# Create list for anti filter drop
col_drop_list = [ col for col in ca_df_dict['Orange'].columns if col not in col_dict.keys()]

In [42]:
# Clean the DataFrames
for data in ca_df_dict.keys():
    ca_df_dict[data].rename(columns = col_dict, inplace=True)
    ca_df_dict[data].drop(col_drop_list, inplace=True, axis=1, errors='ignore')
    ca_df_dict[data].reset_index(drop=True, inplace=True)
    ca_df_dict[data].dropna(subset=['transaction_date'], inplace=True) #

In [43]:
# export cleaned dictionary of dataframe
with open("ca_df_dict.pkl","wb") as ca_dump:
    pickle.dump(ca_df_dict,ca_dump)
    print('done')

done


In [67]:
total_size = 0
for bc in ca_pickled.keys():
    total_size += ca_pickled[bc].shape[0]
print(f'Total length is {total_size}')

Total length is 434395


### We need to isolate only the residential transactions of interest

In [59]:
ca_pickled['Los Angeles'].type.unique()

array(['SFR', 'LIGHT INDUSTRIAL', 'CONDOMINIUM', 'TRIPLEX', 'PUD',
       'APARTMENT', 'QUADRUPLEX', 'DUPLEX', 'RESIDENTIAL ACREAGE',
       'VACANT LAND (NEC)', 'SERVICE STATION', 'TYPE UNKNOWN',
       'OFFICE BUILDING', 'MOTEL', 'HOSPITAL', 'WAREHOUSE',
       'STORE BUILDING', 'AGRICULTURAL (NEC)', 'STORES & OFFICES',
       'SUPERMARKET', 'COMMERCIAL ACREAGE', 'ANIMAL HOSPITAL/VET',
       'PARKING STRUCTURE', nan, 'RESTAURANT BUILDING', 'MOBILE HOME',
       'INDUSTRIAL (NEC)', 'COMMON AREA', 'INDUSTRIAL ACREAGE',
       'MEDICAL BUILDING', 'SHOPPING CENTER', 'FARMS', 'RELIGIOUS',
       'PUBLIC SCHOOL', 'CLUB', 'FUNERAL HOME', 'RESORT HOTEL',
       'MULTI FAMILY DWELLING', 'ORCHARD', 'NURSERY/HORTICULTURE',
       'DEPARTMENT STORE', 'TRANSPORT FACILITY', 'PUBLIC (NEC)',
       'HEAVY INDUSTRIAL', 'AUTO SALES', 'THEATER', 'FOOD PROCESSING',
       'MINI WAREHOUSE', 'COOPERATIVE', 'COMMUNICATION FACILITY',
       'GOLF COURSE', 'FINANCIAL BUILDING', 'MOBILE HOME PARK',
       'L

In [32]:
# Isolating SFR
residential = ['APARTMENT', 'CONDOMINIUM','SFR', 'DUPLEX','TOWNHOUSE/ROWHOUSE','QUADRUPLEX','TRIPLEX']
for data in ca_pickled.keys():
    ca_pickled[data] = ca_pickled[data][ca_pickled[data].type.isin(residential)]

## Income 

In [5]:
# Federal income data reported 2019
income_data = pd.read_excel("datasets/income/est19-all_income.xls", header=3)[['Postal Code', 'Name','Median Household Income']]

# Select relevant income data
income_data_CA_NY = income_data[(income_data['Postal Code'].isin(['CA'])) & income_data['Name'].str.contains('County')]

# Select relevant columns
median_income_df = income_data_CA_NY[['Postal Code', 'Name','Median Household Income']]
median_income_df.rename(columns={'Name':'County','Postal Code':'State ABV'}, inplace=True)

# Isolate each county's Area Median Income (AMI).
ca_median_income_df = median_income_df[median_income_df.County.isin(['San Francisco County','Los Angeles County','San Diego County','San Bernardino County','Sacramento County','Riverside County','Orange County'])]

ca_median_income_df

Unnamed: 0,State ABV,County,Median Household Income
210,CA,Los Angeles County,72721
221,CA,Orange County,95761
224,CA,Riverside County,72905
225,CA,Sacramento County,71891
227,CA,San Bernardino County,67398
228,CA,San Diego County,83576
229,CA,San Francisco County,121795


## Population

In [6]:
# Need to import population for Covid factorization 
# (Daily infection / Population) -> DEFAULT TO NYT
# (Daily death / Population) -> DEFAULT TO NYT

# Density for filtering on the "urban factor" & causal county comparison 
# https://worldpopulationreview.com/us-counties/states/ca
# https://worldpopulationreview.com/us-counties/states/ny
pop_density = pd.read_csv("datasets/population/ca_density_pop_by_county.csv")
pop_density['state'] = 'California'

# Rename for Title Caps
pop_density.rename(columns={'CTYNAME':'City Name','popDensity':'Pop Density','pop2021': 'Pop 2021','state':'State'}, inplace=True)

pop_density[pop_density['City Name'].isin(['San Francisco County','Los Angeles County','San Diego County','San Bernardino County','Sacramento County','Riverside County','Orange County'])]

Unnamed: 0,City Name,Pop 2021,GrowthRate,Pop Density,State
0,Los Angeles County,9969510,1.4889,13488.8513,California
1,San Diego County,3347270,7.8648,4528.8913,California
2,Orange County,3175130,5.3052,4295.9841,California
3,Riverside County,2520060,14.466,3409.6675,California
4,San Bernardino County,2206750,8.129,2985.7558,California
7,Sacramento County,1578680,11.0667,2135.9706,California
11,San Francisco County,883255,9.6523,1195.0533,California


## Covid-19

In [7]:
# Read in New York Times data - > SELECTED
# This represents a cumulative number
# https://github.com/nytimes/covid-19-data/blob/master/us-counties.csv
nyt_cum = pd.read_csv("datasets/covid/nyt-us-counties.csv")
nyt_cum = nyt_cum[
    (nyt_cum.county.isin(
        ['San Francisco','Los Angeles','San Diego','San Bernardino','Sacramento','Riverside','Orange'])) & (nyt_cum.state == 'California')].drop(['fips'], axis=1)
nyt_cum = nyt_cum.rename(columns={'cases':'cum_cases','deaths':'cum_deaths'})

# Next represents daily reports and averages - > SELECTED
# https://github.com/nytimes/covid-19-data/tree/master/rolling-averages
nyt_avgs = pd.read_csv("datasets/covid/nyt-rolling-us-counties.csv")
nyt_avgs = nyt_avgs[nyt_avgs.county.isin(['San Francisco','Los Angeles','San Diego','San Bernardino','Sacramento','Riverside','Orange']) & (nyt_avgs.state == 'California')].drop(['geoid'], axis=1)
nyt_avgs = nyt_avgs.rename(columns={'cases':'New Cases','cases_avg':'R7 Cases AVG','deaths':'Days Deaths','cases_avg_per_100k':'R7 Cases AVG Per 100k','deaths_avg':'R7 Deaths AVG','deaths_avg_per_100k':'R7 Deaths AVG Per 100k'})

# Merge the dataframes into one
covid = pd.merge(nyt_cum, nyt_avgs, on=['date','county','state'], how='outer')
covid.rename(columns={'date':'Date', 'county':'County', 'state':'State', 'cum_cases': 'CUM Cases', 'cum_deaths':'CUM Deaths'}, inplace=True)

In [14]:
covid.sample(2)

Unnamed: 0,Date,County,State,CUM Cases,CUM Deaths,New Cases,R7 Cases AVG,R7 Cases AVG Per 100k,Days Deaths,R7 Deaths AVG,R7 Deaths AVG Per 100k
2475,2021-02-02,Orange,California,247886.0,3162.0,851,1244.43,39.19,53,56.29,1.77
2675,2021-03-02,San Diego,California,261064.0,3317.0,376,456.57,13.68,14,14.14,0.42


In [47]:
# export covid df
"""
with open("covid.pkl","wb") as covid_dump:
    pickle.dump(covid,covid_dump)
    print('done')
"""

done


# Functions for Metrics, Statistics, Classification 

<em>Drilling into true affordability requires fincial data we don't have</em>

The first-time homebuyer affordability index is based upon the assumption of 70% median household income, 85% of the median house price, a 10% down payment on a 30-year fixed rate mortgage at prevailing rates plus 0.25% added per month for mortgage insurance. The assumption of 25% of gross income for mortgage repayments also applies.

The composite housing affordability index assumes a median household income, median house prices, a 20% down payment on a 30-year fixed rate mortgage at prevailing rates. The assumption is also made that the lender will not allow principal and interest costs to exceed 25% of gross income.

<em>Therefore we need to use some generic metric such as the 30% rule.</em>

A ratio of housing cost to income is the gold standard, it is known as the 30 percent rule, and it is established because of legal actions.

https://www.forbes.com/sites/rogervaldez/2020/02/07/housing-ideas-for-the-feds-find-a-new-measure-of-affordable/?sh=2e9f3c3c36f8

https://www.huduser.gov/portal/pdredge/pdr_edge_featd_article_092214.html

In [7]:
# For some reason nesting this inside summary states isn't working
def affordability(row, monthly_30):
    """
    Row wise operations for apply function
    Return row of values

    Fixed Monthly Mortgage Repayment Calculation = P * r * (1 + r)n / [(1 + r)n – 1]
    where P = Outstanding loan amount, r = Effective monthly interest rate, 
    n = Total number of years * 12 months
    """

    # Mortgage variables
    # Cite: http://www.freddiemac.com/pmms/docs/30yr_pmmsmnth.xls
    freddie_mac_mortgage_rates = {2019:3.94,2020:3.11,2021:2.92}
    n = 30 * 12

    P = row['sale_price']
    r = freddie_mac_mortgage_rates[row['transaction_date'].year]
    row['Monthly Mortgage'] = (P * r * ((1 + r)**n)) / (((1 + r)**n) - 1)
    row['Affordability'] = 100 * ((monthly_30 - row['Monthly Mortgage']) / monthly_30 )
    return row

In [22]:
def summary_stats(df, county, ami):
    """
    Requires numpy 
    Input variable = df (dataframe) , area median income (int), county (string)

    Return new unit price columns -> 'price_per_size_unit', 'price_per_bed_unit', 'price_per_lot_size_unit'
    Return class based on 'price_per_size_unit'  -> 'tier' [lower, mid , top]
    Return affordability -> 'monthly_mortgage' , 'affordability' , 'ami'
    """

    # Unit price columns
    df['Price Per Size Unit'] = np.where(df['size'] != 0, 
        round(df['sale_price'] / df['size'], 2), # True
        np.nan # False
        )
    df['Price Per Bed Unit'] = np.where(df['bedrooms'] != 0, 
        round(df['sale_price'] / df['bedrooms'], 2), # True
        np.nan # False
        )
    df['Price Per Lot Size Unit'] = np.where(df['lot_size'] != 0, 
        round(df['sale_price'] / df['lot_size'], 2), # True
        np.nan # False
        )

    # Binning into 3 equal tiers based on price
    df['Tier'] = pd.qcut(
        df[df['Price Per Size Unit'].notna()]['Price Per Size Unit'], 
        3, 
        labels=['Bottom','Middle','Top'])

    # Convert to datetime
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])


    # Apply the affordability function
    # Monthly is needed for the afordability calculation
    monthly_30 = ( ami / 12 ) * .3
    df = df.apply(lambda x: affordability( row = x, monthly_30 = monthly_30), axis=1)
    

    return df


In [34]:
for county_key in sorted(ca_pickled.keys()):
    missing_counties = list()
    try:
        ca_pickled[county_key] = summary_stats(
                df = ca_pickled[county_key], 
                county = county_key, 
                ami = int(median_income_df[
                        median_income_df['County'].str.contains(str(county_key))]['Median Household Income'])
        )
    except:
        missing_counties.append(county_key)
        print(f"County income data mising for {county_key}. Continuing...")
        continue

In [35]:
# export cleaned dictionary of dataframe
with open("ca_df_dict_with_stats.pkl","wb") as ca_dump:
    pickle.dump(ca_pickled,ca_dump)
    print('done')

done


In [None]:
# To Peak inside the binnning we can repeat th code
"""
ca_pickled['San Francisco']['Tier'], sf_bins = pd.qcut(
        ca_pickled['San Francisco'][ca_pickled['San Francisco']['Price Per Size Unit'].notna()]['Price Per Size Unit'], 
        3, 
        labels=['Bottom','Middle','Top'],
        precision = 0,
        retbins=True)
ca_pickled['Los Angeles']['Tier'], la_bins = pd.qcut(
        ca_pickled['Los Angeles'][ca_pickled['Los Angeles']['Price Per Size Unit'].notna()]['Price Per Size Unit'], 
        3, 
        labels=['Bottom','Middle','Top'],
        precision = 0,
        retbins=True)
"""

In [215]:
print(sf_bins)

[3.65000000e+00 9.29643333e+02 1.22737000e+03 6.80000000e+05]


In [217]:
print(la_bins)

[1.00300000e+01 3.79770000e+02 5.47470000e+02 1.71428571e+06]


In [195]:
ca_pickled['San Francisco'][ca_pickled['San Francisco']['Price Per Size Unit'].notna()]['Price Per Size Unit'].describe()

count     11181.000000
mean       1268.021655
std        6697.544770
min           3.650000
25%         834.930000
50%        1077.810000
75%        1319.550000
max      680000.000000
Name: Price Per Size Unit, dtype: float64

In [36]:
ca_pickled['Los Angeles'][ca_pickled['Los Angeles']['Price Per Size Unit'].notna()]['Price Per Size Unit'].describe()

count    1.353370e+05
mean     6.186799e+02
std      5.157070e+03
min      1.003000e+01
25%      3.446400e+02
50%      4.524500e+02
75%      6.174500e+02
max      1.714286e+06
Name: Price Per Size Unit, dtype: float64

In [None]:
"""
ca_df_dict['Orange'] = summary_stats(
    df= ca_df_dict['Orange'], 
    county = 'Orange', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Orange')]['Median Household Income'])
    )
    
ca_df_dict['San Diego'] = summary_stats(
    df= ca_df_dict['San Diego'], 
    county = 'San Diego', 
    ami = int(median_income_df[median_income_df['County'].str.contains('San Diego')]['Median Household Income'])
    )

ca_df_dict['San Francisco'] = summary_stats(
    df= ca_df_dict['San Francisco'], 
    county = 'San Francisco', 
    ami = int(median_income_df[median_income_df['County'].str.contains('San Francisco')]['Median Household Income'])
    )

ca_df_dict['Los Angeles'] = summary_stats(
    df= ca_df_dict['Los Angeles'], 
    county = 'Los Angeles', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Los Angeles')]['Median Household Income'])
    )

ca_df_dict['Scremento'] = summary_stats(
    df= ca_df_dict['Scremento'], 
    county = 'Scremento', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Scremento')]['Median Household Income'])
    )
    
ca_df_dict['San Bernardino'] = summary_stats(
    df= ca_df_dict['San Bernardino'], 
    county = 'San Bernardino', 
    ami = int(median_income_df[median_income_df['County'].str.contains('San Bernardino')]['Median Household Income'])
    )

ca_df_dict['Riverside'] = summary_stats(
    df= ca_df_dict['Riverside'], 
    county = 'Riverside', 
    ami = int(median_income_df[median_income_df['County'].str.contains('Riverside')]['Median Household Income'])
    )

ca_df_dict['Riverside'][ca_df_dict['Riverside']['transaction_date'].isna()]
ca_df_dict['Los Angeles'][ca_df_dict['Los Angeles']['transaction_date'].isna()]
ca_df_dict['San Diego'][ca_df_dict['San Diego']['transaction_date'].isna()]

"""

# COVID EDA

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [231]:
covid.columns

Index(['Date', 'County', 'State', 'CUM Cases', 'CUM Deaths', 'New Cases',
       'R7 Cases AVG', 'R7 Cases AVG Per 100k', 'Days Deaths', 'R7 Deaths AVG',
       'R7 Deaths AVG Per 100k'],
      dtype='object')

In [235]:
# LA's First case was 2020-01-26
covid[covid.County =='Los Angeles'].head(1)

Unnamed: 0,Date,County,State,CUM Cases,CUM Deaths,New Cases,R7 Cases AVG,R7 Cases AVG Per 100k,Days Deaths,R7 Deaths AVG,R7 Deaths AVG Per 100k
1,2020-01-26,Los Angeles,California,1.0,0.0,1,0.14,0.0,0,0.0,0.0


In [236]:
# San Francisco's first case was 2020-02-02	
covid[covid.County =='San Francisco'].head(1)

Unnamed: 0,Date,County,State,CUM Cases,CUM Deaths,New Cases,R7 Cases AVG,R7 Cases AVG Per 100k,Days Deaths,R7 Deaths AVG,R7 Deaths AVG Per 100k
17,2020-02-02,San Francisco,California,2.0,0.0,2,0.29,0.03,0,0.0,0.0


In [179]:
fig2 = px.line(covid, x='Date', y='R7 Cases AVG Per 100k', color='County')
fig2.show()

In [180]:
# Focus on two market's for control and test group
filtered_covid = covid[covid.County.isin(['San Francisco','Los Angeles'])][['Date','R7 Cases AVG Per 100k','County']]

fig = px.line(filtered_covid, x='Date', y='R7 Cases AVG Per 100k', color='County')
fig.show()

Treatment variable: R7 cases per 100K 
Treatment variable threshold: ~15

Post treatment period : June 22, 2020 to July 22nd 2020
Pre treatment period : June 22, 2019 to July 2019

In DiD, in post-treatment period, the treatment group is affected, but the control group is not.
In DiD, in pre-treatment period, neither treatment nor control group is affected.

Post-treatment outcome variable for Control group: Median home prices for June 22, 2020 to July 22nd 2020
Post-treatment outcome variable for Treatment Group : Median home prices for LA for June 22, 2020 to July 22nd 2020

Pre-treatment outcome variable for Control group : Median home prices for SF for June 22, 2019 to July 22nd 2019
Pre-treatment outcome variable for Treatment Group : Median home prices for LA for June 22, 2019 to July 22nd 2019

In [None]:
# Los Angles numbers
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 15) & (covid.County == 'Los Angeles')].head(1)

In [None]:
# Los Angles numbers
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 34) & (covid.County == 'Los Angeles')].head(1)

In [None]:
# San Francisco
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 15) & (covid.County == 'San Francisco')].head(1)

In [None]:
filtered_covid[(filtered_covid['R7 Cases AVG Per 100k'] > 34) & (covid.County == 'San Francisco')].head(1)

# Real Estate EDA

In [9]:
ca_pickled['San Francisco']['County'] = 'San Francisco'
ca_pickled['Los Angeles']['County'] = 'Los Angeles'
sf_la = pd.concat([ca_pickled['San Francisco'], ca_pickled['Los Angeles']])

In [10]:
sf_la.head()

Unnamed: 0,lot_size,zipcode,latitude,longitude,sub_type,type,year_built,size,bathrooms,bedrooms,transaction_date,sale_price,Price Per Size Unit,Price Per Bed Unit,Price Per Lot Size Unit,Tier,Monthly Mortgage,Affordability,County
0,0.0556,94109,37.798711,-122.41882,RESIDENTIAL,APARTMENT,1961,3424,4.0,0,2020-01-01,256000,74.77,,4604316.55,Bottom,796160.0,-26047.543003,San Francisco
1,0.0,94108,37.791088,-122.407931,RESIDENTIAL,CONDOMINIUM,1920,288,1.0,0,2020-01-02,480000,1666.67,,,Top,1492800.0,-48926.64313,San Francisco
3,0.0,94121,37.775637,-122.502045,RESIDENTIAL,CONDOMINIUM,1998,670,1.0,2,2020-01-02,729000,1088.06,364500.0,,Middle,2267190.0,-74359.214253,San Francisco
4,0.0,94132,37.710298,-122.466269,RESIDENTIAL,CONDOMINIUM,2002,1186,2.0,3,2020-01-02,830000,699.83,276666.67,,Bottom,2581300.0,-84675.237079,San Francisco
5,0.0,94105,37.787586,-122.390748,RESIDENTIAL,CONDOMINIUM,1991,795,1.0,1,2020-01-02,860000,1081.76,860000.0,,Middle,2674600.0,-87739.402274,San Francisco


In [11]:
# Look at daily counts
sales_count_by_type = sf_la.groupby(by=['County','transaction_date','type'])[['sub_type']].count().reset_index()
sales_count_by_tier = sf_la.groupby(by=['County','transaction_date','Tier'])[['sub_type']].count().reset_index()

In [165]:
sales_count_by_type.head()

Unnamed: 0,County,transaction_date,type,sub_type
0,Los Angeles,2019-01-01,CONDOMINIUM,1
1,Los Angeles,2019-01-01,SFR,2
2,Los Angeles,2019-01-02,APARTMENT,1
3,Los Angeles,2019-01-02,CONDOMINIUM,34
4,Los Angeles,2019-01-02,DUPLEX,4


In [66]:
la_violin_chart = px.histogram(ca_pickled['Los Angeles'], x="sale_price", histfunc="count",marginal="violin", title='Los Angeles Sale Prices')
la_violin_chart.show()

In [18]:
sf_la.columns

Index(['lot_size', 'zipcode', 'latitude', 'longitude', 'sub_type', 'type',
       'year_built', 'size', 'bathrooms', 'bedrooms', 'transaction_date',
       'sale_price', 'Price Per Size Unit', 'Price Per Bed Unit',
       'Price Per Lot Size Unit', 'Tier', 'Monthly Mortgage', 'Affordability',
       'County'],
      dtype='object')

In [39]:
sf_la['size'].describe()

count    147065.000000
mean       1998.344072
std        2769.586027
min           0.000000
25%        1160.000000
50%        1539.000000
75%        2156.000000
max      305404.000000
Name: size, dtype: float64

In [37]:
# Curiously this data has some anomalies
# 14 bathroom in 51 sq ft residence
sf_la[sf_la["size"] <= 150].sort_values(by='size').tail(10)

Unnamed: 0,lot_size,zipcode,latitude,longitude,sub_type,type,year_built,size,bathrooms,bedrooms,transaction_date,sale_price,Price Per Size Unit,Price Per Bed Unit,Price Per Lot Size Unit,Tier,Monthly Mortgage,Affordability,County
124445,0.1601,91214,34.210806,-118.243086,MISCELLANEOUS INDUSTRIAL,SFR,1986,3,4.0,1,2020-09-08,1129000,376333.33,1129000.0,7051842.6,Top,3511190.0,-193032.1,Los Angeles
9340,0.0341,94124,37.7328,-122.391,RESIDENTIAL,TOWNHOUSE/ROWHOUSE,2019,3,2.0,3,2019-11-12,382000,127333.33,127333.33,11202346.04,Top,1505080.0,-49329.94,San Francisco
24490,0.7986,90077,34.083073,-118.439049,MISCELLANEOUS INDUSTRIAL,SFR,1925,7,7.0,5,2020-11-19,12000000,1714285.71,2400000.0,15026296.02,Top,37320000.0,-2052677.0,Los Angeles
134662,0.8051,90094,33.970956,-118.427663,RESIDENTIAL,SFR,2005,20,0.0,0,2021-02-04,749000,37450.0,,930319.22,Top,2187080.0,-120199.8,Los Angeles
140741,0.1126,90066,34.003015,-118.449437,MISCELLANEOUS INDUSTRIAL,SFR,1939,51,14.0,3,2020-08-31,1688000,33098.04,562666.67,14991119.01,Top,5249680.0,-288657.3,Los Angeles
147899,0.1696,90290,34.102624,-118.621148,MISCELLANEOUS INDUSTRIAL,SFR,1947,96,0.0,0,2019-09-08,755000,7864.58,,4451650.94,Top,2974700.0,-163522.6,Los Angeles
127607,0.5326,90265,34.019324,-118.810809,RESIDENTIAL,SFR,1986,120,0.0,0,2019-08-22,1000000,8333.33,,1877581.67,Top,3940000.0,-216618.7,Los Angeles
79527,0.0968,90501,33.830122,-118.306272,RESIDENTIAL,SFR,1945,120,0.0,0,2019-06-07,1079000,8991.67,,11146694.21,Top,4251260.0,-233739.5,Los Angeles
51386,0.2494,91502,34.180277,-118.303818,RESIDENTIAL,CONDOMINIUM,1992,135,0.0,0,2020-04-23,530000,3925.93,,2125100.24,Top,1648300.0,-90564.32,Los Angeles
141501,0.1192,90290,34.103951,-118.591971,MISCELLANEOUS INDUSTRIAL,SFR,1950,150,1.0,0,2020-08-12,1360000,9066.67,,11409395.97,Top,4229600.0,-232548.1,Los Angeles


In [46]:
sf_la_size_fltr = sf_la["size"] <= 150
sf_la_sz = sf_la[~sf_la_size_fltr]

In [None]:
sf_la_sz

In [48]:
# .sort_values(by='size').head(10)
sf_la_sz[sf_la_sz["bedrooms"] == 0].shape

(4199, 19)

In [24]:
sf_la[sf_la["Price Per Size Unit"] >= 50000].head()

Unnamed: 0,lot_size,zipcode,latitude,longitude,sub_type,type,year_built,size,bathrooms,bedrooms,transaction_date,sale_price,Price Per Size Unit,Price Per Bed Unit,Price Per Lot Size Unit,Tier,Monthly Mortgage,Affordability,County
2628,0.0,94103,37.774712,-122.412201,RESIDENTIAL,CONDOMINIUM,1906,1,1.0,1,2020-07-27,680000,680000.0,680000.0,,Top,2114800.0,-69354.41,San Francisco
9340,0.0341,94124,37.7328,-122.391,RESIDENTIAL,TOWNHOUSE/ROWHOUSE,2019,3,2.0,3,2019-11-12,382000,127333.33,127333.33,11202346.04,Top,1505080.0,-49329.94,San Francisco
24490,0.7986,90077,34.083073,-118.439049,MISCELLANEOUS INDUSTRIAL,SFR,1925,7,7.0,5,2020-11-19,12000000,1714285.71,2400000.0,15026296.02,Top,37320000.0,-2052677.0,Los Angeles
26679,0.6824,90212,34.061228,-118.395009,MISCELLANEOUS INDUSTRIAL,CONDOMINIUM,2015,690,1.0,1,2020-11-20,35800000,51884.06,35800000.0,52461899.18,Top,111338000.0,-6124018.0,Los Angeles
27498,0.6824,90212,34.061335,-118.395189,MISCELLANEOUS INDUSTRIAL,CONDOMINIUM,2015,690,1.0,1,2020-11-20,35800000,51884.06,35800000.0,52461899.18,Top,111338000.0,-6124018.0,Los Angeles


In [23]:
violin_c = px.histogram(sf_la[sf_la["Price Per Size Unit"] >= 50000], x='Price Per Size Unit', color='County',histfunc="count",marginal="violin", title='Distirbution of Residential properties less than 50k per SQ FT')
violin_c.show()

In [268]:
sf_violin_chart = px.histogram(ca_pickled['San Francisco'], x="Price Per Size Unit", marginal="violin", title='San Francisco Price Per SQ FT')
sf_violin_chart.show()

In [63]:
la_sales_type = px.histogram(sales_count_by_type[sales_count_by_type.County =='Los Angeles'], x='transaction_date', y='sub_type', color='type', histfunc="sum", labels={
                     "transaction_date": "transaction period",
                     "sub_type": "residential types",
                 }, title='Los Angeles Monthly Sales by Type')
la_sales_type.update_traces(xbins_size="M1")

# Annotations
la_sales_type.add_annotation(x='2020-01-26',y=7000 ,text="LA's First Covid-19 Case")
la_sales_type.add_annotation(x='2020-03-16',y=5000 ,text='State Wide Shelter In Place Order')
la_sales_type.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-01-26', x1= '2020-01-26',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
la_sales_type.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-03-19', x1= '2020-03-19',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
la_sales_type.show()

In [67]:
sf_sales_type = px.histogram(sales_count_by_type[sales_count_by_type.County =='San Francisco'], x='transaction_date', y='sub_type', color='type', histfunc="sum", labels={
                     "transaction_date": "transaction period",
                     "sub_type": "residential types",
                 }, title='San Francisco Monthly Sales by Type')
sf_sales_type.update_traces(xbins_size="M1")

# add text annotations
sf_sales_type.add_annotation(x='2020-02-02',y=600 ,text="SF's First Covid-19 Case")
sf_sales_type.add_annotation(x='2020-03-16',y=400 ,text='Shelter In Place Order')
sf_sales_type.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-02-02', x1= '2020-02-02',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
sf_sales_type.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-03-16', x1= '2020-03-16',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )

sf_sales_type.show()

In [61]:
la_tier_chart = px.histogram(sales_count_by_tier[sales_count_by_tier.County =='Los Angeles'], x='transaction_date', y='sub_type', color='Tier', histfunc="sum", title='Los Angeles Monthly Sales by Tier')
la_tier_chart.update_traces(xbins_size="M1")


la_tier_chart.add_annotation(x='2020-01-26',y=7000 ,text="LA's First Covid-19 Case")
la_tier_chart.add_annotation(x='2020-03-16',y=5000 ,text='State Wide Shelter In Place Order')
la_tier_chart.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-01-26', x1= '2020-01-26',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
la_tier_chart.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-03-19', x1= '2020-03-19',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
la_tier_chart.show()

In [250]:
sf_tier_chart = px.histogram(sales_count_by_tier[sales_count_by_tier.County =='San Francisco'], x='transaction_date', y='sub_type', color='Tier', histfunc="sum", title='San Francisco Daily Sales by Tier')
sf_tier_chart.update_traces(xbins_size="M1")

# add text annotations
sf_tier_chart.add_annotation(x='2020-02-02',y=600 ,text="SF's First Covid-19 Case")
sf_tier_chart.add_annotation(x='2020-03-16',y=400 ,text='Shelter In Place Order')
sf_tier_chart.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-02-02', x1= '2020-02-02',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
sf_tier_chart.add_shape(
      dict(
        type= 'line',
        yref= 'paper', y0= 0, y1= 1,
        xref= 'x', x0= '2020-03-16', x1= '2020-03-16',
      line=dict(color="Gray",width=1,dash="dashdot")
      ),
    )
sf_tier_chart.show()