# Data Manipulation

In [None]:
# Load Modin for faster pandas if needed
# https://towardsdatascience.com/get-faster-pandas-with-modin-even-on-your-laptops-b527a2eeda74
# !pip install modin

In [1]:
import numpy as np
# import modin.pandas as pd # This is for when we have all the files
import pandas as pd  # if modin is not needed
import pickle
import os

In [3]:
# folder navigation
os.chdir("..") # Move up directory
print(os.path.abspath(os.curdir))

/Users/Chris/Documents/GitHub/SIADS691-Covid-Real-Estate


# Import Data

## New York

In [217]:
# Read in New York Dictionary of dataframes
ny_dict_df = pd.read_pickle("datasets/NY_data/final_data_frame/NY_dict_df_col_cleaned_v2.pkl")
print('done')

done


## Income

In [60]:
# Federal income data reported 2019
income_data = pd.read_excel("datasets/income/est19-all_income.xls", header=3)[['Postal Code', 'Name','Median Household Income']]

In [77]:
# Select relevant income data
income_data_CA_NY = income_data[(income_data['Postal Code'].isin(['CA','NY'])) & income_data['Name'].str.contains('County')]

# Select relevant columns
median_income_df = income_data_CA_NY[['Postal Code', 'Name','Median Household Income']]
median_income_df.rename(columns={'Name':'County','Postal Code':'State ABV'}, inplace=True)

# Isolate each state's Area Median Income (AMI).
ny_median_income_df = median_income_df[median_income_df['State ABV'] == 'NY']
ca_median_income_df = median_income_df[median_income_df['State ABV'] == 'CA']

## Population

In [66]:
# Need to import population for Covid factorization 
# (Daily infection / Population) -> DEFAULT TO NYT
# (Daily death / Population) -> DEFAULT TO NYT

# Density for filtering on the "urban factor" & causal county comparison 
# https://worldpopulationreview.com/us-counties/states/ca
# https://worldpopulationreview.com/us-counties/states/ny
ny_density = pd.read_csv("datasets/population/ny_density_pop_by_county.csv")
ca_density = pd.read_csv("datasets/population/ca_density_pop_by_county.csv")
ny_density['state'] = 'New York'
ca_density['state'] = 'California'

# Concatenate files together
pop_density = pd.concat([ny_density,ca_density], axis = 0).drop(['GrowthRate'], axis=1)

# Rename for Title Caps
pop_density.rename(columns={'CTYNAME':'City Name','popDensity':'Pop Density','pop2021': 'Pop 2021','state':'State'}, inplace=True)

# Secondary dataset 
pop_data = pd.read_csv("datasets/population/co-est2019-alldata.csv")
pop_data = pop_data[pop_data.STNAME.isin(['California','New York']) ][['STNAME', 'CTYNAME','POPESTIMATE2019']]


In [30]:
ca_density.sort_values(by='CTYNAME').head()

Unnamed: 0,CTYNAME,pop2021,GrowthRate,popDensity
6,Alameda County,1680480,11.0701,2273.707
57,Alpine County,1209,4.1344,1.6358
45,Amador County,40446,6.7571,54.7239
28,Butte County,196880,-10.4883,266.3807
43,Calaveras County,46319,1.8716,62.6701


In [7]:
pop_data.head()

Unnamed: 0,STNAME,CTYNAME,POPESTIMATE2019
190,California,California,39512223
191,California,Alameda County,1671329
192,California,Alpine County,1129
193,California,Amador County,39752
194,California,Butte County,219186


## Covid
Covid-19 reporting starts on 2020.01.20

In [79]:
# Read in CDC data -> NOT NEEDED
# cdc = pd.read_csv("datasets/covid/CDC_COVID-19_Case_Surveillance_Public_Use_Data_with_Geography.csv")

# Read in New York Times data - > SELECTED
# This represents a cumulative number
# https://github.com/nytimes/covid-19-data/blob/master/us-counties.csv
nyt_cum = pd.read_csv("datasets/covid/nyt-us-counties.csv")
nyt_cum = nyt_cum[nyt_cum.state.isin(['New York','California'])].drop(['fips'], axis=1)
nyt_cum = nyt_cum.rename(columns={'cases':'cum_cases','deaths':'cum_deaths'})

# Next represents daily reports and averages - > SELECTED
# https://github.com/nytimes/covid-19-data/tree/master/rolling-averages
nyt_avgs = pd.read_csv("datasets/covid/nyt-rolling-us-counties.csv")
nyt_avgs = nyt_avgs[nyt_avgs.state.isin(['New York','California'])].drop(['geoid'], axis=1)
nyt_avgs = nyt_avgs.rename(columns={'cases':'New Cases','cases_avg':'R7 Cases AVG','deaths':'Days Deaths','cases_avg_per_100k':'R7 Cases AVG Per 100k','deaths_avg':'R7 Deaths AVG','deaths_avg_per_100k':'R7 Deaths AVG Per 100k'})

# Merge the dataframes into one
covid = pd.merge(nyt_cum, nyt_avgs, on=['date','county','state'], how='outer')
covid.rename(columns={'date':'Date', 'county':'County', 'state':'State', 'cum_cases': 'CUM Cases', 'cum_deaths':'CUM Deaths'}, inplace=True)

# Read in Johns Hopkins data -> NOT NEEDED : SAME AS NYT
# jh_confirmed = pd.read_csv("datasets/covid/jh_time_series_covid19_confirmed_US.csv")
# jh_death = pd.read_csv("datasets/covid/jh_time_series_covid19_deaths_US.csv")

In [72]:
# Need to check if each county is labelled as the same 
# Perhaps default to string contains _____
nyt[nyt.county == 'New York City'].head()

Unnamed: 0,date,county,state,fips,cases,deaths
416,2020-03-01,New York City,New York,,1,0.0
448,2020-03-02,New York City,New York,,1,0.0
482,2020-03-03,New York City,New York,,2,0.0
518,2020-03-04,New York City,New York,,2,0.0
565,2020-03-05,New York City,New York,,4,0.0


In [49]:
nyt[nyt.date == '2021-04-02'].head()

Unnamed: 0,date,county,state,fips,cases,deaths
1180117,2021-04-02,Autauga,Alabama,1001.0,6606,99.0
1180118,2021-04-02,Baldwin,Alabama,1003.0,20519,301.0
1180119,2021-04-02,Barbour,Alabama,1005.0,2228,55.0
1180120,2021-04-02,Bibb,Alabama,1007.0,2544,58.0
1180121,2021-04-02,Blount,Alabama,1009.0,6455,132.0


In [52]:
jh_confirmed[['Admin2','Province_State','4/2/21']].head()

Unnamed: 0,Admin2,Province_State,4/2/21
0,Autauga,Alabama,6606
1,Baldwin,Alabama,20519
2,Barbour,Alabama,2228
3,Bibb,Alabama,2544
4,Blount,Alabama,6455


In [53]:
jh_death[['Admin2','Province_State','4/2/21']].head()

Unnamed: 0,Admin2,Province_State,4/2/21
0,Autauga,Alabama,99
1,Baldwin,Alabama,301
2,Barbour,Alabama,55
3,Bibb,Alabama,58
4,Blount,Alabama,132


In [None]:
# Investigating Conclusion
# NYT maintains JH integrity and is in a better format for wrangling.

# Functions for Metrics, Statistics, Classification 

<em>Drilling into true affordability requires fincial data we don't have</em>

The first-time homebuyer affordability index is based upon the assumption of 70% median household income, 85% of the median house price, a 10% down payment on a 30-year fixed rate mortgage at prevailing rates plus 0.25% added per month for mortgage insurance. The assumption of 25% of gross income for mortgage repayments also applies.

The composite housing affordability index assumes a median household income, median house prices, a 20% down payment on a 30-year fixed rate mortgage at prevailing rates. The assumption is also made that the lender will not allow principal and interest costs to exceed 25% of gross income.

<em>Therefore we need to use some generic metric such as the 30% rule.</em>

A ratio of housing cost to income is the gold standard, it is known as the 30 percent rule, and it is established because of legal actions.

https://www.forbes.com/sites/rogervaldez/2020/02/07/housing-ideas-for-the-feds-find-a-new-measure-of-affordable/?sh=2e9f3c3c36f8

https://www.huduser.gov/portal/pdredge/pdr_edge_featd_article_092214.html

In [255]:
# For some reason nesting this inside summary states isn't working
def affordability(row):
    """
    Row wise operations for apply function
    Return row of values
    """
    P = row['sale_price']
    r = freddie_mac_mortgage_rates[row['transaction_date'].year]
    row['Monthly Mortgage'] = (P * r * ((1 + r)**n)) / (((1 + r)**n) - 1)
    row['Affordability'] = 100 * ((monthly_30 - row['Monthly Mortgage']) / monthly_30 )
    return row

In [304]:
def summary_stats(df, county, median):
    """
    Requires numpy 
    Input variable = df (dataframe) , area median income (int), county (string)

    Fixed Monthly Mortgage Repayment Calculation = P * r * (1 + r)n / [(1 + r)n â€“ 1]
    where P = Outstanding loan amount, r = Effective monthly interest rate, 
    n = Total number of years * 12 months

    Return new unit price columns -> 'price_per_size_unit', 'price_per_bed_unit', 'price_per_lot_size_unit'
    Return class based on 'price_per_size_unit'  -> 'tier' [lower, mid , top]
    Return affordability -> 'monthly_mortgage' , 'affordability' , 'ami'
    """

    # Unit price columns
    df['Price Per Size Unit'] = np.where(df['size'] != 0, 
        round(df['sale_price'] / df['size'], 2), # True
        np.nan # False
        )
    df['Price Per Bed Unit'] = np.where(df['bedrooms'] != 0, 
        round(df['sale_price'] / df['bedrooms'], 2), # True
        np.nan # False
        )
    df['Price Per Lot Size Unit'] = np.where(df['lot_size'] != 0, 
        round(df['sale_price'] / df['lot_size'], 2), # True
        np.nan # False
        )

    # Convert to datetime
    df['transaction_date'] = pd.to_datetime(df['transaction_date'])

    # Binning into 3 equal tiers based on price
    df['Tier'] = pd.cut(
        df[
            (df['Price Per Size Unit'].notna()) & 
            (df['sub_type'].isin(['RESIDENTIAL','HOUSE']))
            ]['Price Per Size Unit'], 
        bins=3, 
        labels=['Bottom','Middle','Top'], 
        ordered=True)

    # Mortgage variables
    # Cite: http://www.freddiemac.com/pmms/docs/30yr_pmmsmnth.xls
    freddie_mac_mortgage_rates = {2019:3.94,2020:3.11,2021:2.92}
    n = 30 * 12
    monthly_30 = ( median / 12 ) * .3

    # Apply the affordability function
    df = df.apply(affordability, axis=1)
    

    return df


In [None]:
# Abandoned this calculation for NYT stats for integrity reasons

def covid_stats(county, covid_df = nyt, pop_df = pop_data):
    """
    This function calculates rolling averages and percentage of population statistics
    INPUT -> county = county (string) ; covid_df = new york times (dataframe) ; pop_df = population (dataframe)

    RRTURN -> None ( in-place dataframe transformation )
    """

    # Day's Confirmed Cases / Population
    # covid_df['confirmed_pct'] = covid_df['cases'] / pop_data[pop_data.CTYNAME.str.contains(county)]['POPESTIMATE2019']

    # Day's Deaths / Population
    #covid_df['death_pct'] = covid_df['deaths'] / pop_data[pop_data.CTYNAME.str.contains(county)]['POPESTIMATE2019']

    # 7 day rolling cases_avg_per_100k
    # 7 day rolling deaths_avg_per_100k


    return none

# Run summary_stats() on NYC

In [289]:
for county_key in sorted(ny_dict_df.keys()):
    missing_counties = list()
    try:
        ny_dict_df[county_key] = summary_stats(
                df = ny_dict_df[county_key], 
                county = county_key, 
                median = int(median_income_df[
                        (median_income_df['State ABV'] == 'NY') & 
                        (median_income_df['County'].str.contains(str(county_key)))
                        ]['Median Household Income'])
                        )
    except:
        missing_counties.append(county_key)
        print(f"County income data mising for {county_key}. Continuing...")
        continue

County income data mising for Bradford. Continuing...
County income data mising for Bergen. Continuing...
County income data mising for McKean. Continuing...
County income data mising for Suffolk. Continuing...
County income data mising for Erie_2. Continuing...
County income data mising for Middlesex_1. Continuing...
County income data mising for Kings. Continuing...
County income data mising for Lewis. Continuing...
County income data mising for New Haven. Continuing...
County income data mising for Pike. Continuing...
County income data mising for Delaware. Continuing...
County income data mising for Potter. Continuing...
County income data mising for Berkshire. Continuing...
County income data mising for Fairfield. Continuing...
County income data mising for Hamilton. Continuing...
County income data mising for Chemung. Continuing...
County income data mising for Bennington. Continuing...
County income data mising for Rutland. Continuing...
County income data mising for Genesee. Co