In [1]:
# Dependencies and Setup
import csv
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress 
import seaborn as sns 
from scipy import stats
import scipy.stats as sts


Alicia's analysis 

* (Households as of March of the following year. Income in current and 2019 CPI-U-RS adjusted dollars (28). Beginning in 2010, standard errors were calculated using replicate weights)
* Table H-8. Median Household Income by State: 1984 to 2019
* Source: U.S. Census Bureau, Current Population Survey, Annual Social and Economic Supplements (CPS ASEC). For information on confidentiality protection, sampling error, nonsampling error, and definitions, see <https://www2.census.gov/programs-surveys/cps/techdocs/cpsmar20.pdf>.
* Footnotes are available at <www.census.gov/topics/income-poverty/income/guidance/cps-historic-footnotes.html>.

In [2]:
# Extracting the path
median_income_path = "../Data/Income/Median-Household-Income-5y.csv"

In [3]:
# Reading the data 
median_income_data = pd.read_csv(median_income_path)
median_income_data.head()


Unnamed: 0,State,2019 Median income,2019 Standard error,2018 Median income,2018 Standard error,2017 Median income,2017 Standard error,2016 Median income,2016 Standard error,2015 Median income,2015 Standard error,2014 Median income,2014 Standard error
0,United States,68703,550,63179,420,61372,335,59039,436,56516,321,53657,392
1,Alabama,56200,2512,49936,2423,51113,845,47221,2301,44509,3419,42278,1529
2,Alaska,78394,6685,68734,3390,72231,2719,75723,4086,75112,3485,67629,3153
3,Arizona,70674,3391,62283,2291,61125,2642,57100,1971,52248,2008,49254,2304
4,Arkansas,54539,2384,49781,2108,48829,2642,45907,2165,42798,1572,44922,2546


In [4]:
for x in median_income_data.columns:
    if x != "State":
        median_income_data[x] = median_income_data[x].str.replace(",", "").astype(float)
    print(x)

State
2019 Median income
2019 Standard error
2018 Median income
2018 Standard error
2017 Median income
2017 Standard error
2016 Median income
2016 Standard error
2015 Median income
2015 Standard error
2014 Median income
2014 Standard error


In [5]:
median_income_data.dtypes

State                   object
2019 Median income     float64
2019 Standard error    float64
2018 Median income     float64
2018 Standard error    float64
2017 Median income     float64
2017 Standard error    float64
2016 Median income     float64
2016 Standard error    float64
2015 Median income     float64
2015 Standard error    float64
2014 Median income     float64
2014 Standard error    float64
dtype: object

In [6]:
print(median_income_data.dtypes)

State                   object
2019 Median income     float64
2019 Standard error    float64
2018 Median income     float64
2018 Standard error    float64
2017 Median income     float64
2017 Standard error    float64
2016 Median income     float64
2016 Standard error    float64
2015 Median income     float64
2015 Standard error    float64
2014 Median income     float64
2014 Standard error    float64
dtype: object


In [7]:
# Creating a new dataframe with just median household income for year 2014 - 2019
household_income_df = pd.DataFrame({
    "State": median_income_data['State'],
    "2019": median_income_data['2019 Median income'],
    "2018": median_income_data['2018 Median income'],
    "2017": median_income_data['2017 Median income'],
    "2016": median_income_data['2016 Median income'],
    "2015": median_income_data['2015 Median income'],
    "2014": median_income_data['2014 Median income']
})
household_income_df.head()

Unnamed: 0,State,2019,2018,2017,2016,2015,2014
0,United States,68703.0,63179.0,61372.0,59039.0,56516.0,53657.0
1,Alabama,56200.0,49936.0,51113.0,47221.0,44509.0,42278.0
2,Alaska,78394.0,68734.0,72231.0,75723.0,75112.0,67629.0
3,Arizona,70674.0,62283.0,61125.0,57100.0,52248.0,49254.0
4,Arkansas,54539.0,49781.0,48829.0,45907.0,42798.0,44922.0


In [14]:
# Only displaying 50 states, removed the United States row 
allstates_df  = household_income_df.iloc[1:]
allstates_df.head()

Unnamed: 0,State,2019,2018,2017,2016,2015,2014
1,Alabama,56200.0,49936.0,51113.0,47221.0,44509.0,42278.0
2,Alaska,78394.0,68734.0,72231.0,75723.0,75112.0,67629.0
3,Arizona,70674.0,62283.0,61125.0,57100.0,52248.0,49254.0
4,Arkansas,54539.0,49781.0,48829.0,45907.0,42798.0,44922.0
5,California,78105.0,70489.0,69759.0,66637.0,63636.0,60487.0


In [21]:
latest_data = allstates_df[["State", "2019"]]
latest_data.head()


Unnamed: 0,State,2019
1,Alabama,56200.0
2,Alaska,78394.0
3,Arizona,70674.0
4,Arkansas,54539.0
5,California,78105.0


In [23]:
latest_data.to_csv("../Data/Alicia/latest_data.csv")

In [9]:
test = allstates_df.T
test

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,41,42,43,44,45,46,47,48,49,50
State,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,Florida,Georgia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
2019,56200,78394,70674,54539,78105,72499,87291,74194,58368,56628,...,64255,56627,67444,84523,74305,81313,82454,53706,67355,65134
2018,49936,68734,62283,49781,70489,73034,72812,65012,54644,55821,...,59463,56060,59785,77067,70066,77151,79726,50573,62629,62539
2017,51113,72231,61125,48829,69759,74172,72780,62318,53681,57016,...,56894,55240,59295,71319,63805,71293,75418,45392,63451,57837
2016,47221,75723,57100,45907,66637,70566,75923,58046,51176,53527,...,57450,51344,58146,67481,60837,66451,70310,44354,59817,57829
2015,44509,75112,52248,42798,63636,66596,72889,57756,48825,50768,...,55065,47330,56473,66258,59494,61486,67243,42824,55425,60925
2014,42278,67629,49254,44922,60487,60940,70161,57522,46140,49555,...,53053,43716,53875,63383,60708,66155,59068,39552,58080,55690


In [10]:
# Creating a list 
year_data = []
column_names = ["State"]
for x in allstates_df.columns:
    if x not in column_names:
        year_data.append(x)

In [11]:
all_df = []
# for i, row in year_data.iterrows():
#     state = row["State"]
for x in year_data:
#     dat = x.split("")
    a ={
        "State": allstates_df["State"],
        "Year": allstates_df["2019"],
        "Median Houshold Income":allstates_df[x]
    }
    all_df.append(a)
        


In [12]:
# Creating a list 
real_estate_date = []
column_names = ["RegionID", "SizeRank", "RegionName", "RegionType", "StateName"]
for x in real_estate_data.columns:
    if x not in column_names:
        real_estate_date.append(x)

NameError: name 'real_estate_data' is not defined

In [44]:
real_estate_df = []
for i, row in real_estate_data.iterrows():
    region_id = row["RegionID"]
    size_rank = row["SizeRank"]
    region_name = row["RegionName"]
    region_type = row["RegionType"]
    state_name = row["StateName"]
    for x in real_estate_date:
        dat = x.split("-")
        a ={
            "Region ID": region_id,
            "Size Rank": size_rank,
            "Region Name": region_name,
            "RegionType": region_type, 
            "State Name": state_name,
            "Date": f"{dat[1].rjust(2,'0')}-{dat[2].rjust(2,'0')}-{dat[0]}",
            "Year": dat[0],
            "Month": dat[1],
            "Housing Price":row[x]
        }
        real_estate_df.append(a)
        


In [53]:
# creating the new dataframe for easy viewing 
new_RE_df = pd.DataFrame(real_estate_df)
new_RE_df.head()

NameError: name 'real_estate_df' is not defined

In [48]:
# drop NA from the dataframe
new_RE_df = new_RE_df.dropna(how="any")
# new_RE_df.head()

# Drop duplicate 
no_duplicate_RE_df = new_RE_df.drop_duplicates()
# no_duplicate_RE_df.head()


# split city from state, make a new column with just city
no_duplicate_RE_df["Region Name"] = no_duplicate_RE_df["Region Name"].str.split(",").str[0]
# no_duplicate_RE_df.head()

# rename the Region Name to City Name
no_duplicate_RE_df = no_duplicate_RE_df.rename({"Region Name": "State"}, axis="columns")
no_duplicate_RE_df


Unnamed: 0,Region ID,Size Rank,State,RegionType,State Name,Date,Year,Month,Housing Price
0,9,0,California,State,CA,01-31-2015,2015,01,428868.0
1,9,0,California,State,CA,02-28-2015,2015,02,431039.0
2,9,0,California,State,CA,03-31-2015,2015,03,434305.0
3,9,0,California,State,CA,04-30-2015,2015,04,437028.0
4,9,0,California,State,CA,05-31-2015,2015,05,439880.0
...,...,...,...,...,...,...,...,...,...
3667,62,50,Wyoming,State,WY,08-31-2020,2020,08,257141.0
3668,62,50,Wyoming,State,WY,09-30-2020,2020,09,257853.0
3669,62,50,Wyoming,State,WY,10-31-2020,2020,10,258747.0
3670,62,50,Wyoming,State,WY,11-30-2020,2020,11,260035.0
