# Statistics to Use (Ideas)
- Political - ?
- Population Density
- Migration patterns
- Age demographics
- Culture - ?
- Interest Rates
- GDP
- Number of housing units
- Proximity to social/metro centres
- City Age -- History (how would we quantify?) 
- Imports/Exports - ?
- Main Industry/Sector - Resources, raw materials
- Average Income
- Number of Schools, Education
- Geographic Location
- Number of highways, transporation infrastructure
- Terrain 

## Economic
Interest Rates, GDP, Imports/Exports, Average Income

## Important
- <b>Politics</b>
    - Partisan segration, polarity
- <b>Education</b>
- Abundance of housing units - ?
- City Age/History
- <b>Main Industry Sector</b>
- <b> Economic Measures</b>
- <b>Geographic Location</b>
    - Proximity to other cities, metro centres
- Number of highways, transporation infrastructure

# TO-DO:
- Add more features, fix poverty data

In [139]:
# Import Necessary Libraries
import numpy as np 
import pandas as pd

# FEATURE 1: RATIO OF PEOPLE TAKING PUBLIC TRANSPORATION TO WORK

transportation_data = pd.read_csv('../datasets/transportation_data/ACSST5Y2010.S0802_data_with_overlays_2020-03-04T134515.csv')
transportation_data.NAME = transportation_data.NAME.map(lambda x: x.replace(' Metro Area', '').replace(' Micro Area', ''))


# Filter data to get only appropriate metropolitan areas
transportation_data = transportation_data[~transportation_data.NAME.str.contains('United State')]
num_public_transport = transportation_data['S0802_C04_001E'].to_numpy()[1:].astype(int)
num_workers = transportation_data['S0802_C01_001E'].to_numpy()[1:].astype(int)
ratio = np.divide(num_public_transport, num_workers)

# Standardisation, Feature Scaling
ratio = (ratio - np.mean(ratio))/(ratio.std())
ratio = [float(r) for r in ratio]

# Create new DataFrame with each metro area and corresponding attribute values
new_df = pd.DataFrame(zip(transportation_data['NAME'].to_numpy()[1:],  ratio), columns = ['metro_name', 'ratio_of_public_transport']).set_index('metro_name').sort_values('metro_name')
new_df['ratio_of_public_transport'].fillna(0, inplace = True)
new_df

Unnamed: 0_level_0,ratio_of_public_transport
metro_name,Unnamed: 1_level_1
"Abbeville, LA",-0.355924
"Aberdeen, SD",-0.362291
"Aberdeen, WA",0.248173
"Abilene, TX",-0.332337
"Ada, OK",-0.520086
...,...
"York-Hanover, PA",0.002138
"Youngstown-Warren-Boardman, OH-PA",-0.186601
"Yuba City, CA",-0.016467
"Yuma, AZ",0.635026


In [140]:
# Standardise a pandas Series to prepare for K-Means
def standardise(df, series):
    df = df.copy(deep = False)
    column = df.loc[:, series]
    standardised = (column - column.mean()) / column.std()
    return standardised

# Standardise when converted to array
def standardise_array(arr):
    return (arr - arr.mean()) / arr.std()

In [141]:

# FEATURE 2: MEDIAN HOUSEHOLD INCOME

income_data = pd.read_csv('../datasets/income_data/ACSST5Y2010.S2503_data_with_overlays_2020-03-04T163518.csv')
income_data = income_data[~income_data.NAME.str.contains('United States')]
income_data.NAME = income_data.NAME.map(lambda x: x.replace(' Metro Area', '').replace(' Micro Area', ''))
median_income = income_data['S2503_C01_013E'].to_numpy()[1:].astype(int)

# Standardise
median_income = standardise_array(median_income)

# Make sure every element of the median_income column is a float
new_df['median_income'] = [float(m) for m in median_income]
new_df

Unnamed: 0_level_0,ratio_of_public_transport,median_income
metro_name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Abbeville, LA",-0.355924,-0.622420
"Aberdeen, SD",-0.362291,1.245645
"Aberdeen, WA",0.248173,0.120348
"Abilene, TX",-0.332337,0.407350
"Ada, OK",-0.520086,0.195562
...,...,...
"York-Hanover, PA",0.002138,0.532464
"Youngstown-Warren-Boardman, OH-PA",-0.186601,0.268693
"Yuba City, CA",-0.016467,0.193791
"Yuma, AZ",0.635026,-0.604710


In [142]:
# FEATURE 3: PERCENT BELOW POVERTY LEVEL

#### CURRENTLY NOT WORKING #####

poverty_data = pd.read_csv('../datasets/poverty_data/ACSST1Y2010.S1701_data_with_overlays_2020-03-04T183351.csv')
poverty_data = poverty_data[~poverty_data.NAME.str.contains('United States')]
poverty_data.NAME = poverty_data.NAME.map(lambda x: x.replace(' Metro Area', '').replace(' Micro Area', ''))
percentages = poverty_data['S1701_C03_001E'].to_numpy()[1:].astype(float)
percentages = standardise_array(percentages)

poverty_df = pd.DataFrame(zip(poverty_data['NAME'][1:], percentages),
             columns = ['metro_name', 'pctg_below_poverty_line']).sort_values('metro_name')

new_df2 = new_df.merge(poverty_df, on = 'metro_name', how='outer', suffixes=('', '_y')).fillna(0)
new_df2                 

Unnamed: 0,metro_name,ratio_of_public_transport,median_income,pctg_below_poverty_line
0,"Abbeville, LA",-0.355924,-0.622420,0.000000
1,"Aberdeen, SD",-0.362291,1.245645,0.000000
2,"Aberdeen, WA",0.248173,0.120348,0.013290
3,"Abilene, TX",-0.332337,0.407350,0.139864
4,"Ada, OK",-0.520086,0.195562,0.000000
...,...,...,...,...
950,"York-Hanover, PA",0.002138,0.532464,-1.252450
951,"Youngstown-Warren-Boardman, OH-PA",-0.186601,0.268693,-0.002531
952,"Yuba City, CA",-0.016467,0.193791,0.345547
953,"Yuma, AZ",0.635026,-0.604710,0.472121


In [143]:
# FEATURE 4: POPULATION OF PEOPLE 3 AND OVER ENROLLED IN SCHOOL

school_data = pd.read_csv('../datasets/school_data/ACSST5Y2010.S1401_data_with_overlays_2020-03-04T192940.csv')
school_data.NAME = school_data.NAME.map(lambda x: x.replace(' Metro Area', '').replace(' Micro Area', ''))

# Log-transform, standardise
enrolled = school_data['S1401_C01_001E'].to_numpy()[1:].astype(int)
enrolled = standardise_array(enrolled)

# Correct type, save to test DataFrame
new_df2['enrolled_in_school'] = [float(e) for e in enrolled]
new_df2.to_csv('test_01.csv')
new_df2

Unnamed: 0,metro_name,ratio_of_public_transport,median_income,pctg_below_poverty_line,enrolled_in_school
0,"Abbeville, LA",-0.355924,-0.622420,0.000000,0.035869
1,"Aberdeen, SD",-0.362291,1.245645,0.000000,0.346162
2,"Aberdeen, WA",0.248173,0.120348,0.013290,-0.077303
3,"Abilene, TX",-0.332337,0.407350,0.139864,0.455581
4,"Ada, OK",-0.520086,0.195562,0.000000,-0.228167
...,...,...,...,...,...
950,"York-Hanover, PA",0.002138,0.532464,-1.252450,-0.257513
951,"Youngstown-Warren-Boardman, OH-PA",-0.186601,0.268693,-0.002531,0.005607
952,"Yuba City, CA",-0.016467,0.193791,0.345547,-0.255924
953,"Yuma, AZ",0.635026,-0.604710,0.472121,-0.217837


In [147]:
cbsa = pd.read_csv('../datasets/cbsa.csv')[['NAME', 'SQMI', 'HSE_UNITS', 'AVE_FAM_SZ', 'POP10_SQMI']]

for col in cbsa.columns[1:]:
    cbsa[col] = standardise(cbsa, col)

new_df2 = new_df2.merge(cbsa, left_on = 'metro_name', right_on = 'NAME', suffixes = ('', '_y'))
new_df2 = new_df2.dropna().drop(columns = ['NAME'])
new_df2

Unnamed: 0,metro_name,ratio_of_public_transport,median_income,pctg_below_poverty_line,enrolled_in_school,NAME_y,SQMI_y,HSE_UNITS_y,AVE_FAM_SZ_y,POP10_SQMI_y,...,AVE_FAM_SZ_y.1,POP10_SQMI_y.1,SQMI,HSE_UNITS,AVE_FAM_SZ,POP10_SQMI,SQMI_y.1,HSE_UNITS_y.1,AVE_FAM_SZ_y.2,POP10_SQMI_y.2
0,"Aberdeen, SD",-0.362291,1.245645,0.000000,0.346162,"Aberdeen, SD",0.419313,-0.265327,0.373006,-0.551088,...,0.373006,-0.551088,0.419313,-0.265327,0.373006,-0.551088,0.419313,-0.265327,0.373006,-0.551088
1,"Aberdeen, WA",0.248173,0.120348,0.013290,-0.077303,"Aberdeen, WA",0.028771,-0.207854,0.414232,-0.426037,...,0.414232,-0.426037,0.028771,-0.207854,0.414232,-0.426037,0.028771,-0.207854,0.414232,-0.426037
2,"Abilene, TX",-0.332337,0.407350,0.139864,0.455581,"Abilene, TX",0.368355,-0.087447,0.480195,-0.307371,...,0.480195,-0.307371,0.368355,-0.087447,0.480195,-0.307371,0.368355,-0.087447,0.480195,-0.307371
3,"Ada, OK",-0.520086,0.195562,0.000000,-0.228167,"Ada, OK",-0.465152,-0.272564,0.455459,-0.351006,...,0.455459,-0.351006,-0.465152,-0.272564,0.455459,-0.351006,-0.465152,-0.272564,0.455459,-0.351006
4,"Adrian, MI",-0.424802,0.868949,-0.429719,-0.224541,"Adrian, MI",-0.450319,-0.178981,0.463704,0.072041,...,0.463704,0.072041,-0.450319,-0.178981,0.463704,0.072041,-0.450319,-0.178981,0.463704,0.072041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,"York-Hanover, PA",0.002138,0.532464,-1.252450,-0.257513,"York-Hanover, PA",-0.389084,0.292187,0.447214,1.915353,...,0.447214,1.915353,-0.389084,0.292187,0.447214,1.915353,-0.389084,0.292187,0.447214,1.915353
795,"Youngstown-Warren-Boardman, OH-PA",-0.186601,0.268693,-0.002531,0.005607,"Youngstown-Warren-Boardman, OH-PA",-0.047236,0.574632,0.414232,1.099591,...,0.414232,1.099591,-0.047236,0.574632,0.414232,1.099591,-0.047236,0.574632,0.414232,1.099591
796,"Yuba City, CA",-0.016467,0.193791,0.345547,-0.255924,"Yuba City, CA",-0.248958,-0.116118,0.818251,0.083215,...,0.818251,0.083215,-0.248958,-0.116118,0.818251,0.083215,-0.248958,-0.116118,0.818251,0.083215
797,"Yuma, AZ",0.635026,-0.604710,0.472121,-0.217837,"Yuma, AZ",1.500424,-0.024277,0.785270,-0.437744,...,0.785270,-0.437744,1.500424,-0.024277,0.785270,-0.437744,1.500424,-0.024277,0.785270,-0.437744
