### Imports

In [9]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import array as array
from scipy.stats import pearsonr
%matplotlib inline
import seaborn as sns
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
#pd.set_option("display.max_rows", None, "display.max_columns", None)

### Data Loading

In [10]:
demographics=pd.read_csv("data/income/demographic.csv")
c_results=pd.read_csv("data/output/countyData.csv")
s_results=pd.read_csv("data/output/stateData.csv")
print('County result:',c_results.shape)
print('State result:',s_results.shape)
print('demographic:',demographics.shape)

County result: (4682, 7)
State result: (64, 5)
demographic: (3220, 37)


### Data Cleaning - Demographics

In [11]:
#Porto Rico doesnt vote
demographics.drop(demographics[demographics.State=='Puerto Rico'].index, inplace=True)

#Drop Columns
demographics.drop(['CountyId','IncomeErr','IncomePerCapErr'], axis=1, inplace=True)

#General Data adjustment for mapping
demographics['County'] = demographics['County'].str.replace(' County', '')
demographics['County'] = demographics['County'].str.replace(' Parish', '')
demographics['County'] = demographics['County'].str.replace(' parish', '')

#Virginia
m = demographics['State'] == 'Virginia'
demographics.loc[m, 'County'] = demographics.loc[m, 'County'].replace({' city': ''}, regex=True)
demographics.loc[m, 'County'] = demographics.loc[m, 'County'].replace({' City': ''}, regex=True)

#Alaska
m = demographics['State'] == 'Alaska'
demographics.loc[m, 'County'] = 'Alaska'

#Alaska Merge

#Sorting
demographics.sort_values(by=['State', 'County'], inplace=True, ascending=True)

### Data Cleaning - c_result

In [12]:
#Drop NA for Now until fix
c_results=c_results.dropna()

#TEMPORARY FIX /// Drop all Alaska Except one
c_results.drop(c_results[(c_results.state=='Alaska') & (c_results.county_id > 13)].index, inplace=True)

#Sorting
#c_results.sort_values(by=['state', 'county'], inplace=True, ascending=True)

### Data Cleaning - s_result

In [13]:
#Drop NA for Now until fix
s_results=s_results.dropna()

#Sorting
#c_results.sort_values(by=['state', 'county'], inplace=True, ascending=True)

### Data County Merge

In [14]:
County_merged=pd.merge(right=demographics, left=c_results, right_on=['State','County'], left_on=['state','county'])
County_merged.drop(['State', 'County'], axis=1, inplace=True)
County_merged.shape
County_merged.to_csv (r'data/output/Merged_County_V1.csv', index = False, header=True)

### Data State Merge

In [21]:
def wAvrgAggr(df, cName):
    totalW = 0.0
    totVal = 0.0
    for value, weight in zip(df[cName], df['TotalPop']):
        totVal += float(value) * float(weight)
        totalW += weight
    return totVal/totalW

def sumAggregation(df, cName):
    return df[cName].sum()

In [16]:
#Merge the data that needs to be summed
column_to_sum_list = ['TotalPop','Men', 'Women']
states_to_list = demographics['State'].unique()

for a in column_to_sum_list:
    mylist = []
    for b in states_to_list:
        mylist.append(sumMerge(demographics, a, b))
    s_results[a] = mylist
    #print(len(mylist))

In [17]:
s_results 
s_results.to_csv (r'data/output/Merged_State_V1.csv', index = False, header=True)

### NEED SOME WORK FOR WEIGHTED AVERAGE

#### 1rst try with my own function

In [18]:
#Merge the data that needs to be averaged
weightedAverageMerge(County_merged.loc[County_merged.state == 'Alaska'], 'Income')

#Results Are too Low. I must have made a mistake

279.27782388821566

#### 2nd Try

In [19]:
g = demographics.groupby('State')
demographics['wa']=demographics.Income / g.Income.transform("sum") * demographics.TotalPop
g.wa.sum()

#Results Are too High. I must have made a mistake

State
Alabama                  85052.526500
Alaska                   29045.053217
Arizona                 536780.607774
Arkansas                 45512.761175
California              759714.564592
Colorado                104667.028150
Connecticut             452467.621496
Delaware                328418.178645
District of Columbia    672391.000000
Florida                 328009.341073
Georgia                  81288.843396
Hawaii                  312951.801911
Idaho                    41917.211964
Illinois                149863.480248
Indiana                  73853.129842
Iowa                     33685.296036
Kansas                   31706.252247
Kentucky                 42469.469863
Louisiana                81081.338100
Maine                    90348.393569
Maryland                280782.108812
Massachusetts           517320.187104
Michigan                135826.429494
Minnesota                74080.791981
Mississippi              42143.596717
Missouri                 62701.448805
Montan

#### 3rd Try

In [22]:
stateData = County_merged.copy()
g = stateData.groupby('state')
for state, group in g:
    # Note that TotalPop should be aggregated last, as it is used in the wAvrg
    sumAgg = ['Men','Women','TotalPop','DEM_votes','REP_votes']
    wAvgAgg = ['normalized_election_outcome', 'Hispanic', 'White','Black','Native','Asian','Pacific', 'Poverty', 'ChildPoverty','Unemployment']
    
    for col in wAvgAgg:  
        newVal = wAvrgAggr(group, col)
        stateData.loc[(stateData['state'] == state), col] = newVal
    for col in sumAgg:
        newVal = sumAggregation(group, col)
        stateData.loc[(stateData['state'] == state), col] = newVal
        
    # And aggregate the
stateData = stateData[['state','state_id','DEM_votes','REP_votes','normalized_election_outcome', 'TotalPop','Men','Women','Hispanic','White','Black','Native','Asian','Pacific','IncomePerCap','Poverty','ChildPoverty','Unemployment']]
stateData.drop_duplicates()

Unnamed: 0,state,state_id,DEM_votes,REP_votes,normalized_election_outcome,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,Poverty,ChildPoverty,Unemployment
0,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,21857,17.974093,25.944204,7.552098
1,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,20020,17.974093,25.944204,7.552098
2,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,22710,17.974093,25.944204,7.552098
3,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,25013,17.974093,25.944204,7.552098
4,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,25803,17.974093,25.944204,7.552098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,32175,11.178200,12.607930,4.734684
3176,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,27345,11.178200,12.607930,4.734684
3177,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,31700,11.178200,12.607930,4.734684
3178,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,30939,11.178200,12.607930,4.734684


In [23]:
stateData

Unnamed: 0,state,state_id,DEM_votes,REP_votes,normalized_election_outcome,TotalPop,Men,Women,Hispanic,White,Black,Native,Asian,Pacific,IncomePerCap,Poverty,ChildPoverty,Unemployment
0,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,21857,17.974093,25.944204,7.552098
1,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,20020,17.974093,25.944204,7.552098
2,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,22710,17.974093,25.944204,7.552098
3,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,25013,17.974093,25.944204,7.552098
4,Alabama,1.0,1434159.0,843473.0,0.629458,4850771,2350806,2499965,4.090538,65.925899,26.394896,0.468075,1.271648,0.019180,25803,17.974093,25.944204,7.552098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3175,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,32175,11.178200,12.607930,4.734684
3176,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,27345,11.178200,12.607930,4.734684
3177,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,31700,11.178200,12.607930,4.734684
3178,Wyoming,56.0,193559.0,73491.0,0.730315,583200,298301,284899,9.718460,84.302438,0.989222,2.043968,0.843658,0.066466,30939,11.178200,12.607930,4.734684
