In [75]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/dataset.csv')

years = [2008, 2012, 2016, 2020]

idx = ['year',
        'gisjoin',
        'state',
        'county']

sex_maritals = ['male_never_married',
                'male_married',
                'male_separated',
                'male_widowed',
                'male_divorced',
                'female_never_married',
                'female_married',
                'female_separated',
                'female_widowed',
                'female_divorced']

nativities = ['persons_native', 
              'persons_foreign_born']

sex_age_edus = ['male_18_24_less_than_9th',
                'male_18_24_some_hs',
                'male_18_24_hs_grad',
                'male_18_24_some_college',
                'male_18_24_associates',
                'male_18_24_bachelors',
                'male_18_24_graduate',
                'male_25_34_less_than_9th',
                'male_25_34_some_hs',
                'male_25_34_hs_grad',
                'male_25_34_some_college',
                'male_25_34_associates',
                'male_25_34_bachelors',
                'male_25_34_graduate',
                'male_35_44_less_than_9th',
                'male_35_44_some_hs',
                'male_35_44_hs_grad',
                'male_35_44_some_college',
                'male_35_44_associates',
                'male_35_44_bachelors',
                'male_35_44_graduate',
                'male_45_64_less_than_9th',
                'male_45_64_some_hs',
                'male_45_64_hs_grad',
                'male_45_64_some_college',
                'male_45_64_associates',
                'male_45_64_bachelors',
                'male_45_64_graduate',
                'male_65plus_less_than_9th',
                'male_65plus_some_hs',
                'male_65plus_hs_grad',
                'male_65plus_some_college',
                'male_65plus_associates',
                'male_65plus_bachelors',
                'male_65plus_graduate',
                'female_18_24_less_than_9th',
                'female_18_24_some_hs',
                'female_18_24_hs_grad',
                'female_18_24_some_college',
                'female_18_24_associates',
                'female_18_24_bachelors',
                'female_18_24_graduate',
                'female_25_34_less_than_9th',
                'female_25_34_some_hs',
                'female_25_34_hs_grad',
                'female_25_34_some_college',
                'female_25_34_associates',
                'female_25_34_bachelors',
                'female_25_34_graduate',
                'female_35_44_less_than_9th',
                'female_35_44_some_hs',
                'female_35_44_hs_grad',
                'female_35_44_some_college',
                'female_35_44_associates',
                'female_35_44_bachelors',
                'female_35_44_graduate',
                'female_45_64_less_than_9th',
                'female_45_64_some_hs',
                'female_45_64_hs_grad',
                'female_45_64_some_college',
                'female_45_64_associates',
                'female_45_64_bachelors',
                'female_45_64_graduate',
                'female_65plus_less_than_9th',
                'female_65plus_some_hs',
                'female_65plus_hs_grad',
                'female_65plus_some_college',
                'female_65plus_associates',
                'female_65plus_bachelors',
                'female_65plus_graduate']

labors = ['labor_force_total',
            'labor_force_armed',
            'labor_force_civilian',
            'labor_force_employed',
            'labor_force_unemployed',
            'not_in_labor_force']

households = ['households_total',
              'households_income_under_10k', 
              'households_income_10k_15k',
              'households_income_15k_25k',
              'households_income_25k_plus']

sexes = ['persons_male',
         'persons_female']

incomes = ['median_household_income',
           'per_capita_income']

sex_races = ['male_white',
            'female_white',
            'male_black',
            'female_black',
            'male_aian',
            'female_aian',
            'male_asian',
            'female_asian',
            'male_nhpi',
            'female_nhpi',
            'male_other',
            'female_other',
            'male_multi',
            'female_multi']

targets = ['democrat',
            'other',
            'republican',
            'non_voter']

misc_persons = ['persons_hispanic',
                'persons_below_poverty']

land = ['land_area_sqkm']

# combine all columns which contain features that count persons
person_cols = sexes + sex_maritals + sex_age_edus + sex_races + nativities +  labors + misc_persons

set(person_cols + targets + land + households + idx + incomes + ['persons_total']) == set(data.columns)

True

In [76]:
dfs = []
for year in years:
    df = data[data['year'] == year].copy()
    df.reset_index(drop=True, inplace=True)
    df_prob = df[idx + incomes].copy()  # keep only the index columns

    # per capita area
    df_prob['per_capita_area'] = df['land_area_sqkm'] / df['persons_total']

    # per capita households by income
    df_households = pd.DataFrame({f'per_capita_{col}': df[col] / df['persons_total'] for col in households[1:]})
    df_prob = pd.concat([df_prob, df_households], axis=1)

    # probability of living in a county
    df_prob['P(C)'] = df['persons_total'] / df['persons_total'].sum()

    # probability of x given county for x in person cols
    df_persons = pd.DataFrame({f'P({col}|C)': df[col] / df['persons_total'] for col in person_cols})

    # probability distribution of targets for each county
    df_targets = pd.DataFrame({f'P({col}|C)' : df[col] / df['persons_total'] for col in targets})

    #make a list of P(x|C) columns for x in sex_age_edus
    P_sae = [f'P({col}|C)' for col in sex_age_edus]

    # make a column for P(18plus|C) by summing all P(x|C) for x in P_sae
    df_persons['P(18plus|C)'] = df_persons[P_sae].sum(axis=1)

    # subtract (1- P(18plus|C)) from P(non-voter|C) so the latter represents non-voters that are 18plus
    df_targets['P(non_voter|C)'] -= 1 - df_persons['P(18plus|C)']

    #concatenate the dataframes
    df_prob = pd.concat([df_prob, df_persons, df_targets], axis=1)

    # Store the dataframe for each year
    dfs.append(df_prob)

In [77]:
df_final = pd.concat(dfs, ignore_index=True)
df_final

Unnamed: 0,year,gisjoin,state,county,median_household_income,per_capita_income,per_capita_area,per_capita_households_income_under_10k,per_capita_households_income_10k_15k,per_capita_households_income_15k_25k,...,P(labor_force_employed|C),P(labor_force_unemployed|C),P(not_in_labor_force|C),P(persons_hispanic|C),P(persons_below_poverty|C),P(18plus|C),P(democrat|C),P(other|C),P(republican|C),P(non_voter|C)
0,2008,G0100010,Alabama,Autauga County,53255,24568,0.029448,0.024758,0.018738,0.032433,...,0.457925,0.030308,0.257643,0.023196,0.105785,0.722980,0.114627,0.002728,0.327401,0.278224
1,2008,G0100030,Alabama,Baldwin County,50147,26469,0.029868,0.023585,0.019074,0.046942,...,0.446667,0.031361,0.315852,0.038955,0.120689,0.766916,0.110279,0.004301,0.348545,0.303793
2,2008,G0100050,Alabama,Barbour County,33219,15875,0.084576,0.061194,0.034803,0.049460,...,0.357739,0.037944,0.405321,0.045958,0.223943,0.775082,0.205675,0.002419,0.211777,0.355211
3,2008,G0100070,Alabama,Bibb County,41770,19918,0.071728,0.024193,0.024193,0.049845,...,0.434277,0.043653,0.316541,0.005927,0.125166,0.765679,0.101681,0.003671,0.276957,0.383370
4,2008,G0100090,Alabama,Blount County,45549,21070,0.029724,0.030727,0.021873,0.047291,...,0.435793,0.035472,0.310802,0.074896,0.132470,0.752205,0.062125,0.006280,0.359645,0.324155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12355,2020,G5600370,Wyoming,Sweetwater County,79375,40268,0.645738,0.020248,0.011003,0.025904,...,0.497873,0.030158,0.245063,0.162076,0.114855,0.745146,0.090853,0.015352,0.290620,0.348321
12356,2020,G5600390,Wyoming,Teton County,108279,76296,0.467860,0.007753,0.008652,0.024372,...,0.649190,0.013878,0.180245,0.151975,0.068791,0.823739,0.421828,0.025615,0.185942,0.190354
12357,2020,G5600410,Wyoming,Uinta County,78164,32955,0.263227,0.007495,0.012216,0.019420,...,0.465395,0.016500,0.271294,0.098559,0.064003,0.720724,0.077436,0.018106,0.364840,0.260343
12358,2020,G5600430,Wyoming,Washakie County,61875,32979,0.751918,0.016570,0.014239,0.047249,...,0.512751,0.009320,0.293204,0.142783,0.065113,0.781230,0.084272,0.017605,0.420065,0.259288


In [78]:
list(df_final.columns)

['year',
 'gisjoin',
 'state',
 'county',
 'median_household_income',
 'per_capita_income',
 'per_capita_area',
 'per_capita_households_income_under_10k',
 'per_capita_households_income_10k_15k',
 'per_capita_households_income_15k_25k',
 'per_capita_households_income_25k_plus',
 'P(C)',
 'P(persons_male|C)',
 'P(persons_female|C)',
 'P(male_never_married|C)',
 'P(male_married|C)',
 'P(male_separated|C)',
 'P(male_widowed|C)',
 'P(male_divorced|C)',
 'P(female_never_married|C)',
 'P(female_married|C)',
 'P(female_separated|C)',
 'P(female_widowed|C)',
 'P(female_divorced|C)',
 'P(male_18_24_less_than_9th|C)',
 'P(male_18_24_some_hs|C)',
 'P(male_18_24_hs_grad|C)',
 'P(male_18_24_some_college|C)',
 'P(male_18_24_associates|C)',
 'P(male_18_24_bachelors|C)',
 'P(male_18_24_graduate|C)',
 'P(male_25_34_less_than_9th|C)',
 'P(male_25_34_some_hs|C)',
 'P(male_25_34_hs_grad|C)',
 'P(male_25_34_some_college|C)',
 'P(male_25_34_associates|C)',
 'P(male_25_34_bachelors|C)',
 'P(male_25_34_gradua

In [79]:
idx = ['year', 'gisjoin', 'state', 'county']

incomes= ['median_household_income','per_capita_income']

land = ['per_capita_area']

households = [
        'per_capita_households_income_under_10k',
        'per_capita_households_income_10k_15k',
        'per_capita_households_income_15k_25k',
        'per_capita_households_income_25k_plus'
                ]

sexes = ['P(persons_male|C)', 'P(persons_female|C)']

sex_maritals = [
        'P(male_never_married|C)',
        'P(male_married|C)',
        'P(male_separated|C)',
        'P(male_widowed|C)',
        'P(male_divorced|C)',
        'P(female_never_married|C)',
        'P(female_married|C)',
        'P(female_separated|C)',
        'P(female_widowed|C)',
        'P(female_divorced|C)'
                ]

sex_age_edus = [
        'P(male_18_24_less_than_9th|C)',
        'P(male_18_24_some_hs|C)',
        'P(male_18_24_hs_grad|C)',
        'P(male_18_24_some_college|C)',
        'P(male_18_24_associates|C)',
        'P(male_18_24_bachelors|C)',
        'P(male_18_24_graduate|C)',
        'P(male_25_34_less_than_9th|C)',
        'P(male_25_34_some_hs|C)',
        'P(male_25_34_hs_grad|C)',
        'P(male_25_34_some_college|C)',
        'P(male_25_34_associates|C)',
        'P(male_25_34_bachelors|C)',
        'P(male_25_34_graduate|C)',
        'P(male_35_44_less_than_9th|C)',
        'P(male_35_44_some_hs|C)',
        'P(male_35_44_hs_grad|C)',
        'P(male_35_44_some_college|C)',
        'P(male_35_44_associates|C)',
        'P(male_35_44_bachelors|C)',
        'P(male_35_44_graduate|C)',
        'P(male_45_64_less_than_9th|C)',
        'P(male_45_64_some_hs|C)',
        'P(male_45_64_hs_grad|C)',
        'P(male_45_64_some_college|C)',
        'P(male_45_64_associates|C)',
        'P(male_45_64_bachelors|C)',
        'P(male_45_64_graduate|C)',
        'P(male_65plus_less_than_9th|C)',
        'P(male_65plus_some_hs|C)',
        'P(male_65plus_hs_grad|C)',
        'P(male_65plus_some_college|C)',
        'P(male_65plus_associates|C)',
        'P(male_65plus_bachelors|C)',
        'P(male_65plus_graduate|C)',
        'P(female_18_24_less_than_9th|C)',
        'P(female_18_24_some_hs|C)',
        'P(female_18_24_hs_grad|C)',
        'P(female_18_24_some_college|C)',
        'P(female_18_24_associates|C)',
        'P(female_18_24_bachelors|C)',
        'P(female_18_24_graduate|C)',
        'P(female_25_34_less_than_9th|C)',
        'P(female_25_34_some_hs|C)',
        'P(female_25_34_hs_grad|C)',
        'P(female_25_34_some_college|C)',
        'P(female_25_34_associates|C)',
        'P(female_25_34_bachelors|C)',
        'P(female_25_34_graduate|C)',
        'P(female_35_44_less_than_9th|C)',
        'P(female_35_44_some_hs|C)',
        'P(female_35_44_hs_grad|C)',
        'P(female_35_44_some_college|C)',
        'P(female_35_44_associates|C)',
        'P(female_35_44_bachelors|C)',
        'P(female_35_44_graduate|C)',
        'P(female_45_64_less_than_9th|C)',
        'P(female_45_64_some_hs|C)',
        'P(female_45_64_hs_grad|C)',
        'P(female_45_64_some_college|C)',
        'P(female_45_64_associates|C)',
        'P(female_45_64_bachelors|C)',
        'P(female_45_64_graduate|C)',
        'P(female_65plus_less_than_9th|C)',
        'P(female_65plus_some_hs|C)',
        'P(female_65plus_hs_grad|C)',
        'P(female_65plus_some_college|C)',
        'P(female_65plus_associates|C)',
        'P(female_65plus_bachelors|C)',
        'P(female_65plus_graduate|C)'
            ]

sex_races = [
        'P(male_white|C)',
        'P(female_white|C)',
        'P(male_black|C)',
        'P(female_black|C)',
        'P(male_aian|C)',
        'P(female_aian|C)',
        'P(male_asian|C)',
        'P(female_asian|C)',
        'P(male_nhpi|C)',
        'P(female_nhpi|C)',
        'P(male_other|C)',
        'P(female_other|C)',
        'P(male_multi|C)',
        'P(female_multi|C)'
            ]

nativities = ['P(persons_native|C)', 'P(persons_foreign_born|C)']

labors = [
    'P(labor_force_total|C)',
    'P(labor_force_armed|C)',
    'P(labor_force_civilian|C)',
    'P(labor_force_employed|C)',
    'P(labor_force_unemployed|C)',
    'P(not_in_labor_force|C)'
        ]

misc = ['P(persons_hispanic|C)', 
        'P(persons_below_poverty|C)', 
        'P(18plus|C)',
        'P(C)']

targets = ['P(democrat|C)',
           'P(other|C)',
           'P(republican|C)',
           'P(non_voter|C)']

features = incomes \
        + land \
        + households \
        + sexes \
        + sex_maritals \
        + sex_races \
        + sex_age_edus \
        + nativities \
        + labors \
        + misc

In [80]:
df_final.to_csv('data/final_dataset.csv', index=False)

In [81]:
df = pd.read_csv('data/final_dataset.csv')

In [82]:
from sklearn.preprocessing import StandardScaler
def load_data(self, 
                fit_years, # list of years to fit StandardScalar
                transform_years, # list of years to transform StandardScaler
                features_to_drop = [] #list of features to be dropped
                ):
    data = pd.read_csv('data/final_dataset.csv')

    # make datasets with fit years and transform years
    df_fit = data[data['year'].isin(fit_years)].reset_index(drop=True)
    df_transform = data[data['year'].isin(transform_years)].reset_index(drop=True)

    # make wts
    wts_fit = df_fit[['P(C)']]
    wts_transform = df_transform[['P(C)']]

    # make y's
    y_fit = df_fit[targets]
    y_transform = df_transform[targets]

    # make X's
    features_to_keep = list(set(all_features) - set(features_to_drop))
    X_fit = df_fit[features_to_keep]
    X_transform = df_transform[features_to_keep]

    #apply StandardScalar to X's
    scaler = StandardScaler()
    X_fit = scaler.fit_transform(X_fit) #fit and transform X_fit
    X_transform = scaler.transform(X_transform) #transform X_transform

    return X_fit, y_fit, wts_fit, X_transform, y_transform, wts_transform

In [83]:
X_train, y_train, wts_train, X_val,y_val,wts_val = load_data(fit_years=[2008, 2012, 2016], transform_years=[2020])

TypeError: load_data() missing 1 required positional argument: 'self'