In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd

pd.set_option('display.max_columns', None)

In [None]:
# read the county election data
df = gpd.read_file('data/election/final_data/county_data_with_elections_2012_2016_2020_census_MAIN.geojson')
df_2012 = gpd.read_file('data/election/final_data/county_data_with_elections_2012_census_MAIN.geojson')
df_2016 = gpd.read_file('data/election/final_data/county_data_with_elections_2016_census_MAIN.geojson')
df_2020 = gpd.read_file('data/election/final_data/county_data_with_elections_2020_census_MAIN.geojson')

In [None]:
# remove commas from the values in all columns
df = df.replace({',': ''}, regex=True)
df_2012 = df_2012.replace({',': ''}, regex=True)
df_2016 = df_2016.replace({',': ''}, regex=True)
df_2020 = df_2020.replace({',': ''}, regex=True)

In [None]:
def convert_to_number(col):
    try:
        return pd.to_numeric(col)
    except:
        return col
    
df = df.apply(convert_to_number)
df_2012 = df_2012.apply(convert_to_number)
df_2016 = df_2016.apply(convert_to_number)
df_2020 = df_2020.apply(convert_to_number)

In [None]:
# save the dfs back to geojson
# df.to_file('data/election/final_data/county_data_with_elections_2012_2016_2020_census_MAIN.geojson', driver='GeoJSON')
# df_2012.to_file('data/election/final_data/county_data_with_elections_2012_census_MAIN.geojson', driver='GeoJSON')
# df_2016.to_file('data/election/final_data/county_data_with_elections_2016_census_MAIN.geojson', driver='GeoJSON')
# df_2020.to_file('data/election/final_data/county_data_with_elections_2020_census_MAIN.geojson', driver='GeoJSON')

In [None]:
# for every column with 'workers' or 'occupation' in the name, append INDUSTRY_ to the beginning of the column name
# this is to make the column names consistent with the other data
for col in df.columns:
    if 'workers' in col or 'occupation' in col:
        df.rename(columns={col: 'INDUSTRY_' + col}, inplace=True)
        df_2012.rename(columns={col: 'INDUSTRY_' + col}, inplace=True)
        df_2016.rename(columns={col: 'INDUSTRY_' + col}, inplace=True)
        df_2020.rename(columns={col: 'INDUSTRY_' + col}, inplace=True)

In [None]:
df

In [17]:
# show columns that have pct in the name
total_columns = [col for col in df.columns if 'percent' in col]
print(total_columns)

['10 to 14 years_percent_2020', '10 to 14 years_percent female_2020', '10 to 14 years_percent male_2020', '15 to 17 years_percent_2020', '15 to 17 years_percent female_2020', '15 to 17 years_percent male_2020', '15 to 19 years_percent_2020', '15 to 19 years_percent female_2020', '15 to 19 years_percent male_2020', '15 to 44 years_percent_2020', '15 to 44 years_percent female_2020', '15 to 44 years_percent male_2020', '16 years and over_percent_2020', '16 years and over_percent female_2020', '16 years and over_percent male_2020', '18 to 24 years_percent_2020', '18 to 24 years_percent female_2020', '18 to 24 years_percent male_2020', '18 years and over_percent_2020', '18 years and over_percent female_2020', '18 years and over_percent male_2020', '20 to 24 years_percent_2020', '20 to 24 years_percent female_2020', '20 to 24 years_percent male_2020', '21 years and over_percent_2020', '21 years and over_percent female_2020', '21 years and over_percent male_2020', '25 to 29 years_percent_202

In [None]:
# write all column names to a text file
with open('data/election/final_data/column_names.txt', 'w') as f:
    for col in df.columns:
        f.write(col + '\n')

In [None]:
df_population_votes = df[['Total population_total_2012', 'Total population_total_2016', 'Total population_total_2020', 'votes_dem_2012', 'votes_gop_2012', 'votes_dem_2016', 'votes_gop_2016', 'votes_dem_2020', 'votes_gop_2020', 'per_dem_2012', 'per_gop_2012', 'per_dem_2016', 'per_gop_2016', 'per_dem_2020', 'per_gop_2020']]

In [None]:
df_population_votes

-------

##### ML

In [18]:
elections_demographics_data = df.drop(columns=['geometry'])
elections_demographics_data_2012 = df_2012.drop(columns=['geometry'])
elections_demographics_data_2016 = df_2016.drop(columns=['geometry'])
elections_demographics_data_2020 = df_2020.drop(columns=['geometry'])

In [19]:
elections_demographics_data = pd.DataFrame(elections_demographics_data)
elections_demographics_data_2012 = pd.DataFrame(elections_demographics_data_2012)
elections_demographics_data_2016 = pd.DataFrame(elections_demographics_data_2016)
elections_demographics_data_2020 = pd.DataFrame(elections_demographics_data_2020)

In [None]:
elections_demographics_data

In [20]:
for col in elections_demographics_data.columns:
    if not np.issubdtype(elections_demographics_data[col].dtype, np.number):
        print(col)

per_point_diff_2016
winner_2012
winner_2016
winner_2020
Child dependency ratio_total_2012
county_name
state_name
state_abbr


In [21]:
# create a new df from election_demographics_data with only the columns that have dates 2016 and before in the column name (2012, 2016, 2000, 2001, '2008-12', etc)
elections_demographics_data_2016_and_before = elections_demographics_data[[col for col in elections_demographics_data.columns if '2000' in col or '2001' in col or '2002' in col or '2003' in col or '2004' in col or '2005' in col or '2006' in col or '2007' in col or '2008' in col or '2009' in col or '2010' in col or '2011' in col or '2012' in col or '2013' in col or '2014' in col or '2015' in col or '2016' in col or '2008-12' in col]]

In [22]:
elections_demographics_data_2016_and_before.winner_2016

0       GOP
1       GOP
2       GOP
3       GOP
4       GOP
       ... 
3100    GOP
3101    DEM
3102    GOP
3103    GOP
3104    GOP
Name: winner_2016, Length: 3105, dtype: object

In [23]:
# show valuecounts of column dtypes
elections_demographics_data_2016_and_before.dtypes.value_counts()

float64    799
int32      133
object       4
dtype: int64

In [24]:
# drop Child dependency ratio_total_2012
elections_demographics_data_2016_and_before = elections_demographics_data_2016_and_before.drop(columns=['Child dependency ratio_total_2012'])

In [None]:
# scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = elections_demographics_data_2016_and_before.select_dtypes(include=[np.number]).columns
elections_demographics_data_2016_and_before[numeric_cols] = scaler.fit_transform(elections_demographics_data_2016_and_before[numeric_cols])

In [25]:
# drop per_point_diff_2016
elections_demographics_data_2016_and_before = elections_demographics_data_2016_and_before.drop(columns=['per_point_diff_2016'])

In [27]:
# encode winner_2012 and winner_2016 with GOP = 0 and DEM = 1
elections_demographics_data_2016_and_before['winner_2012'] = elections_demographics_data_2016_and_before['winner_2012'].replace({'GOP': 0, 'DEM': 1})
elections_demographics_data_2016_and_before['winner_2016'] = elections_demographics_data_2016_and_before['winner_2016'].replace({'GOP': 0, 'DEM': 1})

In [None]:
elections_demographics_data_2016_and_before

In [None]:
pd.set_option('display.max_rows', 1000)

In [28]:
# find what values contribute the most to the winner_2016 column
elections_demographics_data_2016_and_before.corr()['winner_2016'].sort_values(ascending=False)

winner_2016                                            1.000000
per_dem_2016                                           0.770546
winner_2012                                            0.765184
per_dem_2012                                           0.682834
per_point_diff_2012                                    0.682287
                                                         ...   
white_alone_not_hispanic_or_latino_nonveterans_2012   -0.489716
white_alone_not_hispanic_or_latino_total_2012         -0.495746
white_alone_not_hispanic_or_latino_veterans_2012      -0.496979
per_gop_2012                                          -0.681091
per_gop_2016                                          -0.761486
Name: winner_2016, Length: 934, dtype: float64

In [29]:
# drop all rows with missing values
elections_demographics_data_2016_and_before = elections_demographics_data_2016_and_before.dropna()

In [30]:
# create columns called win_gop_2012 and win_gop_2016 and win_dem_2012 and win_dem_2016 that are 1 if the GOP or DEM won in 2012 or 2016 and 0 otherwise
elections_demographics_data_2016_and_before['win_gop_2012'] = elections_demographics_data_2016_and_before['winner_2012'].apply(lambda x: 1 if x == 0 else 0)
elections_demographics_data_2016_and_before['win_gop_2016'] = elections_demographics_data_2016_and_before['winner_2016'].apply(lambda x: 1 if x == 0 else 0)
elections_demographics_data_2016_and_before['win_dem_2012'] = elections_demographics_data_2016_and_before['winner_2012'].apply(lambda x: 1 if x == 1 else 0)
elections_demographics_data_2016_and_before['win_dem_2016'] = elections_demographics_data_2016_and_before['winner_2016'].apply(lambda x: 1 if x == 1 else 0)

In [34]:
# perform a random forest classifier on the data to see which columns are the most important in predicting the win_gop_2016 column
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = elections_demographics_data_2016_and_before.drop(columns=['winner_2012', 'winner_2016', 'win_gop_2012', 'win_gop_2016', 'win_dem_2012', 'win_dem_2016', 'per_gop_2012', 'per_gop_2016', 'per_dem_2012', 'per_dem_2016', 'votes_gop_2012', 'votes_gop_2016', 'votes_dem_2012', 'votes_dem_2016', 'per_point_diff_2012'])
y = elections_demographics_data_2016_and_before['winner_2016']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# print accuracy
print(rf.score(X_test, y_test))

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking formatted so the numbers are all aligned
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. %s (%f)" % (f + 1, X.columns[indices[f]], importances[indices[f]]))

0.9279569892473118
Feature ranking:
1. white_total_2012 (0.036814)
2. white_alone_not_hispanic_or_latino_total_2012 (0.033932)
3. white_alone_not_hispanic_or_latino_veterans_2012 (0.033182)
4. white_alone_not_hispanic_or_latino_nonveterans_2012 (0.030021)
5. white_nonveterans_2012 (0.019000)
6. black_or_african_american_total_2012 (0.014088)
7. black_or_african_american_nonveterans_2012 (0.012829)
8. black_or_african_american_veterans_2012 (0.012584)
9. Percent of adults with a bachelor's degree or higher, 2000 (0.011185)
10. Asian alone_2012 (0.010854)
11. 300_to_499_occupied_housing_units_2016 (0.008829)
12. Asian_Total_2012 (0.007699)
13. 25_to_29_years_Total_2016 (0.007491)
14. white_veterans_2012 (0.006913)
15. bachelor's_degree_or_higher_nonveterans_2012 (0.006605)
16. Percent of adults with a bachelor's degree or higher, 2008-12 (0.006530)
17. Percent of adults with a high school diploma only, 2000 (0.006378)
18. Asian_alone_Total_2016 (0.006162)
19. 300_to_399_occupied_housing_

In [40]:
# find the most common first node amongst all the decision trees
first_nodes = [estimator.tree_.feature[0] for estimator in rf.estimators_]
first_nodes = [node for node in first_nodes if node != -2]
first_nodes = pd.Series(first_nodes)
first_nodes.value_counts()

791    5
767    4
792    4
793    4
92     4
      ..
819    1
770    1
778    1
49     1
306    1
Length: 64, dtype: int64

In [35]:
from sklearn.tree import export_graphviz

# Export as dot file
export_graphviz(rf.estimators_[0], out_file='tree.dot', 
                feature_names = X.columns,
                class_names = ['GOP', 'DEM'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

0

In [None]:
# gridsearch tuning and cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40, 50]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# print the best parameters
print(grid_search.best_params_)

# print the best score
print(grid_search.best_score_)

# print the accuracy of the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)