In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd

pd.set_option('display.max_columns', None)

In [None]:
# read the county election data
df = gpd.read_file('data/election/final_data/county_data_with_elections_2012_2016_2020_census_MAIN.geojson')
df_2012 = gpd.read_file('data/election/final_data/county_data_with_elections_2012_census_MAIN.geojson')
df_2016 = gpd.read_file('data/election/final_data/county_data_with_elections_2016_census_MAIN.geojson')
df_2020 = gpd.read_file('data/election/final_data/county_data_with_elections_2020_census_MAIN.geojson')

In [None]:
# remove commas from the values in all columns
df = df.replace({',': ''}, regex=True)
df_2012 = df_2012.replace({',': ''}, regex=True)
df_2016 = df_2016.replace({',': ''}, regex=True)
df_2020 = df_2020.replace({',': ''}, regex=True)

In [None]:
def convert_to_number(col):
    try:
        return pd.to_numeric(col)
    except:
        return col
    
df = df.apply(convert_to_number)
df_2012 = df_2012.apply(convert_to_number)
df_2016 = df_2016.apply(convert_to_number)
df_2020 = df_2020.apply(convert_to_number)

In [None]:
# save the dfs back to geojson
df.to_file('data/election/final_data/county_data_with_elections_2012_2016_2020_census_MAIN.geojson', driver='GeoJSON')
df_2012.to_file('data/election/final_data/county_data_with_elections_2012_census_MAIN.geojson', driver='GeoJSON')
df_2016.to_file('data/election/final_data/county_data_with_elections_2016_census_MAIN.geojson', driver='GeoJSON')
df_2020.to_file('data/election/final_data/county_data_with_elections_2020_census_MAIN.geojson', driver='GeoJSON')

In [None]:
elections_demographics_data = df.drop(columns=['geometry'])
elections_demographics_data_2012 = df_2012.drop(columns=['geometry'])
elections_demographics_data_2016 = df_2016.drop(columns=['geometry'])
elections_demographics_data_2020 = df_2020.drop(columns=['geometry'])

In [None]:
elections_demographics_data = pd.DataFrame(elections_demographics_data)
elections_demographics_data_2012 = pd.DataFrame(elections_demographics_data_2012)
elections_demographics_data_2016 = pd.DataFrame(elections_demographics_data_2016)
elections_demographics_data_2020 = pd.DataFrame(elections_demographics_data_2020)

In [None]:
elections_demographics_data

In [None]:
for col in elections_demographics_data.columns:
    if not np.issubdtype(elections_demographics_data[col].dtype, np.number):
        print(col)

In [None]:
# create a new df from election_demographics_data with only the columns that have dates 2016 and before in the column name (2012, 2016, 2000, 2001, '2008-12', etc)
elections_demographics_data_2016_and_before = elections_demographics_data[[col for col in elections_demographics_data.columns if '2000' in col or '2001' in col or '2002' in col or '2003' in col or '2004' in col or '2005' in col or '2006' in col or '2007' in col or '2008' in col or '2009' in col or '2010' in col or '2011' in col or '2012' in col or '2013' in col or '2014' in col or '2015' in col or '2016' in col or '2008-12' in col]]

In [None]:
elections_demographics_data_2016_and_before.winner_2016

In [None]:
# show valuecounts of column dtypes
elections_demographics_data_2016_and_before.dtypes.value_counts()

In [None]:
# drop Child dependency ratio_total_2012
elections_demographics_data_2016_and_before = elections_demographics_data_2016_and_before.drop(columns=['Child dependency ratio_total_2012'])

In [None]:
# scale the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
numeric_cols = elections_demographics_data_2016_and_before.select_dtypes(include=[np.number]).columns
elections_demographics_data_2016_and_before[numeric_cols] = scaler.fit_transform(elections_demographics_data_2016_and_before[numeric_cols])

In [None]:
# drop per_point_diff_2016
elections_demographics_data_2016_and_before = elections_demographics_data_2016_and_before.drop(columns=['per_point_diff_2016'])

In [None]:
# encode winner_2012 and winner_2016 with GOP = 0 and DEM = 1
elections_demographics_data_2016_and_before['winner_2012'] = elections_demographics_data_2016_and_before['winner_2012'].replace({'GOP': 0, 'DEM': 1})
elections_demographics_data_2016_and_before['winner_2016'] = elections_demographics_data_2016_and_before['winner_2016'].replace({'GOP': 0, 'DEM': 1})

In [None]:
pd.set_option('display.max_rows', 1000)

In [None]:
# find what values contribute the most to the winner_2016 column
elections_demographics_data_2016_and_before.corr()['winner_2016'].sort_values(ascending=False)

In [None]:
# drop all rows with missing values
elections_demographics_data_2016_and_before = elections_demographics_data_2016_and_before.dropna()

In [None]:
# create columns called win_gop_2012 and win_gop_2016 and win_dem_2012 and win_dem_2016 that are 1 if the GOP or DEM won in 2012 or 2016 and 0 otherwise
elections_demographics_data_2016_and_before['win_gop_2012'] = elections_demographics_data_2016_and_before['winner_2012'].apply(lambda x: 1 if x == 0 else 0)
elections_demographics_data_2016_and_before['win_gop_2016'] = elections_demographics_data_2016_and_before['winner_2016'].apply(lambda x: 1 if x == 0 else 0)
elections_demographics_data_2016_and_before['win_dem_2012'] = elections_demographics_data_2016_and_before['winner_2012'].apply(lambda x: 1 if x == 1 else 0)
elections_demographics_data_2016_and_before['win_dem_2016'] = elections_demographics_data_2016_and_before['winner_2016'].apply(lambda x: 1 if x == 1 else 0)

In [None]:
# perform a random forest classifier on the data to see which columns are the most important in predicting the win_gop_2016 column
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X = elections_demographics_data_2016_and_before.drop(columns=['winner_2012', 'winner_2016', 'win_gop_2012', 'win_gop_2016', 'win_dem_2012', 'win_dem_2016', 'per_gop_2012', 'per_gop_2016', 'per_dem_2012', 'per_dem_2016', 'votes_gop_2012', 'votes_gop_2016', 'votes_dem_2012', 'votes_dem_2016', 'per_point_diff_2012'])
y = elections_demographics_data_2016_and_before['winner_2016']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# print accuracy
print(rf.score(X_test, y_test))

importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking formatted so the numbers are all aligned
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. %s (%f)" % (f + 1, X.columns[indices[f]], importances[indices[f]]))

In [None]:
from sklearn.tree import export_graphviz

# Export as dot file
export_graphviz(rf.estimators_[0], out_file='tree.dot', 
                feature_names = X.columns,
                class_names = ['GOP', 'DEM'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

In [None]:
# gridsearch tuning and cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, 40, 50]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# print the best parameters
print(grid_search.best_params_)

# print the best score
print(grid_search.best_score_)

# print the accuracy of the model
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)