In [36]:
from google.colab import drive
from google.auth import default
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
%cd /content/drive/MyDrive/Coursework/STATS/315B/project

/content/drive/MyDrive/Coursework/STATS/315B/project


## Imports and Setup

In [38]:
## Project Imports
import gspread
import os
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [39]:
pres_data = pd.read_excel('full-data.xlsx', sheet_name=0)
sen_data = pd.read_excel('full-data.xlsx', sheet_name=1)

In [40]:
cwd = os.getcwd() ## Needed for writing csvs 
print(cwd)

/content/drive/MyDrive/Coursework/STATS/315B/project


In [41]:
## First we form predictions for presidential data
## We will use 2016 results as a training set, this will involve using all 
## features above except for 2020 population and 2020 polling info columns 

le = LabelEncoder()
region_enc = le.fit_transform(pres_data['Region'])
pres_data["Region"] = region_enc
sen_data["Region"] = region_enc

In [42]:
noninc_cols_train = ['State','2020 Population','recent-five-polling-avg-20','polling-party-lead-20','pres2016','pres2020']
pres_train_X = pres_data.drop(noninc_cols_train, axis=1)
pres_train_y = pres_data[['pres2016']]

## Model Creation

Models here are: logistic regression with no penalty, logistic regression models with l1 and l2 penalization terms, random forests, xgboost.

Note for logistic regression we employ a standard scaling as necessary when applying regularization terms. We keep the standardization even for no-penalty (regular) logistic regression just for the sake of completion. No standardization is done for 

In [43]:
## Logistic Regression with Scaling
scaler = StandardScaler()
lr = LogisticRegression(penalty='none', solver = 'saga')
lrmodel = Pipeline([('standardize', scaler), ('log_reg', lr)])
lrmodel.fit(pres_train_X,pres_train_y.values.ravel())

## L1-Penalty Logistic Regression ("LASSO-like" logistic regression)
lassor = LogisticRegression(penalty='l1', solver = 'saga')
lassomodel = Pipeline([('standardize', scaler), ('lasso_log_reg', lassor)])
lassomodel.fit(pres_train_X,pres_train_y.values.ravel())

## L2-Penalty Logistic Regression
ridger = LogisticRegression()
ridgemodel = Pipeline([('standardize', scaler), ('ridge_log_reg', ridger)])
ridgemodel.fit(pres_train_X,pres_train_y.values.ravel())

## Elastic Net Logistic Regression (l1_ratio = 0.2)
elo2r = LogisticRegression(penalty='elasticnet', solver = 'saga', l1_ratio = 0.2)
elo2model = Pipeline([('standardize', scaler), ('elo2_reg', elo2r)])
elo2model.fit(pres_train_X,pres_train_y.values.ravel())

## Elastic Net Logistic Regression (l1_ratio = 0.4)
elo4r = LogisticRegression(penalty='elasticnet', solver = 'saga', l1_ratio = 0.4)
elo4model = Pipeline([('standardize', scaler), ('elo4_reg', elo4r)])
elo4model.fit(pres_train_X,pres_train_y.values.ravel())

## Elastic Net Logistic Regression (l1_ratio = 0.6)
elo6r = LogisticRegression(penalty='elasticnet', solver = 'saga', l1_ratio = 0.6)
elo6model = Pipeline([('standardize', scaler), ('elo6_reg', elo6r)])
elo6model.fit(pres_train_X,pres_train_y.values.ravel())

## Elastic Net Logistic Regression (l1_ratio = 0.8)
elo8r = LogisticRegression(penalty='elasticnet', solver = 'saga', l1_ratio = 0.8)
elo8model = Pipeline([('standardize', scaler), ('elo8_reg', elo8r)])
elo8model.fit(pres_train_X,pres_train_y.values.ravel())



Pipeline(steps=[('standardize', StandardScaler()),
                ('elo8_reg',
                 LogisticRegression(l1_ratio=0.8, penalty='elasticnet',
                                    solver='saga'))])

## Some Inference Info (Coefficients)

In [44]:
## Coefficients organized from least important to most important
print("Logistic Regression - No Penalty")
print(np.sort(lr.coef_))
print(np.argsort(np.array(lr.coef_)))

print("LASSO Regression")
print(np.sort(lassor.coef_))
print(np.argsort(np.array(lassor.coef_)))

print("Ridge Regression")
print(np.sort(ridger.coef_))
print(np.argsort(np.array(ridger.coef_)))

print("EL-0.2 Logistic Regression")
print(np.sort(elo2r.coef_))
print(np.argsort(np.array(lr.coef_)))

print("EL-0.4 Logistic Regression")
print(np.sort(elo4r.coef_))
print(np.argsort(np.array(lassor.coef_)))

print("EL-0.6 Logistic Regression")
print(np.sort(elo6r.coef_))
print(np.argsort(np.array(ridger.coef_)))

print("EL-0.8 Logistic Regression")
print(np.sort(elo8r.coef_))
print(np.argsort(np.array(lr.coef_)))

Logistic Regression - No Penalty
[[-0.61525887 -0.56087586 -0.49170165 -0.46214553 -0.31986924 -0.27638648
  -0.26463274 -0.16136923 -0.1552292  -0.06600457  0.04432042  0.05786461
   0.13415904  0.14595002  0.17575262  0.18525188  0.19190916  0.27163769
   0.33871503  0.34883374  0.35736832  0.39137783  0.40827679  0.50072429
   0.50334703  0.50756721  0.51124186  0.56831296  0.67326698  0.76150059
   1.93833018]]
[[ 9 24  1 11  6 21  8 17 10 15  0 23 19  2 29 22  3 18 16 12 27  4 20 25
   7  5 26 14 13 28 30]]
LASSO Regression
[[-0.02320296  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.12947297  0.22492336  0.26591988  0.65383391
   2.41681161]]
[[ 9  0 27 26 25 24 23 22 21 20 19 18 17 16 29 15 12 11 10  8  6  4  3  2
   1 14  7  5 13 28 30]]
Ridge Regres

## Creating a Test Set

Test set here is similar features to the training data, however we make the following changes:

*   Obviously remove any 2016 related info, so we remove 2010 population counts and the 2016 presidential polling averages.
*   From the training data, we fit the model with using 2000 presidential election results, but instead we will use 



In [45]:
noninc_cols_test = ['State','2010 Population','recent-five-polling-avg-16','polling-party-lead-16','pres2000','pres2020']
pres_test_X = pres_data.drop(noninc_cols_test, axis=1)
pres_test_y = pres_data[['pres2020']]

## Move 2016 results to position where 2012 results
## Here we use 2004, 2008, 2012, 2016 results as predictors towards 2020 results
## Since 2000 was removed we have to rearrange columns here
col_move = pres_test_X.pop("pres2016")
pres_test_X.insert(2, "pres2016", col_move)

## Rename columns just so the models can run
## A little bit hacky but we turn the 2000 results into 2016 results
dict = {'2020 Population': '2010 Population',
        'polling-party-lead-20': 'polling-party-lead-16',
        'pres2016': 'pres2000',
        'recent-five-polling-avg-20': 'recent-five-polling-avg-16',}
 
# call rename () method
pres_test_X.rename(columns=dict,inplace=True)

In [46]:
## Form predictions over all models (omitted logistic regression/ridge/lasso models, see other notebook)
elo2preds = elo2model.predict(pres_test_X)
elo4preds = elo4model.predict(pres_test_X)
elo6preds = elo6model.predict(pres_test_X)
elo8preds = elo8model.predict(pres_test_X)

In [47]:
## Notably, all the classification results are the same
print(elo2preds)
print(elo4preds)
print(elo6preds)
print(elo8preds)

[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]


## Senate Predictions

Here we form predictions for the US senate for the upcoming 2022 Midterm Elections.

As a note, the preprocessing here for the senate data is altered. We have a set of 13 "competitive" states that are considered competitive based off [2022 US Senate Election Ratings](https://en.wikipedia.org/wiki/2022_United_States_Senate_elections#Predictions). These will form our test set,and the training set will be over other states where (1) they have no Senate competition in 2022 so we use their most recent party outcome for the most recent Senate election in 2020 or (2) forecasting and demographic/polling factors strongly indicate a party direction for the Senate results, to the point where no forecasting is necessary and polling is usually sparse. The paper addresses this in more depth as a potential challenge.

In [48]:
null_results = pd.isnull(sen_data["recent-res"])
competitive_states = [i for i, n in enumerate(null_results) if n == True] 
other_states = list(set(list(range(50))) - set(competitive_states))
print(other_states)
print(competitive_states)

sen_data = sen_data.drop(["State"], axis = 1)
sen_train_X = sen_data.iloc[other_states]
sen_train_X = sen_train_X.drop(["recent-res"], axis = 1)
sen_train_y = sen_data[~sen_data['recent-res'].isnull()]['recent-res']
sen_test_X = sen_data.iloc[competitive_states]
sen_test_X = sen_test_X.drop(["recent-res"], axis = 1)

[0, 3, 4, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 29, 30, 31, 33, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 49]
[1, 2, 5, 8, 9, 24, 27, 28, 32, 34, 37, 46, 48]


In [49]:
lrmodel.fit(sen_train_X,sen_train_y.values.ravel())
lassomodel.fit(sen_train_X,sen_train_y.values.ravel())
ridgemodel.fit(sen_train_X,sen_train_y.values.ravel())
elo2model.fit(sen_train_X,sen_train_y.values.ravel())
elo4model.fit(sen_train_X,sen_train_y.values.ravel())
elo6model.fit(sen_train_X,sen_train_y.values.ravel())
elo8model.fit(sen_train_X,sen_train_y.values.ravel())



Pipeline(steps=[('standardize', StandardScaler()),
                ('elo8_reg',
                 LogisticRegression(l1_ratio=0.8, penalty='elasticnet',
                                    solver='saga'))])

In [50]:
## Form predictions over all models
logpreds = lrmodel.predict(sen_test_X)
lassopreds = lassomodel.predict(sen_test_X)
ridgepreds = ridgemodel.predict(sen_test_X)
elo2preds = elo2model.predict(sen_test_X)
elo4preds = elo4model.predict(sen_test_X)
elo6preds = elo6model.predict(sen_test_X)
elo8preds = elo8model.predict(sen_test_X)

The indices above correspond to the following states: [Alaska, Arizona, Colorado, Florida, Georgia, Missouri, Nevada, New Hampshire, North Carolina, Ohio, Pennsylvania, Washington, Wisconsin]

## More Inference for Senate Model

In [51]:
## Coefficients organized from least important to most important
print("Logistic Regression - No Penalty")
print(np.sort(lr.coef_))
print(np.argsort(np.array(lr.coef_)))

print("LASSO Regression")
print(np.sort(lassor.coef_))
print(np.argsort(np.array(lassor.coef_)))

print("Ridge Regression")
print(np.sort(ridger.coef_))
print(np.argsort(np.array(ridger.coef_)))

print("EL-0.2 Logistic Regression")
print(np.sort(elo2r.coef_))
print(np.argsort(np.array(elo2r.coef_)))

print("EL-0.4 Logistic Regression")
print(np.sort(elo4r.coef_))
print(np.argsort(np.array(elo4r.coef_)))

print("EL-0.6 Logistic Regression")
print(np.sort(elo6r.coef_))
print(np.argsort(np.array(elo6r.coef_)))

print("EL-0.8 Logistic Regression")
print(np.sort(elo8r.coef_))
print(np.argsort(np.array(elo8r.coef_)))

Logistic Regression - No Penalty
[[-0.32816069 -0.30355349 -0.24572165 -0.16059677 -0.12099383 -0.11596618
  -0.07718537 -0.04641651 -0.04133707  0.03654313  0.06216207  0.07152417
   0.07470964  0.12012585  0.12355474  0.16363187  0.18095112  0.19615543
   0.21001494  0.22690729  0.23517212  0.27989027  0.27989027  0.28585068
   0.34094177  0.35560668  0.36238357  0.37242244  0.40485376  0.41668697
   0.44012307  0.4577108   0.47852678  0.47852678  0.5181924   0.55877095
   0.57437096  0.71489026  0.73914093]]
[[27 15 31 17 30 37 16 35 20 21 25  0 29 18 24  1 14 23 22 33 26  6 36 32
  19  4 10  2  5  3  8 38  7 34 12 28 13 11  9]]
LASSO Regression
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.00445296 0.00643072 0.05738035 0.05738035 0.06387875
  0.07328902 0.18118123 0.18337846 

## New Data Set

Here we apply a couple of small feature changes and train and test over similar, but new datasets.

In [52]:
pres_data_new = pd.read_excel('full-data.xlsx', sheet_name=2)
sen_data_new = pd.read_excel('full-data.xlsx', sheet_name=3)

In [53]:
## Training Set Info
region_enc = le.fit_transform(pres_data_new['Region'])
pres_data_new["Region"] = region_enc
sen_data_new["Region"] = region_enc
noninc_cols_train = ['State','2020 Population','recent-five-polling-avg-20','polling-party-lead-20','pres2016','pres2020']
pres_train_X = pres_data_new.drop(noninc_cols_train, axis=1)
pres_train_y = pres_data_new[['pres2016']]

In [54]:
## Test Set Info
noninc_cols_test = ['State','2010 Population','recent-five-polling-avg-16','polling-party-lead-16','pres2000','pres2020']
pres_test_X = pres_data_new.drop(noninc_cols_test, axis=1)
pres_test_y = pres_data_new[['pres2020']]
col_move = pres_test_X.pop("pres2016")
pres_test_X.insert(2, "pres2016", col_move)
dict = {'2020 Population': '2010 Population',
        'polling-party-lead-20': 'polling-party-lead-16',
        'pres2016': 'pres2000',
        'recent-five-polling-avg-20': 'recent-five-polling-avg-16',}
pres_test_X.rename(columns=dict,inplace=True)

In [55]:
pres_train_X.head()

Unnamed: 0,Region,2010 Population,pres2000,pres2004,pres2008,pres2012,gdp-per-capita,high-school-pop,some-college,associates,...,white,inc-party,inc-w-recent,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg-16,polling-party-lead-16,pvi,party-pvi
0,2,4779736.0,0.0,0.0,0.0,0.0,49027.0,30.3,21.6,8.7,...,0.640074,1.0,1.0,6829526.55,37491.77,52035.0,16.0,0.0,15.0,0.0
1,3,710231.0,0.0,0.0,0.0,0.0,75027.0,28.4,26.0,8.7,...,0.627645,0.0,1.0,7527972.46,3566.69,77790.0,4.0,1.0,9.0,0.0
2,3,6392017.0,0.0,0.0,0.0,0.0,56511.0,23.8,24.9,8.9,...,0.517834,0.0,0.0,13839094.88,38865284.52,61529.0,2.0,0.0,3.0,0.0
3,2,2915918.0,0.0,0.0,0.0,0.0,47770.0,33.9,22.0,7.5,...,0.692194,1.0,1.0,5973540.8,26852.08,49475.0,23.0,0.0,16.0,0.0
4,3,37253956.0,1.0,1.0,1.0,1.0,85546.0,20.4,20.9,8.0,...,0.330164,1.0,1.0,135852.28,9467165.58,78672.0,28.0,1.0,14.0,1.0


In [56]:
lrmodel.fit(pres_train_X,pres_train_y.values.ravel())
lassomodel.fit(pres_train_X,pres_train_y.values.ravel())
ridgemodel.fit(pres_train_X,pres_train_y.values.ravel())
elo2model.fit(pres_train_X,pres_train_y.values.ravel())
elo4model.fit(pres_train_X,pres_train_y.values.ravel())
elo6model.fit(pres_train_X,pres_train_y.values.ravel())
elo8model.fit(pres_train_X,pres_train_y.values.ravel())



Pipeline(steps=[('standardize', StandardScaler()),
                ('elo8_reg',
                 LogisticRegression(l1_ratio=0.8, penalty='elasticnet',
                                    solver='saga'))])

In [57]:
## Coefficients organized from least important to most important
print("Logistic Regression - No Penalty")
print(np.sort(lr.coef_))
print(np.argsort(np.array(lr.coef_)))

print("LASSO Regression")
print(np.sort(lassor.coef_))
print(np.argsort(np.array(lassor.coef_)))

print("Ridge Regression")
print(np.sort(ridger.coef_))
print(np.argsort(np.array(ridger.coef_)))

print("EL-0.2 Logistic Regression")
print(np.sort(elo2r.coef_))
print(np.argsort(np.array(elo2r.coef_)))

print("EL-0.4 Logistic Regression")
print(np.sort(elo4r.coef_))
print(np.argsort(np.array(elo4r.coef_)))

print("EL-0.6 Logistic Regression")
print(np.sort(elo6r.coef_))
print(np.argsort(np.array(elo6r.coef_)))

print("EL-0.8 Logistic Regression")
print(np.sort(elo8r.coef_))
print(np.argsort(np.array(elo8r.coef_)))

Logistic Regression - No Penalty
[[-0.66194529 -0.61136946 -0.55860429 -0.45792765 -0.24629473 -0.18206904
  -0.12793425  0.03503794  0.0353491   0.10341796  0.12540735  0.2076227
   0.20910578  0.2162791   0.23811112  0.25133489  0.25718787  0.25897391
   0.34196647  0.34312929  0.429106    0.45271046  0.48956586  0.49363849
   0.54266221  0.55993131  0.61601266  0.78431241  2.05591217]]
[[ 7  1 22  9 19 15 13  8 21  0 17 27  6  2 16 20 10  3 25 14 24 18  4 12
   5 23 11 26 28]]
LASSO Regression
[[-0.06753375  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.01231057  0.23534152  0.2354424   0.69051165  2.46068395]]
[[ 7  0 25 24 23 22 21 20 19 18 17 16 15 27 14 12 10  9  8  6  3  2  1 13
   4 11  5 26 28]]
Ridge Regression
[[-4.13380778e-01 -3.85599559e-01 -3.72958971e-01 -2.2137204

In [58]:
print(pres_train_X.columns[28])
print(pres_train_X.columns[26])
print(pres_train_X.columns[5])
print(pres_train_X.columns[11])
print(pres_train_X.columns[4])
print(pres_train_X.columns[24])

party-pvi
polling-party-lead-16
pres2012
grad-professional
pres2008
median-hh-income


In [70]:
print(pres_train_X.columns[10])
print(pres_train_X.columns[9])
print(pres_train_X.columns[8])
print(pres_train_X.columns[6])
print(pres_train_X.columns[3])
print(pres_train_X.columns[2])

bachelors
associates
some-college
gdp-per-capita
pres2004
pres2000


In [59]:
forest = RandomForestClassifier(n_estimators = 120, random_state = 83)
forest.fit(pres_train_X,pres_train_y.values.ravel())

xgboost1 = xgb.XGBClassifier(base_score=0.5, booster='gbtree', random_state = 83, colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 120)
xgboost1.fit(pres_train_X,pres_train_y.values.ravel())

## Feature Importances for Ensembles/Trees
print(np.sort(forest.feature_importances_))
print(np.argsort(np.array(forest.feature_importances_)))
print(np.sort(xgboost1.feature_importances_))
print(np.argsort(np.array(xgboost1.feature_importances_)))

[1.49664751e-04 1.96348605e-03 2.19787194e-03 2.23746898e-03
 4.09094629e-03 4.10388258e-03 5.64675067e-03 6.71188500e-03
 7.35686638e-03 7.45568485e-03 8.70821793e-03 1.11084831e-02
 1.24441818e-02 1.29264470e-02 1.29269990e-02 1.29874984e-02
 1.42689151e-02 1.76963942e-02 3.20600728e-02 3.59874119e-02
 3.91224566e-02 4.79442737e-02 5.10695269e-02 5.36274339e-02
 6.11344545e-02 7.54317723e-02 1.05225719e-01 1.16296323e-01
 2.37118912e-01]
[21 13 15 20  0  9 12 17 25 23  7 19  2  6  8  1 16 27 22 18 10 14  3  5
 24  4 11 26 28]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.0032079  0.00460977 0.00479008 0.00483478
 0.00488032 0.0060679  0.00692078 0.00782292 0.01818532 0.02912245
 0.02963205 0.03391373 0.05099395 0.05742973 0.0588125  0.05913836
 0.0653655  0.09270841 0.11119918 0.12247662 0.22788776]
[ 0 25 23 21  8  9 15 13 18 10 19  2  6 12 14 20 16 22  1 17  7  4 27 11
  3 26 28 24  5]


In [60]:
rfpreds = forest.predict(pres_test_X)
xgbpreds = xgboost1.predict(pres_test_X)
print("Random Forests")
print(rfpreds)
print("Gradient Boosting")
print(xgbpreds)

Random Forests
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
Gradient Boosting
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]


## Senate Fits

In [61]:
sen_data_new = sen_data_new.drop(["State"], axis = 1)
sen_train_X = sen_data_new.iloc[other_states]
sen_train_X = sen_train_X.drop(["recent-res"], axis = 1)
sen_train_y = sen_data_new[~sen_data['recent-res'].isnull()]['recent-res']
sen_test_X = sen_data_new.iloc[competitive_states]
sen_test_X = sen_test_X.drop(["recent-res"], axis = 1)

In [62]:
sen_data_new.head()

Unnamed: 0,Region,2020 Population,pres2012,pres2016,pres2020,sen1-recent,sen2-recent,gdp-per-capita,high-school-pop,some-college,...,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg,polling-party-lead,pvi,party-pvi,employment-rate,urban-pct-2010,recent-res
0,2,5024279.0,0.0,0.0,0.0,0.0,0.0,49027.0,30.3,21.6,...,6829526.55,37491.77,52035.0,20.4,0.0,15.0,0.0,54.0,59.04,0.0
1,3,733391.0,0.0,0.0,0.0,0.0,0.0,75027.0,28.4,26.0,...,7527972.46,3566.69,77790.0,6.5,0.0,9.0,0.0,59.6,66.02,
2,3,7151502.0,0.0,0.0,1.0,1.0,1.0,56511.0,23.8,24.9,...,13839094.88,38865284.52,61529.0,7.0,1.0,3.0,0.0,56.2,89.81,
3,2,3011524.0,0.0,0.0,0.0,0.0,0.0,47770.0,33.9,22.0,...,5973540.8,26852.08,49475.0,45.0,0.0,16.0,0.0,54.8,56.16,0.0
4,3,39538223.0,1.0,1.0,1.0,1.0,1.0,85546.0,20.4,20.9,...,135852.28,9467165.58,78672.0,25.0,1.0,14.0,1.0,59.4,94.95,1.0


In [63]:
lrmodel.fit(sen_train_X,sen_train_y.values.ravel())
lassomodel.fit(sen_train_X,sen_train_y.values.ravel())
ridgemodel.fit(sen_train_X,sen_train_y.values.ravel())
elo2model.fit(sen_train_X,sen_train_y.values.ravel())
elo4model.fit(sen_train_X,sen_train_y.values.ravel())
elo6model.fit(sen_train_X,sen_train_y.values.ravel())
elo8model.fit(sen_train_X,sen_train_y.values.ravel())



Pipeline(steps=[('standardize', StandardScaler()),
                ('elo8_reg',
                 LogisticRegression(l1_ratio=0.8, penalty='elasticnet',
                                    solver='saga'))])

In [64]:
## Coefficients organized from least important to most important
print("Logistic Regression - No Penalty")
print(np.sort(lr.coef_))
print(np.argsort(np.array(lr.coef_)))

print("LASSO Regression")
print(np.sort(lassor.coef_))
print(np.argsort(np.array(lassor.coef_)))

print("Ridge Regression")
print(np.sort(ridger.coef_))
print(np.argsort(np.array(ridger.coef_)))

print("EL-0.2 Logistic Regression")
print(np.sort(elo2r.coef_))
print(np.argsort(np.array(lr.coef_)))

print("EL-0.4 Logistic Regression")
print(np.sort(elo4r.coef_))
print(np.argsort(np.array(lassor.coef_)))

print("EL-0.6 Logistic Regression")
print(np.sort(elo6r.coef_))
print(np.argsort(np.array(ridger.coef_)))

print("EL-0.8 Logistic Regression")
print(np.sort(elo8r.coef_))
print(np.argsort(np.array(lr.coef_)))

Logistic Regression - No Penalty
[[-0.38130064 -0.31374824 -0.30988359 -0.20462848 -0.17997388 -0.16930799
  -0.09505919 -0.08087766  0.02059206  0.036911    0.04578041  0.04962242
   0.05823185  0.10549211  0.11061087  0.14069052  0.14317127  0.19927628
   0.22338205  0.26518764  0.26518764  0.26656195  0.27353887  0.33767021
   0.49774141  0.70228456  0.7119952   0.74964777  0.74964777  0.76423691
   0.93464787  1.27956513]]
[[24 20  8 10 30 28  0  9 23 13 17 14 18  1 22 19 11  7 16  3 29 26 15 25
  12 31  2 27  4  5 21  6]]
LASSO Regression
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.08232949 0.22735818 0.26778139 0.27242637 0.38136467 0.42208585
  0.42208585 1.60189338]]
[[ 0 29 28 26 25 24 23 22 20 19 18 17 16 30 15 13 11 10  9  8  1  7 14  3
  12  5 31  2 21 27  4  6]]
Ridge Regressi

In [65]:
print(sen_train_X.columns[6])
print(sen_train_X.columns[21])
print(sen_train_X.columns[27])
print(sen_train_X.columns[4])

sen2-recent
inc-party
polling-party-lead
pres2020


In [66]:
forest.fit(sen_train_X,sen_train_y.values.ravel())
xgboost1.fit(sen_train_X,sen_train_y.values.ravel())

logpreds = lrmodel.predict(sen_test_X)
lassopreds = lassomodel.predict(sen_test_X)
ridgepreds = ridgemodel.predict(sen_test_X)
elo2preds = elo2model.predict(sen_test_X)
elo4preds = elo4model.predict(sen_test_X)
elo6preds = elo6model.predict(sen_test_X)
elo8preds = elo8model.predict(sen_test_X)
rfpreds = forest.predict(sen_test_X)
xgbpreds = xgboost1.predict(sen_test_X)

print(logpreds)
print(lassopreds)
print(ridgepreds)
print(elo2preds)
print(elo4preds)
print(elo6preds)
print(elo8preds)
print(rfpreds)
print(xgbpreds)

[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1.]
[0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.]
[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1.]
[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1.]
[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1.]
[0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 1.]
[0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.]
[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0.]
[0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.]


The indices above correspond to the following states: [Alaska, Arizona, Colorado, Florida, Georgia, Missouri, Nevada, New Hampshire, North Carolina, Ohio, Pennsylvania, Washington, Wisconsin]

In [67]:
print(np.sort(forest.feature_importances_))
print(np.argsort(np.array(forest.feature_importances_)))
print(np.sort(xgboost1.feature_importances_))
print(np.argsort(np.array(xgboost1.feature_importances_)))

[0.         0.         0.00063837 0.00189809 0.00337442 0.00339247
 0.0045115  0.00587309 0.00630449 0.00646727 0.00647933 0.00747065
 0.0076196  0.00903482 0.00909733 0.01174924 0.0126972  0.01469967
 0.01558472 0.01882768 0.02053692 0.03598656 0.03621964 0.04266007
 0.04860869 0.05625146 0.0601764  0.0979208  0.09905442 0.10028748
 0.10743275 0.1491449 ]
[ 1 22 14  0 16 17 10 30 13  7 28 26 19 18 20  8 11  9 24 23 21 15 31 25
  5  3 29  4 27  2 12  6]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.00917806 0.01031382 0.01047874 0.01969595 0.02188664
 0.02883478 0.02916104 0.03806818 0.09670313 0.10515018 0.11828082
 0.20135534 0.3108933 ]
[ 0 20 19 18 23 16 30 14 13 11  8  7 24 25 26 28  1  9 22  3 12 10 21 31
 15  4 17  2  6 27 29  5]


In [68]:
print(sen_train_X.columns[6])
print(sen_train_X.columns[12])
print(sen_train_X.columns[2])
print(sen_train_X.columns[27])

print(sen_train_X.columns[5])
print(sen_train_X.columns[29])
print(sen_train_X.columns[27])
print(sen_train_X.columns[6])

sen2-recent
grad-professional
pres2012
polling-party-lead
sen1-recent
party-pvi
polling-party-lead
sen2-recent
