In [145]:
from google.colab import drive
from google.auth import default
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [146]:
%cd /content/drive/MyDrive/Coursework/STATS/315B/project

/content/drive/MyDrive/Coursework/STATS/315B/project


## Imports and Setup

In [147]:
## Project Imports
import gspread
import os
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [148]:
pres_data = pd.read_excel('full-data.xlsx', sheet_name=0)
sen_data = pd.read_excel('full-data.xlsx', sheet_name=1)

In [149]:
cwd = os.getcwd() ## Needed for writing csvs 
print(cwd)

/content/drive/MyDrive/Coursework/STATS/315B/project


## A Bit of Data Exploration

Mostly just a view into different features as part of the models (will elaborate this later in the final paper and within this notebook).

In [150]:
pres_data.head()

Unnamed: 0,State,Region,2010 Population,2020 Population,pres2000,pres2004,pres2008,pres2012,sen1-3rdrecent,sen2-3rdrecent,...,dem-finance,median-hh-income,recent-five-polling-avg-16,polling-party-lead-16,recent-five-polling-avg-20,polling-party-lead-20,pvi,party-pvi,pres2016,pres2020
0,Alabama,South,4779736.0,5024279.0,0.0,0.0,0.0,0.0,0.0,0.0,...,37491.77,52035.0,16.0,0.0,19.5,0.0,15.0,0.0,0.0,0.0
1,Alaska,West,710231.0,733391.0,0.0,0.0,0.0,0.0,1.0,0.0,...,3566.69,77790.0,4.0,1.0,7.7,0.0,9.0,0.0,0.0,0.0
2,Arizona,West,6392017.0,7151502.0,0.0,0.0,0.0,0.0,0.0,0.0,...,38865284.52,61529.0,2.0,0.0,2.6,1.0,3.0,0.0,0.0,1.0
3,Arkansas,South,2915918.0,3011524.0,0.0,0.0,0.0,0.0,1.0,1.0,...,26852.08,49475.0,23.0,0.0,22.8,0.0,16.0,0.0,0.0,0.0
4,California,West,37253956.0,39538223.0,1.0,1.0,1.0,1.0,1.0,1.0,...,9467165.58,78672.0,28.0,1.0,29.2,1.0,14.0,1.0,1.0,1.0


In [151]:
sen_data.head()

Unnamed: 0,State,Region,2020 Population,pres2000,pres2004,pres2008,pres2012,pres2016,pres2020,sen1-recent,...,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg,polling-party-lead,pvi,party-pvi,employment-rate,urban-pct-2010,recent-res
0,Alabama,South,5024279.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6829526.55,37491.77,52035.0,20.4,0.0,15.0,0.0,54.0,59.04,0.0
1,Alaska,West,733391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,7527972.46,3566.69,77790.0,6.5,0.0,9.0,0.0,59.6,66.02,
2,Arizona,West,7151502.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,13839094.88,38865284.52,61529.0,7.0,1.0,3.0,0.0,56.2,89.81,
3,Arkansas,South,3011524.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5973540.8,26852.08,49475.0,45.0,0.0,16.0,0.0,54.8,56.16,0.0
4,California,West,39538223.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,135852.28,9467165.58,78672.0,25.0,1.0,14.0,1.0,59.4,94.95,1.0


In [152]:
## First we form predictions for presidential data
## We will use 2016 results as a training set, this will involve using all 
## features above except for 2020 population and 2020 polling info columns 
## (and obviously the )


le = LabelEncoder()
region_enc = le.fit_transform(pres_data['Region'])
pres_data["Region"] = region_enc
sen_data["Region"] = region_enc

In [153]:
noninc_cols_train = ['State','2020 Population','recent-five-polling-avg-20','polling-party-lead-20','pres2016','pres2020']
pres_train_X = pres_data.drop(noninc_cols_train, axis=1)
pres_train_y = pres_data[['pres2016']]

pres_train_X.to_csv(cwd + "\\pres_train_X.csv")
pres_train_y.to_csv(cwd + "\\pres_train_y.csv")

In [154]:
pres_train_X.head()

Unnamed: 0,Region,2010 Population,pres2000,pres2004,pres2008,pres2012,sen1-3rdrecent,sen2-3rdrecent,gdp,high-school-pop,...,white,inc-party,inc-w-recent,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg-16,polling-party-lead-16,pvi,party-pvi
0,2,4779736.0,0.0,0.0,0.0,0.0,0.0,0.0,226896.5,30.3,...,0.640074,1.0,1.0,6829526.55,37491.77,52035.0,16.0,0.0,15.0,0.0
1,3,710231.0,0.0,0.0,0.0,0.0,1.0,0.0,49820.0,28.4,...,0.627645,0.0,1.0,7527972.46,3566.69,77790.0,4.0,1.0,9.0,0.0
2,3,6392017.0,0.0,0.0,0.0,0.0,0.0,0.0,373719.0,23.8,...,0.517834,0.0,0.0,13839094.88,38865284.52,61529.0,2.0,0.0,3.0,0.0
3,2,2915918.0,0.0,0.0,0.0,0.0,1.0,1.0,130750.7,33.9,...,0.692194,1.0,1.0,5973540.8,26852.08,49475.0,23.0,0.0,16.0,0.0
4,3,37253956.0,1.0,1.0,1.0,1.0,1.0,1.0,3007187.7,20.4,...,0.330164,1.0,1.0,135852.28,9467165.58,78672.0,28.0,1.0,14.0,1.0


In [155]:
pres_train_y.head()

Unnamed: 0,pres2016
0,0.0
1,0.0
2,0.0
3,0.0
4,1.0


## Model Creation

Models here are: logistic regression with no penalty, logistic regression models with l1 and l2 penalization terms, random forests, xgboost.

Note for logistic regression we employ a standard scaling as necessary when applying regularization terms. We keep the standardization even for no-penalty (regular) logistic regression just for the sake of completion. No standardization is done for 

In [156]:
## Logistic Regression with Scaling
scaler = StandardScaler()
lr = LogisticRegression(penalty='none', solver = 'saga')
lrmodel = Pipeline([('standardize', scaler), ('log_reg', lr)])
lrmodel.fit(pres_train_X,pres_train_y.values.ravel())

## L1-Penalty Logistic Regression ("LASSO-like" logistic regression)
lassor = LogisticRegression(penalty='l1', solver = 'saga')
lassomodel = Pipeline([('standardize', scaler), ('lasso_log_reg', lassor)])
lassomodel.fit(pres_train_X,pres_train_y.values.ravel())

## L2-Penalty Logistic Regression
ridger = LogisticRegression()
ridgemodel = Pipeline([('standardize', scaler), ('ridge_log_reg', ridger)])
ridgemodel.fit(pres_train_X,pres_train_y.values.ravel())



Pipeline(steps=[('standardize', StandardScaler()),
                ('ridge_log_reg', LogisticRegression())])

In [157]:
## Decision Trees
dtree = DecisionTreeClassifier(random_state = 83)
dtree.fit(pres_train_X,pres_train_y.values.ravel())

DecisionTreeClassifier(random_state=83)

In [158]:
## Random Forests
forest = RandomForestClassifier(n_estimators = 120, random_state = 83)
forest.fit(pres_train_X,pres_train_y.values.ravel())

RandomForestClassifier(n_estimators=120, random_state=83)

In [159]:
## XGBoost
xgboost1 = xgb.XGBClassifier(base_score=0.5, booster='gbtree', random_state = 83, colsample_bytree = 0.3, learning_rate = 0.1, alpha = 10, n_estimators = 120)
xgboost1.fit(pres_train_X,pres_train_y.values.ravel())

XGBClassifier(alpha=10, colsample_bytree=0.3, n_estimators=120, random_state=83)

## Some Inference Info (Coefficients)

In [160]:
## Coefficients organized from least important to most important
print(np.sort(lr.coef_))
print(np.argsort(np.array(lr.coef_)))

print(np.sort(lassor.coef_))
print(np.argsort(np.array(lassor.coef_)))

print(np.sort(ridger.coef_))
print(np.argsort(np.array(ridger.coef_)))

[[-0.63522668 -0.54890935 -0.46846031 -0.46732825 -0.30468706 -0.26275472
  -0.23327019 -0.13048911 -0.1100392  -0.07166002  0.08120737  0.08534572
   0.11665046  0.11951665  0.16046987  0.1901478   0.19141705  0.22784826
   0.30762721  0.32658518  0.36894064  0.40303548  0.43748837  0.47168472
   0.50418456  0.50858985  0.54875366  0.55347809  0.68910551  0.77184036
   1.96047481]]
[[ 9 24 11  1  6 21  8 17 10 15 23  0 22 19  2 29  3 18 12 16 27 20  4 26
   5 25  7 14 13 28 30]]
[[-0.03390862  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.12143516  0.21769316  0.24887075  0.65538543
   2.42285615]]
[[ 9  0 27 26 25 24 23 22 21 20 19 18 17 16 29 15 12 11 10  8  6  4  3  2
   1 14  7  5 13 28 30]]
[[-0.40214395 -0.37667092 -0.28359664 -0.21455842 -0.15021314 

In [161]:
## Get Tree Representation
text_representation = tree.export_text(dtree)
print(text_representation)

|--- feature_30 <= 0.50
|   |--- class: 0.0
|--- feature_30 >  0.50
|   |--- class: 1.0



In [162]:
## Feature Importances for Ensembles
print(np.sort(dtree.feature_importances_))
print(np.argsort(np.array(dtree.feature_importances_)))
print(np.sort(forest.feature_importances_))
print(np.argsort(np.array(forest.feature_importances_)))
print(np.sort(xgboost1.feature_importances_))
print(np.argsort(np.array(xgboost1.feature_importances_)))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1.]
[ 0 28 27 26 25 24 23 22 21 20 19 18 17 16 29 15 13 12 11 10  9  8  7  6
  5  4  3  2  1 14 30]
[0.         0.00136609 0.0027355  0.00519779 0.00554901 0.00579912
 0.00656748 0.00728432 0.00760413 0.00809159 0.00954432 0.00963875
 0.01029661 0.01032012 0.01124802 0.01372405 0.01460365 0.02029505
 0.03013778 0.03021072 0.0319415  0.03309494 0.03346494 0.041301
 0.04318408 0.04996164 0.05742606 0.06927306 0.08075092 0.14893616
 0.20045158]
[ 6 23 15 17 11 19  8  1 27 25 14 22 29  0 18 10 21 20  9 24  4  3 12 16
  7  2  5 13 26 28 30]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.00160409 0.00553188 0.00649872
 0.00676417 0.00776622 0.0078066  0.00786895 0.00876377 0.01102983
 0.01496952 0.01569865 0.01605816 0.01612681 0.01779142 0.02417387
 0.05457897 0.05931142 0.08839258 0.09506239 0.09619368 0.17068237
 0.26732597]
[ 0 25 23 17 29 14 10 15  

In [182]:
print(pres_train_X.columns[30])
print(pres_train_X.columns[28])
print(pres_train_X.columns[7])
print(pres_train_X.columns[5])

print(pres_train_X.columns[26])
print(pres_train_X.columns[2])
print(pres_train_X.columns[13])

party-pvi
polling-party-lead-16
sen2-3rdrecent
pres2012
median-hh-income
pres2000
grad-professional


## Creating a Test Set

Test set here is similar features to the training data, however we make the following changes:

*   Obviously remove any 2016 related info, so we remove 2010 population counts and the 2016 presidential polling averages.
*   From the training data, we fit the model with using 2000 presidential election results, but instead we will use 



In [164]:
noninc_cols_test = ['State','2010 Population','recent-five-polling-avg-16','polling-party-lead-16','pres2000','pres2020']
pres_test_X = pres_data.drop(noninc_cols_test, axis=1)
pres_test_y = pres_data[['pres2020']]

## Move 2016 results to position where 2012 results
## Here we use 2004, 2008, 2012, 2016 results as predictors towards 2020 results
## Since 2000 was removed we have to rearrange columns here
col_move = pres_test_X.pop("pres2016")
pres_test_X.insert(2, "pres2016", col_move)

## Rename columns just so the models can run
## A little bit hacky but we turn the 2000 results into 2016 results
dict = {'2020 Population': '2010 Population',
        'polling-party-lead-20': 'polling-party-lead-16',
        'pres2016': 'pres2000',
        'recent-five-polling-avg-20': 'recent-five-polling-avg-16',}
 
# call rename () method
pres_test_X.rename(columns=dict,inplace=True)

pres_test_X.to_csv(cwd + "\\pres_test_X.csv")
pres_test_y.to_csv(cwd + "\\pres_test_y.csv")

In [165]:
pres_test_X.head() ## We see the data matches the format as our train set :)

Unnamed: 0,Region,2010 Population,pres2000,pres2004,pres2008,pres2012,sen1-3rdrecent,sen2-3rdrecent,gdp,high-school-pop,...,white,inc-party,inc-w-recent,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg-16,polling-party-lead-16,pvi,party-pvi
0,2,5024279.0,0.0,0.0,0.0,0.0,0.0,0.0,226896.5,30.3,...,0.640074,1.0,1.0,6829526.55,37491.77,52035.0,19.5,0.0,15.0,0.0
1,3,733391.0,0.0,0.0,0.0,0.0,1.0,0.0,49820.0,28.4,...,0.627645,0.0,1.0,7527972.46,3566.69,77790.0,7.7,0.0,9.0,0.0
2,3,7151502.0,0.0,0.0,0.0,0.0,0.0,0.0,373719.0,23.8,...,0.517834,0.0,0.0,13839094.88,38865284.52,61529.0,2.6,1.0,3.0,0.0
3,2,3011524.0,0.0,0.0,0.0,0.0,1.0,1.0,130750.7,33.9,...,0.692194,1.0,1.0,5973540.8,26852.08,49475.0,22.8,0.0,16.0,0.0
4,3,39538223.0,1.0,1.0,1.0,1.0,1.0,1.0,3007187.7,20.4,...,0.330164,1.0,1.0,135852.28,9467165.58,78672.0,29.2,1.0,14.0,1.0


In [166]:
pres_test_y.head()

Unnamed: 0,pres2020
0,0.0
1,0.0
2,1.0
3,0.0
4,1.0


In [167]:
## Form predictions over all models
logpreds = lrmodel.predict(pres_test_X)
lassopreds = lassomodel.predict(pres_test_X)
ridgepreds = ridgemodel.predict(pres_test_X)
treepreds = dtree.predict(pres_test_X)
rfpreds = forest.predict(pres_test_X)
xgbpreds = xgboost1.predict(pres_test_X)

In [168]:
## Notably, all the classification results are the same
print(logpreds)
print(lassopreds)
print(ridgepreds)
print(treepreds)
print(rfpreds)
print(xgbpreds)

[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]
[0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0.
 0. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0.
 0. 0.]


In [169]:
preds = np.array(xgbpreds)
true = np.array(pres_test_y.values.ravel())
print(np.equal(preds,true))
print(np.where(np.not_equal(preds,true)))

[ True  True False  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True False  True  True  True  True  True  True  True  True  True  True
 False  True]
(array([ 2,  9, 21, 37, 48]),)


From the above difference checking between predictions and true values, we see that indices 2,9,21,37,48 in our states data were misclassified. These correspond in the data to the following states: Arizona, Georgia, Michigan, Pennsylvania, and Wisconsin. These were all flips between the 2016 and 2020 election.

## Senate Predictions

Here we form predictions for the US senate for the upcoming 2022 Midterm Elections.

As a note, the preprocessing here for the senate data is altered. We have a set of 13 "competitive" states that are considered competitive based off [2022 US Senate Election Ratings](https://en.wikipedia.org/wiki/2022_United_States_Senate_elections#Predictions). These will form our test set,and the training set will be over other states where (1) they have no Senate competition in 2022 so we use their most recent party outcome for the most recent Senate election in 2020 or (2) forecasting and demographic/polling factors strongly indicate a party direction for the Senate results, to the point where no forecasting is necessary and polling is usually sparse. The paper addresses this in more depth as a potential challenge.

In [170]:
null_results = pd.isnull(sen_data["recent-res"])
competitive_states = [i for i, n in enumerate(null_results) if n == True] 
other_states = list(set(list(range(50))) - set(competitive_states))
print(other_states)
print(competitive_states)

sen_data = sen_data.drop(["State"], axis = 1)
sen_train_X = sen_data.iloc[other_states]
sen_train_X = sen_train_X.drop(["recent-res"], axis = 1)
sen_train_y = sen_data[~sen_data['recent-res'].isnull()]['recent-res']
sen_test_X = sen_data.iloc[competitive_states]
sen_test_X = sen_test_X.drop(["recent-res"], axis = 1)

## Write CSVs, this is for inference analysis in R for logistic models
sen_train_X.to_csv(cwd + "\\sen_train_X.csv")
sen_train_y.to_csv(cwd + "\\sen_train_y.csv")
sen_test_X.to_csv(cwd + "\\sen_test_X.csv")

[0, 3, 4, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 29, 30, 31, 33, 35, 36, 38, 39, 40, 41, 42, 43, 44, 45, 47, 49]
[1, 2, 5, 8, 9, 24, 27, 28, 32, 34, 37, 46, 48]


In [171]:
sen_train_X.head()

Unnamed: 0,Region,2020 Population,pres2000,pres2004,pres2008,pres2012,pres2016,pres2020,sen1-recent,sen2-recent,...,inc-w-recent,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg,polling-party-lead,pvi,party-pvi,employment-rate,urban-pct-2010
0,2,5024279.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,6829526.55,37491.77,52035.0,20.4,0.0,15.0,0.0,54.0,59.04
3,2,3011524.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,5973540.8,26852.08,49475.0,45.0,0.0,16.0,0.0,54.8,56.16
4,3,39538223.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,135852.28,9467165.58,78672.0,25.0,1.0,14.0,1.0,59.4,94.95
6,0,3605944.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1088131.76,7442253.87,79855.0,19.6,1.0,7.0,1.0,61.7,87.99
7,2,989948.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,479515.11,7217773.49,69110.0,30.0,1.0,6.0,1.0,58.0,83.3


In [172]:
sen_train_y.head()

0    0.0
3    0.0
4    1.0
6    1.0
7    1.0
Name: recent-res, dtype: float64

In [173]:
sen_test_X.head()

Unnamed: 0,Region,2020 Population,pres2000,pres2004,pres2008,pres2012,pres2016,pres2020,sen1-recent,sen2-recent,...,inc-w-recent,rep-finance,dem-finance,median-hh-income,recent-five-polling-avg,polling-party-lead,pvi,party-pvi,employment-rate,urban-pct-2010
1,3,733391.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,7527972.46,3566.69,77790.0,6.5,0.0,9.0,0.0,59.6,66.02
2,3,7151502.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,13839094.88,38865284.52,61529.0,7.0,1.0,3.0,0.0,56.2,89.81
5,3,5773714.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,11162092.65,75231.0,8.5,1.0,3.0,1.0,64.3,86.15
8,2,21538187.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,30159319.08,30773246.01,57703.0,9.0,0.0,3.0,0.0,55.4,91.16
9,2,10711908.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,16470998.24,73223096.79,61224.0,0.5,0.0,3.0,0.0,59.2,75.07


In [174]:
lrmodel.fit(sen_train_X,sen_train_y.values.ravel())
lassomodel.fit(sen_train_X,sen_train_y.values.ravel())
ridgemodel.fit(sen_train_X,sen_train_y.values.ravel())
dtree.fit(sen_train_X,sen_train_y.values.ravel())
forest.fit(sen_train_X,sen_train_y.values.ravel())
xgboost1.fit(sen_train_X,sen_train_y.values.ravel())



XGBClassifier(alpha=10, colsample_bytree=0.3, n_estimators=120, random_state=83)

In [175]:
## Form predictions over all models
logpreds = lrmodel.predict(sen_test_X)
lassopreds = lassomodel.predict(sen_test_X)
ridgepreds = ridgemodel.predict(sen_test_X)
treepreds = dtree.predict(sen_test_X)
rfpreds = forest.predict(sen_test_X)
xgbpreds = xgboost1.predict(sen_test_X)

In [176]:
## Differences here!
print(logpreds)
print(lassopreds)
print(ridgepreds)
print(treepreds)
print(rfpreds)
print(xgbpreds)

[0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 1.]
[0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0.]
[0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 1.]
[0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.]
[0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0.]
[0. 1. 1. 0. 1. 0. 1. 1. 0. 0. 0. 1. 0.]


The indices above correspond to the following states: [Alaska, Arizona, Colorado, Florida, Georgia, Missouri, Nevada, New Hampshire, North Carolina, Ohio, Pennsylvania, Washington, Wisconsin]

## More Inference for Senate Model

In [177]:
## Coefficients organized from least important to most important
print(np.sort(lr.coef_))
print(np.argsort(np.array(lr.coef_)))

print(np.sort(lassor.coef_))
print(np.argsort(np.array(lassor.coef_)))

print(np.sort(ridger.coef_))
print(np.argsort(np.array(ridger.coef_)))

[[-0.30257861 -0.29466422 -0.28885609 -0.19722093 -0.09989724 -0.08165607
  -0.0799471  -0.07744254 -0.04599377 -0.02751796 -0.00417817  0.06489185
   0.06517315  0.12781807  0.13935924  0.14017504  0.14771562  0.15892526
   0.17514169  0.20333729  0.20939747  0.24555895  0.24555895  0.29297501
   0.3681319   0.3681733   0.3749692   0.37919283  0.38373684  0.41359122
   0.44662094  0.44662094  0.44743388  0.48920077  0.49662945  0.50902328
   0.63755006  0.72786629  0.76343373]]
[[31 27 15 17 30 37 35 16 21 20  0 29 25 23 14  1 18 33 24 22 26  6 36 32
   3  4 19  2 10  5  7 34  8 12 28 38 13 11  9]]
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.0045224  0.06244795 0.06244795 0.06692976
  0.07793053 0.16883411 0.17740417 0.20030645 0.2585641  0.35815732
  0.35815732 0.76

In [183]:
print(sen_train_X.columns[9])
print(sen_train_X.columns[11])
print(sen_train_X.columns[13])
print(sen_train_X.columns[34])
print(sen_train_X.columns[28])
print(sen_train_X.columns[7])
print(sen_train_X.columns[16])
print(sen_train_X.columns[15])
print(sen_train_X.columns[19])
print(sen_train_X.columns[38])

sen2-recent
sen2-2ndrecent
sen2-3rdrecent
polling-party-lead
inc-party
pres2020
some-college
high-school-pop
grad-professional
urban-pct-2010


In [179]:
## Get Tree Representation
text_representation = tree.export_text(dtree)
print(text_representation)

|--- feature_9 <= 0.50
|   |--- class: 0.0
|--- feature_9 >  0.50
|   |--- class: 1.0



In [180]:
## Feature Importances for Ensembles
print(np.sort(dtree.feature_importances_))
print(np.argsort(np.array(dtree.feature_importances_)))
print(np.sort(forest.feature_importances_))
print(np.argsort(np.array(forest.feature_importances_)))
print(np.sort(xgboost1.feature_importances_))
print(np.argsort(np.array(xgboost1.feature_importances_)))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
[ 0 21 22 23 24 25 26 27 20 28 30 31 32 33 34 35 36 29 37 19 17  1  2  3
  4  5  6  7 18  8 10 11 12 13 14 15 16 38  9]
[0.         0.         0.         0.         0.         0.00025305
 0.00068519 0.00077083 0.00088916 0.00091766 0.00123467 0.00163139
 0.00180508 0.00231108 0.002372   0.00280412 0.00404633 0.00504132
 0.00644731 0.00691513 0.01043537 0.0111667  0.01175279 0.01782676
 0.01998911 0.02058077 0.02112438 0.02780488 0.03595817 0.03722988
 0.04234075 0.04881164 0.05141508 0.06861354 0.07005088 0.07988107
 0.0875037  0.11174531 0.18764492]
[24 21 18 26 29 16  0  1 35 25 23 27 37 14 12 17 30 33 15 10  3 20 31 13
  4 38 28 32  8  2 19 22 36  6  5  7 34 11  9]
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0. 