# ECMA 31330 Final Project
### Abby Beckler, Miles Brown

In [95]:
# imports
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.impute import KNNImputer
from sklearn.linear_model import Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

## Load Data + Use KNN to Impute Missing Values

In [96]:
# set seed to ensure reproducibility
np.random.seed(25)

# load the data
ANES = pd.read_csv('data/ANES.csv')

# remove rows with missing 'pocket', 'retro', and 'trumpft' values
ANES = ANES.dropna(subset=['pocket', 'retro', 'trumpft'])

# impute using k-nearest neighbors imputer
imputer = KNNImputer(n_neighbors=5)
ANES = pd.DataFrame(imputer.fit_transform(ANES), columns=ANES.columns)

# add relevant columns 'pocketEval' and 'econEval'
ANES['pocketEval'] = (ANES['pocket'] - 3) * (-1/2)
ANES['econEval'] = (ANES['retro'] - 3) * (-1/2)

## Perform Initial OLS Regressions (same as K&K)

In [97]:
# regress 'pocketEval' on 'trumpft'
X_pocket = ANES['pocketEval']
X_pocket = sm.add_constant(X_pocket)
y = ANES['trumpft']
model_pocket = sm.OLS(y, X_pocket).fit()

# regresss 'econEval' on 'trumpft'
X_econ = ANES['econEval']
X_econ = sm.add_constant(X_econ)
model_econ = sm.OLS(y, X_econ).fit()

# compare the models
print(model_pocket.summary())
print(model_econ.summary())

                            OLS Regression Results                            
Dep. Variable:                trumpft   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.037
Method:                 Least Squares   F-statistic:                     304.3
Date:                Sun, 09 Mar 2025   Prob (F-statistic):           6.59e-67
Time:                        13:24:42   Log-Likelihood:                -40764.
No. Observations:                7998   AIC:                         8.153e+04
Df Residuals:                    7996   BIC:                         8.155e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         40.1407      0.443     90.694      0.0

## More Complex OLS Regressions (+ Covariates)

In [98]:
# create list of covariate columns
covariates = ANES.columns.tolist()
removed = ['trumpft', 'trumpft_post', 'pocket', 'retro', 'pocketEval', 'econEval', 'votepres', 
           'bidenft', 'bidenft_post', 'pid_lean', 'pid_strong', 'pid3', 'pid7']
for col in removed:
    covariates.remove(col)

# regress 'trumpft' on 'pocketEval' variable + covariates
X_pocket_cov = ANES[['pocketEval'] + covariates]
X_pocket_cov = sm.add_constant(X_pocket_cov)
model_pocket_cov = sm.OLS(y, X_pocket_cov).fit()

# regress 'trumpft' on 'econEval' variable + covariates
X_econ_cov = ANES[['econEval'] + covariates]
X_econ_cov = sm.add_constant(X_econ_cov)
model_econ_cov = sm.OLS(y, X_econ_cov).fit()

# compare the models with the covariates
# NOTE: the coefficient for pocketEval is no longer significant at p<0.05
print(model_pocket_cov.summary())
print(model_econ_cov.summary())


                            OLS Regression Results                            
Dep. Variable:                trumpft   R-squared:                       0.874
Model:                            OLS   Adj. R-squared:                  0.874
Method:                 Least Squares   F-statistic:                     1153.
Date:                Sun, 09 Mar 2025   Prob (F-statistic):               0.00
Time:                        13:24:42   Log-Likelihood:                -32617.
No. Observations:                7998   AIC:                         6.533e+04
Df Residuals:                    7949   BIC:                         6.567e+04
Df Model:                          48                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                  100.6644 

## LASSO Regression

In [99]:
# train test split the pocket and econ + covariate data
X_pocket_cov_train, X_pocket_cov_test, y_train, y_test = train_test_split(X_pocket_cov, y, test_size=0.2, random_state=25)
X_econ_cov_train, X_econ_cov_test, y_train, y_test = train_test_split(X_econ_cov, y, test_size=0.2, random_state=25)

# fit LASSO models
lasso_pocket = Lasso(alpha=0.5, random_state=25)
lasso_pocket.fit(X_pocket_cov_train, y_train)

lasso_econ = Lasso(alpha=0.5, random_state=25)
lasso_econ.fit(X_econ_cov_train, y_train)

# compare the selected features for the two models
print('Selected features for pocketEval model:')
print(X_pocket_cov.columns[lasso_pocket.coef_ != 0])
print('Number of selected features:', sum(lasso_pocket.coef_ != 0))

print('Selected features for econEval model:')
print(X_econ_cov.columns[lasso_econ.coef_ != 0])
print('Number of selected features:', sum(lasso_econ.coef_ != 0))

Selected features for pocketEval model:
Index(['sp_serv', 'sp_serv_biden', 'sp_serv_trump', 'jobs_living',
       'jobs_living_trump', 'envir_business', 'envir_business_trump',
       'abortion', 'abortion_biden', 'covid_approval', 'covid_response_speed',
       'mail_in_attitude', 'police_treat', 'repro', 'mail_in', 'stateFIPS',
       'age', 'marital', 'income', 'ideology', 'trans', 'immigration',
       'gun_diff', 'offensive_language'],
      dtype='object')
Number of selected features: 24
Selected features for econEval model:
Index(['sp_serv', 'sp_serv_biden', 'sp_serv_trump', 'jobs_living',
       'jobs_living_trump', 'envir_business', 'envir_business_trump',
       'abortion', 'abortion_biden', 'covid_approval', 'covid_response_speed',
       'mail_in_attitude', 'police_treat', 'repro', 'mail_in', 'stateFIPS',
       'age', 'marital', 'income', 'ideology', 'trans', 'immigration',
       'gun_diff', 'offensive_language'],
      dtype='object')
Number of selected features: 24


## Neural Network (?)

## Random Forest

In [100]:
# run a Random Forest on pocketEval and covariates
rf_pocket = RandomForestRegressor(n_estimators=500, random_state=25)
rf_pocket.fit(X_pocket_cov, y)

feature_importance_pocket = pd.DataFrame({
    "Feature": X_pocket_cov.columns,
    "Importance": rf_pocket.feature_importances_
}).sort_values(by="Importance", ascending=False)
feature_importance_pocket.head(10)

Unnamed: 0,Feature,Importance
40,immigration,0.483845
20,covid_approval,0.346906
36,ideology,0.023719
12,envir_business_trump,0.012781
10,envir_business,0.007606
29,age,0.007359
6,sp_serv_trump,0.007226
24,repro,0.006808
23,police_treat,0.006047
27,stateFIPS,0.00588


In [101]:
# run a Random Forest on econEval and covariates
rf_econ = RandomForestRegressor(n_estimators=500, random_state=25)
rf_econ.fit(X_econ_cov, y)

feature_importance_econ = pd.DataFrame({
    "Feature": X_econ_cov.columns,
    "Importance": rf_econ.feature_importances_
}).sort_values(by="Importance", ascending=False)
feature_importance_econ.head(10)

Unnamed: 0,Feature,Importance
40,immigration,0.483851
20,covid_approval,0.346911
36,ideology,0.023738
12,envir_business_trump,0.012828
10,envir_business,0.007703
29,age,0.007447
6,sp_serv_trump,0.007284
23,police_treat,0.006106
27,stateFIPS,0.005959
35,income,0.005694
