In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import math

In [2]:
# Read in data
master = pd.read_csv("winners.csv", index_col = 0, parse_dates = True)
master = master.drop(labels = "Unnamed: 0.1", axis = 1)

In [3]:
# Eliminate third-party candidates and non-voting territories
master = master[master["Cand_Party_Affiliation"].isin(["DEM", "REP"])]
master = master[~master["Cand_Office_St"].isin(["AS", "GU", "MP", "VI", "PR", "DC"])]

In [4]:
# Check data
master.head()

Unnamed: 0,Cand_Name,Cand_Office_St,Cand_Office_Dist,Cand_Party_Affiliation,Year,recent_presidential_vote,percent_bachelor_or_above,gov_party,Party_Previous_Vote_Share,Incumbent,...,Presidential_Approval_Rating,Same_Party_As_President,seat_transition,Median_Income,name_commonness,Primary.Vote.Percent,labor_force,Raised,Spent,won
1,"CRAWFORD, HARRY T JR",AK,0,DEM,2010,,27.0,REP,0.451464,0,...,0.38,True,4.0,64576.0,200,100.0,78.1,240439.24,235571.43,0
3,"YOUNG, DONALD E",AK,0,REP,2010,,27.0,REP,0.503333,1,...,0.38,False,4.0,64576.0,42,70.35,78.1,1001015.37,887310.33,1
4,"BONNER, JO",AL,1,REP,2010,0.390789,20.5,REP,1.0,1,...,0.41,False,4.0,41172.0,500,75.25,79.8,913052.62,1101701.0,1
9,"BRIGHT SR, BOBBY NEAL",AL,2,DEM,2010,0.407024,19.3,REP,0.503119,0,...,0.41,True,0.0,40567.0,500,,80.0,1413031.76,1435526.44,0
12,"ROBY, MARTHA",AL,2,REP,2010,0.407024,19.3,REP,0.496881,0,...,0.41,False,0.0,40567.0,161,48.55,80.0,1253557.11,1240275.64,1


In [5]:
# Coerce objects to floats as necessary
master[["Primary.Vote.Percent", "labor_force"]] = master[["Primary.Vote.Percent", "labor_force"]].apply(pd.to_numeric, errors = "coerce")

In [6]:
# Check data types
master.dtypes

Cand_Name                        object
Cand_Office_St                   object
Cand_Office_Dist                  int64
Cand_Party_Affiliation           object
Year                              int64
recent_presidential_vote        float64
percent_bachelor_or_above       float64
gov_party                        object
Party_Previous_Vote_Share       float64
Incumbent                         int64
minority_percentage             float64
Gender                            int64
Presidential_Approval_Rating    float64
Same_Party_As_President            bool
seat_transition                 float64
Median_Income                   float64
name_commonness                   int64
Primary.Vote.Percent            float64
labor_force                     float64
Raised                          float64
Spent                           float64
won                               int64
dtype: object

In [7]:
# Check for missing values
master.isnull().sum()

Cand_Name                          0
Cand_Office_St                     0
Cand_Office_Dist                   0
Cand_Party_Affiliation             0
Year                               0
recent_presidential_vote        1185
percent_bachelor_or_above          0
gov_party                        138
Party_Previous_Vote_Share          0
Incumbent                          0
minority_percentage                0
Gender                             0
Presidential_Approval_Rating      49
Same_Party_As_President            0
seat_transition                  448
Median_Income                     22
name_commonness                    0
Primary.Vote.Percent             937
labor_force                        0
Raised                            51
Spent                             51
won                                0
dtype: int64

In [8]:
# Drop columns as necessary
master = master.drop(["gov_party", "seat_transition", "name_commonness", "Primary.Vote.Percent"], axis = 1)

In [9]:
# Function for filling missing data
def fill_na(df, columns):
    df_copy = df.copy()
    for col in columns:
        na_df = df[pd.isnull(df[col])]
        na_rows = na_df.iterrows()
        for _, na_row in na_rows:
            district_subset = df[(df["Cand_Office_St"] == na_row["Cand_Office_St"]) & (df["Cand_Office_Dist"] == na_row["Cand_Office_Dist"]) & (~pd.isnull(df[col]))]
            state_subset = df[(df["Cand_Office_St"] == na_row["Cand_Office_St"]) & (~pd.isnull(df[col]))]
            if len(district_subset) > 0:
                district_mean = np.mean(district_subset[col])
                df_copy.loc[(df_copy["Cand_Office_St"] == na_row["Cand_Office_St"]) & (df_copy["Cand_Office_Dist"] == na_row["Cand_Office_Dist"]) & (pd.isnull(df_copy[col])), col] = district_mean
            elif len(state_subset) > 0:
                state_mean = np.mean(state_subset[col])
                df_copy.loc[(df_copy["Cand_Office_St"] == na_row["Cand_Office_St"]) & (df_copy["Cand_Office_Dist"] == na_row["Cand_Office_Dist"]) & (pd.isnull(df_copy[col])), col] = state_mean
            else:
                ovr_mean = np.mean(df[~pd.isnull(df[col])][col])
                df_copy.loc[(df_copy["Cand_Office_St"] == na_row["Cand_Office_St"]) & (df_copy["Cand_Office_Dist"] == na_row["Cand_Office_Dist"]) & (pd.isnull(df_copy[col])), col] = ovr_mean
    return df_copy

In [10]:
# Fill in missing data
missing_columns = master.columns.drop(["Cand_Name", "Cand_Office_St", "Cand_Party_Affiliation"])
master = fill_na(master, missing_columns)

In [11]:
# Function for Dem polling data
def project_D (row):
    if row['Year'] == 2010 :
        return 44.8
    if row['Year'] == 2012 :
        return 47.3
    if row['Year'] == 2014 :
        return 45.7
    if row['Year'] == 2016 :
        return 46.0
    if row['Year'] == 2018 :
        return 49.4
    else:
        return 0

In [12]:
# Function for Rep polling data
def project_R (row):
    if row['Year'] == 2010 :
        return 51.6
    if row['Year'] == 2012 :
        return 47.5
    if row['Year'] == 2014 :
        return 51.4
    if row['Year'] == 2016 :
        return 45.4
    if row['Year'] == 2018 :
        return 41.9
    else:
        return 0

In [13]:
# Append polling data
master['projected_D'] = master.apply(lambda row: project_D (row),axis=1)
master['projected_R'] = master.apply(lambda row: project_R (row),axis=1)

In [14]:
# Create dummy variables and join
master = pd.concat([master, pd.get_dummies(master["Cand_Party_Affiliation"], prefix = "cand_indicator", drop_first = True)], axis = 1)
master = master.drop("Cand_Party_Affiliation", axis = 1)
master.head()

Unnamed: 0,Cand_Name,Cand_Office_St,Cand_Office_Dist,Year,recent_presidential_vote,percent_bachelor_or_above,Party_Previous_Vote_Share,Incumbent,minority_percentage,Gender,Presidential_Approval_Rating,Same_Party_As_President,Median_Income,labor_force,Raised,Spent,won,projected_D,projected_R,cand_indicator_REP
1,"CRAWFORD, HARRY T JR",AK,0,2010,0.525392,27.0,0.451464,0,0.32569,1,0.38,True,64576.0,78.1,240439.24,235571.43,0,44.8,51.6,0
3,"YOUNG, DONALD E",AK,0,2010,0.525392,27.0,0.503333,1,0.32569,1,0.38,False,64576.0,78.1,1001015.37,887310.33,1,44.8,51.6,1
4,"BONNER, JO",AL,1,2010,0.390789,20.5,1.0,1,0.32626,1,0.41,False,41172.0,79.8,913052.62,1101701.0,1,44.8,51.6,1
9,"BRIGHT SR, BOBBY NEAL",AL,2,2010,0.407024,19.3,0.503119,0,0.35261,1,0.41,True,40567.0,80.0,1413031.76,1435526.44,0,44.8,51.6,0
12,"ROBY, MARTHA",AL,2,2010,0.407024,19.3,0.496881,0,0.35261,0,0.41,False,40567.0,80.0,1253557.11,1240275.64,1,44.8,51.6,1


In [15]:
# Check that all missing values are removed
sum(master.isnull().sum() > 0)

0

In [16]:
# View columns
for col in master.columns:
    print(col)

Cand_Name
Cand_Office_St
Cand_Office_Dist
Year
recent_presidential_vote
percent_bachelor_or_above
Party_Previous_Vote_Share
Incumbent
minority_percentage
Gender
Presidential_Approval_Rating
Same_Party_As_President
Median_Income
labor_force
Raised
Spent
won
projected_D
projected_R
cand_indicator_REP


In [17]:
# Check number of observations
len(master)

4076

In [18]:
# Reset index of dataframe
master = master.reset_index(drop = True)

In [19]:
# Write data to CSV
master.to_csv("late_forecast_data.csv", index = False)

In [20]:
# Function for cross-validation without iid data
def get_cv_iterables(df):
    cv_iterables = []
    years = [2012, 2014]
    for year in years:
        train_indices = df.index[(df["Year"] == year - 2) | (df["Year"] == year - 4)].tolist()
        test_indices = df.index[(df["Year"] == year)].tolist()
        cv_iterables.append((train_indices, test_indices))
    return cv_iterables

In [21]:
# Get indices for cross-validation
indices = get_cv_iterables(master)

In [22]:
# Set predictors and response variable
columns = master.columns
predictors = columns.drop(["Cand_Name", "Cand_Office_St", "Cand_Office_Dist", "won"])
response = "won"

In [23]:
# Perform custom CV to get best hyperparameters
rf = RandomForestClassifier(random_state = 69)
cv_rf = GridSearchCV(rf, param_grid = {"n_estimators": [100, 250, 500], "max_depth": [2, 4, 6, 8]}, cv = indices)

In [24]:
# Establish training and testing data
x_train = master[master["Year"] != 2016][predictors]
y_train = master[master["Year"] != 2016][response]
x_test = master[master["Year"] == 2016][predictors]
y_test = master[master["Year"] == 2016][response]

In [25]:
# Fit new random forest using tuned hyperparameters
final_rf = cv_rf.fit(x_train, y_train)
final_rf.best_params_

{'max_depth': 2, 'n_estimators': 500}

In [26]:
# Print accuracies
print("Training accuracy: ", final_rf.score(x_train, y_train))
print("Testing accuracy: ", final_rf.score(x_test, y_test))

Training accuracy:  0.9194240196078431
Testing accuracy:  0.9519704433497537


In [27]:
# Read in 2018 data and drop columns
full2018 = pd.read_csv("full2018.csv")
full2018 = full2018.drop(["Unnamed: 0", "Lose Prob", "Win Prob"], axis = 1)

In [28]:
# Check data
full2018.head()

Unnamed: 0,Cand_Name,Cand_Office_St,Cand_Office_Dist,Cand_Party_Affiliation,Year,recent_presidential_vote,percent_bachelor_or_above,gov_party,Party_Previous_Vote_Share,Incumbent,...,Gender,Presidential_Approval_Rating,Same_Party_As_President,seat_transition,Median_Income,name_commonness,Primary.Vote.Percent,labor_force,Raised,Spent
0,"GALVIN, ALYSE",AK,0,IND,2018,0.0,28.8,IND,0.2,0,...,0,0.48,False,0.0,73181.0,500,54.1,76.1,0.0,0.0
1,"YOUNG, DONALD E",AK,0,REP,2018,0.0,28.8,IND,0.503,1,...,1,0.48,True,0.0,73181.0,42,70.9,76.1,1003579.47,960955.52
2,"BYRNE, BRADLEY ROBERTS",AL,1,REP,2018,0.352134,23.3,REP,1.0,1,...,1,0.62,True,0.0,47984.0,57,100.0,74.3,1235570.11,645776.76
3,"KENNEDY, ROBERT JR.",AL,1,DEM,2018,0.352134,23.3,REP,0.0,0,...,1,0.62,False,0.0,47984.0,7,80.8,74.3,76091.03,31564.26
4,"ISNER, TABITHA KAY",AL,2,DEM,2018,0.380884,21.8,REP,0.453655,0,...,0,0.62,False,4.0,46579.0,500,60.4,76.3,405460.57,267916.56


In [29]:
# Eliminate non-voting territories and non major party candidates
full2018 = full2018[~full2018["Cand_Office_St"].isin(["AS", "GU", "MP", "VI", "PR", "DC"])]

In [30]:
# Append polling data
full2018['projected_D'] = full2018.apply(lambda row: project_D (row),axis=1)
full2018['projected_R'] = full2018.apply(lambda row: project_R (row),axis=1)

In [31]:
# Create dummy variables and join
full2018 = pd.concat([full2018, pd.get_dummies(full2018["Cand_Party_Affiliation"], prefix = "cand_indicator", drop_first = True)], axis = 1)
full2018.head()

Unnamed: 0,Cand_Name,Cand_Office_St,Cand_Office_Dist,Cand_Party_Affiliation,Year,recent_presidential_vote,percent_bachelor_or_above,gov_party,Party_Previous_Vote_Share,Incumbent,...,cand_indicator_IND,cand_indicator_LIB,cand_indicator_NOP,cand_indicator_NPA,cand_indicator_OTH,cand_indicator_REC,cand_indicator_REF,cand_indicator_REP,cand_indicator_UNK,cand_indicator_WF
0,"GALVIN, ALYSE",AK,0,IND,2018,0.0,28.8,IND,0.2,0,...,1,0,0,0,0,0,0,0,0,0
1,"YOUNG, DONALD E",AK,0,REP,2018,0.0,28.8,IND,0.503,1,...,0,0,0,0,0,0,0,1,0,0
2,"BYRNE, BRADLEY ROBERTS",AL,1,REP,2018,0.352134,23.3,REP,1.0,1,...,0,0,0,0,0,0,0,1,0,0
3,"KENNEDY, ROBERT JR.",AL,1,DEM,2018,0.352134,23.3,REP,0.0,0,...,0,0,0,0,0,0,0,0,0,0
4,"ISNER, TABITHA KAY",AL,2,DEM,2018,0.380884,21.8,REP,0.453655,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
full2018.isnull().sum()

Cand_Name                       0
Cand_Office_St                  0
Cand_Office_Dist                0
Cand_Party_Affiliation          0
Year                            0
recent_presidential_vote        0
percent_bachelor_or_above       0
gov_party                       0
Party_Previous_Vote_Share       0
Incumbent                       0
minority_percentage             0
Gender                          0
Presidential_Approval_Rating    0
Same_Party_As_President         0
seat_transition                 0
Median_Income                   0
name_commonness                 0
Primary.Vote.Percent            0
labor_force                     0
Raised                          0
Spent                           0
projected_D                     0
projected_R                     0
cand_indicator_DEM              0
cand_indicator_GRE              0
cand_indicator_IDP              0
cand_indicator_IND              0
cand_indicator_LIB              0
cand_indicator_NOP              0
cand_indicator

In [33]:
# Generate 2018 predicted probabilities
preds = final_rf.predict_proba(full2018[predictors])

In [34]:
# Append win probability to dataframe
full2018["win_prob"] = np.nan
full2018["win_prob"] = preds
full2018["soft_max_win_prob"] = np.nan
full2018["predicted_winner"] = 0

In [35]:
# Check data
full2018.head()

Unnamed: 0,Cand_Name,Cand_Office_St,Cand_Office_Dist,Cand_Party_Affiliation,Year,recent_presidential_vote,percent_bachelor_or_above,gov_party,Party_Previous_Vote_Share,Incumbent,...,cand_indicator_NPA,cand_indicator_OTH,cand_indicator_REC,cand_indicator_REF,cand_indicator_REP,cand_indicator_UNK,cand_indicator_WF,win_prob,soft_max_win_prob,predicted_winner
0,"GALVIN, ALYSE",AK,0,IND,2018,0.0,28.8,IND,0.2,0,...,0,0,0,0,0,0,0,0.970253,,0
1,"YOUNG, DONALD E",AK,0,REP,2018,0.0,28.8,IND,0.503,1,...,0,0,0,0,1,0,0,0.768168,,0
2,"BYRNE, BRADLEY ROBERTS",AL,1,REP,2018,0.352134,23.3,REP,1.0,1,...,0,0,0,0,1,0,0,0.763612,,0
3,"KENNEDY, ROBERT JR.",AL,1,DEM,2018,0.352134,23.3,REP,0.0,0,...,0,0,0,0,0,0,0,0.969572,,0
4,"ISNER, TABITHA KAY",AL,2,DEM,2018,0.380884,21.8,REP,0.453655,0,...,0,0,0,0,0,0,0,0.94617,,0


In [36]:
# Function for calculating softmax by district
def calculate_soft_max(master_df, row):
    cand = row["Cand_Name"]
    win_prob = row["win_prob"]
    if not pd.isnull(cand) and not pd.isnull(win_prob):
        candidate_df = master_df[(master_df["Year"] == row["Year"]) & (master_df["Cand_Office_St"] == row["Cand_Office_St"]) & (master_df["Cand_Office_Dist"] == row["Cand_Office_Dist"])]
        num = math.exp(win_prob)
        denom = sum([math.exp(cand_row["win_prob"]) for _, cand_row in candidate_df.iterrows()])
        master_df.loc[(master_df["Year"] == row["Year"]) & (master_df["Cand_Office_St"] == row["Cand_Office_St"]) & (master_df["Cand_Office_Dist"] == row["Cand_Office_Dist"]) & (master_df["Cand_Name"] == cand), "soft_max_win_prob"] = num / denom
    return 0

In [37]:
# Function for assigning winner by district
def assign_winner(master_df, row):
    cand = row["Cand_Name"]
    soft_max_win_prob = row["soft_max_win_prob"]
    if not pd.isnull(cand) and not pd.isnull(soft_max_win_prob):
        candidate_df = master_df[(master_df["Year"] == row["Year"]) & (master_df["Cand_Office_St"] == row["Cand_Office_St"]) & (master_df["Cand_Office_Dist"] == row["Cand_Office_Dist"])]
        if soft_max_win_prob == max(candidate_df[~candidate_df["soft_max_win_prob"].isnull()]["soft_max_win_prob"]):
            if candidate_df["predicted_winner"].sum() == 0:
                master_df.loc[(master_df["Year"] == row["Year"]) & (master_df["Cand_Office_St"] == row["Cand_Office_St"]) & (master_df["Cand_Office_Dist"] == row["Cand_Office_Dist"]) & (master_df["Cand_Name"] == cand), "predicted_winner"] = 1
    return 0

In [38]:
# Apply functions as necessary
full2018.apply(lambda x: calculate_soft_max(full2018, x), axis = 1)
full2018.apply(lambda x: assign_winner(full2018, x), axis = 1)

0      0
1      0
2      0
3      0
4      0
5      0
6      0
7      0
8      0
9      0
10     0
11     0
12     0
13     0
14     0
15     0
16     0
17     0
18     0
19     0
20     0
21     0
22     0
23     0
24     0
25     0
26     0
27     0
28     0
29     0
      ..
805    0
806    0
807    0
808    0
809    0
810    0
811    0
812    0
813    0
814    0
815    0
816    0
817    0
818    0
819    0
820    0
821    0
822    0
823    0
824    0
825    0
826    0
827    0
828    0
829    0
830    0
831    0
832    0
833    0
834    0
Length: 835, dtype: int64

In [39]:
full2018.head()

Unnamed: 0,Cand_Name,Cand_Office_St,Cand_Office_Dist,Cand_Party_Affiliation,Year,recent_presidential_vote,percent_bachelor_or_above,gov_party,Party_Previous_Vote_Share,Incumbent,...,cand_indicator_NPA,cand_indicator_OTH,cand_indicator_REC,cand_indicator_REF,cand_indicator_REP,cand_indicator_UNK,cand_indicator_WF,win_prob,soft_max_win_prob,predicted_winner
0,"GALVIN, ALYSE",AK,0,IND,2018,0.0,28.8,IND,0.2,0,...,0,0,0,0,0,0,0,0.970253,0.55035,1
1,"YOUNG, DONALD E",AK,0,REP,2018,0.0,28.8,IND,0.503,1,...,0,0,0,0,1,0,0,0.768168,0.44965,0
2,"BYRNE, BRADLEY ROBERTS",AL,1,REP,2018,0.352134,23.3,REP,1.0,1,...,0,0,0,0,1,0,0,0.763612,0.448691,0
3,"KENNEDY, ROBERT JR.",AL,1,DEM,2018,0.352134,23.3,REP,0.0,0,...,0,0,0,0,0,0,0,0.969572,0.551309,1
4,"ISNER, TABITHA KAY",AL,2,DEM,2018,0.380884,21.8,REP,0.453655,0,...,0,0,0,0,0,0,0,0.94617,0.546186,1


In [40]:
# Check that 435 winners have been assigned
full2018["predicted_winner"].sum()

435

In [41]:
# Write all predictions to CSV
full2018.to_csv("2018_predictions.csv", index = False)

In [42]:
# Add new column to define state and district
full2018["state_district"] = np.nan
full2018["state_district"] = full2018["Cand_Office_St"] + " " + full2018["Cand_Office_Dist"].astype(str)

In [43]:
# Create desired dataframe
final_df = full2018[full2018["predicted_winner"] == 1][["state_district", "Cand_Name", "soft_max_win_prob"]]
final_df.columns = ["District", "Projected Winner", "Win Probability"]
final_df.head()

Unnamed: 0,District,Projected Winner,Win Probability
0,AK 0,"GALVIN, ALYSE",0.55035
3,AL 1,"KENNEDY, ROBERT JR.",0.551309
4,AL 2,"ISNER, TABITHA KAY",0.546186
6,AL 3,"HAGAN, MALLORY",0.550559
9,AL 4,"AUMAN, JAMES LEE",0.551281


In [44]:
# Write to CSV
final_df.to_csv("seat_predictions.csv", index = False)