In [1]:
import sys
import json
import re
import pandas as pd
import pickle
import numpy as np

# Predicting election outcomes

We have a map from candidate name to Twitter handle, and the csv with district, state, chamber, party affiliation, and incumbency data for each candidate. We want a dataframe with district, state, chamber, projected winner's name, party, incumbency for social influence model, and projected winner's name, party, incumbency for voter count model.

input: dictionary that maps name to handle

In [2]:
candidate_data = pd.read_csv('../candidate_data_final.csv')

In [3]:
candidate_data

Unnamed: 0,district,name,state,chamber,party,incumbent
0,District 1,Robert Kennedy Jr.,Alabama,house,Democratic Party,False
1,District 1,Bradley Byrne,Alabama,house,Republican Party,True
2,District 2,Tabitha Isner,Alabama,house,Democratic Party,False
3,District 2,Martha Roby,Alabama,house,Republican Party,True
4,District 3,Mallory Hagan,Alabama,house,Democratic Party,False
5,District 3,Mike Rogers,Alabama,house,Alabama Party,True
6,District 4,Lee Auman,Alabama,house,Democratic Party,False
7,District 4,Robert Aderholt,Alabama,house,Republican Party,True
8,District 5,Peter Joffrion,Alabama,house,Democratic Party,False
9,District 5,Mo Brooks,Alabama,house,Republican Party,True


In [4]:
with open('candidate_names_to_handles.pkl', 'rb') as f:
    candidate_names_to_handles = pickle.load(f)

In [5]:
candidate_data

Unnamed: 0,district,name,state,chamber,party,incumbent
0,District 1,Robert Kennedy Jr.,Alabama,house,Democratic Party,False
1,District 1,Bradley Byrne,Alabama,house,Republican Party,True
2,District 2,Tabitha Isner,Alabama,house,Democratic Party,False
3,District 2,Martha Roby,Alabama,house,Republican Party,True
4,District 3,Mallory Hagan,Alabama,house,Democratic Party,False
5,District 3,Mike Rogers,Alabama,house,Alabama Party,True
6,District 4,Lee Auman,Alabama,house,Democratic Party,False
7,District 4,Robert Aderholt,Alabama,house,Republican Party,True
8,District 5,Peter Joffrion,Alabama,house,Democratic Party,False
9,District 5,Mo Brooks,Alabama,house,Republican Party,True


In [6]:
len(candidate_names_to_handles)

1180

In [7]:
candidate_names_to_handles['Robert Kennedy Jr.']

'Kennedy4Alabama'

In [8]:
def map_names_to_handles(row):
    try:
        return candidate_names_to_handles[row['name']].lower()
    except:
        pass

In [9]:
candidate_data['handles'] = candidate_data.apply(map_names_to_handles, axis=1)

In [10]:
candidate_data[candidate_data['handles'] == None]

Unnamed: 0,district,name,state,chamber,party,incumbent,handles


In [11]:
len(candidate_data[candidate_data["handles"].isnull()])

61

In [12]:
races = candidate_data.astype(str).groupby(['district', 'state', 'chamber'])
races

<pandas.core.groupby.DataFrameGroupBy object at 0x10615c630>

In [15]:
len(races)

439

In [29]:
new_df = pd.DataFrame(columns=['district','state', 'chamber', 'winner_predicted_by_total', 'winner_predicted_by_unique'])
i = 0
for race, df in races:
    predictions = get_winner(df, partisan_tweets_per_candidate)
    new_df = new_df.append({
        'district': race[0], 
        'state': race[1], 
        'chamber': race[2], 
        'winner_predicted_by_total': predictions['winner_predicted_by_total'],
        'margin_total': predictions['margin_total'],
        'winner_predicted_by_unique': predictions['winner_predicted_by_unique'],
        'margin_unique': predictions['margin_unique']
    }, ignore_index = True)
    i += 1

In [30]:
new_df

Unnamed: 0,district,state,chamber,winner_predicted_by_total,winner_predicted_by_unique,margin_total,margin_unique
0,District 1,Alabama,house,Robert Kennedy Jr.,Robert Kennedy Jr.,0.280000,0.090909
1,District 1,Arizona,house,Wendy Rogers,Wendy Rogers,,
2,District 1,Arkansas,house,Rick Crawford,Elvis Presley,1.000000,
3,District 1,California,house,Audrey Denney,Audrey Denney,0.891304,0.943396
4,District 1,Colorado,house,Casper Stockham,Casper Stockham,1.000000,0.666667
5,District 1,Connecticut,house,John Larson,John Larson,0.945946,1.000000
6,District 1,Georgia,house,not enough info,not enough info,,
7,District 1,Hawaii,house,Cam Cavasso,Cam Cavasso,1.000000,1.000000
8,District 1,Idaho,house,Russ Fulcher,Russ Fulcher,,
9,District 1,Illinois,house,Bobby Rush,Bobby Rush,1.000000,1.000000


In [61]:
new_df.to_csv('midterm_predictions.csv')

In [32]:
i

439

In [21]:
with open('partisan_tweets_per_candidate_final.pkl', 'rb') as f:
    partisan_tweets_per_candidate = pickle.load(f)

In [23]:
def get_winner(df, partisan_tweets_per_candidate):
    """
    return candidate to win under total, candidate to win 
    under unique, margin under total, margin under unique
    """
    results = {
        'winner_predicted_by_total': '', 
        'margin_total': 0, 
        'winner_predicted_by_unique':'', 
        'margin_unique': 0
    }
    metrics = ["total", "unique"]
    for metric in metrics:
        winner, margin = get_winner_by_metric(df, partisan_tweets_per_candidate, metric)
        results['winner_predicted_by_'+metric] = winner
        results['margin_'+metric] = margin
    return results


In [25]:
def get_winner_by_metric(df, partisan_tweets_per_candidate, metric):

    num_of_neutral_tweets, score = {}, dict()
    for handle in df['handles']:
        try:
            num_of_neutral_tweets[handle] = partisan_tweets_per_candidate[handle]['neutral'][metric]
        except:
            pass
    # Get data of top two candidates in number of total mentions
    top_two_handles = sorted(num_of_neutral_tweets, key=num_of_neutral_tweets.get, reverse=True)
    affiliation_to_sentiment = {
        'Democratic Party': 'liberal',
        'Republican Party': 'conservative',
        'Libertarian Party': 'conservative',
        'Green Party': 'liberal'
    }
    # Case where we only have data on one candidate in the race
    if len(top_two_handles) < 2:
        return "not enough info", np.nan
    else:
        total_tweets = 0
        for i in range(2):
            handle = top_two_handles[i]
            affiliation = df.loc[df['handles'] == handle]['party'].values[0]
            try: 
                num_of_tweets = partisan_tweets_per_candidate[handle][affiliation_to_sentiment[affiliation]][metric] \
                    + partisan_tweets_per_candidate[handle][affiliation_to_sentiment[affiliation]][metric]
                score[handle] = num_of_tweets
                total_tweets += num_of_tweets
            except:
                pass
        try:
            margin = abs(score[top_two_handles[1]] - score[top_two_handles[0]]) / total_tweets
        except:
            margin = np.nan
        if score:
            return df.loc[df['handles'] == max(score, key=score.get)]['name'].values[0], margin
        else:
            return None, np.nan

We want the numerator to be the number of liberal/conservative tweets, the denominator will be the total number of tweets in the top two