In [257]:
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import sys
from scipy.stats import dirichlet
import seaborn as sns
sns.set(style='whitegrid')

In [288]:
ground_truth_polls_df = pd.read_csv('ground_truth_polling.csv', index_col=0)
best_guess_polling_df = pd.read_csv('best_guess_polling.csv', index_col=0)
first_valid_day = datetime.date(2019, 3, 15)

In [289]:
twitter_counts_df = pd.read_csv('FINALLL.csv', index_col='Date')

In [290]:
ground_truth_polls_df.index = pd.to_datetime(ground_truth_polls_df.index)
best_guess_polling_df.index = pd.to_datetime(best_guess_polling_df.index)
twitter_counts_df.index = pd.to_datetime(twitter_counts_df.index)

In [291]:
last_valid_day = list(twitter_counts_df.index)[-1].to_pydatetime().date()

In [292]:
def get_tweet_columns(threshold):
    return (['Number of tweets >= ' + str(threshold) + '_' + cand 
             for cand in list(ground_truth_polls_df.columns)])

In [293]:
# get tweets between latest_poll_date and number of days in advance from that date
def get_tweets(latest_poll_date, days_in_advance, columns):
    days_not_nan = np.zeros(len(columns))
    tweet_tot = np.zeros(len(columns))
    for i in range(1, days_in_advance + 1):
        tweets = np.array(twitter_counts_df.loc[latest_poll_date + datetime.timedelta(days=i), columns])
        days_not_nan += [int(not np.isnan(x)) for x in tweets]
        tweet_tot += [0 if np.isnan(x) else x for x in tweets]
    
    days_not_nan = [days if days else 1 for days in days_not_nan]
    return days_in_advance * (tweet_tot / days_not_nan)

In [301]:
# update prior distribution using Twitter info and calculate posterior likelihood of ground truth polling 
def obtain_posterior_likelihood(latest_poll_date, days_in_advance, poll_scaling_factor, decay_factor, 
                     tweet_scaling_factor):
    prediction_date = latest_poll_date + datetime.timedelta(days=days_in_advance)
    sum_of_alphas = poll_scaling_factor * decay_factor ** days_in_advance
    polling_prior_belief = np.array(best_guess_polling_df.loc[latest_poll_date])
    
    proportion_top_5 = np.sum(polling_prior_belief)
    prior_alphas = np.array([(cand_proportion / proportion_top_5) * sum_of_alphas 
                             for cand_proportion in polling_prior_belief])
    
    tweet_data = get_tweets(latest_poll_date, days_in_advance, get_tweet_columns(0.6))
    posterior_alphas = prior_alphas + tweet_data * tweet_scaling_factor 
    ground_truth_polling = np.array(ground_truth_polls_df.loc[prediction_date])
    ground_truth_polling_sum_to_1 = ground_truth_polling / np.sum(ground_truth_polling)
    
    return(dirichlet.pdf(ground_truth_polling_sum_to_1, posterior_alphas))
    

In [302]:
# performing grid search to find optimal values of poll_scaling_factor and tweet_scaling_factor
poll_scaling_factors = [10000 * x for x in range(5, 12)]
tweet_scaling_factors = np.linspace(0.1, 2, ((2 - 0.1) / 0.1) + 2) / 100
decay_factors = np.linspace(0.995, 0.95, ((0.995 - 0.95) / 0.005) + 1)
sentiment_threshold = 0.6

days_in_advance = 5

starting_days = []
curr_day = first_valid_day 
while curr_day < last_valid_day:
    starting_days.append(curr_day)
    curr_day += datetime.timedelta(days=days_in_advance)

del starting_days[-1]
    
last_starting_day = starting_days[-1]
last_starting_day_days_in_advance = (last_valid_day - last_starting_day).days

cv_df = pd.DataFrame(columns=['poll_scaling_factors', 'tweet_scaling_factors', 'likelihood|model'])

cnt = 0
for poll_scaling_factor in poll_scaling_factors:
    for tweet_scaling_factor in tweet_scaling_factors:
        likelihood_lst = []
        for starting_day in starting_days:
                likelihood = obtain_posterior_likelihood(starting_day, days_in_advance, poll_scaling_factor,
                                                         1, tweet_scaling_factor)
                likelihood_lst.append(likelihood)     
        cv_df.loc[cnt] = [poll_scaling_factor, tweet_scaling_factor, np.mean(likelihood_lst)]
        cnt += 1
            

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [296]:
# extract optimal hyperparamters from dataframe
cv_df.loc[cv_df['likelihood|model'].idxmax()]

poll_scaling_factors     1.100000e+05
tweet_scaling_factors    2.000000e-03
likelihood|model         1.605145e+08
Name: 121, dtype: float64