# Data Collection for Hype-Machine

#### Setup and Dependancies

In [39]:
import os
import sys
import json
import math
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from nltk.metrics import *
from linearmodels import PanelOLS
from linearmodels import RandomEffects
import warnings
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import time
from datetime import datetime, timedelta
import re
import ast

warnings.filterwarnings('ignore')
%matplotlib inline

#### Directories and Names

In [53]:
rep16_candidates = [
    ("Jeb", "Bush"),
    ("Ben", "Carson"),
    ("Chris", "Christie"),
    ("Ted", "Cruz"),
    ("Carly", "Fiorina"),
    ("Jim", "Gilmore"),
    ("Lindsey", "Graham"),
    ("Mike", "Huckabee"),
    ("Bobby", "Jindal"),
    ("John", "Kasich"),
    ("George", "Pataki"),
    ("Randal", "Paul"),
    ("Rick", "Perry"),
    ("Marco", "Rubio"),
    ("Rick", "Santorum"),
    ("Donald", "Trump"),
    ("Scott", "Walker")
]

dem20_candidates = [
    ('Michael', 'Bennet'),
    ('Joe', 'Biden'),
    ('Corey', 'Booker'),
    ('Steve', 'Bullock'),
    ('Pete', 'Buttigieg'),
    ('Julian', 'Castro'),
    ('Bill', 'de Blasio'),
    ('John', 'Delaney'),
    ('Tulsi', 'Gabbard'),
    ('Kirsten', 'Gillibrand'),
    ('Kamala', 'Harris'),
    ('John', 'Hickenlooper'),
    ('Jay', 'Inslee'),
    ('Amy', 'Klobuchar'),
    ('Beto', 'Orourke'),
    ('Andrew', 'Yang'),
    ('Bernie', 'Sanders'),
    ('Eric', 'Swalwell'),
    ('Elizabeth', 'Warren'),
    ('Marianne', 'Williamson'),
    ('Andrew', 'Yang')
]

# make 2016 republican fec folders
rep16_fec_path = os.path.join("..","data","fec_new","2016")
if(not os.path.isdir(rep16_fec_path)):
    os.mkdir(rep16_fec_path)

rep16_fec_path = os.path.join(rep16_fec_path,"republican")
if(not os.path.isdir(rep16_fec_path)):
    os.mkdir(rep16_fec_path)
    
for candid in rep16_candidates:
    candid_path = os.path.join(rep16_fec_path,candid[1].lower())
    if(not os.path.isdir(candid_path)):
        os.mkdir(candid_path)
        
# make 2020 democrat fec folders
dem20_fec_path = os.path.join("..","data","fec_new","2020")
if(not os.path.isdir(dem20_fec_path)):
    os.mkdir(dem20_fec_path)

dem20_fec_path = os.path.join(dem20_fec_path,"democrat")
if(not os.path.isdir(dem20_fec_path)):
    os.mkdir(dem20_fec_path)
    
for candid in dem20_candidates:
    candid_path = os.path.join(dem20_fec_path,candid[1].lower())
    if(not os.path.isdir(candid_path)):
        os.mkdir(candid_path)
        
# polls folders
rep16_polls_path = os.path.join("..","data","polls")
if(not os.path.isdir(rep16_polls_path)):
    os.mkdir(rep16_polls_path)

rep16_polls_path = os.path.join(rep16_polls_path,"2016")
if(not os.path.isdir(rep16_polls_path)):
    os.mkdir(rep16_polls_path)
    
dem20_polls_path = os.path.join("..","data","polls","2020")
if(not os.path.isdir(dem20_polls_path)):
    os.mkdir(dem20_polls_path)
    
# gdelt folders
rep16_gdelt_path = os.path.join("..","data","gdelt")
if(not os.path.isdir(rep16_gdelt_path)):
    os.mkdir(rep16_gdelt_path)

rep16_gdelt_path = os.path.join(rep16_gdelt_path,"2016")
if(not os.path.isdir(rep16_gdelt_path)):
    os.mkdir(rep16_gdelt_path)
    
rep16_gdelt_path = os.path.join(rep16_gdelt_path,"republican")
if(not os.path.isdir(rep16_gdelt_path)):
    os.mkdir(rep16_gdelt_path)
    
dem20_gdelt_path = os.path.join(os.path.join("..","data","gdelt","2020"))
if(not os.path.isdir(dem20_gdelt_path)):
    os.mkdir(dem20_gdelt_path)
    
dem20_gdelt_path = os.path.join(dem20_gdelt_path,"democrat")
if(not os.path.isdir(dem20_gdelt_path)):
    os.mkdir(dem20_gdelt_path)

## Collect NLP Data

In [65]:
nlp_raw = {}
nlp_dir = os.path.join("..","data","candidate_aggregation")
for ii,fname in enumerate(os.listdir(nlp_dir)):
    tmp_df = pd.read_csv(os.path.join(nlp_dir, fname))
    candidate = os.path.splitext(fname)[0]
    nlp_raw[candidate] = {}
    for ii,row in tmp_df.iterrows():
        daily_nlp = {}
        nlp_list = ast.literal_eval(row["topic_titles"])
        for tup in nlp_list:
            daily_nlp[tup[0]] = tup[1]
        nlp_raw[candidate][row["day"]] = daily_nlp
        
dem20_nlp = pd.DataFrame.from_dict(
    {(i,j): nlp_raw[i][j] for i in nlp_raw.keys() for j in nlp_raw[i].keys()},
    orient='index').fillna(0)

dem20_nlp = dem20_nlp.reset_index()
dem20_nlp = dem20_nlp.rename({"level_0":"candidate", "level_1":"date"}, axis="columns")
dem20_nlp["date"] = pd.to_datetime(dem20_nlp["date"], format='%Y-%m-%d')

dem20_nlp.head()

Unnamed: 0,candidate,date,climate change,mueller report,mccain,sanders,inslee,2020 democractic primary,warren,congress,...,booker,gun laws,mcconnell,yang,prison reform,lgbtq,buttigieg,puerto rico,assange + stone,williamson
0,bennett,2019-01-06,0.0,0.0,0.072656,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bennett,2019-01-07,0.0,0.069143,0.0,0.002481,0.0,0.0,0.0,0.04163,...,0.0,0.0,0.008585,0.0,0.0,0.0,0.0,0.0,0.009514,0.005925
2,bennett,2019-01-08,0.0,0.34976,0.016031,0.017082,0.0,0.0,0.0,0.0,...,0.0,0.022525,0.0,0.0,0.0,0.0,0.019011,0.0,0.0,0.0
3,bennett,2019-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13824,0.0
4,bennett,2019-01-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040574,0.0


## Collect Polling Data

#### 2016 Polling Data
There are very few good aggregates of 2016 primary polls, and for consistency we'd like to use 538 as a data source if possible. We pulled the code below from [a github repository from sgodfrey66](https://github.com/sgodfrey66/Polls_and_press/blob/master/code/Poll_data_2016.ipynb). The code scrapes [this 538 webpage](https://projects.fivethirtyeight.com/election-2016/national-primary-polls/republican/).

In [42]:
class Scrape538PollData:
    # Attributes of the data retrieval
    url = 'https://projects.fivethirtyeight.com/election-2016/national-primary-polls/republican/'
    chrome_driver = os.path.join(os.getcwd(),"..","data","polls","chromedriver")
    status_code_ = None
    html_ = None
    soup_ = None
    candidates_ = []
    polls_ = []
    
    # Initialization method
    def __init__(self, url = None, chrome_driver = None):
        # If url != None then reset self.url
        if url:
            self.url = url
        # If chrome_driver != None reset self.chrome_driver
        if chrome_driver:
            self.chrome_driver = chrome_driver
    
    # method to collect data from posts
    def collect_page_data(self):
        # Set the selenium driver
        try:
            driver = webdriver.Chrome(self.chrome_driver)
            driver.get(self.url)
        except:
            self.status_code_ = driver.error_handler.check_response
            raise ValueError('Error retrieving the web page; see WebExceptionError for details.')
            
        # Find the read more polls button and click it
        driver.find_element_by_css_selector('.more-polls').click()
        
        # Return the html of the new page
        self.html_ = driver.page_source.encode('utf-8')
        
        # Return the soup version of the page
        self.soup_ = bs(self.html_, 'lxml')

    # method to collect data from posts
    def extract_polls(self):
        # Check to see that self.soup_ has data, if not run collect_page_data
        if self.soup_ == None or len(self.soup_) == 0:
            self.collect_page_data()
            
        # Find the list of candidates
        table = self.soup_.find('table')
        self.candidates_ = [c.text for c in table.find_all('th', {'class':'th th-rotate'})]

        # Find the poll data
        body = self.soup_.find('tbody')
        polls = body.find_all('tr',{'class': 't-row'})

        # For each poll extract information related to the poll
        for poll in polls:
            pl_d = {}
            pl_d['dates'] = poll.find('td', {'class':'t-dates'}).text
            pl_d['pollster_url'] = poll.find('a', href = True)
            pl_d['pollster'] = poll.find('td', {'class': 't-pollster t-left-margin'}).text
            pl_d['sample'] = poll.find('td', {'class': 't-sample t-left-margin t-right-align only-full'}).text
            pl_d['weight'] = poll.find('td', \
                                        {'class': 't-weight t-left-margin t-right-margin double-l-margin t-right-border-dark'}).text
            try:
                pl_d['leader'] = poll.find('td', \
                                        {'class':'t-leader t-left-margin t-right-margin only-full color-text-rep'}).text
            except:
                pl_d['leader'] = ''

            # Get the odds for each candidate except for the last candidate in the table
            for i, odds in enumerate(poll.find_all('td', {'class':'t-center-align td-cand-odds td-block t-right-border'})):
                # This tag is present if a value exists in the poll for that candidate
                if odds.find('div', {'class':'t-cand-odds heat-map-blocks'}):
                    pl_d[self.candidates_[i]] = float(odds.text.replace('%','').strip())/100

            # Get the odds for the last candidate 
            odds = poll.find('td', {'class':'t-center-align td-cand-odds td-block'})
            # This tag is present if a value exists in the poll for that candidate
            if odds.find('div', {'class':'t-cand-odds heat-map-blocks'}):
                pl_d[self.candidates_[len(self.candidates_) - 1]] = float(odds.text.replace('%','').strip())/100

            self.polls_.append(pl_d)
    
'''
# Walk through DataFrame to assign dates
def assign_dates(df = None):
    # Loop through each row in this DataFrame to create 
    #  a datetime object for the beginnging and end of the poll
    start_year = '2016'
    end_year = '2016'
    
    for idx in df.index:
        # Find the months and set them equal to start and end month
        months = re.findall(r'[\w]{3}',df.loc[idx,'dates'])
        if len(months)==1:
            start_month = months[0]
            end_month = months[0]
        else:
            start_month = months[0]
            end_month = months[1]
            
        # Find the months and set them equal to start and end month            
        dates = re.findall(r'[\d]{1,}',df.loc[idx,'dates'])
        if len(dates)==1:
            start_date = dates[0]
            end_date = dates[0]
        else:
            start_date = dates[0]
            end_date = dates[1]       
        
        # Figure out if the year needs to be changed
        if start_month=='Dec' and end_month=='Jan':
            start_year = '2015'
            end_year = '2016'
        elif start_month=='Jan' and end_month=='Jan':
            start_year = '2016'
            end_year = '2016'           
        elif start_month=='Dec' and end_month=='Dec':
            start_year = '2015'
            end_year = '2015'
        
        start_time=start_year+' '+start_month+' '+start_date
        end_time=end_year+' '+end_month+' '+end_date
        
        dt_start = datetime.strptime(start_time,'%Y %b %d')
        dt_end = datetime.strptime(end_time,'%Y %b %d')
        
        df.loc[idx,'start_time'] = dt_start 
        df.loc[idx,'end_time'] = dt_end
'''    
        
# Instantiate and a poll object and get poll data
rep16_polls = Scrape538PollData()
rep16_polls.extract_polls()

The scraped data needs munged to be compatible with the 2020 dataset.

In [43]:
def date_range_str_to_tuple(arg):
    arg = re.sub('[^0-9a-zA-Z]+', ' ', arg)
    arg = arg.split()
    if(len(arg)==3):
        mm = arg[0]
        dd = arg[2]
    elif(len(arg)==4):
        mm = arg[2]
        dd = arg[3]
    return (dd,mm)

# drop columns and filter data
rep16_polls_df = pd.DataFrame(rep16_polls.polls_)
rep16_polls_df = rep16_polls_df.loc[rep16_polls_df["pollster"]=="Morning Consult"]
rep16_polls_df = rep16_polls_df.drop(["leader","pollster_url","weight","sample","pollster"], axis=1)
rep16_polls_df = rep16_polls_df.rename({'dates':'date'}, axis='columns')

# date conversion
yy = 2016
for index, row in rep16_polls_df.iterrows():
    dd, mm = date_range_str_to_tuple(row["date"])
    if(mm=="Dec"):
        yy = 2015
    rep16_polls_df.loc[index, "date"] = datetime.strptime("{:2d} {:s} {:02d}".format(yy,mm,int(dd)), "%Y %b %d")

rep16_polls_df = rep16_polls_df.set_index("date")
rep16_polls_df.to_csv(os.path.join(rep16_polls_path,"president_primary_polls.csv"))
rep16_polls_df.head()

Unnamed: 0_level_0,Bush,Carson,Christie,Cruz,Fiorina,Huckabee,Kasich,Paul,Rubio,Santorum,Trump
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-05-02,,,,0.2,,,0.13,,,,0.56
2016-04-29,,,,0.27,,,0.12,,,,0.48
2016-04-22,,,,0.24,,,0.14,,,,0.49
2016-04-17,,,,0.26,,,0.13,,,,0.46
2016-04-11,,,,0.26,,,0.13,,,,0.48


#### 2020 Polling Data
538 aggregates the 2020 polling data [here](https://data.fivethirtyeight.com/) under "Latest Polls", which makes it easy to import.

In [44]:
dem20_polls_df = pd.read_csv(os.path.join(dem20_polls_path,"president_primary_polls.csv"))
dem20_polls_df = dem20_polls_df.rename({"created_at":"date"}, axis='columns')
dem20_polls_df = dem20_polls_df.loc[dem20_polls_df["cycle"]==2020]
dem20_polls_df = dem20_polls_df.loc[dem20_polls_df["stage"]=="primary"]
dem20_polls_df = dem20_polls_df.loc[dem20_polls_df["party"]=="DEM"]
dem20_polls_df = dem20_polls_df.loc[dem20_polls_df["pollster"]=="Morning Consult"]
dem20_polls_df = dem20_polls_df.groupby(by=["candidate_name","date"])["pct"].mean()
dem20_polls_df = dem20_polls_df.unstack(level="candidate_name")
dem20_polls_df.head()

candidate_name,Amy Klobuchar,Andrew Cuomo,Andrew Yang,Bernard Sanders,Beto O'Rourke,Bill de Blasio,Cory A. Booker,Deval Patrick,Elizabeth Warren,Eric Garcetti,...,Michael Bloomberg,Michael F. Bennet,Pete Buttigieg,Seth Moulton,Sherrod Brown,Steve Bullock,Terry R. McAuliffe,Tim Ryan,Tom Steyer,Tulsi Gabbard
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1/16/19 08:16,1.0,,,15.0,8.0,,2.0,,9.0,0.0,...,1.0,,,,1.0,0.0,0.0,,,
1/23/19 08:15,2.0,,,16.0,6.0,,4.0,,6.0,0.0,...,2.0,,,,1.0,1.0,0.0,,,1.0
1/29/19 23:17,1.0,,,15.0,6.0,,3.0,,6.0,0.0,...,2.0,,0.0,,1.0,1.0,0.0,,,0.0
1/9/19 13:53,2.0,0.0,,16.0,7.0,,3.0,,4.0,0.0,...,1.0,,,,1.0,0.0,0.0,,,
12/27/18 03:57,1.0,1.0,,19.0,8.0,,3.0,0.0,5.0,1.0,...,2.0,,,,1.0,1.0,,,,


In [45]:
dem20_polls_df.columns

Index(['Amy Klobuchar', 'Andrew Cuomo', 'Andrew Yang', 'Bernard Sanders',
       'Beto O'Rourke', 'Bill de Blasio', 'Cory A. Booker', 'Deval Patrick',
       'Elizabeth Warren', 'Eric Garcetti', 'Eric H. Holder', 'Eric Swalwell',
       'Gavin Newsom', 'Hillary Rodham Clinton', 'Howard Schultz',
       'Jay Robert Inslee', 'John Hickenlooper', 'John K. Delaney',
       'John Kerry', 'Joseph R. Biden Jr.', 'Julián Castro',
       'Kamala D. Harris', 'Kirsten E. Gillibrand', 'Marianne Williamson',
       'Michael Avenatti', 'Michael Bloomberg', 'Michael F. Bennet',
       'Pete Buttigieg', 'Seth Moulton', 'Sherrod Brown', 'Steve Bullock',
       'Terry R. McAuliffe', 'Tim Ryan', 'Tom Steyer', 'Tulsi Gabbard'],
      dtype='object', name='candidate_name')

## Collect Donnation Data

#### 2016 Donnations

In [46]:
# find the path to each fec file, store paths in a nested dict
rep16_fec_file_map = {}
for cand_dir in os.listdir(rep16_fec_path):
    if(cand_dir[0]!="."):
        rep16_fec_file_map[cand_dir] = {}
        for csv_path in os.listdir(os.path.join(rep16_fec_path,cand_dir)):
            if(csv_path.find("schedule_a")>=0):
                rep16_fec_file_map[cand_dir]["donations"] = \
                    os.path.join(rep16_fec_path,cand_dir,csv_path)
            elif(csv_path.find("schedule_b")>=0):
                rep16_fec_file_map[cand_dir]["spending"] = \
                    os.path.join(rep16_fec_path,cand_dir,csv_path)
print(json.dumps(rep16_fec_file_map, indent=4))

{
    "fiorina": {
        "donations": "../data/fec_new/2016/republican/fiorina/schedule_a-2019-07-29T21_48_31.csv"
    },
    "carson": {
        "donations": "../data/fec_new/2016/republican/carson/schedule_a-2019-07-28T15_31_03.csv",
        "spending": "../data/fec_new/2016/republican/carson/schedule_b-2019-07-28T15_31_06.csv"
    },
    "paul": {
        "donations": "../data/fec_new/2016/republican/paul/schedule_a-2019-07-29T22_22_04.csv"
    },
    "cruz": {
        "donations": "../data/fec_new/2016/republican/cruz/schedule_a-2019-07-29T22_30_15.csv"
    },
    "rubio": {
        "donations": "../data/fec_new/2016/republican/rubio/schedule_a-2019-07-29T22_25_26.csv"
    },
    "huckabee": {
        "donations": "../data/fec_new/2016/republican/huckabee/schedule_a-2019-07-29T22_18_27.csv"
    },
    "jindal": {
        "donations": "../data/fec_new/2016/republican/jindal/schedule_a-2019-07-29T22_19_05.csv"
    },
    "graham": {
        "donations": "../data/fec_new/2016/republ

#### 2020 Donnations

In [54]:
# find the path to each fec file, store paths in a nested dict
dem20_fec_file_map = {}
for cand_dir in os.listdir(dem20_fec_path):
    if(cand_dir[0]!="."):
        dem20_fec_file_map[cand_dir] = {}
        for csv_path in os.listdir(os.path.join(dem20_fec_path,cand_dir)):
            if(csv_path.find("schedule_a")>=0):
                dem20_fec_file_map[cand_dir]["donations"] = \
                    os.path.join(dem20_fec_path,cand_dir,csv_path)
            elif(csv_path.find("schedule_b")>=0):
                dem20_fec_file_map[cand_dir]["spending"] = \
                    os.path.join(dem20_fec_path,cand_dir,csv_path)
print(json.dumps(dem20_fec_file_map, indent=4))

{
    "swalwell": {
        "donations": "../data/fec_new/2020/democrat/swalwell/schedule_a-2019-07-28T18_15_11.csv",
        "spending": "../data/fec_new/2020/democrat/swalwell/schedule_b-2019-07-20T11_47_42.csv"
    },
    "williamson": {
        "spending": "../data/fec_new/2020/democrat/williamson/schedule_b-2019-07-20T11_48_37.csv",
        "donations": "../data/fec_new/2020/democrat/williamson/schedule_a-2019-07-28T18_15_58.csv"
    },
    "biden": {
        "spending": "../data/fec_new/2020/democrat/biden/schedule_b-2019-07-20T09_27_56.csv",
        "donations": "../data/fec_new/2020/democrat/biden/schedule_a-2019-07-28T18_01_42.csv"
    },
    "harris": {
        "donations": "../data/fec_new/2020/democrat/harris/schedule_a-2019-07-28T17_28_33.csv",
        "spending": "../data/fec_new/2020/democrat/harris/schedule_b-2019-07-20T09_58_19.csv"
    },
    "buttigieg": {
        "spending": "../data/fec_new/2020/democrat/buttigieg/schedule_b-2019-07-20T09_51_13.csv",
        "donat

## Collect Cable TV Mentions
Uses the [gdelt TV API](https://blog.gdeltproject.org/gdelt-2-0-television-api-debuts/) to collect mentions of primary cadidates on cable television.

#### Generic API Request

In [48]:
def cable_mentions(candidate_list, start_year=2019, end_year=2019):

    # api setup
    Market = ['National']
    tv_api_base   = "https://api.gdeltproject.org/api/v2/tv/"
    tv_api_params = {
        "mode":"timelinevol",
        "format":"csv",
        "STARTDATETIME":"{:04d}0101000000".format(start_year),
        "ENDDATETIME":"{:04d}1231115959".format(end_year)
    }
    tv_api_params_string = "&".join(["{:s}={:s}".format(key,val) for key,val in tv_api_params.items()])

    # send requests and combine into dataframe
    data = pd.DataFrame()
    for ii, candid in enumerate(candidate_list):
        last_name = candid[1]
        full_name = " ".join(candid)
        print("\r{:0.0f}% {:s}{:s}".format(100*ii/len(candidate_list), full_name, " "*20), end="")
        for location in Market:
            api_call = "{:s}tv?query=%22{:s}%22%20market:%22{:s}%22&{:s}".format(
                tv_api_base,
                last_name,
                location,
                tv_api_params_string
            )
            try:
                temp_data = pd.read_csv(api_call)
                temp_data['Candidate'] = full_name
                temp_data['Market'] = location

                data = data.append(temp_data, ignore_index = True)
            except Exception as e:
                print(e, end="")
    print("\r100% {:s}".format(" "*50))

    # fix date column name
    find_colname_str = "date"
    for colname in data.columns:
        if(colname[:len(find_colname_str)].lower()==find_colname_str):
            found_colname_str = colname
    data.rename({found_colname_str:find_colname_str.title()},axis="columns", inplace=True)
    data["Date"] = pd.to_datetime(data["Date"], format='%Y-%m-%d')

    # mung tv data
    data = data[["Candidate","Date","Series","Value"]]
    tv_mentions = data.groupby(["Candidate","Date"])["Value"].mean()
    tv_mentions = tv_mentions.unstack(level="Candidate")
    
    return tv_mentions

#### 2016 Mentions

In [49]:
rep16_mentions = cable_mentions(rep16_candidates, 2015, 2016)
rep16_mentions.to_csv(os.path.join(rep16_gdelt_path,"tv_mentions.csv"))

100%                                                   


#### 2020 Mentions

In [50]:
dem20_mentions = cable_mentions(dem20_candidates)
dem20_mentions.to_csv(os.path.join(dem20_gdelt_path,"tv_mentions.csv"))

100%                                                   


## Combining Datasets

#### 2016 Name Mapping

In [55]:
rep16_name_mapping = {}
poll_names = rep16_polls_df.columns
media_names = list(rep16_mentions.columns)
for candid in rep16_fec_file_map.keys():
    # poll data
    comparison_scores = [edit_distance(x[-len(candid):].lower(), candid.lower()) for x in poll_names]
    val, idx = min((val, idx) for (idx, val) in enumerate(comparison_scores))
    poll_name_map = poll_names[idx]
    # media data
    comparison_scores = [edit_distance(x[-len(candid):].lower(), candid.lower()) for x in media_names]
    val, idx = min((val, idx) for (idx, val) in enumerate(comparison_scores))
    media_name_map = media_names[idx]
    # output
    rep16_name_mapping[candid] = (poll_name_map, media_name_map)
    print("{:s} -> {:s} (score: {:d})".format(candid, str(rep16_name_mapping[candid]), val))

fiorina -> ('Fiorina', 'Carly Fiorina') (score: 0)
carson -> ('Carson', 'Ben Carson') (score: 0)
paul -> ('Paul', 'Randal Paul') (score: 0)
cruz -> ('Cruz', 'Ted Cruz') (score: 0)
rubio -> ('Rubio', 'Marco Rubio') (score: 0)
huckabee -> ('Huckabee', 'Mike Huckabee') (score: 0)
jindal -> ('Christie', 'Bobby Jindal') (score: 0)
graham -> ('Bush', 'Lindsey Graham') (score: 0)
santorum -> ('Santorum', 'Rick Santorum') (score: 0)
kasich -> ('Kasich', 'John Kasich') (score: 0)
perry -> ('Cruz', 'Rick Perry') (score: 0)
christie -> ('Christie', 'Chris Christie') (score: 0)
bush -> ('Bush', 'Jeb Bush') (score: 0)
gilmore -> ('Carson', 'Jim Gilmore') (score: 0)
pataki -> ('Paul', 'George Pataki') (score: 0)
trump -> ('Trump', 'Donald Trump') (score: 0)
walker -> ('Carson', 'Scott Walker') (score: 0)


#### 2020 Name Mapping

In [56]:
dem20_name_mapping = {}
dem20_name_mapping_scores = {}
poll_names = dem20_polls_df.columns
media_names = list(dem20_mentions.columns)
nlp_names = list(dem20_nlp["candidate"].unique())
for candid in dem20_fec_file_map.keys():
    # poll data
    comparison_scores = [edit_distance(x[-len(candid):].lower(), candid.lower()) for x in poll_names]
    poll_score, idx = min((val, idx) for (idx, val) in enumerate(comparison_scores))
    poll_name_map = poll_names[idx]
    # media data
    comparison_scores = [edit_distance(x[-len(candid):].lower(), candid.lower()) for x in media_names]
    media_score, idx = min((val, idx) for (idx, val) in enumerate(comparison_scores))
    media_name_map = media_names[idx]
    # nlp data
    comparison_scores = [edit_distance(x[-len(candid):].lower(), candid.lower()) for x in nlp_names]
    nlp_score, idx = min((val, idx) for (idx, val) in enumerate(comparison_scores))
    nlp_name_map = nlp_names[idx]
    # output
    dem20_name_mapping[candid] = (poll_name_map, media_name_map, nlp_name_map)
    dem20_name_mapping_scores[candid] = (poll_score, media_score, nlp_score)
    
# special case
dem20_name_mapping["ryan"] = ("Tim Ryan", "", "")
dem20_name_mapping["biden"] = ("Joseph R. Biden Jr.", dem20_name_mapping["biden"][1], dem20_name_mapping["biden"][2])

# output
print("fec_name -> polling_name (edit_distance), media_name (edit_distance), nlp_name (edit distance)")
for candid in sorted(dem20_name_mapping.keys()):
    print("{:s} -> {:s} ({:d}), {:s} ({:d}), {:s} ({:d})".format(
        candid,
        dem20_name_mapping[candid][0],
        dem20_name_mapping_scores[candid][0],
        dem20_name_mapping[candid][1],
        dem20_name_mapping_scores[candid][1],
        dem20_name_mapping[candid][2],
        dem20_name_mapping_scores[candid][2]))

fec_name -> polling_name (edit_distance), media_name (edit_distance), nlp_name (edit distance)
bennet -> Michael F. Bennet (0), Michael Bennet (0), bennett (2)
biden -> Joseph R. Biden Jr. (3), Joe Biden (0), biden (0)
booker -> Cory A. Booker (0), Corey Booker (0), booker (0)
bullock -> Steve Bullock (0), Steve Bullock (0), bullock (0)
buttigieg -> Pete Buttigieg (0), Pete Buttigieg (0), buttigieg (0)
castro -> Julián Castro (0), Julian Castro (0), castro (0)
de blasio -> Bill de Blasio (0), Julian Castro (6), de_blasio (1)
deblasio -> Bill de Blasio (2), John Delaney (5), de_blasio (2)
delaney -> John K. Delaney (0), John Delaney (0), delaney (0)
gabbard -> Tulsi Gabbard (0), Tulsi Gabbard (0), gabbard (0)
gillibrand -> Kirsten E. Gillibrand (0), Kirsten Gillibrand (0), gillibrand (0)
harris -> Kamala D. Harris (0), Kamala Harris (0), harris (0)
hickenlooper -> John Hickenlooper (0), John Hickenlooper (0), hickenlooper (0)
inslee -> Jay Robert Inslee (0), Jay Inslee (0), inslee (0)
k

#### Joins and Normalization

In [86]:
def make_normalized_col(df, col_name):
    new_col_name = "{:s}_normalized".format(col_name)
    daily_sums = df.groupby("date")[col_name].sum()
    dates = df.index.get_level_values('date')
    df[new_col_name] = daily_sums.loc[dates].values
    df[new_col_name] = df[col_name]/df[new_col_name]
    
def read_donnation_csv(path, candid):
    df = pd.read_csv(path)
    df["contribution_receipt_date"] = pd.to_datetime(df["contribution_receipt_date"]).dt.date
    df = df.loc[df["entity_type"]=="IND"]
    df = df.loc[np.logical_and(df["contribution_receipt_amount"]>0,df["contribution_receipt_amount"]<=2800)]
    # donation count
    df_count = df.groupby(by="contribution_receipt_date")["contribution_receipt_amount"].count()
    df_count.name = "individual_donations"
    # donation sum
    df_sum = df.groupby(by="contribution_receipt_date")["contribution_receipt_amount"].sum()
    df_sum.name = "individual_donation_amount"
    # donations under $200
    df = df.loc[df["contribution_receipt_amount"]<=200]
    df_small = df.groupby(by="contribution_receipt_date")["contribution_receipt_amount"].count()
    df_small.name = "small_donor_count"
    df = pd.concat([df_count, df_sum, df_small], axis=1)
    df["candidate"] = candid
    return df

def combine_campaign(
    fec_paths,
    polls,
    tv_mentions,
    name_mapping,
    nlp = None):
    
    dataset = pd.DataFrame()
    
    for candid in fec_paths.keys():
        if("donations" in fec_paths[candid].keys()):
            
            # donnations data
            df = read_donnation_csv(fec_paths[candid]["donations"], candid)
            
            # polling data
            if(name_mapping[candid][0] in polls.columns):
                candid_polls = polls[name_mapping[candid][0]]
                candid_polls.name = "polling_percentage"
                candid_polls.index = pd.to_datetime(candid_polls.index).date
                df = df.join(candid_polls, how="left")
                df["polling_percentage"] = df["polling_percentage"].interpolate(limit_direction='both').fillna(0.0)
                df = df.rename_axis("date").reset_index()

            # process media dataset
            df["date"] = pd.to_datetime(df["date"], format='%Y-%m-%d')
            if(name_mapping[candid][1] in tv_mentions.columns):
                candid_tv = tv_mentions[name_mapping[candid][1]]
                candid_tv.name = "tv_mentions"
                df = df.merge(candid_tv, how="left", left_on="date", right_index=True)
                
            # process nlp data
            if(nlp is not None):
                nlp_candid = nlp.loc[nlp["candidate"]==name_mapping[candid][2]]
                nlp_candid = nlp_candid.drop("candidate", axis=1)
                df = df.merge(nlp_candid, how="left", left_on="date", right_on="date")
                
            # append to main df
            dataset = dataset.append(df)
    
    # munge data
    #dataset = dataset.reset_index()
    #dataset = dataset.rename(columns={"index": "date"})
    dataset = dataset.set_index(["candidate","date"])
    dataset["individual_donations"] = dataset["individual_donations"].fillna(0.0)
    make_normalized_col(dataset, "individual_donations")
    dataset["individual_donation_amount"] = dataset["individual_donation_amount"].fillna(0.0)
    make_normalized_col(dataset, "individual_donation_amount")
    dataset["small_donor_count"] = dataset["small_donor_count"].fillna(0.0)
    make_normalized_col(dataset, "small_donor_count")
    dataset["tv_mentions"] = dataset["tv_mentions"].fillna(0.0)
    make_normalized_col(dataset, "tv_mentions")
    dataset["polling_percentage"] = dataset["polling_percentage"].fillna(0.0)
    make_normalized_col(dataset, "polling_percentage")
    
    return dataset

def weekly_aggregate(df):
    df['date'] = pd.to_datetime(df['date']) - pd.to_timedelta(7, unit='d')
    df = df.groupby(['candidate', pd.Grouper(key='date', freq='W-TUE')]).mean()
    df = df.reset_index()
    return df

In [89]:
dem20_df = combine_campaign(dem20_fec_file_map, dem20_polls_df, dem20_mentions, dem20_name_mapping, dem20_nlp)
dem20_weekly_df = weekly_aggregate(dem20_df.reset_index())

# write data
dem20_df.to_csv("../data/dem20_dataset.csv")
dem20_df.to_pickle("../data/dem20_dataset.pkl")
dem20_weekly_df.to_csv("../data/dem20_weekly_dataset.csv")

# check data
dem20_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,2018 congress,2020 democractic party,2020 democractic primary,2020 democratic primary,2020 election,aoc,assange,assange + stone,biden busing,booker,...,us economy (socialism vs captialism),venezuela,warren,williamson,yang,individual_donations_normalized,individual_donation_amount_normalized,small_donor_count_normalized,tv_mentions_normalized,polling_percentage_normalized
candidate,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
swalwell,2019-04-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.016582,0.022695,0.014047,0.015357,0.0
swalwell,2019-04-09,0.136851,0.0,0.0,0.110916,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.011609,0.0,0.0,0.027352,0.034873,0.022388,0.162691,0.0
swalwell,2019-04-10,0.0,0.038388,0.03322,0.0,0.337605,0.058141,0.012057,0.024866,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.007437,0.009403,0.004323,0.033627,0.0
swalwell,2019-04-11,0.194413,0.0,0.0,0.051672,0.0,0.060707,0.005744,0.0,0.005341,0.014486,...,0.017295,0.0,0.0,0.0,0.0,0.007039,0.022345,0.0033,0.0869,0.0
swalwell,2019-04-12,,,,,,,,,,,...,,,,,,0.000519,0.001233,0.0,0.020355,0.0
swalwell,2019-04-14,,,,,,,,,,,...,,,,,,0.012328,0.015399,0.009928,0.030431,0.0
swalwell,2019-04-15,,,,,,,,,,,...,,,,,,0.004417,0.002454,0.0039,0.053133,0.0
swalwell,2019-04-16,,,,,,,,,,,...,,,,,,0.005499,0.014746,0.001775,0.089081,0.0
swalwell,2019-04-17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.002435,0.002073,0.002,0.027264,0.0
swalwell,2019-04-18,0.0,0.0,0.0,0.052425,0.0,0.03792,0.0,0.0,0.0,0.006953,...,0.074782,0.00802,0.084694,0.0,0.0,0.001852,0.013743,0.000854,0.070388,0.0


In [168]:
rep16_df = combine_campaign(rep16_fec_file_map, rep16_polls_df, rep16_mentions, rep16_name_mapping)

# write data
rep16_df.to_csv("../data/rep16_dataset.csv")
rep16_df.to_pickle("../data/rep16_dataset.pkl")

rep16_df.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,individual_donations,individual_donation_amount,small_donor_count,polling_percentage,tv_mentions,individual_donations_normalized,individual_donation_amount_normalized,small_donor_count_normalized,tv_mentions_normalized,polling_percentage_normalized
candidate,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
fiorina,2015-05-04,43,43772.78,16.0,0.02,0.735856,0.031548,0.133269,0.016343,0.242804,0.036364
fiorina,2015-05-05,22,10285.0,15.0,0.02,0.449933,0.018033,0.021112,0.023256,0.11664,0.036364
fiorina,2015-05-06,24,25596.0,7.0,0.02,0.273656,0.046243,0.12535,0.020958,0.09508,0.035088
fiorina,2015-05-07,7,698.0,6.0,0.02,0.069189,0.009174,0.003456,0.011516,0.050342,0.035088
fiorina,2015-05-08,4,3400.0,2.0,0.02,0.085989,0.004053,0.014086,0.002594,0.105922,0.036364
fiorina,2015-05-09,11,5100.0,3.0,0.02,0.141933,0.065868,0.09135,0.028037,0.094622,0.055556
fiorina,2015-05-10,5,635.0,4.0,0.02,0.307578,0.04386,0.021106,0.052632,0.253987,0.055556
fiorina,2015-05-11,25,8700.0,15.0,0.02,0.188567,0.027027,0.033058,0.022026,0.168063,0.035088
fiorina,2015-05-12,4,350.0,4.0,0.02,0.019678,0.006483,0.001153,0.010811,0.022897,0.036364
fiorina,2015-05-13,4,1445.0,2.0,0.02,0.015589,0.006814,0.007854,0.004866,0.033701,0.036364
