script to preprocess the poll data based on the R script that I created
```
process.dates <- function(input.date.vect){
  input.date.vect <- gsub(".{3,} - ", "", input.date.vect)
  current.month <- rep(NA, length(input.date.vect))
  #need to add years to the vector
  current.year <- 2016
  #identify month
  for(i in 1:length(input.date.vect)){
    #if the second character is a slash, then the month is one digit. else, two
    second.char <- substr(input.date.vect[i], 2, 2)
    if(second.char == "/"){
      current.month[i] = substr(input.date.vect[i], 1, 1)
    } else {
      current.month[i] = substr(input.date.vect[i], 1, 2)
    }
    # if the month of the previous entry is greater, then knock down the year one
    if(i > 1){
      if(current.month[i] > current.month[i - 1]){
        current.year <- current.year - 1
      }
    }
    input.date.vect[i] = paste0(input.date.vect[i], "/", current.year)
  }
  input.date.vect <- gsub("/", "-", input.date.vect)
  #add zeros where necessary
  input.date.vect <- sub("^([0-9]{1})-", "0\\1-", input.date.vect)
  input.date.vect <- sub("-([0-9]{1})-", "-0\\1-", input.date.vect)
  return(input.date.vect)
}

reps$Date <- as.POSIXct(as.Date(process.dates(reps$Date), "%m-%d-%Y"))
dems$Date <- as.POSIXct(as.Date(process.dates(dems$Date), "%m-%d-%Y"))

min.clinton <- min(dems$Date[dems$Clinton != "--"])
max.clinton <- max(dems$Date[dems$Clinton != "--"])
min.bernie <- min(dems$Date[dems$Sanders != "--"])
max.bernie <- max(dems$Date[dems$Sanders != "--"])
min.trump <- min(reps$Date[reps$Trump != "--"])
max.trump <- max(reps$Date[reps$Trump != "--"])
min.cruz <- min(reps$Date[reps$Cruz != "--"])
max.cruz <- max(reps$Date[reps$Cruz != "--"])
min.kasich <- min(reps$Date[reps$Kasich != "--"])
max.kasich <- max(reps$Date[reps$Kasich != "--"])

len.clinton <- max.clinton - min.clinton
len.bernie <- max.bernie - min.bernie
len.trump <- max.trump - min.trump
len.cruz <- max.cruz - min.cruz
len.kasich <- max.kasich - min.kasich

candidate.data.frame <- data.frame(campaign.length = as.numeric(c(len.clinton, len.bernie, len.trump, len.cruz, len.kasich)))
rownames(candidate.data.frame) <- c("Clinton", "Sanders", "Trump", "Cruz", "Kasich")
# take the lyric data and add the mentions per candidate for data to create a scatterplot

candidate.data.frame$mentions <- NA
candidate.data.frame$mentions[1] <- sum(lyrics$candidate == "Hillary Clinton")
candidate.data.frame$mentions[2] <- sum(lyrics$candidate == "Bernie Sanders")
candidate.data.frame$mentions[3] <- sum(lyrics$candidate == "Donald Trump")
candidate.data.frame$mentions[4] <- sum(lyrics$candidate == "Ted Cruz")
candidate.data.frame$mentions[5] <- sum(lyrics$candidate == "John Kasich")
```

In [62]:
# author: John Boudreaux

import numpy as np
import pandas as pd
import re

dems_2016 = pd.read_csv("../data/democrats2016.csv")
reps_2016 = pd.read_csv("../data/republicans2016.csv")
dems = dems_2016.copy()


def get_current_month(string_val):
    ''' for date objects, will retreive the first or second value before a slash'''
    if string_val[1] == "/":
        return(int(string_val[0:1]))
    else:
        return(int(string_val[0:2]))
    
def get_day(string_val):
    ''' for preprocessed dates, retreive the day (comes after month and slash)'''
    # find where the slash is
    slash_index = string_val.index("/")
    return(int(string_val[slash_index+1:]))
    
    
 

def fix_dates(old_df):
    ''' function to preprocess the 2016 poll data to give usable datetime objects'''
    df = old_df
    df["preprocessDate"] = df.Date.str.replace(".{3,} - ", "")
    df["month"] = df.preprocessDate.apply(lambda x : get_current_month(x))
    year = 2016
    years = []
    # we need to go and loop through each of the months to figure out which year we need to use
    # if the next month is greater than the previous one, we know we went down a year
    for i in range(len(df.month)):
        if i > 0:
            if df.month[i] > df.month[i-1]:
                year -= 1
        years.append(year)
    df["year"] = years
    df["day"] = df.preprocessDate.apply(lambda x : get_day(x))
    df["newDate"] = pd.to_datetime(df[['day', 'month', 'year']])
    
    columns = ['Date', 'preprocessDate', 'month', 'day', 'year']
    df.drop(columns, inplace=True, axis=1)

    return(df)

def replace_dashes(df):
    df.replace('--', np.nan, inplace = True)
    return(df)

def preprocess_poll_data(df):
    new_df = df
    new_df = fix_dates(new_df)
    new_df = replace_dashes(new_df)
    return(new_df)

new_dems = preprocess_poll_data(dems_2016)
new_reps = preprocess_poll_data(reps_2016)

new_dems.to_csv("../data/PreProcessedDemPolls.csv")
new_reps.to_csv("../data/PreProcessedRepPolls.csv")

[2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2016,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2015,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,
 2014,

168

6

5