src/panel_models.rmd

---
title: "panel_model"
author: "Andrew Walters"
date: "6/23/2019"
output: pdf_document
---

## Libraries
```{r message=FALSE, warning=FALSE}
# visual
library(knitr)
library(kableExtra)
library(ggplot2)
library(ggpubr)
library(stargazer)
# data manipulation
library(dplyr)
library(reshape)
library(MASS)
# modeling
library(stats)
library(matrixStats)
library(plm)
library(lmtest)
library(summarytools)
```

## Load Data
```{r message=FALSE, warning=FALSE}
# clear workspace
rm(list = ls())

data_cleanup <- function(df, start_date, end_date) {
  df$date <- as.Date(df$date, format = "%Y-%m-%d")
  df <- subset(df, date >= start_date )
  df <- subset(df, date <=  end_date)
  return(df)
}

top_candidates <- function(df, n_candidates) {
  avg_polls <- df %>%
    group_by(candidate) %>%
    summarize(mean = mean(polling_percentage_normalized, na.rm = TRUE))
  avg_polls <- avg_polls[order(-(avg_polls$mean)),] 
  top_list <- avg_polls[1:n_candidates,]
  top_df <- subset(df, candidate %in% top_list$candidate)
  bottom_list <- avg_polls[n_candidates+1:nrow(avg_polls),]
  bottom_df <- subset(df, candidate %in% bottom_list$candidate)
  return_dfs <- list(top=top_df, bottom=bottom_df)
  return(return_dfs)
}

dem20 <- read.csv("../data/dem20_dataset.csv")
dem20 <- data_cleanup(dem20, as.Date("2019-01-01"), as.Date("2019-06-30"))
dem20_split <- top_candidates(dem20, 7)

dem20_weekly <- read.csv("../data/dem20_weekly_dataset.csv")
dem20_weekly <- data_cleanup(dem20_weekly, as.Date("2019-01-01"), as.Date("2019-06-30"))

rep16 <- read.csv("../data/rep16_dataset.csv")
rep16 <- data_cleanup(rep16, as.Date("2015-01-01"), as.Date("2016-12-31"))
rep16_split <- top_candidates(rep16, 5)

head(dem20)
head(dem20_split$top)
```

## Including Plots

```{r}
print("bottom")
unique(dem20_split$bottom$candidate)
print("top")
unique(dem20_split$top$candidate)
```

You can also embed plots, for example:

```{r pressure, echo=FALSE}
make_ggplots <- function(df, cycle, party, description) {
  
  legend_nrow <- ceiling(length(df$candidate)/3)
  
  gg_poll <- ggplot(df, aes(x=date, y=polling_percentage_normalized)) +
    geom_line(aes(colour=candidate)) +
    ggtitle(sprintf("%s %s %s Polling", cycle, description, party)) +
    xlab("Date") +
    ylab("Normalized Polls") +
    guides(fill=guide_legend(ncol = 2))
  
  gg_dcount <- ggplot(df, aes(x=date, y=individual_donations_normalized)) +
    geom_line(aes(colour=candidate)) +
    ggtitle(sprintf("%s %s %s Donnation Count", cycle, description, party)) +
    xlab("Date") +
    ylab("Normalized Donnation Count")
  
  gg_dsum <- ggplot(df, aes(x=date, y=individual_donation_amount_normalized)) +
    geom_line(aes(colour=candidate)) +
    ggtitle(sprintf("%s %s %s Donnation Sum", cycle, description, party)) +
    xlab("Date") +
    ylab("Normalized Donnation Sum")
  
  gg_sdcount <- ggplot(df, aes(x=date, y=small_donor_count_normalized)) +
    geom_line(aes(colour=candidate)) +
    ggtitle(sprintf("%s %s %s Small Donnation Count (less than $200)", cycle, description, party)) +
    xlab("Date") +
    ylab("Normalized Small Donnation Count")
  
  gg_tv <- ggplot(df, aes(x=date, y=tv_mentions_normalized)) +
    geom_line(aes(colour=candidate)) +
    ggtitle(sprintf("%s %s %s TV Mention Count", cycle, description, party)) +
    xlab("Date") +
    ylab("Normalized Cable TV Mentions")
  
  return_plots <- list(poll=gg_poll, dcount=gg_dcount, dsum=gg_dsum, sdcount=gg_sdcount, tv=gg_tv)
  return(return_plots)
}


# Show a time series of the speed limit laws
#speedLimitMelted <- melt(speedLimit, id=c("year"))
dem20_ggtop <- make_ggplots(dem20_split$top, "2020", "Democrat", "Top 7")
dem20_ggbottom <- make_ggplots(dem20_split$bottom, "2020", "Democrat", "Bottom")

# display plots
ggarrange(dem20_ggtop$poll, dem20_ggbottom$poll, ncol=1, nrow=2)
ggarrange(dem20_ggtop$dcount, dem20_ggbottom$dcount, ncol=1, nrow=2)
ggarrange(dem20_ggtop$dsum, dem20_ggbottom$dsum, ncol=1, nrow=2)
ggarrange(dem20_ggtop$sdcount, dem20_ggbottom$sdcount, ncol=1, nrow=2)
ggarrange(dem20_ggtop$tv, dem20_ggbottom$tv, ncol=1, nrow=2)
```


#### Print out all nlp features for formula
```{r}
formula_agg <- ""
ii = 0
for (col in colnames(dem20)) {
  formula_agg <- paste(formula_agg, col, sep="+")
  if(ii%%4==0) {
    formula_agg <- paste(formula_agg, "\n")
  }
  ii <- ii+1
}
print(formula_agg)
```

## Panel Model: all candidates, weekly aggregate, nlp featrues on polls
takeaway - easier to predict at weekly level than daily, still not great with all candidates
```{r}
dem20_weekly.panel <- pdata.frame(dem20_weekly, c("candidate", "date"))
dem20_weekly.panel <- dem20_weekly.panel[!duplicated(rownames(dem20_weekly.panel)),]
dem20.nlp.fit <- plm(polling_percentage_normalized ~ 
  X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
  busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+electoral.college+female.congresswomen+first.debate+gun.laws+
  harris+healthcare+hickenlooper+impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+mcconnell+
  mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+sanders+student.debt+supreme.court+tax.return+
  tech+trump+trump.racist+unions+us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
  data=dem20_weekly.panel, model="within")

summary(dem20.nlp.fit)
```

# Panel Model: all candidates, daily aggregate, nlp featrues on polls
takeaway: daily is worse than weekly
```{r}
dem20.panel <- pdata.frame(dem20, c("candidate", "date"))
dem20.panel <- dem20.panel[!duplicated(rownames(dem20.panel)),]
dem20.nlp.fit <- plm(polling_percentage_normalized ~ 
  X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
  busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+electoral.college+female.congresswomen+first.debate+gun.laws+
  harris+healthcare+hickenlooper+impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+mcconnell+
  mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+sanders+student.debt+supreme.court+tax.return+
  tech+trump+trump.racist+unions+us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
  data=dem20.panel, model="within")

summary(dem20.nlp.fit)
```

## Panel Model: all candidates, weekly aggregate, nlp featrues on donnation count
takeaway: donnation count is slightly harder to predict than polls
```{r}
dem20.nlp.fit <- plm(individual_donations_normalized ~ 
  X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
  busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+electoral.college+female.congresswomen+first.debate+gun.laws+
  harris+healthcare+hickenlooper+impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+mcconnell+
  mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+sanders+student.debt+supreme.court+tax.return+
  tech+trump+trump.racist+unions+us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
  data=dem20_weekly.panel, model="within")

summary(dem20.nlp.fit)
```

## Panel Model: all candidates, weekly aggregate, nlp featrues on tv mentions
takeaway: tv mentions are about the same as polling in prediction quality
```{r}
dem20.nlp.fit <- plm(tv_mentions_normalized ~ 
  X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
  busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+electoral.college+female.congresswomen+first.debate+gun.laws+
  harris+healthcare+hickenlooper+impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+mcconnell+
  mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+sanders+student.debt+supreme.court+tax.return+
  tech+trump+trump.racist+unions+us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
  data=dem20_weekly.panel, model="within")

summary(dem20.nlp.fit)
```


## Panel Model: all candidates, weekly aggregate, nlp featrues on small donor count
takeaway: small donor cound is the same as regualr donor count
```{r}
dem20.nlp.fit <- plm(small_donor_count_normalized ~ 
  X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
  busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+electoral.college+female.congresswomen+first.debate+gun.laws+
  harris+healthcare+hickenlooper+impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+mcconnell+
  mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+sanders+student.debt+supreme.court+tax.return+
  tech+trump+trump.racist+unions+us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
  data=dem20_weekly.panel, model="within")

summary(dem20.nlp.fit)
```

## Panel Model: all candidates, nlp featrues on donnation amount
takeaway: donnation amount is slightly harder to predict than donnation count
```{r}
dem20.nlp.fit <- plm(individual_donation_amount_normalized ~ 
  X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
  busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+electoral.college+female.congresswomen+first.debate+gun.laws+
  harris+healthcare+hickenlooper+impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+mcconnell+
  mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+sanders+student.debt+supreme.court+tax.return+
  tech+trump+trump.racist+unions+us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
  data=dem20_weekly.panel, model="within")

summary(dem20.nlp.fit)
```

## Hunting for Significance
Looking at linear models for single_candidate_polling ~ single_nlp_feature. Use the resulting p-values and r-squared to decide which to explore further
```{r}
dem20_fits <- c()
for(candid in unique(dem20_split$top$candidate)) {
  
  # get candidate data
  candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
  candid_df <- within(candid_df, rm("candidate"))
  candid_panel <- pdata.frame(candid_df, c("date"))
  candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]
  
  # run candidate total regression
  candid_fit <- lm(polling_percentage_normalized ~ 
    X2018.congress+X2020.democratic.primary+X2020.election+aoc+assange+biden.busing+booker+border+
    busing+buttigieg+civil.rights+climate.change+congress+de.blasio+drugs+economic.foreign.policy+
    electoral.college+female.congresswomen+first.debate+gun.laws+harris+healthcare+hickenlooper+
    impeachment+inslee+iowa.caucus+iran+iran.afghanistan+israel+kavanaugh+lgbtq+marijuana+mccain+
    mcconnell+mueller.report+north.korea+o.rourke+prison.reform+puerto.rico+russian.interference+
    sanders+student.debt+supreme.court+tax.return+tech+trump+trump.racist+unions+
    us.economy..socialism.vs.captialism.+venezuela+warren+williamson+yang,
    data=candid_panel)
  print(paste("*****",candid,"r-squared:",summary(candid_fit)$r.squared,sep = " "))
  
  # run independant regressions on each variable
  for(colname in colnames(candid_panel)) {
    if(colname %in% c(colnames(rep16),"time")) {
      #print(paste("[PASS]",candid,colname,sep=" "))
    }
    else {
      #print(paste("[TEST]",candid,colname,sep=" "))
      tmpdf <- candid_panel[,c("polling_percentage_normalized", colname)]
      colnames(tmpdf) <- c("y", "x")
      fit <- lm(y ~ x, data=tmpdf)
      fit_summary <- summary(fit)
      fit_coef <- fit_summary$coefficients[2,]
      if(fit_coef[4]<0.1) {
        print(paste(candid,colname,"coef:",fit_coef[1],"intercept:",fit_summary$coefficients[1,1],"std-error:",fit_coef[2],"p-value:",fit_coef[4],"r-squared:",fit_summary$r.squared,sep = " "))
      }
    }
  }
}
```

### Helper Function for Linear Regression
```{r}
regression_plot <- function(candid_panel, candid_fit, candid) {
  candid_panel$predicted_polling_percentage <- candid_fit$fitted.values
  plot_panel <- melt(candid_panel[,c("date","predicted_polling_percentage","polling_percentage_normalized",names(candid_fit$coefficients[2:length(candid_fit$coefficients)]))], id.vars = 'date')
  gg_fit <- ggplot(plot_panel, aes(x=date, y=value, color=variable, group=variable)) +
      geom_point(aes(colour=variable), size = 0.5) +
      geom_path(aes(colour=variable)) +
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      ggtitle(paste(candid,"regression ( r-squared:",round(summary(candid_fit)$r.squared,3),")")) +
      xlab("Date") +
      ylab("Normalized Value")
  return(gg_fit)
}
```

## Biden
takeaway: very high predictive power, 
```{r}
candid <- "biden"

# get candidate data
candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
candid_df <- within(candid_df, rm("candidate"))
candid_panel <- pdata.frame(candid_df, c("date"))
candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]

# run candidate total regression
candid_fit <- lm(polling_percentage_normalized ~ 
  buttigieg+economic.foreign.policy+mcconnell+tax.return+unions+us.economy..socialism.vs.captialism.,
  data=candid_panel)

summary(candid_fit)
regression_plot(candid_panel, candid_fit, candid)
```

## Harris
```{r}
candid <- "harris"

# get candidate data
candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
candid_df <- within(candid_df, rm("candidate"))
candid_panel <- data.frame(candid_df, c("date"))
candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]

# run candidate total regression
candid_fit <- lm(polling_percentage_normalized ~ 
  aoc+booker+buttigieg+harris+impeachment,
  data=candid_panel)

summary(candid_fit)
regression_plot(candid_panel, candid_fit, candid)
```

## Buttigieg
```{r}
candid <- "buttigieg"

# get candidate data
candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
candid_df <- within(candid_df, rm("candidate"))
candid_panel <- pdata.frame(candid_df, c("date"))
candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]

# run candidate total regression
candid_fit <- lm(polling_percentage_normalized ~ 
  X2018.congress+border+female.congresswomen+impeachment+trump,
  data=candid_panel)

summary(candid_fit)
regression_plot(candid_panel, candid_fit, candid)
```

## Warren Model

```{r}
candid <- "warren"

# get candidate data
candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
candid_df <- within(candid_df, rm("candidate"))
candid_panel <- pdata.frame(candid_df, c("date"))
candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]

# run candidate total regression
candid_fit <- lm(polling_percentage_normalized ~ 
  border+female.congresswomen+student.debt+warren,
  data=candid_panel)

summary(candid_fit)
regression_plot(candid_panel, candid_fit, candid)
```

## Sanders
```{r}
candid <- "sanders"

# get candidate data
candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
candid_df <- within(candid_df, rm("candidate"))
candid_panel <- pdata.frame(candid_df, c("date"))
candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]

# run candidate total regression
candid_fit <- lm(polling_percentage_normalized ~ 
  economic.foreign.policy+impeachment+sanders+student.debt,
  data=candid_panel)

summary(candid_fit)
regression_plot(candid_panel, candid_fit, candid)
```

## Booker
```{r}
candid <- "booker"

# get candidate data
candid_df <- dem20_weekly[dem20_weekly$candidate==candid,]
candid_df <- within(candid_df, rm("candidate"))
candid_panel <- pdata.frame(candid_df, c("date"))
candid_panel <- candid_panel[!duplicated(rownames(candid_panel)),]

# run candidate total regression
candid_fit <- lm(polling_percentage_normalized ~ 
  border+buttigieg+climate.change+sanders,
  data=candid_panel)

summary(candid_fit)
regression_plot(candid_panel, candid_fit, candid)
```


```{r}
#warren_plot <- warren[c("warren","mcconnell","female.congresswomen","date","polling_percentage_normalized","border")]
#"tv_mentions_normalized"
#"individual_donations_normalized"
#warren_plot$israel <- warren_plot$israel * 10
#warren_plot = melt(warren_plot, id.vars = 'date')
#gg_warren <- ggplot(warren_plot, aes(x=date, y=value)) +
#    geom_line(aes(colour=variable)) +
#    ggtitle("Warren") +
#    xlab("Date") +
#    ylab("Y")
#gg_warren
```