In [1]:
import pandas as pd
import numpy as np
import csv
import json
import string
import random
import re

np.random.seed(1234)
random.seed(1234)

## Load Data, Clean, Define Document Level

In [14]:
CSpeech = pd.read_csv('CampaignSpeech.csv', header=None)
CSpeech.head()

Unnamed: 0,0,1,2,3,4,5
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [15]:
CSpeech.columns = ['Candidate','Title','Date','Content','Type','URL']
CSpeech.head()

Unnamed: 0,Candidate,Title,Date,Content,Type,URL
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [16]:
CSpeech['Year'] = [int(re.split(',',x)[1].strip()) for x in CSpeech.Date]

In [17]:
CSpeech['Month'] = [re.split(',',x)[0].strip('\n')[:-3] for x in CSpeech.Date]
CSpeech['Day'] = [int(re.split(',',x)[0].strip('\n')[-2:]) for x in CSpeech.Date]

In [19]:
CSpeech['Speech_index']= ['Speech'+ str(x) for x in range(0,len(CSpeech.Title))]

### Create Sub-speech

In [27]:
sub_dict = {'Subspeech_index':[], 'SubContent':[], 'Speech_index':[]}

In [28]:
# Save every 10 lines into a subspeech e.g. [0,10)

for i,text in enumerate(CSpeech['Content']): 
    splited = re.split('\n',text)
    length = len(re.split('\n',text))
    if length > 10:
        a = list(range(0,length,10)) 
        for j in a[:-2]:
            rowvalue = []
            upper=j+10
            sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+str(j)+ ':' + str(upper))
            sub_dict['SubContent'].append(splited[j: upper])
            sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
            
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+ str(a[-2])+ ':' + str(length))
        sub_dict['SubContent'].append(splited[a[-2]: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
    else:
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i] +'_'+str(0)+ ':' + str(length))
        sub_dict['SubContent'].append(splited[: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])

In [25]:
list(range(0,15,10)) 

[0, 10]

In [33]:
Sub_Speech = pd.DataFrame.from_dict(sub_dict)
Sub_Speech.head()

Unnamed: 0,Speech_index,SubContent,Subspeech_index
0,Speech0,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0_0:10
1,Speech0,[The unemployment rate just hit the lowest lev...,Speech0_10:20
2,Speech0,"[And I'll tell you, a little—a little tricky b...",Speech0_20:30
3,Speech0,"[You are here, he is here to help elect Cindy ...",Speech0_30:40
4,Speech0,[What a great crowd we have tonight for you. I...,Speech0_40:50


In [34]:
SubSpeech = Sub_Speech.merge(CSpeech, left_on='Speech_index', right_on = 'Speech_index')
SubSpeech.head()

Unnamed: 0,Speech_index,SubContent,Subspeech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day
0,Speech0,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0_0:10,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
1,Speech0,[The unemployment rate just hit the lowest lev...,Speech0_10:20,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
2,Speech0,"[And I'll tell you, a little—a little tricky b...",Speech0_20:30,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
3,Speech0,"[You are here, he is here to help elect Cindy ...",Speech0_30:40,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
4,Speech0,[What a great crowd we have tonight for you. I...,Speech0_40:50,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26


In [35]:
SubSpeech.to_csv("SubSpeech.csv")

## Create Training Data for Annotation (prodigy)

In [36]:
len(SubSpeech.Subspeech_index)

11929

In [44]:
len(pd.unique(CSpeech.Year))

76

In [42]:
CSpeech['Year'].value_counts()

1960    881
2008    461
2012    343
2007    256
2016    204
2015    119
2011    105
2004     96
1996     81
2014     32
1968     29
1932     23
2000     21
1980     13
2013     12
2018     11
2009      8
1992      6
1995      6
2010      4
1988      3
2006      3
1984      2
1976      2
1972      2
1964      2
1999      2
2003      2
2002      1
1971      1
1979      1
1983      1
1985      1
1974      1
1991      1
1956      1
1952      1
1912      1
Name: Year, dtype: int64

In [50]:
CSpeech.Candidate[CSpeech['Year']==1960].value_counts()

John F. Kennedy      600
Richard Nixon        280
Lyndon B. Johnson      1
Name: Candidate, dtype: int64

In [51]:
CSpeech.Candidate.value_counts()

John F. Kennedy          600
Barack Obama             582
Richard Nixon            311
Hillary Clinton          196
John McCain              178
Mitt Romney              136
John F. Kerry             88
Donald J. Trump           86
Robert Dole               83
Bernie Sanders            60
Rudy Giuliani             38
John Edwards              37
Bill Richardson           31
Franklin D. Roosevelt     23
Joe Biden                 22
Rick Santorum             20
Sarah Palin               19
Newt Gingrich             17
Fred Thompson             16
Albert Gore, Jr.          16
Rick Perry                16
Mike Huckabee             16
Ronald Reagan             14
Ron Paul                  12
Martin O'Malley           11
William J. Clinton         9
George W. Bush             9
Michele Bachmann           8
Ted Cruz                   6
Jeb Bush                   6
                        ... 
Lincoln Chafee             3
Tim Pawlenty               3
Scott Walker               3
Lindsey Graham