In [1]:
import pandas as pd
import numpy as np
import csv
import json
import string
import random
import re

np.random.seed(1234)
random.seed(1234)

## Load Data, Clean, Define Document Level

In [21]:
CSpeech = pd.read_csv('CampaignSpeech.csv', header=None)
CSpeech.head()

Unnamed: 0,0,1,2,3,4,5
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [22]:
CSpeech.columns = ['Candidate','Title','Date','Content','Type','URL']
CSpeech.head()

Unnamed: 0,Candidate,Title,Date,Content,Type,URL
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [23]:
CSpeech['Year'] = [int(re.split(',',x)[1].strip()) for x in CSpeech.Date]

In [24]:
CSpeech['Month'] = [re.split(',',x)[0].strip('\n')[:-3] for x in CSpeech.Date]
CSpeech['Day'] = [int(re.split(',',x)[0].strip('\n')[-2:]) for x in CSpeech.Date]

In [25]:
CSpeech['Speech_index']= ['Speech'+ str(x) for x in range(0,len(CSpeech.Title))]

In [26]:
CSpeech.head()

Unnamed: 0,Candidate,Title,Date,Content,Type,URL,Year,Month,Day,Speech_index
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,Speech0
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,October,20,Speech1
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,September,29,Speech2
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,September,6,Speech3
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,August,30,Speech4


In [27]:
CSpeech.to_csv("CampaignSpeech.csv", index=False)

### Create Sub-speech

In [8]:
sub_dict = {'Subspeech_index':[], 'SubContent':[], 'Speech_index':[]}

In [9]:
# Save every 10 lines into a subspeech e.g. [0,10)

for i,text in enumerate(CSpeech['Content']): 
    splited = re.split('\n',text)
    length = len(re.split('\n',text))
    if length > 10:
        a = list(range(0,length,10)) 
        for j in a[:-2]:
            rowvalue = []
            upper=j+10
            sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+str(j)+ ':' + str(upper))
            sub_dict['SubContent'].append(splited[j: upper])
            sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
            
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+ str(a[-2])+ ':' + str(length))
        sub_dict['SubContent'].append(splited[a[-2]: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
    else:
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i] +'_'+str(0)+ ':' + str(length))
        sub_dict['SubContent'].append(splited[: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])

In [None]:
Sub_Speech = pd.DataFrame.from_dict(sub_dict)
Sub_Speech.head()

In [None]:
SubSpeech = Sub_Speech.merge(CSpeech, left_on='Speech_index', right_on = 'Speech_index')
SubSpeech.head()

In [None]:
SubSpeech.to_csv("SubSpeech.csv")

## Create Training Data for Annotation (prodigy)

In [None]:
len(SubSpeech.Subspeech_index), len(CSpeech.Speech_index)

In [None]:
#CSpeech['Year'].value_counts()
#CSpeech.Candidate.value_counts()

In [None]:
#uniyear = pd.unique(CSpeech.Year)

In [None]:
speech_sample = np.random.choice(CSpeech['Speech_index'], 60, replace=False)

In [None]:
CSpeech.Candidate[CSpeech.Speech_index.isin(speech_sample)].value_counts()

In [None]:
subspeech_sample = SubSpeech[['Speech_index', 'Subspeech_index','SubContent']].loc[SubSpeech.Speech_index.isin(speech_sample)]

In [None]:
subspeech_sample['text'] = [subspeech_sample.Subspeech_index[i] + '## ' + ''.join(subspeech_sample.SubContent[i])
                           for i in subspeech_sample.Subspeech_index.index]

In [None]:
len(subspeech_sample.Subspeech_index)

In [None]:
subspeech_sample.to_csv('subspeech_sample.csv')

## Annotate additional data

In [4]:
#SubSpeech_predicted = pd.read_csv('SubSpeech_predicted.csv')

## Load and Merge Annotation

In [2]:
with open('Annotations.jsonl/test_annotation.jsonl', 'r') as annotation_jsonl:
    annotation_list = list(annotation_jsonl)

In [3]:
with open('Annotations.jsonl/pop_code1117.jsonl', 'r') as annotation_jsonl:
    annotation_list += list(annotation_jsonl)

In [5]:
with open('Annotations.jsonl/blind_test1117.jsonl', 'r') as annotation_jsonl:
    annotation_list += list(annotation_jsonl)

In [7]:
with open('Annotations.jsonl/blindtest2.jsonl', 'r') as annotation_jsonl:
    annotation_list += list(annotation_jsonl)

In [8]:
len(annotation_list)

419

In [9]:
coded = {}
for annotation in annotation_list:
    annotation_dict = json.loads(annotation)
    coded[annotation_dict['text'].split('##')[0]] = annotation_dict['answer']

In [10]:
len(coded)

419

In [11]:
coded_data = pd.DataFrame.from_dict(coded, orient='index')

In [12]:
coded_data = coded_data.rename(columns={0:'Pop'})

In [13]:
coded_data['Subspeech_index'] = coded_data.index

In [14]:
Sub_Speech = pd.read_csv('SubSpeech.csv')

In [15]:
SubSpeech_coded = Sub_Speech.merge(coded_data, how = 'outer', left_on='Subspeech_index', right_on = 'Subspeech_index')

In [16]:
len(SubSpeech_coded.Candidate)

11929

In [17]:
SubSpeech_coded.to_csv("SubSpeech_coded.csv")

In [None]:
#CSpeech.Candidate[CSpeech.Speech_index=='Speech1654']

In [19]:
SubSpeech_coded.Candidate[SubSpeech_coded.Pop =='accept'].value_counts()

Donald J. Trump    36
Barack Obama        8
John Edwards        6
Bernie Sanders      4
John McCain         3
Robert Dole         2
Ted Cruz            1
John F. Kerry       1
Newt Gingrich       1
Name: Candidate, dtype: int64

## Add more speech

In [28]:
Speech = pd.read_csv('CampaignSpeech.csv')

In [29]:
PresiDoc = pd.read_csv('PresidentDoc.csv')

In [100]:
SubSpeech_predicted = pd.read_csv('SubSpeech_predicted.csv')

In [101]:
SubSpeech_predicted.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Subspeech_index,SubContent,Speech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day,Pop,Pop_class,Pop_prob
0,0,0,Speech0_0:10,"['', ""PRESIDENT DONALD TRUMP: Thank you, thank...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,,0,0.225
1,1,1,Speech0_10:20,['The unemployment rate just hit the lowest le...,Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,,0,0.024194
2,2,2,Speech0_20:30,"[""And I'll tell you, a little—a little tricky ...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,,0,0.244048
3,3,3,Speech0_30:40,"['You are here, he is here to help elect Cindy...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,,0,0.175
4,4,4,Speech0_40:50,"[""What a great crowd we have tonight for you. ...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26,,0,0.244048


In [30]:
PresiDoc.head()

Unnamed: 0,Name,Title,Date,Type,URL,phrased_text,Year
0,Donald J. Trump,\nRemarks Announcing Candidacy for President i...,"\nJune 16, 2015","['Campaign Documents', 'Presidential Candidates']",/documents/remarks-announcing-candidacy-for-pr...,trump wow whoa that is some group of people th...,2015
1,Donald J. Trump,\nPress Release - Donald Trump: Obama Is A Hor...,"\nJune 16, 2015","['Elections and Transitions', 'Press Releases']",/documents/press-release-donald-trump-obama-ho...,the gateway pundit donald_trump says its time ...,2015
2,Donald J. Trump,\nPress Release - Trump on Hillary: I Was Watc...,"\nJune 16, 2015","['Elections and Transitions', 'Press Releases']",/documents/press-release-trump-hillary-i-was-w...,the gateway pundit this is why we love the don...,2015
3,Donald J. Trump,\nPress Release - Donald J. Trump Declares Can...,"\nJune 16, 2015","['Elections and Transitions', 'Press Releases']",/documents/press-release-donald-j-trump-declar...,new_york_ny june_16 2015today donald_j_trump a...,2015
4,Donald J. Trump,\nPress Release - Donald Trump: I Would Build ...,"\nJune 16, 2015","['Elections and Transitions', 'Press Releases']",/documents/press-release-donald-trump-i-would-...,the gateway pundit what a speechthe donald say...,2015


In [62]:
"'Press Releases'" in re.split(', ', PresiDoc.Type[1].strip(''' [] '''))

True

In [80]:
acceptance = []
for value in PresiDoc.Type:
    result = "'Convention Speeches'"  in re.split(', ', value.strip(''' [] '''))
    acceptance.append(result)

In [82]:
len(PresiDoc[acceptance].Name)

32

In [84]:
AcceptSpeech = PresiDoc[acceptance]

In [85]:
AcceptSpeech.head()

Unnamed: 0,Name,Title,Date,Type,URL,phrased_text,Year
290,Donald J. Trump,\nAddress Accepting the Presidential Nominatio...,"\nJuly 21, 2016","['Campaign Documents', 'Presidential Nominatio...",/documents/address-accepting-the-presidential-...,friends delegates and fellow_americans i humbl...,2016
3477,Barack Obama,\nAddress Accepting the Presidential Nominatio...,"\nAugust 28, 2008","['Elections and Transitions', 'Convention Spee...",/documents/address-accepting-the-presidential-...,to chairman dean and my great friend dick_durb...,2008
11289,Barack Obama,\nRemarks Accepting the Presidential Nominatio...,"\nSeptember 06, 2012","['Elections and Transitions', 'Convention Spee...",/documents/remarks-accepting-the-presidential-...,the first_lady thank you so much tonight i am ...,2012
19948,George W. Bush,\nAddress Accepting the Presidential Nominatio...,"\nAugust 03, 2000","['Elections and Transitions', 'Convention Spee...",/documents/address-accepting-the-presidential-...,mr chairman delegates and my_fellow_citizens i...,2000
26752,George W. Bush,\nRemarks Accepting the Presidential Nominatio...,"\nSeptember 02, 2004","['Elections and Transitions', 'Convention Spee...",/documents/remarks-accepting-the-presidential-...,the president thank you all mr chairman delega...,2004


In [None]:
AcceptSpeech

In [87]:
Speech.tail()

Unnamed: 0,Candidate,Title,Date,Content,Type,URL,Year,Month,Day,Speech_index
2734,Franklin D. Roosevelt,"\nAddress at Oglethorpe University in Atlanta,...","\nMay 22, 1932","\nPresident Jacobs, members and friends of Ogl...",Campaign Documents,/documents/address-oglethorpe-university-atlan...,1932,May,22,Speech2734
2735,Franklin D. Roosevelt,"\nAddress at Jefferson Day Dinner in St. Paul,...","\nApril 18, 1932",\nWhat is the real reason that Jefferson Day D...,Campaign Documents,/documents/address-jefferson-day-dinner-st-pau...,1932,April,18,Speech2735
2736,Franklin D. Roosevelt,"\nRadio Address From Albany, New York: ""The 'F...","\nApril 07, 1932",\nAlthough I understand that I am talking unde...,Campaign Documents,/documents/radio-address-from-albany-new-york-...,1932,April,7,Speech2736
2737,Franklin D. Roosevelt,\nThe Governor Enters the First Primary Campai...,"\nJanuary 22, 1932",\nMy dear Mr. McLean:\nIf it is the desire of ...,Campaign Documents,/documents/the-governor-enters-the-first-prima...,1932,January,22,Speech2737
2738,William Howard Taft,\nRemarks Accepting the Presidential Nominatio...,"\nAugust 01, 1912",\nMR. ROOT AND CHAIRMEN OF THE NOTIFICATION CO...,Campaign Documents,/documents/remarks-accepting-the-presidential-...,1912,August,1,Speech2738


In [88]:
PresiDoc.Type.unique()

array(["['Campaign Documents', 'Presidential Candidates']",
       "['Elections and Transitions', 'Press Releases']",
       "['Elections and Transitions', 'Statements']", '[]',
       "['Campaign Documents', 'Presidential Nomination Acceptance Addresses', 'Convention Speeches', 'Presidential Candidates']",
       "['Elections and Transitions', 'Transition Documents']",
       "['Written Statements', 'Statements', 'Presidential']",
       "['Written Presidential Orders', 'Executive Orders', 'Presidential', 'Executive Orders']",
       "['Written Presidential Orders', 'Proclamations', 'Presidential']",
       "['Miscellaneous', 'Miscellaneous Press Secretary']",
       "['Spoken Addresses and Remarks', 'Inaugural Addresses', 'Presidential', 'Inaugural']",
       "['Miscellaneous', 'Miscellaneous Remarks', 'Presidential']",
       "['Memoranda', 'Presidential']",
       "['Miscellaneous', 'Miscellaneous Remarks', 'Presidential', 'Executive Orders']",
       "['Written Statements', 'State

In [97]:
db = []
for value in PresiDoc.Type:
    result = "'Debates'"  in re.split(', ', value.strip(''' [] '''))
    db.append(result)

In [104]:
debate = PresiDoc[db]

In [108]:
debate.iloc[-1].phrased_text

'the moderator good_evening i am barbara_walters moderator of the last of the debates of 1976 between gerald_r_ford republican_candidate for president and jimmy_carter democratic_candidate for president welcome president ford welcome governor_carter and thank you for joining_us this evening this debate takes_place before an audience in phi_beta_kappa memorial_hall on the campus of the college of william and mary in historic williamsburg_virginia it is particularly appropriate in this bicentennial_year that we meet on these grounds to hear this debate two_hundred_years_ago five william and mary students met at nearby raleigh tavern to form phi_beta_kappa a fraternity designed they wrote to search out and dispel the clouds of falsehood by debating without reserve the issues of the day in that spirit of debatewithout reserve to dispel the clouds of falsehoodgentlemen let_us proceed the subject matter of this debate is open covering all issues and topics our questioners tonight are joseph 

## Add Debate

In [109]:
Sub_Speech = pd.read_csv('SubSpeech.csv')

In [111]:
Sub_Speech.tail()

Unnamed: 0.1,Unnamed: 0,Subspeech_index,SubContent,Speech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day
11924,11924,Speech2738_30:40,"[""Instead of giving us the benefit of any spec...",Speech2738,William Howard Taft,\nRemarks Accepting the Presidential Nominatio...,"\nAugust 01, 1912",\nMR. ROOT AND CHAIRMEN OF THE NOTIFICATION CO...,Campaign Documents,/documents/remarks-accepting-the-presidential-...,1912,August,1
11925,11925,Speech2738_40:50,['For the benefit of our own people and of the...,Speech2738,William Howard Taft,\nRemarks Accepting the Presidential Nominatio...,"\nAugust 01, 1912",\nMR. ROOT AND CHAIRMEN OF THE NOTIFICATION CO...,Campaign Documents,/documents/remarks-accepting-the-presidential-...,1912,August,1
11926,11926,Speech2738_50:60,['The dignity and effectiveness of the governm...,Speech2738,William Howard Taft,\nRemarks Accepting the Presidential Nominatio...,"\nAugust 01, 1912",\nMR. ROOT AND CHAIRMEN OF THE NOTIFICATION CO...,Campaign Documents,/documents/remarks-accepting-the-presidential-...,1912,August,1
11927,11927,Speech2738_60:70,['Under the provisions of the Payne bill I was...,Speech2738,William Howard Taft,\nRemarks Accepting the Presidential Nominatio...,"\nAugust 01, 1912",\nMR. ROOT AND CHAIRMEN OF THE NOTIFICATION CO...,Campaign Documents,/documents/remarks-accepting-the-presidential-...,1912,August,1
11928,11928,Speech2738_70:85,['The antitrust law was passed to provide agai...,Speech2738,William Howard Taft,\nRemarks Accepting the Presidential Nominatio...,"\nAugust 01, 1912",\nMR. ROOT AND CHAIRMEN OF THE NOTIFICATION CO...,Campaign Documents,/documents/remarks-accepting-the-presidential-...,1912,August,1


In [None]:
Sub_Speech 