In [1]:
import pandas as pd
import numpy as np
import csv
import json
import string
import random
import re

np.random.seed(1234)
random.seed(1234)

## Load Data, Clean, Define Document Level

In [14]:
CSpeech = pd.read_csv('CampaignSpeech.csv', header=None)
CSpeech.head()

Unnamed: 0,0,1,2,3,4,5
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [15]:
CSpeech.columns = ['Candidate','Title','Date','Content','Type','URL']
CSpeech.head()

Unnamed: 0,Candidate,Title,Date,Content,Type,URL
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [16]:
CSpeech['Year'] = [int(re.split(',',x)[1].strip()) for x in CSpeech.Date]

In [17]:
CSpeech['Month'] = [re.split(',',x)[0].strip('\n')[:-3] for x in CSpeech.Date]
CSpeech['Day'] = [int(re.split(',',x)[0].strip('\n')[-2:]) for x in CSpeech.Date]

In [19]:
CSpeech['Speech_index']= ['Speech'+ str(x) for x in range(0,len(CSpeech.Title))]

### Create Sub-speech

In [27]:
sub_dict = {'Subspeech_index':[], 'SubContent':[], 'Speech_index':[]}

In [28]:
# Save every 10 lines into a subspeech e.g. [0,10)

for i,text in enumerate(CSpeech['Content']): 
    splited = re.split('\n',text)
    length = len(re.split('\n',text))
    if length > 10:
        a = list(range(0,length,10)) 
        for j in a[:-2]:
            rowvalue = []
            upper=j+10
            sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+str(j)+ ':' + str(upper))
            sub_dict['SubContent'].append(splited[j: upper])
            sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
            
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+ str(a[-2])+ ':' + str(length))
        sub_dict['SubContent'].append(splited[a[-2]: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
    else:
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i] +'_'+str(0)+ ':' + str(length))
        sub_dict['SubContent'].append(splited[: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])

In [25]:
list(range(0,15,10)) 

[0, 10]

In [33]:
Sub_Speech = pd.DataFrame.from_dict(sub_dict)
Sub_Speech.head()

Unnamed: 0,Speech_index,SubContent,Subspeech_index
0,Speech0,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0_0:10
1,Speech0,[The unemployment rate just hit the lowest lev...,Speech0_10:20
2,Speech0,"[And I'll tell you, a little—a little tricky b...",Speech0_20:30
3,Speech0,"[You are here, he is here to help elect Cindy ...",Speech0_30:40
4,Speech0,[What a great crowd we have tonight for you. I...,Speech0_40:50


In [34]:
SubSpeech = Sub_Speech.merge(CSpeech, left_on='Speech_index', right_on = 'Speech_index')
SubSpeech.head()

Unnamed: 0,Speech_index,SubContent,Subspeech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day
0,Speech0,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0_0:10,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
1,Speech0,[The unemployment rate just hit the lowest lev...,Speech0_10:20,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
2,Speech0,"[And I'll tell you, a little—a little tricky b...",Speech0_20:30,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
3,Speech0,"[You are here, he is here to help elect Cindy ...",Speech0_30:40,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
4,Speech0,[What a great crowd we have tonight for you. I...,Speech0_40:50,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26


In [35]:
SubSpeech.to_csv("SubSpeech.csv")

## Create Training Data for Annotation (prodigy)

In [62]:
len(SubSpeech.Subspeech_index), len(CSpeech.Speech_index), 2739*0.02

(11929, 2739, 54.78)

In [68]:
#CSpeech['Year'].value_counts()
#CSpeech.Candidate.value_counts()

In [53]:
uniyear = pd.unique(CSpeech.Year)

In [63]:
speech_sample = np.random.choice(CSpeech['Speech_index'], 60, replace=False)

In [67]:
CSpeech.Candidate[CSpeech.Speech_index.isin(speech_sample)].value_counts()

Barack Obama             12
John F. Kennedy           9
Richard Nixon             6
Mitt Romney               6
John F. Kerry             5
Hillary Clinton           5
John McCain               5
Robert Dole               3
Donald J. Trump           2
Fred Thompson             1
Bernie Sanders            1
Albert Gore, Jr.          1
John Anderson             1
Martin O'Malley           1
Jeb Bush                  1
Franklin D. Roosevelt     1
Name: Candidate, dtype: int64

In [106]:
subspeech_sample = SubSpeech[['Speech_index', 'Subspeech_index','SubContent']].loc[SubSpeech.Speech_index.isin(speech_sample)]

In [129]:
subspeech_sample['text'] = [subspeech_sample.Subspeech_index[i] + '## ' + ''.join(subspeech_sample.SubContent[i])
                           for i in subspeech_sample.Subspeech_index.index]

In [130]:
subspeech_sample.to_csv('subspeech_sample.csv')

## Load and Merge Annotation

In [152]:
with open('USPOPannotations.jsonl/test_annotation.jsonl', 'r') as annotation_jsonl:
    annotation_list = list(annotation_jsonl)

In [153]:
coded = {}
for annotation in annotation_list:
    annotation_dict = json.loads(annotation)
    coded[annotation_dict['text'].split('##')[0]] = annotation_dict['answer']

In [154]:
coded

{'Speech5_0:10': 'reject',
 'Speech5_10:20': 'accept',
 'Speech5_20:30': 'accept',
 'Speech5_30:40': 'reject'}

In [161]:
coded_data = pd.DataFrame.from_dict(coded, orient='index')

In [171]:
coded_data = coded_data.rename(columns={0:'Pop'})

In [169]:
coded_data['Subspeech_index'] = coded_data.index

In [182]:
new = Sub_Speech.merge(coded_data, how = 'outer', left_on='Subspeech_index', right_on = 'Subspeech_index')
new.head()

Unnamed: 0,Speech_index,SubContent,Subspeech_index,Pop
0,Speech0,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0_0:10,
1,Speech0,[The unemployment rate just hit the lowest lev...,Speech0_10:20,
2,Speech0,"[And I'll tell you, a little—a little tricky b...",Speech0_20:30,
3,Speech0,"[You are here, he is here to help elect Cindy ...",Speech0_30:40,
4,Speech0,[What a great crowd we have tonight for you. I...,Speech0_40:50,
