In [1]:
import pandas as pd
import numpy as np
import csv
import json
import string
import random
import re

np.random.seed(1234)
random.seed(1234)

## Load Data, Clean, Define Document Level

In [2]:
CSpeech = pd.read_csv('CampaignSpeech.csv', header=None)
CSpeech.head()

Unnamed: 0,0,1,2,3,4,5
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [3]:
CSpeech.columns = ['Candidate','Title','Date','Content','Type','URL']
CSpeech.head()

Unnamed: 0,Candidate,Title,Date,Content,Type,URL
0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
1,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nOctober 20, 2018","\nThe President: Thank you very much, and hell...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
2,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 29, 2018","\n[music: John Denver, ""Take Me Home, Country...",Campaign Documents,/documents/remarks-make-america-great-again-ra...
3,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nSeptember 06, 2018",\nPRESIDENT DONALD TRUMP: Thank you. Thank yo...,Campaign Documents,/documents/remarks-make-america-great-again-ra...
4,Donald J. Trump,\nRemarks at a “Make America Great Again” Rall...,"\nAugust 30, 2018","\nPRESIDENT DONALD TRUMP: Evansville, you are ...",Campaign Documents,/documents/remarks-make-america-great-again-ra...


In [4]:
CSpeech['Year'] = [int(re.split(',',x)[1].strip()) for x in CSpeech.Date]

In [5]:
CSpeech['Month'] = [re.split(',',x)[0].strip('\n')[:-3] for x in CSpeech.Date]
CSpeech['Day'] = [int(re.split(',',x)[0].strip('\n')[-2:]) for x in CSpeech.Date]

In [6]:
CSpeech['Speech_index']= ['Speech'+ str(x) for x in range(0,len(CSpeech.Title))]

In [8]:
#CSpeech.head()

### Create Sub-speech

In [9]:
sub_dict = {'Subspeech_index':[], 'SubContent':[], 'Speech_index':[]}

In [10]:
# Save every 10 lines into a subspeech e.g. [0,10)

for i,text in enumerate(CSpeech['Content']): 
    splited = re.split('\n',text)
    length = len(re.split('\n',text))
    if length > 10:
        a = list(range(0,length,10)) 
        for j in a[:-2]:
            rowvalue = []
            upper=j+10
            sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+str(j)+ ':' + str(upper))
            sub_dict['SubContent'].append(splited[j: upper])
            sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
            
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i]+'_'+ str(a[-2])+ ':' + str(length))
        sub_dict['SubContent'].append(splited[a[-2]: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])
    else:
        sub_dict['Subspeech_index'].append(CSpeech['Speech_index'][i] +'_'+str(0)+ ':' + str(length))
        sub_dict['SubContent'].append(splited[: length])
        sub_dict['Speech_index'].append(CSpeech['Speech_index'][i])

In [11]:
Sub_Speech = pd.DataFrame.from_dict(sub_dict)
Sub_Speech.head()

Unnamed: 0,Subspeech_index,SubContent,Speech_index
0,Speech0_0:10,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0
1,Speech0_10:20,[The unemployment rate just hit the lowest lev...,Speech0
2,Speech0_20:30,"[And I'll tell you, a little—a little tricky b...",Speech0
3,Speech0_30:40,"[You are here, he is here to help elect Cindy ...",Speech0
4,Speech0_40:50,[What a great crowd we have tonight for you. I...,Speech0


In [12]:
SubSpeech = Sub_Speech.merge(CSpeech, left_on='Speech_index', right_on = 'Speech_index')
SubSpeech.head()

Unnamed: 0,Subspeech_index,SubContent,Speech_index,Candidate,Title,Date,Content,Type,URL,Year,Month,Day
0,Speech0_0:10,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
1,Speech0_10:20,[The unemployment rate just hit the lowest lev...,Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
2,Speech0_20:30,"[And I'll tell you, a little—a little tricky b...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
3,Speech0_30:40,"[You are here, he is here to help elect Cindy ...",Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26
4,Speech0_40:50,[What a great crowd we have tonight for you. I...,Speech0,Donald J. Trump,"\nRemarks at a ""Make America Great Again"" Rall...","\nNovember 26, 2018","\nPRESIDENT DONALD TRUMP: Thank you, thank you...",Campaign Documents,/documents/remarks-make-america-great-again-ra...,2018,November,26


In [13]:
SubSpeech.to_csv("SubSpeech.csv")

## Create Training Data for Annotation (prodigy)

In [14]:
len(SubSpeech.Subspeech_index), len(CSpeech.Speech_index)

(11929, 2739)

In [68]:
#CSpeech['Year'].value_counts()
#CSpeech.Candidate.value_counts()

In [53]:
#uniyear = pd.unique(CSpeech.Year)

In [15]:
speech_sample = np.random.choice(CSpeech['Speech_index'], 60, replace=False)

In [29]:
speech_sample1 = speech_sample[-30:]

In [67]:
CSpeech.Candidate[CSpeech.Speech_index.isin(speech_sample)].value_counts()

Barack Obama             12
John F. Kennedy           9
Richard Nixon             6
Mitt Romney               6
John F. Kerry             5
Hillary Clinton           5
John McCain               5
Robert Dole               3
Donald J. Trump           2
Fred Thompson             1
Bernie Sanders            1
Albert Gore, Jr.          1
John Anderson             1
Martin O'Malley           1
Jeb Bush                  1
Franklin D. Roosevelt     1
Name: Candidate, dtype: int64

In [17]:
subspeech_sample = SubSpeech[['Speech_index', 'Subspeech_index','SubContent']].loc[SubSpeech.Speech_index.isin(speech_sample)]

In [30]:
subspeech_sample = SubSpeech[['Speech_index', 'Subspeech_index','SubContent']].loc[SubSpeech.Speech_index.isin(speech_sample1)]

In [31]:
subspeech_sample['text'] = [subspeech_sample.Subspeech_index[i] + '## ' + ''.join(subspeech_sample.SubContent[i])
                           for i in subspeech_sample.Subspeech_index.index]

In [32]:
len(subspeech_sample.Subspeech_index)

119

In [189]:
len(subspeech_sample.Subspeech_index)

280

In [130]:
subspeech_sample.to_csv('subspeech_sample.csv')

In [33]:
subspeech_sample.to_csv('subspeech_sample1.csv')

## Load and Merge Annotation

In [192]:
with open('USPOPannotations.jsonl/test_annotation.jsonl', 'r') as annotation_jsonl:
    annotation_list = list(annotation_jsonl)

In [193]:
coded = {}
for annotation in annotation_list:
    annotation_dict = json.loads(annotation)
    coded[annotation_dict['text'].split('##')[0]] = annotation_dict['answer']

In [194]:
len(coded)

81

In [196]:
coded_data = pd.DataFrame.from_dict(coded, orient='index')

In [197]:
coded_data = coded_data.rename(columns={0:'Pop'})

In [198]:
coded_data['Subspeech_index'] = coded_data.index

In [199]:
new = Sub_Speech.merge(coded_data, how = 'outer', left_on='Subspeech_index', right_on = 'Subspeech_index')
new.head()

Unnamed: 0,Speech_index,SubContent,Subspeech_index,Pop
0,Speech0,"[, PRESIDENT DONALD TRUMP: Thank you, thank yo...",Speech0_0:10,
1,Speech0,[The unemployment rate just hit the lowest lev...,Speech0_10:20,
2,Speech0,"[And I'll tell you, a little—a little tricky b...",Speech0_20:30,
3,Speech0,"[You are here, he is here to help elect Cindy ...",Speech0_30:40,
4,Speech0,[What a great crowd we have tonight for you. I...,Speech0_40:50,


In [39]:
CSpeech.Candidate[CSpeech.Speech_index=='Speech1654']

1654    John Edwards
Name: Candidate, dtype: object

In [36]:
subspeech_sample.Speech_index.value_counts()

Speech284     11
Speech397      8
Speech417      8
Speech414      8
Speech506      7
Speech1618     7
Speech208      6
Speech1819     6
Speech830      6
Speech541      5
Speech1654     5
Speech1515     5
Speech2725     5
Speech739      5
Speech77       4
Speech1482     4
Speech356      2
Speech702      2
Speech1922     2
Speech672      2
Speech2475     2
Speech2458     1
Speech762      1
Speech2559     1
Speech634      1
Speech1979     1
Speech2312     1
Speech2398     1
Speech1927     1
Speech1909     1
Name: Speech_index, dtype: int64

In [40]:
len(subspeech_sample.Subspeech_index)

119