# Presidential Debate Text Processing
### Anlan Du
11-10-17
Open Project  
Here, I'll walk through the steps of processing the presidential debates! I break down data into three main sets: speech data, which calculates information speech-by-speech (a speech is dileneated by a new person talking); speaker data, which aggregates all speech data for a given speaker (either Clinton, Trump, or a Moderator); and dictionaries, which provide the most frequently used words per speaker.  
[Source of Debate 1 Transcript](https://www.washingtonpost.com/news/the-fix/wp/2016/09/26/the-first-trump-clinton-presidential-debate-transcript-annotated/)  
[Source of Debate 2 Transcript](https://medium.com/@PolitiFact/politifacts-annotated-transcript-of-the-second-presidential-debate-b54f45edeb99#.6sodpyhkw)  
[Source of Debate 3 Transcript](https://www.washingtonpost.com/news/the-fix/wp/2016/10/19/the-final-trump-clinton-debate-transcript-annotated/)  
(I intentionally selected transcripts with structurally similar motifs--names in all caps, interruptions indicated by ellipses--so that I could reduce my analysis to one function, to be repeated thrice.)

In [1]:
#Standard import statements
import json
import string
import re
import csv
from nltk import RegexpTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk



The below function processes each speech and returns data on each one (see speechCols for the specifics of the data). It's unfortunate I had to create such a massive cell, but necessary in order to call it as a function (so I can call it for each debate).

In [2]:
def process_speeches(i):
    with open("debate%s.txt"% i, "r") as f:
        debate = f.read()
        
    #regex to delineate speeches
    re_tokenizer = RegexpTokenizer("([A-Z][A-Z]+ ?)+:")
    
    #get list of speaker name indices
    speechNameIndices = list(zip(re_tokenizer.tokenize(debate), re_tokenizer.span_tokenize(debate)))
    speechTextIndices = []
    speechData=[[0,0,0,0,0,0,0,0,0,0,0] for i in range(len(speechNameIndices))]

    #get the indices of each speech
    for i, speech in enumerate(speechNameIndices):
        # for purposes of determining speech's x positioning in the visualization,
        # append start and end indices to speech info
        speechData[i][2]=speechNameIndices[i][1][0]
        try:
            speechData[i][3]=speechNameIndices[i+1][1][0]-1
        except:
            speechData[i][3]=len(debate)

        # append indices per speech to speechTextIndices
        c = [speech[0], [speech[1][1], 0]]
        try: 
            #append ending index
            c[1][1] = speechNameIndices[i+1][1][0]
        except:
            #final section
            pass
        speechTextIndices.append(c)

        
    #create array of text of each speech. 
    for i, speech in enumerate(speechTextIndices):
        speech_text=debate[speech[1][0]:speech[1][1]-1]
        speechData[i][0]=speech[0]
        #annoying special chars we want to get rid of.
        speechData[i][1]=speech_text.replace("\n", " ").replace("\r", "").replace("’","'").replace("“","\"").replace("”","\"").strip()
        #all-caps words in parentheses are superfluous notes (e.g. "APPLAUSE")
        speechData[i][1]=re.sub("\([A-Z][A-Z]+\)", "", speechData[i][1])
        
    #punctuation (to get rid of)
    puncFinal=string.punctuation+")’(,�...``''--“”"
    #this will eventually end up looking like words[speech][word]
    words=[]

    #to keep track of sentence number as we iterate through each chapter
    currSent=0

    for i, speech in enumerate(speechData):
        #add sentiment score to speech data
        sentAnalyzer=SentimentIntensityAnalyzer()
        speech[10]=sentAnalyzer.polarity_scores(speech[1]).get("compound")
        speech_text=speech[1]
        
        #append to chapter a list of this speech's words
        words.append(nltk.word_tokenize(speech_text))

        #temp contains words to be deleted; this circumvents bugs in removing elements
        temp=[]
        
        #ellipses indicate interruption; increment "interrupt" and "number of sentences" data as such.
        if words[i][-1:]==['...']:
            speechData[i][4]+=1
            speechData[i][7]+=1

        for j,word in enumerate(words[i]):
            #set punctuation to be deleted and iterate num sentences appropriately
            if word in puncFinal:
                if word=='.' or word=='?' or word=='!':
                    speechData[i][7]+=1
                temp.append(word)
                
            #add non-punctuation words to data (including word length)
            else:
                speechData[i][5]+=1
                speechData[i][6]+=len(word)

            #at end of each sentence, delete unwanted punctuation
            if j==len(words[i])-1:
                while len(temp) != 0:
                    words[i].remove(temp[0])
                    temp.remove(temp[0])
                    
        #average word length = num chars/num words
        speechData[i][8]=speechData[i][6]/speechData[i][5]
        speechData[i][9]=speechData[i][5]/speechData[i][7]
        
        from collections import Counter
        from nltk.corpus import stopwords

        #the word no is important. the word said isn't. modifying stopwords as such
        stopset=set(stopwords.words('english'))
        stopset.remove('no')
        stopset.add("'s")
        stopset.add("'ve")
        stopset.add("'re")
        stopset.add("n't")
        stopset.add("'m")
        fullDict=[]

        trumpDict=Counter()
        clintonDict=Counter()
        modDict=Counter()
        #add words to each speaker's dictionary, then append the (sorted) 75 most common words for each
        #speaker to the nested array "fullDict"
        for i, speech in enumerate(words):
            toBeAdded=[word.lower() for word in words[i] if word.lower() not in stopset]
            if speechData[i][0]=='TRUMP':
                trumpDict.update(toBeAdded)
            elif speechData[i][0]=='CLINTON':
                clintonDict.update(toBeAdded)
            else:
                modDict.update(toBeAdded)
        fullDict.append(trumpDict.most_common(60))
        fullDict.append(clintonDict.most_common(60))
        fullDict.append(modDict.most_common(60))
    
    #because we can only return one object, return array of dicts and speech data
    return [fullDict,speechData]

Solid! So we've got our speech data worked out; now we just need to work out our data for each speaker. We'll go back through our already-collected data and do calculations with those numbers.

In [3]:
def process_speakers(speechData):
    speakerData=[[0 for i in range(7)] for j in range(3)]
    #append names to speaker data
    speakerData[0][0]="Trump"
    speakerData[1][0]="Clinton"
    speakerData[2][0]="Moderator"
    #create arrays to tally words, chars, etc.; we don't need them in final dataset
    #so these are just throwaway variables
    totalWords=[0,0,0]
    totalChars=[0,0,0]
    totalSents=[0,0,0]
    totalSentiment=[0,0,0]
    totalInterrupt=[0,0,0]
    #tally data from each speech in appropriate speaker's arrays
    for i, speech in enumerate(speechData):
        sInd=speakInd(speech[0])
        speakerData[sInd][1]+=1
        totalWords[sInd]+=speech[5]
        totalChars[sInd]+=speech[6]
        totalSents[sInd]+=speech[7]
        speakerData[sInd][2]+=speech[4]
        totalSentiment[sInd]+=speech[10]
    for i in range(3):
        #calc words per speech
        speakerData[i][3]=totalWords[i]/speakerData[i][1]
        #calc avg word length per speaker
        speakerData[i][4]=totalChars[i]/totalWords[i]
        #calc avg sentence length per speaker
        speakerData[i][5]=totalWords[i]/totalSents[i]
        #calc avg sentiment
        speakerData[i][6]=totalSentiment[i]/speakerData[i][1]
    return speakerData

In [4]:
#helper function; returns index of speaker in speaker data array based on speaker name
#(0 is trump, 1 is clinton, 2 is moderator)
def speakInd(speaker):
    if speaker=='TRUMP':
        return 0
    elif speaker=='CLINTON':
        return 1
    else:
        return 2

Now we just need to save everything and we're done!

In [5]:
speakerDataCols=["speaker","numSpeeches","interrupted","avgWordsPerSpeech","avgWordLen","avgSentLen","sentiment"]
speechCols=["speaker","speechText","startInd","endInd","interrupted","numWords","numChars","numSents","avgWordLen","avgSentLen","sentiment"]
dictCols=["word","freq"]
for i in range(1,4):
    #data[0] is an array of dicts per speaker; data[1] is data per speech
    data=process_speeches(i)
    #use speech data to get speaker data
    speakData=process_speakers(data[1])
    
    #write speech data
    with open('debate%s.csv'% i, 'w',newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(speechCols)
        for row in data[1]:
            writer.writerow(row)
            
    #write speaker dicts
    with open('trumpDict%s.csv'% i, 'w',newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(dictCols)
        for key,value in data[0][0]:
            writer.writerow([key, value])
    with open('clintonDict%s.csv'% i, 'w',newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(dictCols)
        for key,value in data[0][1]:
            writer.writerow([key, value])
    with open('moderatorDict%s.csv'% i, 'w',newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(dictCols)
        for key,value in data[0][2]:
            writer.writerow([key, value])
            
    #write speaker data
    with open('speakData%s.csv'% i, 'w',newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(speakerDataCols)
        for row in speakData:
            writer.writerow(row)
# grab all lines that are all caps
