<strong>Part 1: extracting direct quotes<strong>

In [1]:
#import modules
import spacy
import re
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [2]:
#save file paths
import os
fileids = os.listdir("texts")
fileids = [("texts/" + fileid) for fileid in fileids]
fileids

['texts/5c1452701e67d78e276ee126.txt',
 'texts/5c146e42795bd2fcce2ea8e5.txt',
 'texts/5c149ffc1e67d78e276fbd44.txt',
 'texts/5c15488f1e67d78e277161d7.txt',
 'texts/5c1548a31e67d78e2771624f.txt']

In [3]:
#open and merge files
textlist = []
for fileid in fileids:
    with open (fileid, mode ="r", encoding = "utf-8") as f:
        text = f.read()
        textlist.append(text)

In [None]:
textlist

In [4]:
#standardize quotation marks- changed to list comprehension
pipeline = [('“', '"'), ('´´', '"'), ('”', '"'), ('’’', '"')]
for old, new in pipeline:
    textlist = [(x.replace(old, new)) for x in textlist]

In [5]:
#remove html- changed to list comprehension
#for some reason there are no random \s in the list so removed that line
textlist = [(x.replace("\n", "")) for x in textlist]

In [None]:
textlist

In [6]:
#nlp without sentences. yay for list comprehensions!
#it would save trouble down the line to call nlp on a merged string but then it causes other problems
textdoc = [nlp(x) for x in textlist]

In [7]:
#quotes function without space
def get_quotes(text):
    quotes = re.findall(r'"(.*?)"', text)
    return(quotes)

In [8]:
#print quotes and save as .txt file
#note: every time you run this it will add to the .txt, so delete the old one first
#it also adds the quote to a list
quoteslistlist = []
for text in textdoc:
    str_text = str(text)
    found_quotes = get_quotes(str_text)
    if len(found_quotes) > 0:
        print(found_quotes)
    with open ("newapproachoutput.txt", "a") as x:
        for quote in found_quotes:
            print(quote, file = x)
    quoteslistlist.append(found_quotes)

['I was clear when I was mayor – I don’t support Uber at all,', 'It was a twinkle in some engineer’s eye some years ago.', 'Mayor McCallum’s statements vary greatly from truth,', "There’s a tried-and-true method in Canadian politics: after an election a new government takes office and says, ‘Oh my gosh, the cupboards are bare.’ Or, ‘We’re much deeper in debt than I thought we were, and now I’ve seen the real books.' So I think there’s an element of that kind of gamesmanship going on,", 'Then there’s the fact that McCallum has been out of office for quite some time, thinking he knew the job, but some things have changed,', 'If you take Fraser Highway SkyTrain and if we’re building that seven days a week around the clock, we probably can save, and this is TransLink’s figures, we can probably save $2-300 million,', 'TransLink has not conducted any detailed study on potential construction methods for a SkyTrain route from Surrey to Langley. The most recent cost estimate (2017 Hatch report)

In [None]:
quoteslistlist

In [14]:
#quoteslistlist is a list of lists so it has to be flattened into one list
#we all know what that means...
#list comprehension!
quoteslist = [item for sublist in quoteslistlist for item in sublist]

<strong>Part 2: extracting speakers<strong>

In [42]:
#please don't ask me why i'm doing it this way because I don't know
textstring = str(textlist)
textstring = textstring.replace("\\n", "")
textstring = textstring.replace("\\", "")
textstring = textstring.replace("\'", "")

In [43]:
textstring

'[The question was common for mayoral candidates to hear back in the October municipal election: what do you think of ride-hailing and should it come to Metro Vancouver?.  "I was clear when I was mayor – I don’t support Uber at all," then-Surrey mayoral candidate Doug McCallum responded at one debate.  It was an odd thing to say: McCallum’s last term as Surrey mayor ended in 2005, while Uber as a company began in 2009, briefly entering the Vancouver market only in 2012. There was no Uber not to support in 2005.  Rival candidate Bruce Hayne joked in his turn to answer the question, "It was a twinkle in some engineer’s eye some years ago.".  McCallum didn’t correct himself.  But the strange remark may have foreshadowed a growing number of curious statements from the returning mayor of B.C.’s second-largest, rapidly growing city, which is wrestling with big changes after his come-from-behind election win.  CTV News has analyzed about three weeks worth of McCallums speeches, statements at 

In [44]:
#def function to find people and orgs with spaCy
speakerlistlist = []
def findspeakers(doc):
    speakers = [ent.text for ent in doc.ents if (ent.label_ == "PERSON" or ent.label_ == "ORG")]
    speakerlistlist.append(speakers)

In [45]:
#do that
for doc in textdoc:
    findspeakers(doc)

In [46]:
#flatten the list again
speakerlist = [item for sublist in speakerlistlist for item in sublist]

In [80]:
#is this elegant? no
#but does it work? maybe
#basically it gives the character index (instead of the token index that spaCy would give)
#the issue is that if a speaker occurs multiple times it always takes the first index
#there's probably a way to fix this but I don't know what it is and I have no brain cells left
speakerindexlist = []
for text in textdoc:
    str_text = str(text)
    for speaker in speakerlist:
        if speaker in str_text:
            speakercharindex = (speaker, str_text.index(speaker))
            speakerindexlist.append(speakercharindex)

In [81]:
speakerindexlist

[('Uber', 218),
 ('Doug McCallum', 262),
 ('McCallum', 267),
 ('Surrey', 237),
 ('Uber', 218),
 ('Bruce Hayne', 532),
 ('McCallum', 267),
 ('CTV News', 915),
 ('McCallum', 267),
 ('Cloverdale Sport and Ice Complex', 1350),
 ('Grandview Heights Community Centre', 1571),
 ('Library', 1610),
 ('SkyTrain', 1652),
 ('Langley', 1674),
 ('McCallum', 267),
 ('SkyTrain', 1652),
 ('McCallum', 267),
 ('Cindy Dalglish', 2195),
 ('Surrey', 237),
 ('McCallum', 267),
 ('Hamish Telford', 2705),
 ('McCallum', 267),
 ('Telford', 2712),
 ('McCallum', 267),
 ('Surrey', 237),
 ('CTV News', 915),
 ('CTV News', 915),
 ('the Mayors Council', 3473),
 ('McCallum', 267),
 ('Newton', 3634),
 ('SkyTrain', 1652),
 ('Langley', 1674),
 ('Langley SkyTrain', 3768),
 ('LRT', 3658),
 ('McCallum', 267),
 ('CTV News', 915),
 ('SkyTrain', 1652),
 ('Surrey', 237),
 ('Langley', 1674),
 ('SkyTrain', 1652),
 ('LRT', 3658),
 ('Surrey', 237),
 ('SkyTrain', 1652),
 ('Evergreen Line', 5029),
 ('the Evergreen Line', 5288),
 ('the Ev

In [50]:
textstring.index('Uber')

219

In [59]:
#there are some issues here
#as in life, sextortion is a big problem
#but repeated ones are probably overgeneralization anyway
quoteindexlist = []
for text in textdoc:
    str_text = str(text)
    found_quotes = get_quotes(str_text)
    for quote in found_quotes:
        quotecharindex = (quote, str_text.index(quote))
        quoteindexlist.append(quotecharindex)

In [82]:
def nearestspeaker(quotetext, quoteindex, listofspeakers):
    for speaker, speakerindex in listofspeakers:
        nearestspeaker = min(speakerindex, key=lambda x:abs(x - aquoteindex))
        return speaker

In [83]:
for index, value in enumerate(quoteindexlist):
    print(index, value)

0 ('I was clear when I was mayor – I don’t support Uber at all,', 171)
1 ('It was a twinkle in some engineer’s eye some years ago.', 587)
2 ('Mayor McCallum’s statements vary greatly from truth,', 2141)
3 ("There’s a tried-and-true method in Canadian politics: after an election a new government takes office and says, ‘Oh my gosh, the cupboards are bare.’ Or, ‘We’re much deeper in debt than I thought we were, and now I’ve seen the real books.' So I think there’s an element of that kind of gamesmanship going on,", 2343)
4 ('Then there’s the fact that McCallum has been out of office for quite some time, thinking he knew the job, but some things have changed,', 2728)
5 ('If you take Fraser Highway SkyTrain and if we’re building that seven days a week around the clock, we probably can save, and this is TransLink’s figures, we can probably save $2-300 million,', 3267)
6 ('TransLink has not conducted any detailed study on potential construction methods for a SkyTrain route from Surrey to Lang

In [87]:
#does it defeat the purpose to divide them manually? absolutely
#but if I tried to combine the texts before getting the indices it just didn't work
quotes1 = quoteindexlist[0:27]
quotes2 = quoteindexlist[27:38]
quotes3 = quoteindexlist[38:53]
quotes4 = quoteindexlist[53:57]
quotes5 = quoteindexlist[57:]

In [88]:
for index, value in enumerate(speakerindexlist):
    print(index,value)

0 ('Uber', 218)
1 ('Doug McCallum', 262)
2 ('McCallum', 267)
3 ('Surrey', 237)
4 ('Uber', 218)
5 ('Bruce Hayne', 532)
6 ('McCallum', 267)
7 ('CTV News', 915)
8 ('McCallum', 267)
9 ('Cloverdale Sport and Ice Complex', 1350)
10 ('Grandview Heights Community Centre', 1571)
11 ('Library', 1610)
12 ('SkyTrain', 1652)
13 ('Langley', 1674)
14 ('McCallum', 267)
15 ('SkyTrain', 1652)
16 ('McCallum', 267)
17 ('Cindy Dalglish', 2195)
18 ('Surrey', 237)
19 ('McCallum', 267)
20 ('Hamish Telford', 2705)
21 ('McCallum', 267)
22 ('Telford', 2712)
23 ('McCallum', 267)
24 ('Surrey', 237)
25 ('CTV News', 915)
26 ('CTV News', 915)
27 ('the Mayors Council', 3473)
28 ('McCallum', 267)
29 ('Newton', 3634)
30 ('SkyTrain', 1652)
31 ('Langley', 1674)
32 ('Langley SkyTrain', 3768)
33 ('LRT', 3658)
34 ('McCallum', 267)
35 ('CTV News', 915)
36 ('SkyTrain', 1652)
37 ('Surrey', 237)
38 ('Langley', 1674)
39 ('SkyTrain', 1652)
40 ('LRT', 3658)
41 ('Surrey', 237)
42 ('SkyTrain', 1652)
43 ('Evergreen Line', 5029)
44 ('t

In [85]:
speakers1 = 
speakers2 = 
speakers3 = 
speakers4 = 
speakers5 = 

[('I was clear when I was mayor – I don’t support Uber at all,', 171),
 ('It was a twinkle in some engineer’s eye some years ago.', 587),
 ('Mayor McCallum’s statements vary greatly from truth,', 2141),
 ("There’s a tried-and-true method in Canadian politics: after an election a new government takes office and says, ‘Oh my gosh, the cupboards are bare.’ Or, ‘We’re much deeper in debt than I thought we were, and now I’ve seen the real books.' So I think there’s an element of that kind of gamesmanship going on,",
  2343),
 ('Then there’s the fact that McCallum has been out of office for quite some time, thinking he knew the job, but some things have changed,',
  2728),
 ('If you take Fraser Highway SkyTrain and if we’re building that seven days a week around the clock, we probably can save, and this is TransLink’s figures, we can probably save $2-300 million,',
  3267),
 ('TransLink has not conducted any detailed study on potential construction methods for a SkyTrain route from Surrey to

<strong>giving up and just showing context<strong>

In [2]:
def showcontext(text, indexedquote):
    for quote, index in indexedquote:
        context = text[(index-100):(index+100)]