In [1]:
from citation_utils import *

## Spacy docs
https://spacy.io/

In [2]:
# pip install -U spacy
# python -m spacy download en_core_web_sm
import spacy

In [3]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")
# https://github.com/explosion/spaCy/issues/4577

In [4]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")

In [5]:
doc = nlp(text)

In [6]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']


In [7]:
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']


In [8]:
# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Sebastian Thrun PERSON
Google ORG
2007 DATE
American NORP
Thrun PERSON
Recode ORG
earlier this week DATE


In [None]:
# TODO: perform analysis on a paragraph from a word doc

In [2]:
# vanilla_fixer = CitationFixer()
# vanilla_fixer.read_document("AA final paper copy.docx")
# vanilla_fixer.get_citations()
# vanilla_fixer.fix_citations()
# vanilla_fixer.save_document("AA final paper copy_fixed.docx")

## Using NLP to analyze paragraphs from the document

In [21]:
vanilla_fixer = CitationFixer()
vanilla_fixer.read_document("AA final paper copy.docx")
vanilla_fixer.get_citations()

In [22]:
paragraphs = vanilla_fixer.document.paragraphs

In [7]:
len(paragraphs)

54

In [10]:
nlp = spacy.load("en_core_web_sm")

In [12]:
paragraph = paragraphs[7].text
print(paragraph)

“The Endless Frontiers [sic] Act is a downpayment for future generations of American technological leadership, and I'm proud to introduce it on a bipartisan basis.” – Representative Mike Gallagher (2020).

“The point is that founding and growing a company is fundamentally an act of exploration and colonization… Google took web search… Twitter colonized real-time status updates. Quora is attempting to colonize Q&A… Facebook of course colonized online identity.” – Kevin Simler (2012).


In [15]:
# named entity recognition
doc = nlp(paragraph)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
    
# observation: a citation often occurs with PERSON followed by DATE

The Endless Frontiers 1 22 WORK_OF_ART
American 76 84 NORP
Mike Gallagher 182 196 PERSON
2020 198 202 DATE
Google 313 319 ORG
Twitter 337 344 PERSON
Quora 381 386 PERSON
Q&A 413 416 ORG
Kevin Simler 467 479 PERSON
2012 481 485 DATE


In [16]:
print(len(doc.ents))

10


In [24]:
# print(type(doc.ents[0].label))

<class 'str'>


In [11]:
class SimpleNLPCitationFixer(CitationFixer):
    def __init__(self):
        self.min_paragraph_length = 10
        self.nlp = spacy.load("en_core_web_sm")
    def get_citations(self):
        print("inside get_citations nlp")
        print("len[self.citations]:", len(self.citations))
        for i in range(len(self.document.paragraphs)):
            text = self.document.paragraphs[i].text
            if len(text) >= self.min_paragraph_length:
                entities = self.nlp(text).ents
                for j in range(len(entities)-1):
                    if entities[j].label_ == 'PERSON' and entities[j+1].label_ == 'DATE':
                        start_char = entities[j].start_char
                        end_char = entities[j+1].end_char
                        # spacy gives the intervals in the [start_char, end_char) format,
                        # but we want it to be in the [start_char, end_char] format.
                        # Since we want to include ')', we don't decrement end_char
                        if text[end_char-len(entities[j+1].text)-1] == '(':
                            # ensuring that there is a ( before the year (distinguishing it from cases
                            # there is a year but not for the purpose of a citation)
                            self.citations[i].append((start_char, end_char))
#                             print("last character:")
#                             print("citation:", text[start_char:end_char+1])
                
                

In [12]:
nlp_fixer = SimpleNLPCitationFixer()
nlp_fixer.read_document("AA final paper copy.docx")
nlp_fixer.get_citations()

reading document
inside get_citations nlp
len[self.citations]: 54
citation: Mike Gallagher (2020)
citation: Kevin Simler (2012)
