In [3]:
import pandas as pd 
import spacy
import os
from collections import defaultdict
import jellyfish
from spacy import displacy
from  more_itertools import unique_everseen
from tqdm import tqdm_notebook as tqdm
nlp = spacy.load('en_core_web_sm')

In [4]:
# import neuralcoref
# neuralcoref.add_to_pipe(nlp)
# nlp.remove_pipe("neuralcoref")  # This remove the current neuralcoref instance from SpaCy pipe
# coref = neuralcoref.NeuralCoref(nlp.vocab)

In [13]:
import sys
sys.path.append('../scripts/')

In [14]:
import util

In [16]:
def process_one_body(row):
    """Take one body and extract the people with their descriptions."""
    article_id, body = row
    
    output = []
    doc = util.preprocess(body)
    entities = util.get_quotes_method_1(doc)
    if len(entities)> 0:
        # quoted_entities = dict(filter(lambda x: len(x[1]['quote sentence']) > 0, entities_clustered.items()))
        entity_df = pd.DataFrame.from_dict(entities, orient='index')
        quoted_ent_df = entity_df.loc[lambda df: df['quote sentence'].str.len() > 0]
        quoted_ent_df['article_id'] = article_id
        output.append(quoted_ent_df)
    return output

# Parse data

In [6]:
full_df = pd.read_csv(
    '../../newspaper-pages/nyt_corpus/unprocessed_full_data_df.csv',
    nrows=10000
)

In [7]:
a1_df = (full_df
         .loc[lambda df: df['print_section_and_page'] == 'A-001']
         .loc[lambda df: df['body'].notnull()]
          .set_index('id')['body']
        )

In [10]:
# a1_df.to_csv("../data/a1_df.csv")

In [6]:
a1_df = pd.read_csv("../data/a1_df.csv", index_col=0, header=-1, squeeze=True,nrows=1000)

In [7]:
doc = nlp(a1_df.iloc[0])
sentences = [sent for sent in doc.sents]

In [23]:
def get_quotes_method_2(doc):
    """Get quoted people by finding the nsubj of a 'say', 'said' or 'according to' verb."""
    ## extract quotes
    entities = defaultdict(lambda: {'background sentence': [], 'quote sentence': []})

    signifiers = [' say ', ' said ', ' says ']#, ' according to ']
    seen = set()
    ## get quotes
    for s_idx, sent in enumerate(doc.sents):
        ## 
        text_sentence = ' '.join([word.text for word in sent]).strip()

        ## hack to pick up common phrasal signifiers
#         if 'according to' in text_sentence:
#             sent = nlp(text_sentence.replace('according to', 'said'))

        ## extract all nsubj of VERB if VERB is 'said', 'says' or 'say'
        nsubjs = []
        for possible_subject in sent:
            if (
                possible_subject.dep_ == 'nsubj' and 
                possible_subject.head.pos_ == 'VERB' and 
                possible_subject.head.text in ('say', 'says', 'said')
            ):
                nsubjs.append(possible_subject.text)
                entities[possible_subject.text]['quote sentence'].append((s_idx, text_sentence))
                
        for noun_phrase in sent.noun_chunks:
            for nsubj in nsubjs:
                if nsubj in noun_phrase.text:
                    entities[noun_phrase.text]['quote sentence'].append((s_idx, text_sentence))
                    seen.add(s_idx)

    ## get background
    for s_idx, sent in enumerate(doc.sents):
        if s_idx not in seen:
            ## get person-entities
            for ent in sent.ents:
                if ent.label_ == 'PERSON':
                    entities[ent.text]['background sentence'].append((s_idx, text_sentence))
    
    return util.cluster_entities(entities)

In [27]:
all_quotes = []
for text in a1_df.head():
    doc = util.preprocess(text, nlp=nlp)
    quotes = get_quotes_method_2(doc)
    all_quotes.append(quotes)

In [42]:
t= pd.DataFrame.from_dict(all_quotes[0], orient='index').loc['Surplus Termed \'Pot']['quote sentence'][0][1]

# Rule-based matching

In [52]:
%%latex
\begin{enumerate}
\item quote-mark QUOTE quote-mark [,] verb [modifier]
[determiner] [title] name

e.g. "blah blah", said again the journalist John Smith.
\item name [, up to 60 characters ,] verb [:|that] quote-mark
QUOTE quote-mark

e.g. John Smith, supporting AFG, said: "blah blah".
\item quote-mark QUOTE quote-mark [; or ,] [title] name
[modifier] verb

\end{enumerate}

<IPython.core.display.Latex object>

In [19]:
all_quotes_df = pd.concat(quoted_dfs_method_1)


In [189]:
import pickle
all_quotes_df = pd.concat(quoted_dfs_method_1)
pickle.dump(all_quotes_df, open('../data/2019-09-16__parse-df-method-1.pkl', 'wb'))

In [31]:
import pickle
import pandas as pd 
all_quotes_df = pickle.load(open('../data/2019-09-16__parse-df-method-1.pkl', 'rb'))

## get the sentence with the full name 
usable_people = (
    all_quotes_df
        .reset_index()
        .loc[lambda df: df['index'].str.split(' ').str.len() > 1]
)

## get the main sentences
main_sentences = (
    usable_people
        .apply(lambda x: (
            x['index'],
            list(filter(lambda y: x['index'] in y[1], x['background sentence'] + x['quote sentence'])),
            x['background sentence'] + x['quote sentence']
        ), axis=1)
)

In [32]:
from IPython.display import clear_output
from IPython.display import HTML, display

In [33]:
## apply labels
labels = []
for sentences in main_sentences:
    person, first_sents, sents = sentences 
    first_sentences = ' '.join(list(map(lambda x: x[1], first_sents)))
    ## replace person
    first_sentences = first_sentences.replace(person, '<span style="background-color: #FFFF00">' + person + '</span>')
    ## replace mentions of person
    next_sentences = ' '.join(list(map(lambda x: x[1], sents[:3]))) + '...'
    for name_part in person.split(' '):
        next_sentences = next_sentences.replace(name_part, '<span style="background-color: #FFFF00">' + name_part + '</span>')
    
    html = HTML('person: ' + person + '<br><br>main sentence:<br>' + first_sentences + '<br><br>others<br>' + next_sentences)
    display(html)
    label = input()
    labels.append(label)
    clear_output() ## clear ipython output

x


KeyboardInterrupt: 

In [None]:
## w: witness
##### wc: casual witness
##### wi: insider witness
##### wl: witness lawyer

## o: official
##### og: government official
##### ol: official lawyer

## e: expert
##### ea: academic expert
##### en: ngo/advocacy expert
##### eo: other expert


## a: actor
##### ai: insider actor
##### 

## x: error

In [63]:
displacy.render(t2, style='dep')

# Look at parse trees

In [224]:
sample_anonymous_sentence = """“We are encountering gut-wrenching delays,” said one key manager with an extensive background in civil infrastructure projects. “Nowhere have I ever worked where I had to keep going back to the same owners for more land.”"""
sample_anon_doc = nlp(sample_anonymous_sentence)

In [285]:
sample_named_sentence = 'Representative David R. Obey , a Wisconsin Democrat who serves on the Appropriations Committee , said , " The President and other officials discredit the budget process by not sending us serious proposals .'
sample_named_doc = nlp(sample_named_sentence)

sample_according_to_sentence = 'At least 43 people died in the fire , at the Dupont Plaza Hotel , and more than 100 were injured as 1,000 holiday guests attempted to flee flames and smoke in the 22-story beachfront structure , according to Police Superintendent Carlos Lopez Feliciano of San Juan .'
sample_according_to_sentence = sample_according_to_sentence.replace('according to', 'said') 
sample_according_to_doc = nlp(sample_according_to_sentence)

In [286]:
doc = sample_according_to_doc

names = defaultdict(list)
for possible_subject in doc:
    if possible_subject.dep_ == 'nsubj' and possible_subject.head.pos_ == 'VERB':
        verbs[possible_subject.head.text].append(possible_subject.text)
        
verb_to_phrases = defaultdict(list)
for noun_phrase in doc.noun_chunks:
    for verb, nsubjs in verbs.items():
        for nsubj in nsubjs:
            if nsubj in noun_phrase.text:
                verb_to_phrases[verb].append(noun_phrase.text)

In [287]:
verb_to_phrases

defaultdict(list,
            {'died': ['At least 43 people', 'At least 43 people'],
             'said': ['Police Superintendent Carlos Lopez Feliciano']})

In [282]:
displacy.render(sample_according_to_doc, style='dep')

In [284]:
for word in sample_according_to_doc:
    print('word: %s, pos: %s' % (word.text, word.pos_))

word: At, pos: ADV
word: least, pos: ADV
word: 43, pos: NUM
word: people, pos: NOUN
word: died, pos: VERB
word: in, pos: ADP
word: the, pos: DET
word: fire, pos: NOUN
word: ,, pos: PUNCT
word: at, pos: ADP
word: the, pos: DET
word: Dupont, pos: PROPN
word: Plaza, pos: PROPN
word: Hotel, pos: PROPN
word: ,, pos: PUNCT
word: and, pos: CCONJ
word: more, pos: ADJ
word: than, pos: ADP
word: 100, pos: NUM
word: were, pos: VERB
word: injured, pos: VERB
word: as, pos: ADP
word: 1,000, pos: NUM
word: holiday, pos: NOUN
word: guests, pos: NOUN
word: attempted, pos: VERB
word: to, pos: PART
word: flee, pos: VERB
word: flames, pos: NOUN
word: and, pos: CCONJ
word: smoke, pos: VERB
word: in, pos: ADP
word: the, pos: DET
word: 22-story, pos: ADJ
word: beachfront, pos: ADJ
word: structure, pos: NOUN
word: ,, pos: PUNCT
word: according, pos: VERB
word: to, pos: ADP
word: Police, pos: PROPN
word: Superintendent, pos: PROPN
word: Carlos, pos: PROPN
word: Lopez, pos: PROPN
word: Feliciano, pos: PROPN
wor

In [269]:
from collections import Counter
def get_list_count(series):
    c = Counter()
    for x in series:
        c.update(x)
    return c