In [140]:
import pandas as pd 
import spacy
import os
from collections import defaultdict
from itertools import product
import jellyfish
from spacy import displacy
nlp = spacy.load('en_core_web_sm')
from  more_itertools import unique_everseen

In [4]:
# import neuralcoref
# neuralcoref.add_to_pipe(nlp)

In [145]:
def preprocess(text):
    """Remove duplicate and short sentences."""
    text = text.replace('\n', ' ').replace("''", '"').strip()
    doc = nlp(text)
    unique_sentences = list(unique_everseen(map(lambda x: x.text.strip(), doc.sents)))
    text = ' '.join(unique_sentences)
    return nlp(text)

In [46]:
def get_quotes_method_1(doc):
    ## extract quotes
    entities = defaultdict(lambda: {'background sentence': [], 'quote sentence': []})

    signifiers = [' say ', ' said ', ' says ', ' according to ', ' described ', ' describes ']
    for s_idx, sent in enumerate(doc.sents):
        ## 
        text_sentence = ' '.join([word.text for word in sent]).strip()

        ## get person-entities
        for ent in sent.ents:
            if ent.label_ == 'PERSON':
                is_quote = False
                ## find quote sentence
                for sig in signifiers:
                    if sig in text_sentence:
                        is_quote = True
                entities[ent.text][['background sentence', 'quote sentence'][is_quote]].append((s_idx, text_sentence))
    
    return cluster_entities(entities)


def get_quotes_method_2(doc):
    """Get quoted people by finding the nsubj of a 'say', 'said' or 'according to' verb."""
    ## extract quotes
    entities = defaultdict(lambda: {'background sentence': [], 'quote sentence': []})

    signifiers = [' say ', ' said ', ' says ', ' according to ']
    seen = set()
    ## get quotes
    for s_idx, sent in enumerate(doc.sents):
        ## 
        text_sentence = ' '.join([word.text for word in sent]).strip()

        ## hack to pick up common phrasal signifiers
        if 'according to' in text_sentence:
            sent = nlp(text_sentence.replace('according to', 'said'))

        ## extract all nsubj of VERB if VERB is 'said', 'says' or 'say'
        nsubjs = []
        for possible_subject in sent:
            if (
                possible_subject.dep_ == 'nsubj' and 
                possible_subject.head.pos_ == 'VERB' and 
                possible_subject.head.text in ('say', 'says', 'said')
            ):
                nsubjs.append(possible_subject.text)
                entities[possible_subject.text]['quote sentence'].append((s_idx, text_sentence))
                
        for noun_phrase in sent.noun_chunks:
            for nsubj in nsubjs:
                if nsubj in noun_phrase.text:
                    entities[noun_phrase.text]['quote sentence'].append((s_idx, text_sentence))
                    seen.add(s_idx)

    ## get background
    for s_idx, sent in enumerate(doc.sents):
        if s_idx not in seen:
            ## get person-entities
            for ent in sent.ents:
                if ent.label_ == 'PERSON':
                    entities[ent.text]['background sentence'].append((s_idx, text_sentence))
    
    return cluster_entities(entities)


def cluster_entities(entities, sim=.95):
    ## append clusters together
    name_mapper = defaultdict(set)
    mapped = defaultdict(bool)

    entity_list = list(entities)
    n_ent = len(entity_list)

    clusters = []
    for i in range(n_ent):
        n1 = entity_list[i]

        if not mapped[n1]:
            ## new cluster
            n_cluster = [n1]
            mapped[n1] = True
            ## 
            for j in range(i, n_ent):
                n2 = entity_list[j]
                if not mapped[n2]:

                    ## get similarites 
                    name_parts = []
                    for w_i in n1.split():
                        for w_j in n2.split():
                            dist = jellyfish.jaro_winkler(w_i, w_j)
                            name_parts.append(dist)

                    ## append to cluster
                    if max(name_parts) > sim:
                        n_cluster.append(n2)
                        mapped[n2] = True
            ## record
            clusters.append(n_cluster)

    cluster_mapper = {}
    for cluster in clusters:
        key = max(cluster, key=lambda x: len(x))
        cluster_mapper[key] = cluster

    ## group for output
    entities_clustered = defaultdict(lambda: {'background sentence': [], 'quote sentence': []})
    for c_key, cluster in cluster_mapper.items():
        for c_i in cluster:
            for part in ['background sentence', 'quote sentence']:
                entities_clustered[c_key][part].extend(entities[c_i][part])
    
    ## 
    return entities_clustered

# Parse data

In [5]:
from tqdm import tqdm_notebook as tqdm

In [6]:
full_df = pd.read_csv('../../newspaper-pages/nyt_corpus/unprocessed_full_data_df.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
a1_df = full_df.loc[lambda df: df['print_section_and_page'] == 'A-001']

In [192]:
a1_df['body'].isnull().value_counts()

False    45527
True      2890
Name: body, dtype: int64

In [188]:
quoted_dfs_method_1 = []
for article_id, body in tqdm(
    a1_df
    .set_index('id')['body']
    .iteritems()
):
    doc = preprocess(body)
    entities = get_quotes_method_1(doc)
    if len(entities)> 0:
        # quoted_entities = dict(filter(lambda x: len(x[1]['quote sentence']) > 0, entities_clustered.items()))
        entity_df = pd.DataFrame.from_dict(entities, orient='index')
        quoted_ent_df = entity_df.loc[lambda df: df['quote sentence'].str.len() > 0]
        quoted_ent_df['article_id'] = article_id
        quoted_dfs_method_1.append(quoted_ent_df)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


AttributeError: 'float' object has no attribute 'replace'

In [189]:
import pickle
all_quotes_df = pd.concat(quoted_dfs_method_1)
pickle.dump(all_quotes_df, open('../data/2019-09-16__parse-df-method-1.pkl', 'wb'))

In [150]:
# quoted_dfs_method_2 = []
# for article_id, body in tqdm(a1_df.head(50).set_index('id')['body'].iteritems()):
#     doc = preprocess(body)
#     entities = get_quotes_method_2(doc)
#     if len(entities)> 0:
#         # quoted_entities = dict(filter(lambda x: len(x[1]['quote sentence']) > 0, entities_clustered.items()))
#         entity_df = pd.DataFrame.from_dict(entities, orient='index')
#         quoted_ent_df = entity_df.loc[lambda df: df['quote sentence'].str.len() > 0]
#         quoted_ent_df['article_id'] = article_id
#         quoted_dfs_method_2.append(quoted_ent_df)

In [151]:
all_quotes_df = pd.concat(quoted_dfs_method_1)

In [152]:
all_quotes_df

Unnamed: 0,background sentence,quote sentence,article_id
David R. Obey,[],"[(21, Representative David R. Obey , a Wiscons...",1dab5d59-c916-11e9-a6c2-b831b5755f6c
Gary D. Bass,[],"[(26, Surplus Termed ' Pot of Gold ' Gary D. B...",1dab5d59-c916-11e9-a6c2-b831b5755f6c
Reagan,"[(1, President Reagan 's new budget proposes $...","[(16, Some Increases Are Planned In the Medica...",1dab5d59-c916-11e9-a6c2-b831b5755f6c
Abraham,[],"[(50, "" May the God of Abraham , Isaac and Jac...",1dab5d62-c916-11e9-ba39-b831b5755f6c
Chaim Herzog,[],"[(3, It is fitting and it behooves me to say t...",1dab5d62-c916-11e9-ba39-b831b5755f6c
Fellini,[],"[(58, James McCarthy , said later , "" It was a...",1dab5d62-c916-11e9-ba39-b831b5755f6c
Isaac,[],"[(50, "" May the God of Abraham , Isaac and Jac...",1dab5d62-c916-11e9-ba39-b831b5755f6c
Jacob,[],"[(50, "" May the God of Abraham , Isaac and Jac...",1dab5d62-c916-11e9-ba39-b831b5755f6c
James Beltretti,"[(36, The motorcade went directly to St. Savio...","[(58, James McCarthy , said later , "" It was a...",1dab5d62-c916-11e9-ba39-b831b5755f6c
Joseph,"[(39, He spoke of the homelessness of Joseph a...","[(40, Joseph and Mary , he said , had to flee ...",1dab5d62-c916-11e9-ba39-b831b5755f6c


In [153]:
t = all_quotes_df.loc['Gonzalez Perez']['background sentence'][0][1]
t2 = nlp(t)

In [174]:
(all_quotes_df
 .pipe(lambda df: df['background sentence'] + df['quote sentence'])
 .reset_index()
 .loc[lambda df: df['index'].str.split(' ').str.len() > 1]
 .apply(lambda x: list(filter(lambda y: x['index'] in y[1], x[0])) , axis=1)
 .iloc[12]
#  .str.len()
#  .iloc[-6]
)

[(27,
  'A spokesman for the bureau , Jerry Rudden , said two additional specialists , a forensic chemist and an expert in fires and explosives , had also been sent .')]

In [70]:
t3 = all_quotes_df.loc['Dominick P. Pannunzio'].iloc[0]
t4 = list(set(list(map(lambda x: x[1], t3))))

In [89]:
list(nlp(t4[0]).ents)

[four and a half hours,
 Wednesday,
 Dominick P. Pannunzio,
 19,
 Dupont Plaza Hotel,
 16th]

In [63]:
displacy.render(t2, style='dep')

# Look at parse trees

In [224]:
sample_anonymous_sentence = """“We are encountering gut-wrenching delays,” said one key manager with an extensive background in civil infrastructure projects. “Nowhere have I ever worked where I had to keep going back to the same owners for more land.”"""
sample_anon_doc = nlp(sample_anonymous_sentence)

In [285]:
sample_named_sentence = 'Representative David R. Obey , a Wisconsin Democrat who serves on the Appropriations Committee , said , " The President and other officials discredit the budget process by not sending us serious proposals .'
sample_named_doc = nlp(sample_named_sentence)

sample_according_to_sentence = 'At least 43 people died in the fire , at the Dupont Plaza Hotel , and more than 100 were injured as 1,000 holiday guests attempted to flee flames and smoke in the 22-story beachfront structure , according to Police Superintendent Carlos Lopez Feliciano of San Juan .'
sample_according_to_sentence = sample_according_to_sentence.replace('according to', 'said') 
sample_according_to_doc = nlp(sample_according_to_sentence)

In [286]:
doc = sample_according_to_doc

names = defaultdict(list)
for possible_subject in doc:
    if possible_subject.dep_ == 'nsubj' and possible_subject.head.pos_ == 'VERB':
        verbs[possible_subject.head.text].append(possible_subject.text)
        
verb_to_phrases = defaultdict(list)
for noun_phrase in doc.noun_chunks:
    for verb, nsubjs in verbs.items():
        for nsubj in nsubjs:
            if nsubj in noun_phrase.text:
                verb_to_phrases[verb].append(noun_phrase.text)

In [287]:
verb_to_phrases

defaultdict(list,
            {'died': ['At least 43 people', 'At least 43 people'],
             'said': ['Police Superintendent Carlos Lopez Feliciano']})

In [282]:
displacy.render(sample_according_to_doc, style='dep')

In [284]:
for word in sample_according_to_doc:
    print('word: %s, pos: %s' % (word.text, word.pos_))

word: At, pos: ADV
word: least, pos: ADV
word: 43, pos: NUM
word: people, pos: NOUN
word: died, pos: VERB
word: in, pos: ADP
word: the, pos: DET
word: fire, pos: NOUN
word: ,, pos: PUNCT
word: at, pos: ADP
word: the, pos: DET
word: Dupont, pos: PROPN
word: Plaza, pos: PROPN
word: Hotel, pos: PROPN
word: ,, pos: PUNCT
word: and, pos: CCONJ
word: more, pos: ADJ
word: than, pos: ADP
word: 100, pos: NUM
word: were, pos: VERB
word: injured, pos: VERB
word: as, pos: ADP
word: 1,000, pos: NUM
word: holiday, pos: NOUN
word: guests, pos: NOUN
word: attempted, pos: VERB
word: to, pos: PART
word: flee, pos: VERB
word: flames, pos: NOUN
word: and, pos: CCONJ
word: smoke, pos: VERB
word: in, pos: ADP
word: the, pos: DET
word: 22-story, pos: ADJ
word: beachfront, pos: ADJ
word: structure, pos: NOUN
word: ,, pos: PUNCT
word: according, pos: VERB
word: to, pos: ADP
word: Police, pos: PROPN
word: Superintendent, pos: PROPN
word: Carlos, pos: PROPN
word: Lopez, pos: PROPN
word: Feliciano, pos: PROPN
wor

In [269]:
from collections import Counter
def get_list_count(series):
    c = Counter()
    for x in series:
        c.update(x)
    return c