***installing Spacy 2.1.0 and neuralcoref***

> neuralcoref only works with Spacy, 2.1.0. Whilst spacy can be installe dusing Pip or Conda, to install the spacy lg english model and neuralcoref:
- https://v2.spacy.io/models/en
- manually download the model tar https://github.com/explosion/spacy-models/releases//tag/en_core_web_lg-2.3.1#
- install the model locally https://github.com/explosion/spaCy/issues/4577
- install neuralcoref from source
    - https://github.com/huggingface/neuralcoref/issues/197#issuecomment-534028423
    - https://github.com/huggingface/neuralcoref#install-neuralcoref-from-source



In [1]:
import re
import pandas as pd
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

import spacy
import neuralcoref #
from spacy import displacy
try:
    nlp = spacy.load('en_core_web_lg')
except:
    !python -m spacy download en_core_web_lg
    nlp = spacy.load('en_core_web_lg')
from spacy.matcher import Matcher 
from spacy.tokens import Span 

## Load Data

In [2]:
df = pd.read_excel('tesco.xlsx',skiprows=2)
df[:2]

Unnamed: 0,id,text
0,0,"Tesco has been hit by hackers, leaving thousan..."
1,1,The outage leaves its grocery website and app ...


## Model

- [named entity recognition]()
- [coreference resolution](https://spacy.io/universe/project/neuralcoref)

 
<img src="./model_architecture.svg" width=600 height=400>

In [3]:
example_text = ('Johnny is a big fan of skiing. '
                'He is a man. '
                'Lydia is a woman who likes cycling. '
                'He also enjoys cycling but she is a fan of music. '
                'Tesco Plc is the lagest supermarket chain in the UK. '
                'Their customers shop on average three times per week. '
                'But they buy fuel from other places.'
               )
example_text

'Johnny is a big fan of skiing. He is a man. Lydia is a woman who likes cycling. He also enjoys cycling but she is a fan of music. Tesco Plc is the lagest supermarket chain in the UK. Their customers shop on average three times per week. But they buy fuel from other places.'

## Named Entity Recognition
To extract the named entities

In [4]:
# custom NER
try:
    nlp.remove_pipe('entity_ruler')
#     pass
except:
    config = {
       "phrase_matcher_attr": None,
       "validate": True,
       "overwrite_ents": True,
       "ent_id_sep": "||",
    }
    ruler = nlp.create_pipe("entity_ruler", config=config)
    patterns = [
                {'label': 'GROUP', 'pattern': [{'TEXT': {'REGEX': r'(?i)(customer)s?' }} ]},
                {'label': 'GROUP', 'pattern': [{'TEXT': {'REGEX': r'(?i)(shopper)s?' }} ]},
                {'label': 'GROUP', 'pattern': [{'TEXT': {'REGEX': r'(?i)(hacker)s?' }} ]},
                ]
    ruler.add_patterns(patterns)
    ruler = nlp.add_pipe(ruler)

In [5]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7fd4ebe2c2d0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fd4ebe7b7c0>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7fd4ebe7b830>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x7fd4e095b790>)]

In [6]:
doc = nlp(example_text)
spacy.displacy.render(doc, style='dep')
print('\n\n')
spacy.displacy.render(doc, style='ent')






In [8]:
# pre-processing
df['text-clean'] = df['text'].apply(lambda x: re.sub(r"(\w+)'s", r'\1s', x))
# apply nlp
df['doc'] = df['text-clean'].apply(lambda x: nlp(x))

df[:2]

Unnamed: 0,id,text,text-clean,doc
0,0,"Tesco has been hit by hackers, leaving thousan...","Tesco has been hit by hackers, leaving thousan...","(Tesco, has, been, hit, by, hackers, ,, leavin..."
1,1,The outage leaves its grocery website and app ...,The outage leaves its grocery website and app ...,"(The, outage, leaves, its, grocery, website, a..."


### Coreference Resolution

To identify mentions of the same entities

In [18]:
try: 
    neuralcoref.add_to_pipe(nlp)
    print(nlp.pipeline)
except:
    print(nlp.pipeline)

[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7fd4ebe2c2d0>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7fd4ebe7b7c0>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7fd4ebe7b830>), ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler object at 0x7fd4e095b790>), ('neuralcoref', <neuralcoref.neuralcoref.NeuralCoref object at 0x7fd4e0877cb0>)]


In [28]:
doc = """Johnny is a big fan of skiing. He is a man. Lydia is a woman who likes cycling. He also enjoys cycling but she is a fan of music. Tesco Plc is the largest supermarket chain in the UK, their customers shop on average three times per week. But they buy fuel from other places."""

In [29]:
doc = nlp(example_text)
doc._.has_coref
doc._.coref_clusters

[Johnny: [Johnny, He, He],
 Lydia: [Lydia, she],
 Their customers: [Their customers, they]]

In [30]:
doc.ents

(Johnny, Lydia, Tesco Plc, UK, customers, three)

In [31]:
doc._.coref_resolved

'Johnny is a big fan of skiing. Johnny is a man. Lydia is a woman who likes cycling. Johnny also enjoys cycling but Lydia is a fan of music. Tesco Plc is the lagest supermarket chain in the UK. Their customers shop on average three times per week. But Their customers buy fuel from other places.'

In [32]:
doc._.coref_scores
k = doc._.coref_clusters
k[0],k[1]

(Johnny: [Johnny, He, He], Lydia: [Lydia, she])

In [33]:
# mentions
k[0].main.start_char,k[0].main.end_char
a = k[0].mentions
print(a)

for i in a:
    print(i,i.start_char,i.end_char)

[Johnny, He, He]
Johnny 0 6
He 31 33
He 80 82


In [34]:
doc = nlp(doc._.coref_resolved)
spacy.displacy.render(doc, style='ent')

### Relation Extraction

> **NOTE we apply both CoRef and NRE to identify related entities. The issue with the current code is that I only examine relations between successive entity pairs, and not those seperated by one of more gaps. The other issue is boiling the named entities down to a unique list within a text.**

In [82]:
# get entities
def get_entities(doc, verbose=True):
    entities = dict()
    for tok in doc:
        if tok.pos_ in [
                        'PROPN',
                        'NOUN',
                        #'PRON'
                       ] or  tok.ent_type_  != '': # in ['PERSON','ORG','GROUP']: #
            
            entities[tok.i] = tok # token.i is int location in doc
            if verbose:
                print((tok.text,tok.pos_,tok.dep_,tok.ent_type_,tok.i,tok.idx))
    return entities

entities = get_entities(doc)
entities

('Johnny', 'PROPN', 'nsubj', 'PERSON', 0, 0)
('fan', 'NOUN', 'attr', '', 4, 16)
('skiing', 'NOUN', 'pobj', '', 6, 23)
('Johnny', 'PROPN', 'nsubj', 'PERSON', 8, 31)
('man', 'NOUN', 'attr', '', 11, 43)
('Lydia', 'PROPN', 'nsubj', 'PERSON', 13, 48)
('woman', 'NOUN', 'attr', '', 16, 59)
('cycling', 'NOUN', 'xcomp', '', 19, 75)
('Johnny', 'PROPN', 'nsubj', 'PERSON', 21, 84)
('cycling', 'NOUN', 'dobj', '', 24, 103)
('Lydia', 'PROPN', 'nsubj', 'PERSON', 26, 115)
('fan', 'NOUN', 'attr', '', 29, 126)
('music', 'NOUN', 'pobj', '', 31, 133)
('Tesco', 'PROPN', 'compound', 'ORG', 33, 140)
('Plc', 'PROPN', 'nsubj', 'ORG', 34, 146)
('lagest', 'PROPN', 'amod', '', 37, 157)
('supermarket', 'NOUN', 'compound', '', 38, 164)
('chain', 'NOUN', 'attr', '', 39, 176)
('UK', 'PROPN', 'pobj', 'GPE', 42, 189)
('customers', 'NOUN', 'nsubj', 'GROUP', 45, 199)
('three', 'NUM', 'nummod', 'CARDINAL', 49, 225)
('times', 'NOUN', 'npadvmod', '', 50, 231)
('week', 'NOUN', 'pobj', '', 52, 241)
('customers', 'NOUN', 'nsubj

{0: Johnny,
 4: fan,
 6: skiing,
 8: Johnny,
 11: man,
 13: Lydia,
 16: woman,
 19: cycling,
 21: Johnny,
 24: cycling,
 26: Lydia,
 29: fan,
 31: music,
 33: Tesco,
 34: Plc,
 37: lagest,
 38: supermarket,
 39: chain,
 42: UK,
 45: customers,
 49: three,
 50: times,
 52: week,
 56: customers,
 58: fuel,
 61: places}

>[***spacy vectors***](https://ashutoshtripathi.com/2020/09/04/word2vec-and-semantic-similarity-using-spacy-nlp-spacy-series-part-7/)
>    ```
>    from scipy.spatial import distance
>    def cosine_similarity(x,y):
>        return 1 - distance.cosine(x,y)
>    queen = nlp.vocab['queen'].vector
>    vqueen = nlp.vocab['king'].vector - nlp.vocab['man'].vector + nlp.vocab['woman'].vector
>    cosine_similarity(queen,vqueen)
>    |0.7880843877792358
>    ```  
>    ```
>
>    # similarity in spacy
>    t1 = nlp('king')
>    t2 = nlp('queen')
>    t1.similarity(t2)
>    ```

In [36]:
# get ordered entity pairs
def get_entity_pairs(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return {e:{'from':x[0],'to':x[1]} for e,x in enumerate(zip(a, b))}
entity_pairs = get_entity_pairs(entities.keys())
entity_pairs

{0: {'from': 0, 'to': 4},
 1: {'from': 4, 'to': 6},
 2: {'from': 6, 'to': 8},
 3: {'from': 8, 'to': 11},
 4: {'from': 11, 'to': 13},
 5: {'from': 13, 'to': 16},
 6: {'from': 16, 'to': 17},
 7: {'from': 17, 'to': 19},
 8: {'from': 19, 'to': 21},
 9: {'from': 21, 'to': 24},
 10: {'from': 24, 'to': 26},
 11: {'from': 26, 'to': 29},
 12: {'from': 29, 'to': 31},
 13: {'from': 31, 'to': 33},
 14: {'from': 33, 'to': 34},
 15: {'from': 34, 'to': 37},
 16: {'from': 37, 'to': 38},
 17: {'from': 38, 'to': 39},
 18: {'from': 39, 'to': 42},
 19: {'from': 42, 'to': 45},
 20: {'from': 45, 'to': 49},
 21: {'from': 49, 'to': 50},
 22: {'from': 50, 'to': 52},
 23: {'from': 52, 'to': 56},
 24: {'from': 56, 'to': 58},
 25: {'from': 58, 'to': 61}}

In [None]:
# get edge text
def get_entity_edges(entity_pairs):
    for k in list(entity_pairs.keys()):
        v = entity_pairs[k]
        edge_text = [t.text for t in doc[v['from']:v['to']+1] if t.pos_ in ['VERB','ADP','ADJ']]
        edge_text = ' '.join(edge_text)
        if len(edge_text) == 0:
            del entity_pairs[k]
        else:
            v['edge'] = edge_text
    
get_entity_edges(entity_pairs)
entity_pairs

## view relations between entities and nouns

In [None]:
entities

In [None]:
# create graph
G=nx.Graph(name='simple KG')

for k,node in entities.items():
    G.add_node(k,name=node.text.title(), ent_typ=node.ent_type_, pos=node.pos_) 
node_labels = {node[0]:node[1]['name'] for node in G.nodes(data=True)}

edges = [(v['from'],v['to'],{'weight':1}) for v in entity_pairs.values()]
edge_labels = {(v['from'],v['to']):v['edge'] for v in entity_pairs.values()}
G.add_edges_from(edges)

In [None]:
# plot graph
fig,ax = plt.subplots(1,1, figsize=(10,10))
pos = nx.random_layout(G,seed=13)
nx.draw(G, 
        ax=ax,
        pos=pos,
        labels=node_labels,
        width=0.1)

nx.draw_networkx_edge_labels(G,
                             pos,
                             edge_labels=edge_labels,
                             font_color='red')
plt.tight_layout()