***installing Spacy 2.1.0***

neuralcoref only works with Spacy, 2.1.0. Whilst spacy can be installe dusing Pip or Conda, to install the lg english model I had to:
- https://v2.spacy.io/models/en
- manually download the tar https://github.com/explosion/spacy-models/releases//tag/en_core_web_lg-2.3.1#
- and install locally as described here https://github.com/explosion/spaCy/issues/4577


In [3]:
import re
import pandas as pd
import itertools
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

import spacy
import neuralcoref #
from spacy import displacy
try:
    nlp = spacy.load('en_core_web_lg')
except:
    !python -m spacy download en_core_web_lg
    nlp = spacy.load('en_core_web_lg')
from spacy.matcher import Matcher 
from spacy.tokens import Span 

## Load Data

In [5]:
df = pd.read_excel('tesco.xlsx',skiprows=2)
df[:2]

Unnamed: 0,id,text
0,0,"Tesco has been hit by hackers, leaving thousan..."
1,1,The outage leaves its grocery website and app ...


## Model
- [coreference resolution](https://spacy.io/universe/project/neuralcoref)
- [named entity recognition]()
 
<img src="./model_architecture.svg" width=400 height=400>

### Coreference Resolution

In [None]:
neuralcoref.add_to_pipe(nlp)

## Named Entity Recognition
To extract the named entities

In [None]:
# pre-processing
df['text-clean'] = df['text'].apply(lambda x: re.sub(r"(\w+)'s", r'\1s', x))

df[:2]

In [None]:
# customer NER
try:
    nlp.remove_pipe('entity_ruler')
except:
    config = {
       "phrase_matcher_attr": None,
       "validate": True,
       "overwrite_ents": True,
       "ent_id_sep": "||",
    }
    ruler = nlp.add_pipe("entity_ruler", config=config)
    patterns = [
                {'label': 'GROUP', 'pattern': [{'TEXT': {'REGEX': r'(?i)(?i)(customer)s?' }} ]},
                {'label': 'GROUP', 'pattern': [{'TEXT': {'REGEX': r'(?i)(?i)(shopper)s?' }} ]},
                {'label': 'GROUP', 'pattern': [{'TEXT': {'REGEX': r'(?i)(?i)(hacker)s?' }} ]},
                ]
    ruler.add_patterns(patterns)

In [None]:
# apply nlp
df['doc'] = df['text-clean'].apply(lambda x: nlp(x))
spacy.displacy.render(df['doc'][1], style='ent')
spacy.displacy.render(df['doc'][1], style='dep')

## Get the entities and nouns

In [None]:
# example doc
doc = df['doc'][0]
# doc = nlp("Ben was born in Hawaii. He owns two cats")
for tok in doc:
    if tok.pos_ in ['PROPN','NOUN','PRON']:
        print((tok.text.title(),tok.pos_,tok.dep_,tok.ent_type_,tok.i,tok.idx))
    else:
        print('\t',(tok.text.title(),tok.pos_,tok.dep_,tok.ent_type_,tok.i,tok.idx))

In [None]:
# get entities
def get_entities(doc):
    entities = dict()
    for tok in doc:
        if tok.pos_ in ['PROPN','NOUN','PRON']:
            entities[tok.i] = tok # token.i is int location in doc
            # print((tok.text.title(),tok.pos_,tok.dep_,tok.ent_type_,tok.i,tok.idx))
    return entities

entities = get_entities(doc)
entities

In [None]:
# get ordered entity pairs
def get_entity_pairs(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = itertools.tee(iterable)
    next(b, None)
    return {e:{'from':x[0],'to':x[1]} for e,x in enumerate(zip(a, b))}
entity_pairs = get_entity_pairs(entities.keys())
entity_pairs

In [None]:
# get edge text
def get_entity_edges(entity_pairs):
    for k in list(entity_pairs.keys()):
        v = entity_pairs[k]
        edge_text = [t.text for t in doc[v['from']:v['to']+1] if t.pos_ in ['VERB','ADP','ADJ']]
        edge_text = ' '.join(edge_text)
        if len(edge_text) == 0:
            del entity_pairs[k]
        else:
            v['edge'] = edge_text
    
get_entity_edges(entity_pairs)
entity_pairs

## view relations between entities and nouns

In [None]:
# create graph
G=nx.Graph(name='simple KG')

for k,node in entities.items():
    G.add_node(k,name=node.text.title(), ent_typ=node.ent_type_, pos=node.pos_) 
node_labels = {node[0]:node[1]['name'] for node in G.nodes(data=True)}

edges = [(v['from'],v['to'],{'weight':1}) for v in entity_pairs.values()]
edge_labels = {(v['from'],v['to']):v['edge'] for v in entity_pairs.values()}
G.add_edges_from(edges)

In [None]:
# plot graph
fig,ax = plt.subplots(1,1, figsize=(10,10))
pos = nx.spring_layout(G,seed=10)
nx.draw(G, 
        ax=ax,
        pos=pos,
        labels=node_labels,
        width=0.1)

nx.draw_networkx_edge_labels(G,
                             pos,
                             edge_labels=edge_labels,
                             font_color='red')
plt.tight_layout()