In [1]:
import networkx as nx
from docx import Document
from io import StringIO, BytesIO
import os
import spacy
from spacy import displacy
from spacy.pipeline import EntityRuler

In [2]:
path = "/Users/Public/Desktop/SOPs/"
SOPs = os.listdir(path)

In [3]:
def para_to_text(p):
    """
    A function to find every texts in the paragraph

    params
    ----
    p : docx.Document.Paragraph object

    returns
    ----
    str 

    """
    rs = p._element.xpath(".//w:t")
    return "".join([r.text for r in rs])


def sop_to_text(file_path):
    """
    Converts SOP.docx into plain text

    params
    ----
    file_path : str (path to the SOP document) 

    returns
    ----
    str
    """
    text = []
    with open(file_path, 'rb') as f:
        source_stream = BytesIO(f.read())
    f.close()

    doc = Document(source_stream)
    paras = doc.paragraphs
    for p in paras:
        text.append(para_to_text(p))
            
    text = " ".join(text).strip()
    return text

## Knowledge Based Graph

### Entity Extraction Status

In [4]:
nlp = spacy.load("./entity_train/ecomm_ner/")

In [5]:
colors = {"EVENT": "#FFCB41",
         "SITUATION": "#C7FF7E",
         "ACTION": "#FF625F",
         "QUESTION": "#D89CFF",
         "ROLE": "#7DC9FF" ,
         "CONDITION": "#FBB9F6" ,
         }
options = {"colors":colors}

In [6]:
file_path = path + "BI - SHOPL - Shoplifter.docx"

text = sop_to_text(file_path)

doc = nlp(text)
displacy.render(nlp(str(doc)), jupyter=True, style='ent', options=options)

### Entity Linking/Pairing

In [7]:
import pandas as pd 

ent_df = pd.DataFrame(columns = ["ENTITY","TYPE"])
ent_df["ENTITY"] = [ent.text for ent in doc.ents]
ent_df["TYPE"] = [ent.label_ for ent in doc.ents]

In [8]:
# Distinguish Main EVENT Type from other referred EVENT Types
event_inds = list(ent_df[ent_df["TYPE"] == "EVENT"].index)
ent_df.iloc[event_inds[0]]["TYPE"] = "MAIN_EVENT"
ent_df.iloc[event_inds[1:]]["TYPE"] = "ADR_EVENT"

In [9]:
def sop_entity_hierarchy(entity_type):
    """
    Determine entity's hierarchy by its type
    
    Params
    ----
    Entity - spacy.Document.Entity's type
    
    Returns
    ----
    int
    """
    rank_dict = {"MAIN_EVENT": 1,
                 "ROLE": 2,
                 "SITUATION": 3,
                 "QUESTION": 4,
                 "CONDITION": 4,
                 "ACTION": 5,
                 "ADR_EVENT": 6} # referred event
    
    return rank_dict[entity_type]

In [10]:
ent_df["RANKING"] = [sop_entity_hierarchy(x) for x in ent_df["TYPE"]]

In [11]:
ent_df

Unnamed: 0,ENTITY,TYPE,RANKING
0,SHOPL – Shoplifter,MAIN_EVENT,1
1,call taker,ROLE,2
2,Adult or juvenile - Trouble holding,SITUATION,3
3,Create,ACTION,5
4,Upgrade,ACTION,5
5,Where is the suspect being held?,QUESTION,4
6,Run,ACTION,5
7,Attach,ACTION,5
8,Juvenile – No trouble holding,SITUATION,3
9,Create,ACTION,5


In [12]:
index_by_type = ent_df.groupby(["TYPE"]).groups
index_by_type

{'ACTION': Int64Index([ 3,  4,  6,  7,  9, 11, 12, 14, 16, 17, 18, 20, 22, 23, 25, 27, 29,
             31, 34, 36, 38, 40, 42],
            dtype='int64'),
 'CONDITION': Int64Index([26, 30], dtype='int64'),
 'MAIN_EVENT': Int64Index([0], dtype='int64'),
 'QUESTION': Int64Index([5, 10, 15, 21, 28], dtype='int64'),
 'ROLE': Int64Index([1, 32], dtype='int64'),
 'SITUATION': Int64Index([2, 8, 13, 19, 24, 33, 35, 37, 39, 41], dtype='int64')}

In [13]:
ent_pairs = []

In [14]:
# MAIN_EVENT - ROLE
for i in index_by_type["ROLE"]:
    ent_pairs.append((ent_df["ENTITY"][0] , ent_df["ENTITY"][i]))

In [15]:
# ROLE - SITUATION
checkpoint = len(ent_df)

role_ind = index_by_type["ROLE"]
sit_ind = index_by_type["SITUATION"]

for i in range(0,len(role_ind)-1):
    start = role_ind[i]
    end = role_ind[i+1]
    for j in range(0,len(sit_ind)):
        if start < sit_ind[j] < end:
            ent_pairs.append([ent_df["ENTITY"][role_ind[i]], ent_df["ENTITY"][sit_ind[j]]])
        else:
            break
            
for k in sit_ind[j:]:
    ent_pairs.append((ent_df["ENTITY"][end], ent_df["ENTITY"][k]))

In [16]:
# SITUATION - ACTION/CONDITION/QUESTION


In [17]:
ent_pairs

[('SHOPL – Shoplifter', 'call taker'),
 ('SHOPL – Shoplifter', 'Dispatcher'),
 ['call taker', 'Adult or juvenile - Trouble holding'],
 ['call taker', 'Juvenile – No trouble holding'],
 ['call taker', 'No Photo ID – No trouble holding'],
 ['call taker', 'Adult – No trouble holding and LPO pressing charges'],
 ['call taker', 'Adult – No trouble holding and LPO not pressing charges'],
 ('Dispatcher', 'Adult or juvenile - Trouble holding'),
 ('Dispatcher', 'Juvenile – No trouble holding'),
 ('Dispatcher', 'No Photo ID – No trouble holding'),
 ('Dispatcher', 'Adult – No trouble holding and LPO pressing charges'),
 ('Dispatcher', 'Adult – No trouble holding and LPO not pressing charges')]

## Drawing Knowledge Graph with NetworkX

In [18]:
G = nx.DiGraph()

In [19]:
G.add_edges_from(ent_pairs)

In [20]:
color_map = []
color_map.extend(["red"]*len(index_by_type["MAIN_EVENT"]))
color_map.extend(["blue"]*len(index_by_type["ROLE"]))
color_map.extend(["yellow"]*len(set(ent_df.iloc[index_by_type["SITUATION"]]["ENTITY"])))

In [21]:
from networkx.drawing.nx_pydot import pydot_layout

plt.figure(figsize=(30,10))
nx.nx_pydot.write_dot(G, "example.dot")

plt.title(file_path.split("/")[-1])
pos = pydot_layout(G, prog='dot')
nx.draw_networkx_labels(G, pos, font_size=20)
nx.draw(G, pos, with_labels=False, node_color = color_map)

NameError: name 'plt' is not defined

In [66]:
!dot -Tpng example.dot -o example.png






