# NER Training Data Creation (Training-based)

In [18]:
def para_to_text(p):
    """
    A function to find every texts in the paragraph

    params
    ----
    p : docx.Document.Paragraph object

    returns
    ----
    str 

    """
    rs = p._element.xpath(".//w:t")
    return u"".join([r.text for r in rs])


def sop_to_text(file_path):
    """
    Converts SOP.docx into plain text

    params
    ----
    file_path : str (path to the SOP document) 

    returns
    ----
    str
    """
    text = []
    with open(file_path, 'rb') as f:
        source_stream = BytesIO(f.read())
    f.close()

    doc = Document(source_stream)
    paras = doc.paragraphs
    for p in paras:
        text.append(para_to_text(p))
            
    text = " ".join(text).strip()
    return text

#### Spacy requires training data to be in the following format:

```python

train_data = [
    
    ("SENTENCES BLABLABLA", {entities : [(entity start index, entity end index, "LABEL A"),
                                         (entity start index, entity end index, "LABEL B")]})
    
    
]

```


# Event type entity training (Rule-based)

In [19]:
import os
import json
from docx import Document
from io import StringIO, BytesIO
import re

In [20]:
path = "/Users/Public/Desktop/SOPs/"
SOPs = os.listdir(path)

In [21]:
events = []
code_pattern = r"^[A-Z0-9][A-Z0-9]+[\- ]+.+"
for sop in SOPs:
    filepath = path+sop
    with open(filepath, 'rb') as f:
        source_stream = BytesIO(f.read())
    f.close()
    doc = Document(source_stream)
    paras = doc.paragraphs
    for p in paras:
        style = p.style.name
        text = para_to_text(p)
        
        if "Normal" in style:
            code = text.split(" event type")[0].split("for the ")[-1]
            r = re.findall(code_pattern, code)
            if r:
                events.append(r[0])

In [24]:
raw_event_list = []
event_code = []
pattern = r"[A-Z0-9][A-Z0-9]+"
inconsistency = set()
for sop in SOPs:
    event_des = "".join(sop.split("-")[1:]).strip().split(".")[0]
    raw_event_list.append(event_des)
    
    p = re.findall(pattern, event_des)
    if p:
        event_code.append(p[0])
    else:
        event_code.append(event_des)

In [26]:
list(set(event_code))

['Bylaw',
 'MISSIC',
 'Misch',
 'MARINE',
 'SUSPC',
 'PARK',
 'ROBBI',
 'MVI12345',
 'AIREM',
 'SCOLL',
 'FIREAR',
 'MISCH',
 'ALRM123',
 'ASSOA',
 'BYLAW',
 'DEMON',
 'WEAPON',
 'RECVEH',
 '1033',
 'ALARM',
 'SIP',
 'PROP',
 'DNA',
 'GAS1234',
 'CHECK',
 'FRAUD',
 'TRAFF',
 'ABDUC',
 'NOK',
 'WARRAN',
 'WIRE12',
 'RES123M',
 'COUNTERFEIT',
 'THEFTV',
 'INDEC',
 'EXP123',
 'PANHA',
 'ANNOY',
 'EXPLOS',
 'ELEC12',
 'MED12A',
 'RUBSH',
 'ASLT',
 'HAZARD',
 'MVIHR',
 'TEST',
 'ACF',
 'MVIINJ',
 'BNEI',
 '911',
 'ASLTSX',
 'MVI',
 'ASSPFA',
 'HARASS',
 'Theft',
 'KPEACE',
 'PURSUE',
 'STOVE',
 'ROBB',
 'SPAT',
 'SHOTS',
 'ARREST',
 'BAIT',
 'SHOPL',
 'PROST',
 'AB911',
 'HOMEIN',
 'INTELL',
 'BOMB',
 'BAITB',
 'THEFTI',
 'MUAID',
 'SCREAM',
 'ALARMS',
 'ALARMH',
 'THEFT',
 'MISCHI',
 'WILD123',
 'ACRA',
 'DOMRPT',
 'STALK',
 'TRAFFS',
 'SUICID',
 'ASSGP',
 'THREATS',
 'FOUNDP',
 'COUNT',
 'SUDDEN',
 'FIGHT',
 'ASLTI',
 'PROWL',
 'MAND',
 'STAFF',
 'VAULT',
 'ILBRN',
 'BORDR',
 'EXTORT',
 '

In [29]:
events

['1033 – Officer in trouble',
 '911 – Any police call',
 'AB911 – Abandoned and Accidental 911 cellular phone calls',
 'AB911 – Abandoned and Accidental 911 landline phone calls',
 'ABANDV - Abandoned vehicle',
 'ABANDV - Abandoned vehicle',
 'ABDUC – Abduction',
 'AIREM – Air Emergency',
 'ALARMD – Domestic Violence Emergency Response System Alarm (DVERS)',
 'ALARMH – Hold up alarm',
 'ALARMS – Silent or Panic Alarm',
 'ARSON – Arson',
 'ASLT – Assault',
 'ASLTI – Assault in Progress',
 'ASSGP – Assist General Public',
 'ASSMHA – Assist Mental Health Act',
 'ASSOA – Assist Other Agency',
 'BAIT – Bait Car Activations',
 'BNEI – Break and enter in progress',
 'BOMB – Bomb Threats',
 'BORDR – Border Runner',
 'BREACH – Breach of conditions, undertaking or restraining order',
 'BYLAW - Bylaws',
 'CHECK - Check wellbeing',
 'COUNT – Counterfeit Currency',
 'DEMON - Demonstration',
 'DISTB – Disturbance',
 'DOMRPT - Domestic report',
 'DRUGS – Drugs',
 'EXPLOS – Explosion/Explosive',
 'EXT

In [61]:
pattern = r'^([A-Z]+)[ -]+([\w ]*)-([\w ,-]*).docx'
events_code = []
for sop in SOPs:
    match = re.findall(pattern, sop, re.IGNORECASE)
    event_parts = [x.strip() for x in list(filter(None, match[0][1:]))]
    events_code.extend(event_parts)
    events_code.append(" - ".join(event_parts))

In [56]:
import pandas as pd
df = pd.read_csv("../data/interim/sop_types_valid.csv")
df

Unnamed: 0,type,juri,filename,juri_count
0,1033,"['AB', 'BI', 'BU', 'DE', 'DFPF', 'NW', 'PO', '...","['AB - 1033 - Officer in trouble.docx', 'BI - ...",16
1,DOMI,"['AB', 'BI', 'BU', 'DE', 'DFPF', 'NW', 'PO', '...","['AB - DOMI - Domestic in progress.docx', 'BI ...",16
2,FOUNDP,"['AB', 'BI', 'BU', 'DE', 'DFPF', 'NW', 'PO', '...","['AB - FOUNDP - Found Person.docx', 'BI - FOUN...",16
3,HAZARD,"['AB', 'BI', 'BU', 'DE', 'DFPF', 'NW', 'PO', '...","['AB - HAZARD - Hazard.docx', 'BI - HAZARD - H...",16
4,IMPAIR,"['AB', 'BI', 'BU', 'DE', 'DFPF', 'NW', 'PO', '...","['AB - IMPAIR - Impaired driver.docx', 'BI - I...",16
...,...,...,...,...
92,INDUST,"['AB', 'BI', 'BU', 'DE', 'NW', 'PO', 'RI', 'RM...","['AB - INDUST - Industrial Accident.docx', 'BI...",15
93,ASSMHA,"['AB', 'BI', 'BU', 'DE', 'NW', 'PO', 'RI', 'RM...",['AB - ASSMHA - Assist Mental Health Act.docx'...,15
94,SPAT,"['BI', 'BU', 'DE', 'NW', 'PO', 'RI', 'RM', 'SC...",['BI - SPAT - Special Attention Detail Event.d...,14
95,ALARMD,"['AB', 'BI', 'BU', 'DE', 'NW', 'PO', 'RI', 'RM...","['AB - ALARMD - Domestic violence alarm.docx',...",14


In [84]:
for x in df["juri"]:
    x = 

['[', "'", 'A', 'B', "'", ',', ' ', "'", 'B', 'I', "'", ',', ' ', "'", 'B', 'U', "'", ',', ' ', "'", 'D', 'E', "'", ',', ' ', "'", 'D', 'F', 'P', 'F', "'", ',', ' ', "'", 'N', 'W', "'", ',', ' ', "'", 'P', 'O', "'", ',', ' ', "'", 'R', 'I', "'", ',', ' ', "'", 'R', 'M', "'", ',', ' ', "'", 'S', 'C', "'", ',', ' ', "'", 'S', 'Q', "'", ',', ' ', "'", 'S', 'X', "'", ',', ' ', "'", 'U', 'N', "'", ',', ' ', "'", 'V', 'A', "'", ',', ' ', "'", 'W', 'P', "'", ',', ' ', "'", 'W', 'V', "'", ']']
['[', "'", 'A', 'B', "'", ',', ' ', "'", 'B', 'I', "'", ',', ' ', "'", 'B', 'U', "'", ',', ' ', "'", 'D', 'E', "'", ',', ' ', "'", 'D', 'F', 'P', 'F', "'", ',', ' ', "'", 'N', 'W', "'", ',', ' ', "'", 'P', 'O', "'", ',', ' ', "'", 'R', 'I', "'", ',', ' ', "'", 'R', 'M', "'", ',', ' ', "'", 'S', 'C', "'", ',', ' ', "'", 'S', 'Q', "'", ',', ' ', "'", 'S', 'X', "'", ',', ' ', "'", 'U', 'N', "'", ',', ' ', "'", 'V', 'A', "'", ',', ' ', "'", 'W', 'P', "'", ',', ' ', "'", 'W', 'V', "'", ']']
['[', "'", 'A', 'B

In [11]:
len(event_code)

1989

In [12]:
inconsistency

{'Bylaw', 'Misch', 'Pursue', 'Theft'}

## SITUATION

In [13]:
situations = []

for sop in SOPs:
    filepath = path+sop
    with open(filepath, 'rb') as f:
        source_stream = BytesIO(f.read())
    f.close()
    doc = Document(source_stream)
    paras = doc.paragraphs
    for p in paras:
        style = p.style.name
        text = para_to_text(p)
        if "heading 2" in style.lower():
            situations.append(text)

In [14]:
situations = set(situations)

## ROLES

In [15]:
roles = []

for sop in SOPs:
    filepath = path+sop
    with open(filepath, 'rb') as f:
        source_stream = BytesIO(f.read())
    f.close()
    doc = Document(source_stream)
    paras = doc.paragraphs
    for p in paras:
        style = p.style.name
        text = para_to_text(p)
        if "heading 1" in style.lower():
            roles.append(text.strip())
            
roles = list(set(roles))[1:]

### Extract data from acronym

In [8]:
import os
import json
from docx import Document
from io import StringIO, BytesIO
import re

In [9]:
file_path = "/Users/flu/Desktop/capstone-2020/utils/acronyms.docx"

In [18]:
with open(file_path, 'rb') as f:
    source_stream = BytesIO(f.read())
f.close()
doc = Document(source_stream)
paras = doc.paragraphs

FileNotFoundError: [Errno 2] No such file or directory: '/Users/flu/Desktop/capstone-2020/utils/acronyms.docx'

In [None]:
import pandas as pd

df = pd.DataFrame(columns=["jargon","meaning"])
jargon = []
meaning = []

In [None]:
for p in paras:
    parsed = p.text.strip().split("\t")
    if len(parsed) == 2:
        jargon.append(parsed[0].strip())
        meaning.append(parsed[1].strip())

In [None]:
tables = doc.tables
for t in tables:
    for r in t.rows:
        i = 0
        for c in r.cells:
            if i % 2 == 0:
                i += 1
                jargon.append(c.text.strip())
            else:
                meaning.append(c.text.strip())

df["jargon"] = jargon
df["meaning"] = meaning                

In [None]:
import numpy as np

df["jargon"].replace("",np.nan, inplace=True)
df["meaning"].replace("",np.nan, inplace=True)

In [None]:
df = df.dropna().reset_index()[["jargon","meaning"]]

In [None]:
terms = list(df["jargon"])
for term in terms:
    print(term)

In [None]:
definitions = list(df["meaning"])
definitions

#### Organization pattern

In [None]:
import spacy
import numpy as np

nlp = spacy.load("en_core_web_sm")

ind = 0
orgs = []

for d in definitions:
    doc = nlp(d)
    for ent in doc.ents:
        if ent.label_ == "ORG":
            orgs.append(terms[ind])
    ind += 1
orgs = list(set(orgs))

In [None]:
filtered = []
for org in orgs:
    if org.upper() != org:
        pass
    else:
        filtered.append(org)

In [None]:
filtered

In [None]:
org_cand = []

glossary = zip(terms, definitions)
for org, mean in glossary:
    if org in filtered:
        org_cand.append((org, mean))

In [None]:
org_key = ["British","Columbia",
           "Service","Services",
           "Police","Institute",
           "Ltd.", "Association",
           "Ministry","National",
           "Unit","Incorporated",
           "Corporation"]

In [None]:
organization = []

for x,y in org_cand:
    if len(set(org_key + y.split())) < len(org_key + y.split()):
        organization.append(x)

In [None]:
organization = list(set(organization))

## SITUATION, ACTION, QUSETION, CONDITION

In [None]:
raw_texts = []

for sop in SOPs:
    filepath = path+sop
    with open(filepath, 'rb') as f:
        source_stream = BytesIO(f.read())
    f.close()
    doc = Document(source_stream)
    paras = doc.paragraphs
    for p in paras:
        style = p.style.name
        text = para_to_text(p)
        if "style1" in style.lower():
            if text.strip() == "":
                pass
            else:
                raw_texts.append(text.strip())

In [None]:
questions = []
conditions = []
actions = []
others = []

for t in raw_texts:
    if t.endswith("?"):
        questions.append(t)
    elif t.startswith("If"):
        conditions.append(t)
    elif nlp(t)[0].pos_ == "VERB":
        actions.append(nlp(t)[0])
    else:
        others.append(t)

In [None]:
questions = list(set(questions))
conditions = list(set(conditions))
actions = list(set(actions))

In [None]:
filtered_conditions = []
for c in conditions:
    filtered_conditions.append(c.replace("\\","").replace("/","or").replace("\t","").replace(":","").replace(";",""))

In [None]:
others[1300:1380]

## Agency, Jurisdiction

In [17]:
jurisdiction = ['AB', 'BI', 'BU', 'DE', 'DFPF', 
            'NW', 'PO', 'RI', 'RM', 'SC', 
            'SQ', 'SX', 'UN', 'VA', 'WP', 
            'WV', 'DF PF']

## Pattern Based Approach

## Regex Based Approach

# Writing Entities

In [None]:
with open("./entity_train/PATTERNS.JSONL","w", encoding="utf-8") as f:
    
    # EVENT CODE EXTRACTION
    
    for ec in event_code:
        f.write('{"label":"EVENT", "pattern":"%s"}\n' %ec)
    for ic in inconsistency:
        f.write('{"label":"EVENT", "pattern":"%s"}\n' %ic)
    for e in events:
        f.write('{"label":"EVENT", "pattern":"%s"}\n' %e)
        
    # SITUATION EXTRACTION
    for s in situations:
        f.write('{"label":"SITUATION", "pattern":"%s"}\n' %s)
        
    # ROLE EXTRACTION
    for r in roles:
        f.write('{"label":"ROLE", "pattern":"%s"}\n' %r)
        
    # QUESTION EXTRACTION
    for q in questions:
        f.write('{"label":"QUESTION", "pattern":"%s"}\n' %q)   
        
    # CONDITION EXTRACTION
    for c in filtered_conditions:
        f.write('{"label":"CONDITION", "pattern":"%s"}\n' %c)  
        
    # ACTION EXTRACTION
    for a in actions:
        f.write('{"label":"ACTION", "pattern":"%s"}\n' %a)    
    
    # ORGANIZATION EXTRACTION
    for o in organization:
        f.write('{"label":"ORG", "pattern":"%s"}\n' %o)
    
    # JURISDICTION / AGENCY
    for j in jurisdiction:
        regex_pattern = "^({})[ -]+".format(j)
        f.write('{"label":"JURI", "pattern":[{"TEXT":{"REGEX": %s }}]}\n' %regex_pattern)
        
f.close()