In [1]:
import os
import json
from docx import Document
from io import StringIO, BytesIO
import re
import time
import datetime

import pandas as pd
import json
import spacy
from nltk.corpus import stopwords

from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim import matutils, models
from gensim.models import CoherenceModel, TfidfModel, HdpModel
from gensim.models.phrases import Phrases, Phraser
import pyLDAvis.gensim

from sklearn.cluster import KMeans
from scipy.sparse import csc_matrix
from gensim.matutils import corpus2csc
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

from docx import Document
from io import StringIO, BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'font.size': 14, 'lines.linewidth': 3})

In [2]:
nlp = spacy.load("en_core_web_sm")
# stop_words = set(stopwords.words('english'))

In [3]:
notebook_dir = os.getcwd()
situ_df = pd.read_csv('../data/interim/calltaker_situation.csv', 
                      keep_default_na = False, 
                     converters = {'sop': eval})

In [4]:
# situ_df = situ_df[situ_df['situation'].str.len() > 0].reset_index(drop = True)
situ_df

Unnamed: 0,type,juri,role,situation,sop,filename
0,1033,DE,call taker,CBSA alarm policy,"[Listen to alarm, Acknowledge the alarm by pre...",DE - 1033 - Officer in trouble.docx
1,DOMI,AB,call taker,All other DOMI reports,[Create a call in every instance. Do not cance...,AB - DOMI - Domestic in progress.docx
2,DOMI,AB,call taker,GPS Panic Alarms,"[Create a call, Remain on the line recording a...",AB - DOMI - Domestic in progress.docx
3,FOUNDP,AB,call taker,Address Obtained,"[Run address on QBA:, If the person is negativ...",AB - FOUNDP - Found Person.docx
4,FOUNDP,AB,call taker,Name or medic alert is known,"[Run name on CPIC, If the person is negative o...",AB - FOUNDP - Found Person.docx
...,...,...,...,...,...,...
3589,ALARMD,AB,call taker,All Other DVERS personal residential alarms,"[Create a call, Check hazards which will show ...",AB - ALARMD - Domestic violence alarm.docx
3590,ALARMD,AB,call taker,DVERS Alarm maintenance,[See “Assist GP” for a stand by keep the peac...,AB - ALARMD - Domestic violence alarm.docx
3591,ALARMD,AB,call taker,GPS tracked Panic Alarms,[See AB-DOMI],AB - ALARMD - Domestic violence alarm.docx
3592,DNA,NW,call taker,DNA Warrant:,[See WARRAN (Warrants) SOP],NW - DNA - DNA collection .docx


In [5]:
def preprocess(strlist,
               min_token_len = 2,
               allowed_pos = ['ADV', 'ADJ', 'VERB', 'NOUN', 'PART', 'NUM', 'PROPN']): 
    removal = ['-', r'i\.e\.']
    res = list()
    not_stopword = {'call'}
    for string in strlist:
        text = re.sub(r"|".join(removal), ' ', string.lower())
        doc = nlp(text)
        res += [token.lemma_ for token in doc \
               if token.pos_ in allowed_pos \
               # Spacy considers 'call' as a stop word, which is not suitable for our case
               and (token.text in not_stopword or not token.is_stop) \
#                and token.text not in stop_words \              
#                and token.is_alpha \
               and len(token.lemma_) > min_token_len
               ]
    
    return ' '.join(res)

In [6]:
def get_dct_dtmatrix(sops):
    corpus = [sop.split() for sop in map(preprocess, sops)]
#     phrases = Phrases(corpus, min_count = 1, threshold = 1)
#     bigram = Phraser(phrases)
#     corpus = bigram(corpus)
    dictionary = corpora.Dictionary(corpus)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]
    return doc_term_matrix, corpus, dictionary

In [7]:
doc_term_bow, corpus, dictionary = get_dct_dtmatrix(situ_df['sop'])

In [8]:
def bow2tfidf(doc_term_bow, corpus_tfidf):
    doc_term_tfidf = corpus_tfidf[doc_term_bow]
    scipy_tfidf = corpus2csc(doc_term_tfidf, num_terms = len(corpus_tfidf.idfs))
    tfidf_mtx = csc_matrix(scipy_tfidf).T.toarray()
#     print(f'The dimensions of tfidf matrix = {tfidf_mtx.shape}')
    return tfidf_mtx

In [9]:
tfidf_situ = TfidfModel(doc_term_bow)
tfidf_mtx = bow2tfidf(doc_term_bow, tfidf_situ)

In [10]:
def save_df(df, name, prefix = '../data/interim/'):
    filename = prefix + name
    df.to_csv(filename, index = False)

### Finding the best `n_clusters` for KMeans
> use code below to fit multiple models with different `n_clusters`:
```python
kms = list()
for k in range(1, 300):
    km = KMeans(n_clusters = k).fit(tfidf_mtx)
    kms.append(km)
```
  
> use code below to find the elbow of inertias:
```python
fig, ax = plt.subplots(1, 1, figsize = (12, 4))
ax.plot(range(1, 300), 
        [km.inertia_ for km in kms])
ax.set_xlabel('k')
ax.set_ylabel('inertia')
ax.set_title(f'Inertia vs Number of Clusters')
ax.xaxis.set_major_locator(MaxNLocator(integer = True))
ax.grid()
plt.show()
```
![](img/kmeans_inertias.PNG)

> use code below to find the shoulder of Silhouette scores
```python
fig, ax = plt.subplots(1, 1, figsize = (12, 4))
ax.plot(range(2, 300), 
        [silhouette_score(tfidf_mtx, kms[k-1].labels_) for k in range(2, 300)])
ax.set_xlabel('k')
ax.set_ylabel('silhouette_score')
ax.set_title(f'Silhouette Score vs Number of Clusters')
ax.xaxis.set_major_locator(MaxNLocator(integer = True))
ax.grid()
plt.show()
```
![](img/kmeans_silhouette.PNG)

#### Based on the plots of inertias and Silhouette scores, we choose k = 190

In [11]:
km_190 = KMeans(n_clusters = 190, random_state = 2020).fit(tfidf_mtx)

In [12]:
situ_topics_kmeans_tfidf = situ_df.copy()
situ_topics_kmeans_tfidf['cluster'] = km_190.labels_
situ_topics_kmeans_tfidf = situ_topics_kmeans_tfidf.sort_values(by = ['cluster', 'type', 'juri'], ignore_index = True)

In [13]:
situ_topics_kmeans_tfidf#[situ_topics_kmeans_tfidf['cluster'] == 91]['situation'].tolist()

Unnamed: 0,type,juri,role,situation,sop,filename,cluster
0,FRAUD,AB,call taker,Not in progress – All reports,[Transfer to a Report Agent],AB - FRAUD.docx,0
1,PROP,AB,call taker,Lost license plates and validation tags,[Transfer call to a Report Agent],"AB - PROP - Property lost, found or seized.docx",0
2,PROP,BI,call taker,Lost license plates and/or validation tags,[Transfer call to a Report Agent],"BI - PROP - Property lost, found or seized.docx",0
3,PROP,BU,call taker,Lost licence plate and/or validation tag,[Transfer to a Report Agent],"BU - PROP - Property lost, found or seized.docx",0
4,PROP,DE,call taker,Lost license plates and validation tags,[Transfer call to a Report Agent],"DE - PROP - Property lost, found or seized.docx",0
...,...,...,...,...,...,...,...
3589,ANIMAL,SX,call taker,Deceased animals,"[Refer caller to Ministry of Highways, Works y...",SX - ANIMAL.docx,188
3590,ANIMAL,UN,call taker,Deceased animals,"[Refer caller to Ministry of Highways, Works y...",UN - ANIMAL.docx,188
3591,ASSOA,DE,call taker,General Procedure,"[Create a call, Obtain the type of investigati...",DE - ASSOA - Assist Other Agency.docx,189
3592,ASSOA,SX,call taker,General Procedure,"[Create a call, Obtain the type of investigati...",SX - ASSOA - Assist Other Agency.docx,189


In [14]:
situ_topics_kmeans_tfidf['situ_lst'] = situ_topics_kmeans_tfidf['situation'].apply(lambda x: [x])

In [15]:
situ_topics_kmeans_tfidf.to_csv('../data/interim/situ_topics_kmeans_tfidf.csv', index = False)

In [16]:
# situ_clusters = situ_topics_kmeans_tfidf.groupby('cluster').agg({
#     'situation': lambda x: list(x)
# })
# situ_clusters

In [17]:
cluster_bow, cluster_corpus, cluster_dict = get_dct_dtmatrix(situ_topics_kmeans_tfidf['situ_lst'])

In [18]:
len(cluster_dict.token2id)

743

In [19]:
tfidf_cluster = TfidfModel(cluster_bow)
tfidf_mtx_cluster = bow2tfidf(cluster_bow, tfidf_cluster)

In [20]:
def show_result(query_mtx, corpus_mtx, N = 20):
    sim = cosine_similarity(query_mtx, corpus_mtx)[0]
    cluster_sorted = zip(sim.argsort()[::-1], sorted(sim)[::-1])
    idx = list()
    for cnt, (i, prob) in enumerate(cluster_sorted):
        if prob < 0.1 or cnt >= N:
            break
        idx.append(i)
    return situ_topics_kmeans_tfidf.iloc[idx].copy().drop(columns = 'situ_lst')

In [21]:
def query_situation(query, corpus_tfidf, corpus_dct, corpus_mtx, N = 20):
    query_corpus = [preprocess([query]).split()]
    query_bow = [corpus_dct.doc2bow(doc) for doc in query_corpus]
    query_tfidf_mtx = bow2tfidf(query_bow, corpus_tfidf)
    return show_result(query_tfidf_mtx, corpus_mtx, N = N)

In [28]:
query = 'missing child'
query1 = 'lost passport'
query_result = query_situation(query, tfidf_cluster, cluster_dict, tfidf_mtx_cluster)
query_result

Unnamed: 0,type,juri,role,situation,sop,filename,cluster
853,ASLTSX,SC,call taker,Child Abuse,[Create a call if there is immediate risk to a...,SC - ASLTSX - Sexual assault.docx,14
2980,ASLT,NW,call taker,Child Abuse,"[Create a call, Questions, Time delay?, What a...",NW - ASLT - Assault.docx,147
836,ASLTI,RI,call taker,Child Abuse,[When there is immediate risk to a child’s saf...,RI - ASLTI - Assault in Progress.docx,14
835,ASLTI,PO,call taker,Child Abuse,[When there is immediate risk to a child’s saf...,PO - ASLTI - Assault in Progress.docx,14
834,ASLTI,NW,call taker,Child Abuse,[When there is immediate risk to a child’s saf...,NW - ASLTI - Assault in Progress.docx,14
833,ASLTI,DE,call taker,Child Abuse,[When there is immediate risk to a child’s saf...,DE - ASLTI - Assault in Progress.docx,14
831,ASLTI,BI,call taker,Child Abuse,[When there is immediate risk to a child’s saf...,BI - ASLTI - Assault in Progress.docx,14
830,ASLTI,AB,call taker,Child Abuse,[When there is immediate risk to a child’s saf...,AB - ASLTI - Assault in Progress.docx,14
2977,ASLT,BI,call taker,Child Abuse,"[Create a call, Questions, Time delay?, What a...",BI - ASLT - Assault.docx,147
2978,ASLT,BU,call taker,Child Abuse,"[Create a call, Questions, Time delay?, What a...",BU - ASLT - Assault.docx,147


In [23]:
# query = 'lost license plate'
# query_corpus = [preprocess([query]).split()]
# query_bow = [cluster_dict.doc2bow(doc) for doc in query_corpus]
# query_tfidf_mtx = bow2tfidf(query_bow, tfidf_cluster)
# sim = cosine_similarity(query_tfidf_mtx, tfidf_mtx_cluster)[0]
# cluster_sorted = zip(sim.argsort()[::-1], sorted(sim)[::-1])
# idx = list()
# for cnt, (i, prob) in enumerate(cluster_sorted):
#     if prob < 0.05 or cnt > 10:
#         break
#     idx.append(i)
# idx

In [24]:
# situ_topics_kmeans_tfidf.iloc[idx]

In [25]:
raise Exception('Stop here')

Exception: Stop here

In [None]:
def func1(patn, lst):
    reg = re.findall(patn, ' '.join(lst), re.IGNORECASE)
    if reg:
        return True
    else:
        return False
    
def find_srs_pattern(patn, pd_srs):
    return list(map(lambda x: func1(patn, x), pd_srs))

situ_lda_tfidf_topics[find_srs_pattern('domi', situ_lda_tfidf_topics['sop'])]