In [1]:
import os
import json
from docx import Document
from io import StringIO, BytesIO
import re
import time
import datetime

import pandas as pd
import json
import spacy
from nltk.corpus import stopwords

from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim import matutils, models
from gensim.models import CoherenceModel, TfidfModel, HdpModel
from gensim.models.phrases import Phrases, Phraser
import pyLDAvis.gensim

from sklearn.cluster import KMeans
from scipy.sparse import csc_matrix
from gensim.matutils import corpus2csc
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

from docx import Document
from io import StringIO, BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'font.size': 14, 'lines.linewidth': 3})

In [2]:
t0 = time.time()

In [3]:
nlp = spacy.load("en_core_web_sm")
# stop_words = set(stopwords.words('english'))

In [4]:
notebook_dir = os.getcwd()
sop_df = pd.read_csv('../data/interim/sop_types_valid.csv', 
                     converters = {'juri': eval, 
                                   'filename': eval})

In [5]:
sop_df.head(3)

Unnamed: 0,type,juri,filename,juri_count
0,1033,"[AB, BI, BU, DE, DFPF, NW, PO, RI, RM, SC, SQ,...","[AB - 1033 - Officer in trouble.docx, BI - 103...",16
1,DOMI,"[AB, BI, BU, DE, DFPF, NW, PO, RI, RM, SC, SQ,...","[AB - DOMI - Domestic in progress.docx, BI - D...",16
2,FOUNDP,"[AB, BI, BU, DE, DFPF, NW, PO, RI, RM, SC, SQ,...","[AB - FOUNDP - Found Person.docx, BI - FOUNDP ...",16


In [6]:
type_list = sop_df['type']

In [7]:
def load_event_role(event_type, role):
    with open(F'../data/sop_jsons/{event_type}.txt') as f:
        dct = json.load(f)
    f.close()
    event_row = sop_df[sop_df['type'] == event_type]
    juri_to_filename = dict(zip(event_row['juri'].values[0], 
                                event_row['filename'].values[0]))
    types, juris, roles, sops = list(), list(), list(), list()
    for juri, role_sop in dct.items():
        if role in role_sop:
            types.append(event_type)
            juris.append(juri)
            roles.append(role)
            sops.append(role_sop[role])
    df = pd.DataFrame({'type': types, 'juri': juris, 'role': roles, 'sop': sops})
    df['filename'] = df['juri'].apply(lambda x: juri_to_filename[x])
    return df

In [8]:
def load_event_types_for_role(types, role):
    res = pd.DataFrame()
    for t in types:
        res = res.append(load_event_role(t, role))
    return res.reset_index(drop = True)

In [9]:
try:
    calltaker_all = pd.read_csv(
        '../data/interim/calltaker_all.csv', 
        converters = {'sop': eval}
    )
except:
    calltaker_all = load_event_types_for_role(type_list, 'call taker')

In [10]:
calltaker_all.head()#['sop'].tolist()[0]

Unnamed: 0,type,juri,role,sop,filename
0,1033,AB,call taker,"[Create a call, Questions, Are there weapons i...",AB - 1033 - Officer in trouble.docx
1,1033,BI,call taker,"[Create a call, Questions, Are there weapons i...",BI - 1033 - Officer in trouble.docx
2,1033,BU,call taker,"[Create a call, Questions, Are there weapons i...",BU - 1033 - Officer in trouble.docx
3,1033,DE,call taker,"[Create a call, Questions, Are there weapons i...",DE - 1033 - Officer in trouble.docx
4,1033,DFPF,call taker,"[Create a call, Questions, Are there weapons i...",DFPF - 1033 - Officer in trouble.docx


In [11]:
calltaker_all.shape

(1468, 5)

In [12]:
def preprocess(strlist,
               min_token_len = 2,
               allowed_pos = ['ADV', 'ADJ', 'VERB', 'NOUN', 'PART', 'NUM', 'PROPN']): 
    removal = ['-', r'i\.e\.']
    res = list()
    not_stopword = {'call'}
    for string in strlist:
        text = re.sub(r"|".join(removal), ' ', string.lower())
        doc = nlp(text)
        res += [token.lemma_ for token in doc \
               if token.pos_ in allowed_pos \
               # Spacy considers 'call' as a stop word, which is not suitable for our case
               and (token.text in not_stopword or not token.is_stop) \
#                and token.text not in stop_words \              
#                and token.is_alpha \
               and len(token.lemma_) > min_token_len
               ]
    
    return ' '.join(res)

In [13]:
def get_dct_dtmatrix(sops):
    corpus = [sop.split() for sop in map(preprocess, sops)]
#     phrases = Phrases(corpus, min_count = 1, threshold = 1)
#     bigram = Phraser(phrases)
#     corpus = bigram(corpus)
    dictionary = corpora.Dictionary(corpus)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]
    return doc_term_matrix, corpus, dictionary

In [14]:
doc_term_bow, corpus, dictionary = get_dct_dtmatrix(calltaker_all['sop'])

### wait here

In [15]:
def bow2tfidf(doc_term_bow, corpus_tfidf):
    doc_term_tfidf = corpus_tfidf[doc_term_bow]
    scipy_tfidf = corpus2csc(doc_term_tfidf, num_terms = len(corpus_tfidf.idfs))
    tfidf_mtx = csc_matrix(scipy_tfidf).T.toarray()
    print(f'The dimensions of tfidf matrix = {tfidf_mtx.shape}')
    return tfidf_mtx

In [16]:
tfidf_type = TfidfModel(doc_term_bow)
tfidf_mtx = bow2tfidf(doc_term_bow, tfidf_type)

The dimensions of tfidf matrix = (1468, 1954)


In [17]:
def save_df(df, name, prefix = '../data/interim/'):
    filename = prefix + name
    df.to_csv(filename, index = False)

In [18]:
save_df(calltaker_all, 'calltaker_all.csv')

### Finding the best `n_clusters` for KMeans
> use code below to fit multiple models with different `n_clusters`:
```python
kms = list()
for k in range(1, 400):
    km = KMeans(n_clusters = k).fit(tfidf_mtx)
    kms.append(km)
```
  
> use code below to find the elbow of inertias:
```python
fig, ax = plt.subplots(1, 1, figsize = (12, 4))
ax.plot(range(1, 200), 
        [km.inertia_ for km in kms[:199]])
ax.set_xlabel('k')
ax.set_ylabel('inertia')
ax.set_title(f'Inertia vs Number of Clusters')
ax.xaxis.set_major_locator(MaxNLocator(integer = True))
ax.grid()
plt.axvline(x = 102, c = 'red', lw = 1)
plt.show()
```
![](img/type_inertias.PNG)

> use code below to find the shoulder of Silhouette scores
```python
fig, ax = plt.subplots(1, 1, figsize = (12, 4))
ax.plot(range(2, 200), 
        [silhouette_score(tfidf_mtx, kms[k-1].labels_) for k in range(2, 200)])
ax.set_xlabel('k')
ax.set_ylabel('silhouette_score')
ax.set_title(f'Silhouette Score vs Number of Clusters')
ax.xaxis.set_major_locator(MaxNLocator(integer = True))
ax.grid()
plt.axvline(x = 102, c = 'red', lw = 1)
plt.show()
```
![](img/type_silhouettes.PNG)

#### Based on the plots of inertias and Silhouette scores, we choose k = 102 for cross-type clustering

In [19]:
# kms = list()
# for k in range(1, 400):
#     km = KMeans(n_clusters = k).fit(tfidf_mtx)
#     kms.append(km)

In [20]:
km_alltype = KMeans(n_clusters = 102).fit(tfidf_mtx)

In [21]:
type_topics_kmeans_tfidf = calltaker_all.copy()
type_topics_kmeans_tfidf['cluster'] = km_alltype.labels_
type_topics_kmeans_tfidf = type_topics_kmeans_tfidf.sort_values(by = ['cluster', 'type', 'juri'], ignore_index = True)

In [22]:
type_topics_kmeans_tfidf#[situ_topics_kmeans_tfidf['cluster'] == 91]['situation'].tolist()

Unnamed: 0,type,juri,role,sop,filename,cluster
0,HOSTAG,AB,call taker,"[Create a call, If speaking with the suspect:,...",AB - HOSTAG - Hostage Situation.docx,0
1,HOSTAG,BI,call taker,"[Create a call, If speaking with the suspect:,...",BI - HOSTAG - Hostage Situation.docx,0
2,HOSTAG,BU,call taker,"[Create a call, If speaking with the suspect:,...",BU - HOSTAG - Hostage Situation.docx,0
3,HOSTAG,DE,call taker,"[Create a call, If speaking with the suspect:,...",DE - HOSTAG - Hostage Situation.docx,0
4,HOSTAG,NW,call taker,"[Create a call, If speaking with the suspect:,...",NW - HOSTAG - Hostage Situation.docx,0
...,...,...,...,...,...,...
1463,ABANDV,SX,call taker,[Query the plate (if supplied) on cpic and con...,SX - ABANDV - Abandoned vehicle.docx,101
1464,ABANDV,UN,call taker,[Query the plate (if supplied) on CPIC and con...,UN - ABANDV - Abandoned vehicle.docx,101
1465,ABANDV,VA,call taker,[Query the plate (if supplied) on CPIC and con...,VA - ABANDV - Abandoned vehicle.docx,101
1466,ABANDV,WP,call taker,"[Pemberton, Query the plate (if supplied) on C...",WP - ABANDV - Abandoned vehicle.docx,101


In [23]:
type_topics_kmeans_tfidf.to_csv('../data/interim/type_topics_kmeans_tfidf.csv', index = False)

In [24]:
# situ_clusters = situ_topics_kmeans_tfidf.groupby('cluster').agg({
#     'situation': lambda x: list(x)
# })
# situ_clusters

In [25]:
cluster_bow, cluster_corpus, cluster_dict = get_dct_dtmatrix(type_topics_kmeans_tfidf['sop'])

In [26]:
len(cluster_dict.token2id)

1954

In [27]:
len(cluster_bow)

1468

In [28]:
tfidf_cluster = TfidfModel(cluster_bow)
tfidf_mtx_cluster = bow2tfidf(cluster_bow, tfidf_cluster)

The dimensions of tfidf matrix = (1468, 1954)


In [29]:
def show_result(query_mtx, corpus_mtx, N = 20):
    sim = cosine_similarity(query_mtx, corpus_mtx)[0]
    cluster_sorted = zip(sim.argsort()[::-1], sorted(sim)[::-1])
    idx = list()
    for cnt, (i, prob) in enumerate(cluster_sorted):
        if prob < 0.02 or cnt >= N:
            break
        idx.append(i)
    return type_topics_kmeans_tfidf.iloc[idx].copy()

In [30]:
def query_situation(query, corpus_tfidf, corpus_dct, corpus_mtx, N = 20):
    query_corpus = [preprocess([query]).split()]
    query_bow = [corpus_dct.doc2bow(doc) for doc in query_corpus]
    query_tfidf_mtx = bow2tfidf(query_bow, corpus_tfidf)
    return show_result(query_tfidf_mtx, corpus_mtx, N = N)

In [31]:
# doc_term_bow, corpus, dictionary = get_dct_dtmatrix(calltaker_all['sop'])
# tfidf_type = TfidfModel(doc_term_bow)
# tfidf_mtx = bow2tfidf(doc_term_bow, tfidf_type)

In [32]:
query = 'missing passport'
query1 = 'lost passport'
query_result = query_situation(query, tfidf_cluster, cluster_dict, tfidf_mtx_cluster)
query_result

The dimensions of tfidf matrix = (1, 1954)


Unnamed: 0,type,juri,role,sop,filename,cluster
1131,PROP,AB,call taker,"[Located international passport, Caller wants ...","AB - PROP - Property lost, found or seized.docx",74
1133,PROP,BU,call taker,"[Located international passport, Caller wants ...","BU - PROP - Property lost, found or seized.docx",74
1140,PROP,SQ,call taker,"[Located international passport, Caller wants ...","SQ- PROP - Property lost, found or seized.docx",74
1132,PROP,BI,call taker,"[Located international passport, Caller wants ...","BI - PROP - Property lost, found or seized.docx",74
1139,PROP,SC,call taker,"[Located international passport, Caller wants ...","SC - PROP - Property lost, found or seized.docx",74
1142,PROP,UN,call taker,"[Located international passport, Caller wants ...","UN - PROP - Property lost, found or seized.docx",74
1141,PROP,SX,call taker,"[Located international passport, Caller wants ...","SX - PROP - Property lost, found or seized.docx",74
1144,PROP,WP,call taker,"[Located international passport, Caller wants ...","WP - PROP - Property lost, found or seized.docx",74
1135,PROP,NW,call taker,"[Located international passport, Caller wants ...","NW - PROP - Property lost, found or seized.docx",74
1134,PROP,DE,call taker,"[Located international passport, Caller wants ...","DE - PROP - Property lost, found or seized.docx",74


In [33]:
# query = 'lost license plate'
# query_corpus = [preprocess([query]).split()]
# query_bow = [cluster_dict.doc2bow(doc) for doc in query_corpus]
# query_tfidf_mtx = bow2tfidf(query_bow, tfidf_cluster)
# sim = cosine_similarity(query_tfidf_mtx, tfidf_mtx_cluster)[0]
# cluster_sorted = zip(sim.argsort()[::-1], sorted(sim)[::-1])
# idx = list()
# for cnt, (i, prob) in enumerate(cluster_sorted):
#     if prob < 0.05 or cnt > 10:
#         break
#     idx.append(i)
# idx

In [34]:
# situ_topics_kmeans_tfidf.iloc[idx]

In [35]:
raise Exception('Stop here')

Exception: Stop here

In [None]:
def func1(patn, lst):
    reg = re.findall(patn, ' '.join(lst), re.IGNORECASE)
    if reg:
        return True
    else:
        return False
    
def find_srs_pattern(patn, pd_srs):
    return list(map(lambda x: func1(patn, x), pd_srs))

situ_lda_tfidf_topics[find_srs_pattern('domi', situ_lda_tfidf_topics['sop'])]