In [1]:
import os
import json
from docx import Document
from io import StringIO, BytesIO
import re
import time
import datetime

import pandas as pd
import json
import spacy
from nltk.corpus import stopwords

from gensim.models import LdaModel
from gensim.models.wrappers import LdaMallet
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim import matutils, models
from gensim.models import CoherenceModel, TfidfModel, HdpModel
from gensim.models.phrases import Phrases, Phraser
import pyLDAvis.gensim

from docx import Document
from io import StringIO, BytesIO
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams.update({'font.size': 14, 'lines.linewidth': 3})

In [2]:
nlp = spacy.load("en_core_web_sm")
# stop_words = set(stopwords.words('english'))

In [3]:
notebook_dir = os.getcwd()
situ_df = pd.read_csv('../data/interim/all_situation.csv', 
                      keep_default_na = False, 
                     converters = {#'situation': eval, 
                                   'sop': eval})

In [4]:
situ_df = situ_df[ 
    (situ_df['role'] == 'call taker') 
    & (situ_df['situation'].str.len() > 0)
].reset_index(drop = True)
situ_df

Unnamed: 0,role,situation,sop,filename
0,call taker,CBSA alarm policy,"[Listen to alarm, Acknowledge the alarm by pre...",DE - 1033 - Officer in trouble.docx
1,call taker,All other DOMI reports,[Create a call in every instance. Do not cance...,AB - DOMI - Domestic in progress.docx
2,call taker,GPS Panic Alarms,"[Create a call, Remain on the line recording a...",AB - DOMI - Domestic in progress.docx
3,call taker,Address Obtained,"[Run address on QBA:, If the person is negativ...",AB - FOUNDP - Found Person.docx
4,call taker,Name or medic alert is known,"[Run name on CPIC, If the person is negative o...",AB - FOUNDP - Found Person.docx
...,...,...,...,...
3589,call taker,All Other DVERS personal residential alarms,"[Create a call, Check hazards which will show ...",AB - ALARMD - Domestic violence alarm.docx
3590,call taker,DVERS Alarm maintenance,[See “Assist GP” for a stand by keep the peac...,AB - ALARMD - Domestic violence alarm.docx
3591,call taker,GPS tracked Panic Alarms,[See AB-DOMI],AB - ALARMD - Domestic violence alarm.docx
3592,call taker,DNA Warrant:,[See WARRAN (Warrants) SOP],NW - DNA - DNA collection .docx


In [5]:
' '.join(situ_df.iloc[0, :]['sop'])

'Listen to alarm Acknowledge the alarm by pressing 911 on the telephone Create a call at 4 56 Street'

In [6]:
def preprocess(text,
               min_token_len = 2,
               allowed_pos = ['ADV', 'ADJ', 'VERB', 'NOUN', 'PART', 'PROPN']): 
    removal = ['-', r'i\.e\.']
    res = list()

    text = re.sub(r"|".join(removal), ' ', text.lower())
    doc = nlp(text)
    res += [token.lemma_ for token in doc \
           if token.pos_ in allowed_pos \
           # Spacy considers 'call' as a stop word, which is not suitable for our case
           and not token.is_stop \
#                and token.text not in stop_words \              
#                and token.is_alpha \
           and len(token.lemma_) > min_token_len
           ]
    
    return ' '.join(res)

In [7]:
def get_dct_dtmatrix(sops):
    corpus = [sop.split() for sop in map(preprocess, sops)]
#     phrases = Phrases(corpus, min_count = 1, threshold = 1)
#     bigram = Phraser(phrases)
#     corpus = bigram(corpus)
    dictionary = corpora.Dictionary(corpus)
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]
    return doc_term_matrix, corpus, dictionary

In [8]:
doc_term_bow, corpus, dictionary = get_dct_dtmatrix(situ_df['situation'])

In [9]:
tfidf_mod = TfidfModel(doc_term_bow)
doc_term_tfidf = tfidf_mod[doc_term_bow]

In [10]:
def save_df(df, name):
    filename = '../data/interim/' + name
    df.to_csv(filename, index = False)

In [11]:
hdp_tfidf_k15_t150_a1_g1 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, random_state = 2020)

In [12]:
hdp_tfidf_k15_t150_a1_g01 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, gamma = 0.1, random_state = 2020)

In [13]:
hdp_tfidf_k15_t150_a05_g01 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, 
                                      alpha = 0.5, gamma = 0.1, random_state = 2020)

In [14]:
hdp_tfidf_k15_t300_a05_g01 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, T = 300, 
                                      alpha = 0.5, gamma = 0.1, random_state = 2020)

In [15]:
hdp_tfidf_k30_t300_a05_g01 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, K = 30, T = 300, 
                                      alpha = 0.5, gamma = 0.1, random_state = 2020)

In [16]:
hdp_tfidf_k30_t300_a01_g01 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, K = 30, T = 300, 
                                      alpha = 0.1, gamma = 0.1, random_state = 2020)

In [17]:
hdp_tfidf_k30_t300_a1_g1 = HdpModel(corpus = doc_term_tfidf, id2word=dictionary, K = 30, T = 300, 
                                      random_state = 2020)

In [18]:
# coherence_hdp_tfidf = CoherenceModel(model=hdp_tfidf, texts=corpus, dictionary=dictionary, coherence='c_v')

In [19]:
# coherence_hdp_tfidf.get_coherence()

In [20]:
def get_topic(model, doc, md_type):
    ppdoc = preprocess(doc)
    doc_term_arr = dictionary.doc2bow(ppdoc.split())
    if md_type == 'tfidf':
        doc_term_arr = tfidf_mod[doc_term_arr]
    try:
        res = sorted(model[doc_term_arr], 
                      key = lambda x: x[1], 
                      reverse = True)[0][0]
    except:
        print(doc)
        raise
    return res

In [21]:
def get_topic2(model, doc, md_type):
    ppdoc = preprocess(doc)
    doc_term_arr = dictionary.doc2bow(ppdoc.split())
    if md_type == 'tfidf':
        doc_term_arr = tfidf_mod[doc_term_arr]
    return sorted(model[doc_term_arr], 
                  key = lambda x: x[1], 
                  reverse = True)

In [25]:
def get_topic_clusters(model, md_type):
    df = situ_df.copy()
    df['topic_id'] = list(map(lambda x: get_topic(model, x, md_type), 
                              df['situation'].values.tolist()))
    return df

In [34]:
situ_df['situation'].str.contains(r'Over \$5000').sum()

18

In [35]:
preprocess('Over $5000')

''

In [29]:
situ_df[situ_df['situation'] == 'Over $5000']

Unnamed: 0,role,situation,sop,filename
2161,call taker,Over $5000,[Create a call],BI - THEFT.docx
2182,call taker,Over $5000,[Create a call],DE - THEFT.docx
2206,call taker,Over $5000,[Create a call],PO - THEFT.docx
2227,call taker,Over $5000,[Create a call],RM - THEFT.docx
2242,call taker,Over $5000,[Create a call],SC - THEFT.docx
2257,call taker,Over $5000,[Create a call],SQ - THEFT.docx
2272,call taker,Over $5000,[Create a call],SX - THEFT.docx
2287,call taker,Over $5000,[Create a call],UN - THEFT.docx
2310,call taker,Over $5000,[Create a call],WP - THEFT.docx
2325,call taker,Over $5000,[Create a call],WV - THEFT.docx


In [26]:
situ_topics_hdp_tfidf = get_topic_clusters(hdp_tfidf_k30_t300_a1_g1, 'tfidf')
situ_topics_hdp_tfidf

Over $5000


IndexError: list index out of range

In [None]:
situ_topics_hdp_tfidf = situ_topics_hdp_tfidf \
                            .sort_values(by = ['topic_id'], ignore_index = True)
situ_topics_hdp_tfidf#[situ_topics_hdp_tfidf['filename'].str.contains('Hit and Run')]

In [None]:
raise Exception('stop here')

## Do not change anything below

In [None]:
raise Exception('Stop here')

In [None]:
calltaker_topic = calltaker_all.copy()
calltaker_topic['topic_id'] = list(map(lambda x: get_topic(lda_20, x), 
                                                        calltaker_topic['sop'].values.tolist()))
calltaker_topic[calltaker_topic['type'] == '1033']

In [None]:
calltaker_topic = calltaker_topic.sort_values(by = ['topic_id', 'type', 'juri'], ignore_index = True)
calltaker_topic

In [None]:
call_6 = calltaker_topic[calltaker_topic['topic_id'] == 6]
call_6

In [None]:
calltaker_topic['topic_id'].unique()

In [None]:
unwant = calltaker_topic[calltaker_topic['type'] == 'UNWANT']
unwant

In [None]:
unwant['sop'].values.tolist()[-2:]

In [None]:
call_6['sop'].values.tolist()[0]

In [None]:
sents = call_6['sop'].tolist()[2]
sents[1:3]

In [None]:
def get_entities(sent):
    ent1 = ''
    ent2 = ''
    prv_tok_dep = ''
    prv_tok_txt = ''
    prefix = ''
    mod = ''
    for tok in nlp(sent):
        if tok.dep_ != 'punct':
            if tok.dep_ == 'compound':
                prefix = tok.text
                if prv_tok_dep == 'compound':
                    prefix = prv_tok_text + ' ' + tok.text
            if tok.dep_.endswith('mod'):
                modifier = tok.text
                if prv_tok_dep == 'compound':
                    modifier = prv_tok_text + ' ' + tok.text
            if tok.dep_.find('sub'):
                ent1 = modifier + ' ' + prefix + ' ' + tok.text
                prefix = ''
                modifier = ''
                prv_tok_dep = ''
                prv_tok_text = ''
            if tok.dep_.find('obj'):
                ent2 = modifier + ' ' + prefix + ' ' + tok.text
            
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
        return ent1.strip(), ent2.strip()

In [None]:
# df_call_withtopic = df_dispatcher.copy()
# df_call_withtopic.loc[:, 'topic_id'] = list(map(lambda x: get_topic(call_model_cv, x), 
#                                                 df_calltaker['sop'].values.tolist()))
# df_call_withtopic = df_call_withtopic.sort_values(by = ['topic_id', 'juri'], ignore_index = True)

In [None]:
# df_call_withtopic

In [None]:
# empty = pd.DataFrame()
# df1 = pd.DataFrame({'type': ['type1', 'type2'], 'value': [1, 2]})
# empty = empty.append(df1)
# empty = empty.append(df1)
# empty

#### Reflection of DRUGS coherence score
- the coherence score is very high for the one-topic model
- this makes sense, because we are looking at docs under the same type "DRUGS"

#### Question
- While the model assigns the documents with the correct topic, does this necessarily mean the documents are similar enough to be consolicated?
- LDA in not stable.  How will this instability affect us?

In [None]:
type_list = sop_df['type'].values.tolist()
type_list[0]

In [None]:
type_list = sop_df['type']
res = pd.DataFrame()
for event_type in type_list:
    dct = load_event(event_type)
    event_row = sop_df[sop_df['type'] == event_type]
    juri_to_filename = dict(zip(event_row['juri'].values[0], 
                            event_row['filename'].values[0]))
    juris, roles, sops, types = list(), list(), list(), list()
    for juri, role_sop in dct.items():
        for role, sop in role_sop.items():
            juris.append(juri)
            roles.append(role)
            sops.append(sop)
            types.append(event_type)
    typedf = pd.DataFrame({'type': types, 'juri': juris, 'role': roles, 'sop': sops})
    typedf['filename'] = typedf['juri'].apply(lambda x: juri_to_filename[x])
    df_calltaker = typedf[typedf['role'] == 'call taker']
    df_dispatcher = typedf[typedf['role'] == 'dispatcher']
    print(df_calltaker.shape)
    print(df_dispatcher.shape)

    for df in [df_calltaker, df_dispatcher]:
        if len(df) == 0:
            continue
        print('Start working on:', event_type, df['role'].unique())
        doc_term_matrix, corpus, dictionary = get_dct_dtmatrix(df['sop'])
        coherence_cv = topics_with_coherence(doc_term_matrix, corpus, dictionary, 
                                            df['sop'].values.tolist())
        best_model_cv = coherence_cv.iloc[1:, :].sort_values('coherence_score')['model'].tolist()[-1]
        df_with_topic = df.copy()
        df_with_topic.loc[:, 'topic_id'] = list(map(lambda x: get_topic(best_model_cv, x), 
                                                        df['sop'].values.tolist()))
        df_with_topic = df_with_topic.sort_values(by = ['topic_id', 'juri'], ignore_index = True)
        res = res.append(df_with_topic)
        print('Finish working on:', event_type, df['role'].unique())

In [None]:
ress = res.reset_index(drop = True)
ress

In [None]:
from datetime import datetime 
dt = datetime.now().strftime('%Y-%m-%dT%H_%M_%S')
cwd = os.getcwd()
os.chdir(notebook_dir)
ress.to_csv(f'../data/interim/sop_topics_{dt}.csv', index = False)
os.chdir(cwd)

In [None]:
print(type_list.values.tolist())

In [None]:
ress[ (ress['type'] == 'MISCH') & (ress['role'] == 'call taker')]

In [None]:
ress[ (ress['type'] == 'MISCH') & (ress['role'] == 'dispatcher')]

In [None]:
ress[ (ress['type'] == 'ANIMAL') & (ress['role'] == 'call taker')]

In [None]:
ress[ (ress['type'] == 'DRUGS') & (ress['role'] == 'call taker')]

In [None]:
ress[ (ress['type'] == 'DRUGS') & (ress['role'] == 'call taker')]['sop'].values.tolist()[0]

In [None]:
# all_coherence = topics_with_coherence(dt_matrix_all, corpus_all, dictionary_all, N = 20)

In [None]:
# all_coherence

In [None]:
# plt.figure(figsize = (12, 8))
# plt.plot(all_coherence.loc[:, 'num_topic'].values, all_coherence.loc[:, 'coherence_score'].values)
# plt.show()