In [1]:
import pandas as pd
import pydot
import numpy as np
from gensim.models import TfidfModel
import src.situ_helper as situ
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
type_df = pd.read_csv(
    'data/interim/type_topics_kmeans_tfidf.csv',
    converters = {'sop': eval}
)
type_df = type_df.drop(columns = 'sop')
# type_df

In [3]:
situ_df = pd.read_csv(
    'data/interim/all_situation.csv',
    converters = {'sop': eval},
    keep_default_na = False
)
situ_df['situation'] = situ_df['situation'].apply(lambda x: x if len(x) > 0 else 'situation NA')
call_situ = situ_df[situ_df['role'] == 'call taker'] \
                .drop(columns = ['role', 'filename']) \
                .copy() \
                .reset_index(drop = True)
del situ_df
call_situ

Unnamed: 0,type,juri,situation,sop
0,1033,AB,situation NA,"[Create a call, Questions, Are there weapons i..."
1,1033,BI,situation NA,"[Create a call, Questions, Are there weapons i..."
2,1033,BU,situation NA,"[Create a call, Questions, Are there weapons i..."
3,1033,DE,situation NA,"[Create a call, Questions, Are there weapons i..."
4,1033,DE,CBSA alarm policy,"[Listen to alarm, Acknowledge the alarm by pre..."
...,...,...,...,...
4529,DNA,VA,DNA Warrant:,[See WARRAN (Warrants) SOP]
4530,DNA,WP,situation NA,[This file type is not used.]
4531,DNA,WP,DNA Warrant:,[See WARRAN (Warrants) SOP]
4532,DNA,WV,situation NA,[This file type is not used.]


In [4]:
type_0 = type_df[type_df['cluster'] == 11]
type_0

Unnamed: 0,type,juri,role,filename,cluster
235,INTELL,BI,call taker,BI - INTELL - Intelligence Information.docx,11
236,INTELL,DFPF,call taker,DFPF - INTELL - Intelligence Information.docx,11
237,INTELL,NW,call taker,NW - INTELL - Intelligence Information.docx,11
238,INTELL,PO,call taker,PO - INTELL - Intelligence Information.docx,11
239,INTELL,RI,call taker,RI - INTELL - Intelligence Information.docx,11
240,INTELL,RM,call taker,RM - INTELL - Intelligence Information.docx,11
241,INTELL,SC,call taker,SC - INTELL - Intelligence Information.docx,11
242,INTELL,SQ,call taker,SQ - INTELL - Intelligence Information.docx,11
243,INTELL,SX,call taker,SX - INTELL - Intelligence Information.docx,11
244,INTELL,UN,call taker,UN - INTELL - Intelligence Information.docx,11


In [5]:
df3 = pd.DataFrame()
for i in range(type_0.shape[0]):
    row = type_0.iloc[i]
    doc_type = row['type']
    juri = row['juri']
    df_match = call_situ[ (call_situ['type'] == doc_type) & (call_situ['juri'] == juri) ]
    df3 = df3.append(df_match)
df3 = df3.reset_index(drop = True)
df3['sop_str'] = df3['sop'].apply(lambda x: '\n'.join(x))
df3['situ_lst'] = df3['situation'].apply(lambda x: [x])
df3 = df3[['type', 'juri', 'situation', 'sop_str', 'situ_lst', 'sop']]
df3

Unnamed: 0,type,juri,situation,sop_str,situ_lst,sop
0,INTELL,BI,situation NA,Call type not used,[situation NA],[Call type not used]
1,INTELL,DFPF,situation NA,Call type not used,[situation NA],[Call type not used]
2,INTELL,NW,situation NA,Call type not used by call taker,[situation NA],[Call type not used by call taker]
3,INTELL,PO,situation NA,Call type is not used,[situation NA],[Call type is not used]
4,INTELL,RI,situation NA,Call type not used,[situation NA],[Call type not used]
5,INTELL,RM,situation NA,Call type not used,[situation NA],[Call type not used]
6,INTELL,SC,situation NA,Call type not used,[situation NA],[Call type not used]
7,INTELL,SQ,situation NA,Call type not used,[situation NA],[Call type not used]
8,INTELL,SX,situation NA,Call type not used,[situation NA],[Call type not used]
9,INTELL,UN,situation NA,Call type not used,[situation NA],[Call type not used]


In [6]:
situ_tfidf = TfidfVectorizer().fit(df3['situ_lst'].apply(situ.preprocess))
situ_tfidf_mtx = situ_tfidf.transform(df3['situation'])

sop_tfidf =  TfidfVectorizer().fit(df3['sop'].apply(situ.preprocess))
sop_tfidf_mtx = sop_tfidf.transform(df3['sop_str'])

In [7]:
situ_similarity = cosine_similarity(situ_tfidf_mtx)
sop_similarity = cosine_similarity(sop_tfidf_mtx)

In [8]:
situ_similarity[0]

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [9]:
df3['situ_group'] = df3['situation']
df3['sop_group'] = df3['sop_str']
similarity_threshold = 0.245
situ_grouped = set()
for i in range(situ_similarity.shape[0]):
    if i in situ_grouped:
        continue
    same_group = np.where(situ_similarity[i] >= similarity_threshold)[0]
    situ_grouped = situ_grouped | set(same_group)
    for j in same_group:
        df3.at[int(j), 'situ_group'] = df3.iloc[i]['situation']
sop_grouped = set()
for i in range(sop_similarity.shape[0]):
    if i in sop_grouped:
        continue
    same_group = np.where(sop_similarity[i] >= similarity_threshold)[0]
    sop_grouped = sop_grouped | set(same_group)
    for j in same_group:
        df3.at[j, 'sop_group'] = df3.iloc[i]['sop_str']
df3 = df3[['type', 'situ_group', 'sop_group']]

graph = pydot.Dot(graph_type='graph', concentrate=True, rankdir='LR')
for i in range(df3.shape[0]):
    row = df3.iloc[i]
    for p, c in zip(row[:-1], row[1:]):
        edge = pydot.Edge(p, c)
        graph.add_edge(edge)
graph.write_png(f"img/flowcharts/type_cluster_1.png")

In [10]:
for _, df in type_df.groupby('cluster'):
    type_i = df.copy().reset_index(drop = True)
    cluster = int(type_i.iloc[0]['cluster'])
    df3 = pd.DataFrame()
    for i in range(type_i.shape[0]):
        row = type_i.iloc[i]
        doc_type = row['type']
        juri = row['juri']
        df_match = call_situ[ (call_situ['type'] == doc_type) & (call_situ['juri'] == juri) ]
        df3 = df3.append(df_match)
    df3 = df3.reset_index(drop = True)
    df3['sop_str'] = df3['sop'].apply(lambda x: '\n'.join(x))
    df3['situ_lst'] = df3['situation'].apply(lambda x: [x])
    df3 = df3[['type', 'juri', 'situation', 'sop_str', 'situ_lst', 'sop']]
    
    try:
        situ_tfidf = TfidfVectorizer().fit(df3['situ_lst'].apply(situ.preprocess))
        situ_tfidf_mtx = situ_tfidf.transform(df3['situation'])
        sop_tfidf =  TfidfVectorizer().fit(df3['sop'].apply(situ.preprocess))
        sop_tfidf_mtx = sop_tfidf.transform(df3['sop_str'])
        situ_similarity = cosine_similarity(situ_tfidf_mtx)
        sop_similarity = cosine_similarity(sop_tfidf_mtx)
    except:
        print(f'Exception at cluster {cluster}')
        print(f'')
        print(df3['situ_lst'])

    df3['situ_group'] = df3['situation']
    df3['sop_group'] = df3['sop_str']

    similarity_threshold = 0.45
    situ_grouped = set()
    for i in range(situ_similarity.shape[0]):
        if i in situ_grouped:
            continue
        same_group = np.where(situ_similarity[i] >= similarity_threshold)[0]
        situ_grouped = situ_grouped | set(same_group)
        for j in same_group:
            df3.at[j, 'situ_group'] = df3.iloc[i]['situation']

    sop_grouped = set()
    for i in range(sop_similarity.shape[0]):
        if i in sop_grouped:
            continue
        same_group = np.where(sop_similarity[i] >= similarity_threshold)[0]
        sop_grouped = sop_grouped | set(same_group)
        for j in same_group:
            df3.at[j, 'sop_group'] = df3.iloc[i]['sop_str']

    graph = pydot.Dot(graph_type='graph', concentrate=True, rankdir='LR')
    df3 = df3[['type', 'situ_group', 'sop_group']]
    nrows, _ = df3.shape
    for i in range(nrows):
        row = df3.iloc[i]
        for p, c in zip(row[:-1], row[1:]):
            edge = pydot.Edge(p, c)
            graph.add_edge(edge)
    graph.write_png(f"img/flowcharts/type_cluster_{cluster}.png")

In [11]:
raise Exception('Stop here')

Exception: Stop here

In [None]:
# for i, df in df2.groupby('cluster'):
#     graph = pydot.Dot(graph_type='graph', rankdir='LR')
#     cluster = int(df.iloc[0]['cluster'])
#     df = df.drop(columns = 'cluster')
#     nrows, _ = df.shape
#     for i in range(nrows):
#         row = df.iloc[i]
#         for p, c in zip(row[:-1], row[1:]):
#             edge = pydot.Edge(p, c)
#             graph.add_edge(edge)
#     graph.write_png(f"img/flowcharts/situ_cluster_{cluster}.png")

In [None]:
# nrows, ncols = df2.shape
# for i in range(nrows):
#     row = df2.iloc[i]
#     for p, c in zip(row[:-1], row[1:]):
#         edge = pydot.Edge(p, c)
#         graph.add_edge(edge)

In [None]:
# graph.write_png('img/situ_clusters/graph_cluster1.png')

In [None]:
# for df in df2.groupby('cluster'):
#     print(df)

In [None]:
import flask
