In [None]:
import pip


def import_or_install(package):
    try:
        __import__(package)
    except ImportError:
        pip.main(['install', package])


packages = ['PyQt6', 'nltk', 'pyvis', 'matplotlib_inline', 'matplotlib', 'gensim.corpora', 'gensim.utils',
            'gensim.models', 'matplotlib_inline.pyplot', 'pandas', 'numpy', 'umap', 'sqlite3', 'spacy',
            'win32com.client', 'datetime', 'pyvis.network', 'matplotlib.pyplot', 'plotly.graph_objects', 'scipy',
            'networkx', 'dash', 'gensim', 'logging', 'warnings', 'nltk.corpus']

for package in packages:
    import_or_install(package)

print('done with package installation')

import nltk
import sqlite3

nltk.download('stopwords')
import pandas as pd
from pandas import Series
import numpy as np
import win32com.client
from datetime import datetime, timedelta
from dash import html
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from textblob import TextBlob
import random
import dash
import json
from dash.dependencies import Input, Output
from dash import dcc
from dash import html
from dash import dash_table
import dash_cytoscape as cyto

nltk.download('omw-1.4')

# stopword definitions for later sections
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
stopwords = nltk.corpus.stopwords.words('english')


In [None]:

def drop_nas_df(df):
    nan_value = float("NaN")
    df.replace("", nan_value, inplace=True)
    df = df.dropna()
    return df


class NetworkMessages:
    def __init__(self):
        self.messages = []

    def lemmatize_with_postag(self, sentence):
        # Get each words Part of speech ( verb, noun etc) then pass it with the word to a lemmatizer. Then put the lemmatized word back into the Pandas DF row
        sent = TextBlob(sentence)
        tag_dict = {"J": 'a',
                    "N": 'n',
                    "V": 'v',
                    "R": 'r'}
        words_and_tags = [(w, tag_dict.get(pos[0], 'n')) for w, pos in sent.tags]
        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
        return " ".join(lemmatized_list)

    def normalize_df_col(self, col):
        self.messages[col] = self.messages[col].apply(lambda x: x.lower())
        self.messages[col] = self.messages[col].str.replace(r'[^\w\s]+', '')  # (?:\w+)
        self.messages[col] = self.messages[col].replace(r'\n', ' ', regex=True)
        self.messages[col] = self.messages[col].replace(r'\r', ' ', regex=True)
        self.messages[col] = self.messages[col].replace(r'_', '', regex=True)
        self.messages[col] = self.messages[col].str.strip()
        self.messages[col] = self.messages[col].apply(lambda x: ' '.join([word for word in x.split() if word not in (
            stopwords)]))  # should look at tokenizing instead of splitting maybe?
        self.messages[col] = self.messages[col].apply(self.lemmatize_with_postag)
        return self.messages

    def build_topics(self, topic_count, col):
        tfidf = TfidfVectorizer(ngram_range=(1, 2), sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1',
                                stop_words='english')
        dtm = tfidf.fit_transform(self.messages[col])
        nmf_model = NMF(n_components=topic_count, random_state=42)
        nmf_model.fit(dtm)
        topics_dict = {}
        for index, topic in enumerate(nmf_model.components_):
            topic_text_string = str([tfidf.get_feature_names_out()[i] for i in topic.argsort()[-topic_count:]])
            v = {index: str(topic_text_string)}
            topics_dict.update(v)

        topic_text_df = pd.Series(topics_dict).to_frame()
        topic_text_df.reset_index(level=0, inplace=True)
        topic_text_df = topic_text_df.rename(columns={0: "topic_text", 'index': "topic"})
        self.topic_results = nmf_model.transform(dtm)
        self.messages['topic'] = self.topic_results.argmax(axis=1)
        topic_colors = {}
        for x in range(topic_count):
            hex_color = "%06x" % random.randint(0, 0xFFFFFF)
            hex_color = "#" + hex_color
            topic_colors.update({x: x, x: hex_color, })

        self.topic_color_df = pd.DataFrame(topic_colors.items(), columns=['topic', 'color'])
        self.topic_color_df = self.topic_color_df.merge(topic_text_df, left_on=['topic', 'topic'],
                                                        right_on=['topic', 'topic'], how='left')
        return self.messages, self.topic_color_df, self.topic_results

    def merge_topic_colors(self):
        # THIS CHUNK GENERATES TOPIC COLORS AND MERGES IT WITH THE MASTER TABLEs
        print('merging topic colors')
        self.messages = self.messages.merge(self.topic_color_df, left_on=['topic', 'topic'],
                                            right_on=['topic', 'topic'], how='left')
        self.recipients = self.messages.merge(self.recipients, left_on=['entryID', 'entryID'],
                                              right_on=['entryID', 'entryID'], how='left')
        self.recipients = self.recipients.reset_index(drop=True)
        return self.messages, self.recipients

    def group_recipients(self):
        # must include color
        self.recipients = self.recipients.groupby(['sender', 'recipient', 'topic', 'color']).size()
        self.recipients = self.recipients.to_frame(name='size').reset_index()
        self.recipients = self.recipients.loc[self.recipients['size'] != 1]
        return self.recipients

    def pipeline_topic_colors(self):
        self.messages, self.recipients = self.merge_topic_colors()
        self.recipients = self.group_recipients()
        self.recipients = drop_nas_df(self.recipients)
        return self.messages, self.recipients

    def generate_node_dataframe(self):
        print("Generating node DF")
        df_node_temp = self.recipients
        # build node list off of to/from table. Adding a new feature called count driven by node edge counts
        df1 = df_node_temp['sender']
        df2 = df_node_temp['recipient']
        df_nodes = df1.append(df2)
        df_nodes.reset_index()
        df_nodes = df_nodes.to_frame()
        df_nodes = df_nodes.rename(columns={0: "individuals"})
        df_nodes = df_nodes['individuals'].value_counts()
        df_nodes = df_nodes.to_frame()
        self.df_nodes = df_nodes.rename(columns={'individuals': "count"})
        self.df_nodes.index.name = 'individual'
        self.df_nodes.reset_index(inplace=True)
        return self.df_nodes

    def generate_edges_dataframe(self):
        print("Generating edges DF")
        edges_table = [self.recipients["sender"], self.recipients["recipient"], self.recipients['topic'],
                       self.recipients['color'], self.recipients['size']]
        headers = ["source", "target", "topic", "color", "size"]
        self.df_edges = pd.concat(edges_table, axis=1, keys=headers)
        return self.df_edges

    def generate_node_and_edge_tables(self):
        self.generate_node_dataframe()
        self.generate_edges_dataframe()
        return self.df_nodes, self.df_edges


class OutlookEmails(NetworkMessages):
    def __init__(self, folder_number):
        super().__init__()
        self.folder_number = folder_number
        self.outlook_connection = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
        folder_contents = self.outlook_connection.GetDefaultFolder(self.folder_number)
        self.messages = folder_contents.Items

    def build_network_df(self):
        print('building core network arrays')
        # Build core tables
        pd_messages = []
        pd_recipients = []
        for message in list(self.messages):
            try:
                recipients_list = message.Recipients
                recipients_cleaned = ""
                for recipient in recipients_list:
                    pd_recipients.append({
                        "entryID": str(message.EntryID),
                        "sender": str(message.Sender),
                        "recipient": str(recipient),
                    })

                pd_messages.append({
                    "entryID": str(message.EntryID),
                    "conversationID": str(message.ConversationID),
                    "conversationIndex": str(message.ConversationIndex),
                    "createTime": str(message.CreationTime),
                    "recievedTime": str(message.ReceivedTime),
                    "ConversationTopic": str(message.ConversationTopic),
                    "subject": str(message.Subject),
                    "body": str(message.body)
                })
            except:
                print("error processing message =  " + str(recipients_list))

        self.messages = pd.DataFrame(pd_messages)
        self.recipients = pd.DataFrame(pd_recipients)
        return self.messages, self.recipients

    def filter_days_back(self, date_range):
        print('applying outlook filter')
        # filter messages processed by the last X days
        received_dt = datetime.now() - timedelta(days=date_range)
        received_dt = received_dt.strftime('%m/%d/%Y %H:%M %p')
        self.messages = self.messages.Restrict("[ReceivedTime] >= '" + received_dt + "'")
        return self.messages

    # def build_graph_tables(self):


def build_network_nodes(df_nodes):
    print('building network node array')
    nodes = set()
    cy_nodes = []
    for index, row in df_nodes.iterrows():
        individual, bins = row['individual'], row['log_count']
        nodes.add(individual)
        cy_nodes.append({"data": {"id": individual, "label": individual, 'weight': bins*10, }})
    return cy_nodes

def build_network_edges(df_edges):
    print('building network edge array')
    cy_edges = []
    for index, row in df_edges.iterrows():
        source, target, topic, weight, topic_color = row['source'], row['target'], row['topic'], row['size'], row['color']
        cy_edges.append({
            'data': {
                'source': source,
                'target': target,
                'topic': topic,
                'weight': weight,
                'topic_color': topic_color
            }
        })
    return cy_edges


def generate_folder_network_tables(folder, days_back):
    inbox_emails = OutlookEmails(folder)
    inbox_emails.filter_days_back(days_back)
    inbox_emails.build_network_df()
    inbox_emails.normalize_df_col('body')
    inbox_emails.build_topics(15, 'body')
    inbox_emails.pipeline_topic_colors()
    inbox_emails.generate_node_and_edge_tables()
    return inbox_emails

In [None]:

inbox_six = generate_folder_network_tables(6, 100)
print(inbox_six.messages)




# define stylesheet
n_stylesheet = [
    {
        "selector": 'node',  #For all nodes
        'style': {
            "opacity": 0.9,
            "height": "data(weight)",
            'width': 'data(weight)',
            "label": "data(label)",  #Label of node to display
            "background-color": "#07ABA0",  #node color
            "color": "#008B80"  #node label color
        }
    },
    {
        "selector": 'edge',  #For all edges
        "style": {
            "target-arrow-color": "#C5D3E2",  #Arrow color
            "target-arrow-shape": "triangle",  #Arrow shape
            "line-color": "data(topic_color)",  #edge color
            'arrow-scale': 2,  #Arrow size
            'curve-style': 'bezier'  #Default curve-If it is style, the arrow will not be displayed, so specify it
        }
    },
]

page_styles = {
    'pre': {
        'border': 'thin lightgrey solid',
        'overflowX': 'scroll',
        'min-height': '50px'
    }
}

inbox_six.df_nodes['log_count'] = np.log(inbox_six.df_nodes['count'])


cy_nodes = build_network_nodes(inbox_six.df_nodes)
cy_edges = build_network_edges(inbox_six.df_edges)


topic_color_df = inbox_six.topic_color_df
edge_legend = inbox_six.topic_color_df[['topic','color']]


cyto.load_extra_layouts()
app = dash.Dash(__name__)
#app.config['TESTING'] = True
server = app.server

app.layout = html.Div(children=[
    html.H4(children='Communication patters'),

    html.Div(
        children=[
            html.Div(children=[
            cyto.Cytoscape(
                id='cytoscape',
                elements=cy_edges + cy_nodes,
                style={
                    'height': '95vh',
                    'width': '100%'
                },
                 layout={'name': 'grid'},
                stylesheet=n_stylesheet
            )], style={'width': '50%'}),
            html.Div(children=[
                    dcc.Dropdown(id='dropdown_topic',
                 options=[{'label': topic.capitalize(), 'value': topic}
                         for topic in ['0', '1', '2', '3', '4']
                         ]
                 ),

    dcc.Dropdown(
        id='dropdown-update-layout',
        options=[
            {'label': 'random',
             'value': 'random'},
            {'label': 'grid',
             'value': 'grid'},
            {'label': 'circle',
             'value': 'circle'},
            {'label': 'concentric',
             'value': 'concentric'},
            {'label': 'breadthfirst - Hiearchy',
             'value': 'breadthfirst'},
            {'label': 'klay - Force Directed',
             'value': 'klay'},
            {'label': 'cose - Force Directed',
             'value': 'cose'},
            {'label': 'cose-bilkent - Force Directed',
             'value': 'cose-bilkent'},
            {'label': 'cola - Force Directed',
             'value': 'cola'},
            {'label': 'spread - Force Directed',
             'value': 'spread'},
            {'label': 'dagre - Hiearchy',
             'value': 'dagre'}
        ], value='circle'
    ),
                html.H4(children='NodeData'),
                html.P(id='cytoscape_element_info_output'),
                html.P(id='cytoscape-tapEdgeData-output'),
                html.H4(children='Legend'),
                dash_table.DataTable(
                    data=edge_legend.to_dict('records'),
                    columns=[{"name": i, "id": i} for i in edge_legend.columns],
                    style_cell={'textAlign': 'left'},
                    style_data_conditional=[
                        {'if': {'row_index': i, 'column_id': 'color'},
                         'background-color': edge_legend['color'][i],
                         'color': edge_legend['color'][i]} for i in range(edge_legend.shape[0])
                    ]
                     )
        ], style={'width': '50%'})
        ], style={'display': 'flex', 'flex-direction': 'row'}),

                dash_table.DataTable(
                    data=topic_color_df.to_dict('records'),
                    columns=[{"name": i, "id": i} for i in topic_color_df.columns],
                    style_cell={'textAlign': 'left'},
                    style_data_conditional=[
                        {'if': {'row_index': i, 'column_id': 'color'},
                         'background-color': topic_color_df['color'][i],
                         'color': topic_color_df['color'][i]} for i in range(topic_color_df.shape[0])
                    ]
                     )
])


@app.callback(Output('cytoscape', 'layout'),
              Input('dropdown-update-layout', 'value'))
def update_layout(layout):
        return {
            'name': layout,
        }


@app.callback(Output('cytoscape_element_info_output', 'children'),
              Input('cytoscape', 'tapNodeData'))
def displayTapNodeData(data):
    if data:
        return "You recently clicked/tapped node: " + data['label']


@app.callback(Output('cytoscape-tapEdgeData-output', 'children'),
              Input('cytoscape', 'tapEdgeData'))
def displayTapEdgeData(data):
    if data:
        return json.dumps(data, indent=2)


In [None]:

if __name__ == '__main__':
    app.run_server(debug=False)