This notebook creates a person-to-person matrix from MGDC _email_ data

In [4]:
StartDate = "2018-09-01"
EndDate = "2019-03-31"
Destination = "ona/test_20210730"
hr_attributes = "FunctionType,LevelDesignation,Domain,Organization,Region"
minGroupSize = 5
log_folder = "ona/job_logs/01/"
meaningfulParticipantThreshold = 8
metric_partition, metric_clustering, metric_fluidity, metric_xy, metric_ari, metric_freedom = True, True, True, True, True, True
bad_flag = False # becomes True if LCC = 0, LCC fails, data is less than 2 months, or modularity fails.

ResultBlobPath = 'abfss://users@dopsis.dfs.core.windows.net/rawdata/MGDC_data/'

StatementMeta(, , , SessionError, )

AVAILABLE_COMPUTE_CAPACITY_EXCEEDED: Livy session has failed. Error code: AVAILABLE_COMPUTE_CAPACITY_EXCEEDED. Your job requested 12 vcores. However, the pool only has 0 vcores available out of quota of 12 vcores. Try ending the running job(s) in the pool, reducing the numbers of vcores requested, increasing the pool maximum size or using another pool. Source: User.

In [None]:
from pyspark.sql.functions import to_date, date_format, ceil, year, lit, udf, explode, split, last_day, trunc, monotonically_increasing_id
import os
import pyspark.sql.functions as F
import datetime as dt
from pyspark.sql.types import *
from pyspark.sql.functions import col
from pyspark.sql.window import Window

import networkx as nx
import topologic as tc
import graspologic as gc

from datetime import datetime, timedelta
import pandas as pd
from notebookutils import mssparkutils

StatementMeta(, , , Cancelled, )

In [None]:
df = spark.read.option("header", "true").json(ResultBlobPath)

StatementMeta(, , , Cancelled, )

In [None]:
df.printSchema

StatementMeta(, , , Cancelled, )

'Attachments', 'BccRecipients', 'BodyPreview', 'Categories', 'CcRecipients', 'ChangeKey', 'ConversationId', 'ConversationIndex', 'CreatedDateTime', 'Flag', 'From', 'HasAttachments', 'Id', 'Importance', 'InferenceClassification', 'InternetMessageId', 'IsComplete', 'IsDeliveryReceiptRequested', 'IsDraft', 'IsRead', 'IsReadReceiptRequested', 'LastModifiedDateTime', 'LikesPreview', 'Mentions', 'MentionsPreview', 'MultiValueExtendedProperties', 'ParentFolderId', 'RawUniqueBody', 'ReceivedDateTime', 'ReplyTo', 'Sender', 'SentDateTime', 'SingleValueExtendedProperties', 'Subject', 'ToRecipients', 'UniqueBody', 'UnsubscribeData', 'UnsubscribeEnabled', 'UserEmailAddress', 'WebLink', 'folderDisplayName', 'ptenant', 'puser'


In [None]:
def email_extractor(s):
    """
    This extracts the email addresses from the Sender and recipients fields and creates a comma-separated string
    """
    string = s.split("u'")
    j = ""
    string_flag = 0
    for x in string:
        if '@' in x:
            if string_flag == 0:
                j += x[0:x.find("'")]
                string_flag = 1
            else:
                j += ", " + x[0:x.find("'")]
                    
    return j

# User defined function for extracting emails
uuidUdf= udf(lambda x: email_extractor(str(x)))
recipient_udf = udf(lambda x,y,z: email_extractor(str(x)+str(y)+str(z)))

# Create the P2P matrix from email data using time sent, senders and recipients

email2 = (df
    .select("SentDateTime","Sender","ToRecipients","CcRecipients","BccRecipients")
    .withColumn("Node1",df.Sender.EmailAddress.Address)
    .withColumn("Node2",recipient_udf(col("ToRecipients"),col("CcRecipients"),col("BccRecipients")))
    .select("SentDateTime","Node1","Node2")
    .withColumn("Node2",explode(split(col("Node2"),",")))
    .withColumn("MonthEndDate",last_day(col("SentDateTime")))
    .withColumn("MonthStartDate",trunc(col("SentDateTime"),"month"))
    .drop("SentDateTime"))
    
email2 = (df
    .select("SentDateTime","Sender","ToRecipients","CcRecipients","BccRecipients")
    .withColumn("Node1", df.Sender.EmailAddress.Address)
    .withColumn("Node2", df.ToRecipients.EmailAddress.Address)
    .select("SentDateTime","Node1","Node2")
    .withColumn("Node2",explode(col("Node2")))
    .withColumn("MonthEndDate",last_day(col("SentDateTime")))
    .withColumn("MonthStartDate",trunc(col("SentDateTime"),"month"))
    .drop("SentDateTime"))
    
# Create a mapping of email addresses to unique integers/identifiers for quicker processing
participants_map = (email2.select("Node1").distinct()
    .union(email2.select("Node2").distinct())
    .distinct()
    .withColumn("PID",monotonically_increasing_id()+1)).cache()

# Create the Node1Pid, Node2Pid, Node1Phid, Node2Phid, WeightbyHours and WeightbyCount columns
email2 = (email2
          .groupby("MonthStartDate","MonthEndDate","Node1","Node2").count()
          .withColumnRenamed("count","WeightbyCount")
          .withColumn("WeightbyHours", col("WeightbyCount")*5/60)
          .join(participants_map,on="Node1",how="left").withColumnRenamed("PID","Node1Pid")
          .join(participants_map.withColumnRenamed("Node1","Node2"), on="Node2", how="left").withColumnRenamed("PID","Node2Pid")
          .withColumn("Node1Phid", col("Node1Pid"))
          .withColumn("Node2Phid", col("Node2Pid"))
         .drop("Node1","Node2")
         .select('Node1Pid','Node2Pid','Node1Phid','Node2Phid', 'WeightbyHours', 'WeightbyCount','MonthStartDate', 'MonthEndDate')).cache()

# Put into a sql table for data exploration
display(email2)

StatementMeta(, , , Cancelled, )

In [None]:
ResultBlobPath = 'abfss://users@dopsis.dfs.core.windows.net/ona/Org_Insights/'
def df_output_blob(df, extension, outFolder):
    outPath = ResultBlobPath + outFolder + "/"
    if extension == 'csv':
      df.repartition(1).write.csv(outPath, header='true', mode='overwrite', escape="\"")
    elif extension == 'json':
      df.repartition(1).write.json(outPath)
    
    # Copy file from outFolder to central working directory
    try:
      fullLS = mssparkutils.fs.ls(outPath)
      for i in fullLS:
        if 'part-00000' in i.name:
          outFileName = i.name
          outFileLocation = i.path
          newFileLocation = ResultBlobPath + outFolder + '.' + extension
          mssparkutils.fs.mv(outFileLocation, newFileLocation, True)
          print ('File moved successfully: ', newFileLocation)
    except Exception as e:
      print ("Error moving file. Error: ", e) 
  
    # clean up old files
    try:
      mssparkutils.fs.rm(outPath , True)
      print ('Work Folder deleted: ', outPath)
    except Exception as e:
      print ("Error Deleting work File or Folder. Error: ", e)
      
email2.name = "MGDC_contoso_P2P"
df_output_blob(email2,"csv","MGDC")

StatementMeta(, , , Cancelled, )

In [None]:
display(email2.select("MonthStartDate","MonthEndDate").groupBy("MonthStartDate").min())

StatementMeta(, , , Cancelled, )