In [1]:
! wget https://storage.googleapis.com/reinfer-datasets/enron_mail_20150507.tar.gz

--2019-06-02 10:24:54--  https://storage.googleapis.com/reinfer-datasets/enron_mail_20150507.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.210.48, 2a00:1450:4009:807::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.210.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 443254787 (423M) [application/x-tar]
Saving to: ‘enron_mail_20150507.tar.gz’


2019-06-02 10:24:58 (93.0 MB/s) - ‘enron_mail_20150507.tar.gz’ saved [443254787/443254787]



In [21]:
!tar -xf enron_mail_20150507.tar.gz

## 1. Wrangle the data to get sender : reciepient mapping

In [None]:
import re
import glob
from collections import defaultdict, Counter
from mailbox import mboxMessage

# What we want to do here is create a data-structure that maps from: recieved : n
# to do this efficiently we use the defaultdict-counter combo
# I experimented a little using the 'mbox' class to do everything at once but couldn't make it work. If I have time i'll
# go back

# Certain emails don't have 'To' or 'From' usually ones that are going to a group list. 
# As we don't know who is on these lists lets ignore this class of message for now, even 
# though this isn't ideal, as it is conceivable that big influencers are much more likely to 
# emails these lists
            

# data structures:
# Effectively a graph, and in reality one would just build a graph using a standard library and run HITS on that. Here
# log both the incoming and outgoing edges independently for each of use in the custom HITS implementation
outgoing = defaultdict(Counter) # outgoingconnection counts
incoming = defaultdict(Counter) # incoming connection counts


previously_seen = set() # keeps a hash of the payloads of previously seen messages to avoid double counting

for f in glob.glob("maildir/**", recursive=True):
    try:
        with open(f) as mbox_file:
            msg = mboxMessage(mbox_file)
                
            payload = msg.get_payload()
            if  msg["From"] is not None and msg["To"] is not None and payload is not None:
                payload_hash = hash(payload)
                if payload_hash not in previously_seen:
                    fr = msg["From"]
                    to = re.sub('\ |\n|\t', '', msg["To"]).split(",") # remove special characters and spaces
                    outgoing[fr].update(to)
                    for person in to:
                        incoming[person].update([fr]) 
                    previously_seen.add(payload_hash)
            
    except (IsADirectoryError, UnicodeDecodeError) as e:
        pass

    

In [43]:
print ("{} unique senders".format(len(connections)))

19567 unique senders


## 2. Implementation of Hubs and Authorities

Just implement a vanilla HA without regards for performance. I'm not sure how long it will take to run, so if its slow i'll optimize it.

In [None]:
class Score:
    __slots__ = 'hub', 'auth'
    def __init__(self, hub=1, auth=1):
        self.hub = hub
        self.auth= auth ## todo: check the initialization of slots

ha_scores = {email:Score() for email in list(outgoing.keys()) + list(incoming.keys())}  # email: [hub score, authority scores]

def auth_update(scores, incoming_edges):
    norm = 0
    for person in scores.keys():
        scores[person].auth = 0
        for connection in incoming_edges[person].keys():
            
            scores[person].auth += scores[connection].hub
        norm += scores[person].auth**2
            
    norm = norm**0.5
    for person in scores.keys():
        scores[person].auth /= norm
        
def hub_update(scores, outgoing_edges):
    norm = 0
    for person in scores.keys():
        scores[person].hub = 0
        for connection in outgoing_edges[person].keys():
            scores[person].hub += scores[connection].auth
        norm += scores[person].hub**2
            
    norm = norm**0.5
    for person in scores.keys():
        scores[person].hub /= norm
        
        
def hits(scores, outgoing_edges, incoming_edges, max_iter=100):
    """
    Update HITS hubs and authorities values for nodes.
    
    :param scores : The hubs and authorities scores for each node
                    in the graph. Hubs score calculated on outgoing
                    connections and authorities score calculated
                    from incoming connections.
    
    :param incoming_edges: Incoming edges and associated weights 
    
    :param outgoing_edges: Out edges and associated weights
    
    :param max_iter: Number of iterations the algorithm runs for.
                     Note, we currently don't check for convergence
                     and an improvement to this algorithm could be
                     to perform such an action.       
    """
    for iteration in range(max_iter):
        print ("Running iteration",iteration,end="\r")
        auth_update(scores, incoming_edges)
        hub_update(scores, outgoing_edges)



87

# Run HITS and find the influention people in the organisation

In [78]:
hits(ha_scores, outgoing, incoming)
authority_scores = sorted([(person, score.auth) for person, score in ha_scores.items()], key=lambda x:x[1])
hub_scores = sorted([(person, score.hub) for person, score in ha_scores.items()], key=lambda x:x[1])

In [79]:
authority_scores[-10:]

[('mark.haedicke@enron.com', 0.0755408347767774),
 ('tana.jones@enron.com', 0.07640490087856729),
 ('mark.taylor@enron.com', 0.07950347900295653),
 ('tim.belden@enron.com', 0.08113851261947602),
 ('steven.kean@enron.com', 0.08213922127524408),
 ('elizabeth.sager@enron.com', 0.08288948053083063),
 ('sally.beck@enron.com', 0.0904662958015301),
 ('greg.whalley@enron.com', 0.09313554995012904),
 ('john.lavorato@enron.com', 0.10821510238262484),
 ('louise.kitchen@enron.com', 0.12346263617889451)]

In [80]:
hub_scores[-10:]

[('daniel.muschar@enron.com', 0.10228356793369467),
 ('louise.kitchen@enron.com', 0.10294998798518347),
 ('technology.enron@enron.com', 0.11945685305893489),
 ('nicki.daw@enron.com', 0.1203082352208114),
 ('billy.lemmons@enron.com', 0.12527074520888876),
 ('david.oxley@enron.com', 0.12846915088138788),
 ('outlook.team@enron.com', 0.14305557927351475),
 ('kenneth.lay@enron.com', 0.14857182614134243),
 ('sally.beck@enron.com', 0.15875307608568873),
 ('david.forster@enron.com', 0.1859679999086267)]