In [1]:
! wget https://storage.googleapis.com/reinfer-datasets/enron_mail_20150507.tar.gz

--2019-06-02 10:24:54--  https://storage.googleapis.com/reinfer-datasets/enron_mail_20150507.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.210.48, 2a00:1450:4009:807::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.210.48|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 443254787 (423M) [application/x-tar]
Saving to: ‘enron_mail_20150507.tar.gz’


2019-06-02 10:24:58 (93.0 MB/s) - ‘enron_mail_20150507.tar.gz’ saved [443254787/443254787]



In [21]:
!tar -xf enron_mail_20150507.tar.gz

## Wrangle the data to get sender : reciepient mapping

In [None]:
import re
import glob
from collections import defaultdict, Counter
from mailbox import mboxMessage

# What we want to do here is create a data-structure that maps from: recieved : n
# to do this efficiently we use the defaultdict-counter combo
# I experimented a little using the 'mbox' class to do everything at once but couldn't make it work. If I have time i'll
# go back

# Certain emails don't have 'To' or 'From' usually ones that are going to a group list. 
# As we don't know who is on these lists lets ignore this class of message for now, even 
# though this isn't ideal, as it is conceivable that big influencers are much more likely to 
# emails these lists
            

# data structures:
# Effectively a graph, and in reality one would just build a graph using a standard library and run HITS on that. Here
# log both the incoming and outgoing edges independently for each of use in the custom HITS implementation
outgoing = defaultdict(Counter) # outgoingconnection counts
incoming = defaultdict(Counter) # incoming connection counts


previously_seen = set() # keeps a hash of the payloads of previously seen messages to avoid double counting

for f in glob.glob("maildir/**", recursive=True):
    try:
        with open(f) as mbox_file:
            msg = mboxMessage(mbox_file)
                
            payload = msg.get_payload()
            if  msg["From"] is not None and msg["To"] is not None and payload is not None:
                payload_hash = hash(payload)
                if payload_hash not in previously_seen:
                    fr = msg["From"]
                    to = re.sub('\ |\n|\t', '', msg["To"]).split(",") # remove special characters and spaces
                    outgoing[fr].update(to)
                    for person in to:
                        incoming[person].update([fr]) 
                    previously_seen.add(payload_hash)
            
    except (IsADirectoryError, UnicodeDecodeError) as e:
        pass

    

In [43]:
print ("{} unique senders".format(len(connections)))

19567 unique senders


## Implementation of Hubs and Authorities

Just implement a vanilla HA without regards for performance. I'm not sure how long it will take to run, so if its slow i'll optimize it.

TypeError: list indices must be integers or slices, not ScoreType

In [58]:
class Score:
    __slots__ = 'hub', 'auth'
    def __init__(self, hub=1, auth=1):
        self.hub = hub
        self.auth= auth ## todo: check the initialization of slots

ha_scores = {email:Score() for email in connections.keys()}  # email: [hub score, authority scores]

def auth_update(scores, incoming_edges):
    norm = 0
    for person in scores.keys():
        scores[person].auth = 0
        for connection in incoming_edges[person].keys():
            
            scores[person].auth += scores[connection].hub
        norm += scores[person].auth**2
            
    norm = norm**0.5
    for person in scores.keys():
        scores[person].auth /= norm
        
def hub_update(scores, outgoing_edges):
    norm = 0
    for person in scores.keys():
        scores[person].hub = 0
        for connection in outgoing_edges[person].keys():
            scores[person].hub += scores[connection].auth
        norm += scores[person].hub**2
            
    norm = norm**0.5
    for person in scores.keys():
        scores[person].hub /= norm
        
        
def hits(scores, outgoing_edges, incoming_edges, max_iter=100,tol=1.0e-8):
    for iteration in range(max_iter):
        print (iteration)
        auth_update(scores, incoming_edges)
        hub_update(scores, incoming_edges)

hits(ha_scores, outgoing, incoming)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


{'storage@bdcimail.com': <__main__.Score at 0x7f0772612860>,
 'david.garrett@enron.com': <__main__.Score at 0x7f0772612898>,
 'rjs@suelthauswalsh.com': <__main__.Score at 0x7f07726128d0>,
 'randi.howard@enron.com': <__main__.Score at 0x7f0772612908>,
 'april.weatherford@enron.com': <__main__.Score at 0x7f0772612940>,
 'stathis.tompaidis@bus.utexas.edu': <__main__.Score at 0x7f07719d5b38>,
 'lisa.pitt@enron.com': <__main__.Score at 0x7f07726129b0>,
 'michaela.jim@worldnet.att.net': <__main__.Score at 0x7f07726129e8>,
 'diane.becker@enron.com': <__main__.Score at 0x7f0772612a20>,
 'stephen.thome@enron.com': <__main__.Score at 0x7f0772612a58>,
 'prod1@earthlink.net': <__main__.Score at 0x7f0771772860>,
 'certifieddealernetwork@carsdirect.com': <__main__.Score at 0x7f0772612ac8>,
 'holli.krebs@enron.com': <__main__.Score at 0x7f0772612b00>,
 'lbcunningham@mail.utexas.edu': <__main__.Score at 0x7f0772612b38>,
 'smitter@smitter.com': <__main__.Score at 0x7f0772612b70>,
 'raymond.bowen@enron.