# Analysis 2
* **Which people sent and recieved the most emails**

In [1]:
import re
import string
from collections import Counter
import glob
import os
from email.parser import Parser
import json


*This uses the EnronData.py to import all of the emails here*

In [2]:
emailPattern = re.compile('.+@enron.com')

## Check address and name to see if they're in the format we need
def skip(a,n):
    
    if emailPattern.search(n) :
        return True
    
    if a.startswith('enron'):
        return True
    
    if '<' in a :
        return True
    
    bad = ['enron', '_', 'employees', 'team', 'temp', 'administrator', 'management']
    
    for x in bad :
        if x in n.lower() :
            return True
        
    return False
   
## If the name is Last, First swap to First Last else return the string without doing anything
def swap(s) :
    if (',' in s) :
        x = s.split(',')
        return x[1].strip() + ' ' + x[0].strip()
    else :
        return s
    
# Takes emails and the names raw string, cleans it, and returns a dictionary with k = email v = corresponding name
def emailsAndNames(address, names) :
    
    regexp1 = re.compile('<[^>]+>|.+@.+')
    re2 = re.compile('<[^>]+>[,]?')
    
    if (address is None or names is None) :
        return None
    
    a = [x.strip() for x in address.split(',')]

    if (re2.search(names)) :
        n = re2.sub('|', names )
        n = [swap(x.strip()) for x in n.split('|')]
    else :
        n = [x.strip() for x in names.split(',')]
   
    d = {}
    x = 0
    while x < len(a) and x < len(n) :
        if skip(a[x], n[x]) :
            x = x + 1
            continue
        if emailPattern.match(a[x]) :
            d[a[x]] = n[x]
        x = x + 1
        
    if d :
        return d
    else :
        return None

In [3]:
## takes an email and returns a dictionary with keys = [to,from,cc] and values = [list of emails]
def getAddresses(email) :
    to = email['to']
    tox = email['X-To']
    f = email['from']
    fx = email['X-From']
    cc = email['cc']
    ccx = email['X-Cc']
    
    addresses = {'from' : emailsAndNames(f, fx), 'to' : emailsAndNames(to, tox), 'cc' : emailsAndNames(cc, ccx)} 
    
    if (not addresses['from'] and not addresses['to'] and not addresses['cc']):
        return None
    else :
        return addresses
    

In [4]:
#Takes the file and converts to email format
def parseEmailFile(fileName) :
    with open(fileName, "r") as f:
        data = f.read()

    email = Parser().parsestr(data)
    return email

# Get all emails in file directory
def getEmails(fileString) :
    l = []
    
    searchDirectory(fileString, l)
    
    return l

# recursive method that adds email addresses to list if it's a file or calls itself again if it's a directory
def searchDirectory(fileString,l) :
    names = glob.glob(fileString + '/*')
    for name in names :
        if (os.path.isdir(name)) :
            searchDirectory(name, l)
        elif (os.path.isfile(name)) :
            email = parseEmailFile(name)
            a = getAddresses(email)
            if a :
                l.append(getAddresses(email))

In [5]:
loc = '../../Data/maildir\\'
names = glob.glob('../../Data/maildir/*')


for name in names :
    
    fname = name.replace(loc, '')
    emails = getEmails(name)
    newloc = '../../Data/Addresses/' + fname + '.json'
    f = open(newloc, 'w+')
    f.write(json.dumps(emails, indent=4, sort_keys=True))
    f.close()
    


In [6]:
people = {}

lists = {'from' : [], 'to' : [], 'cc' : []}

names = glob.glob('../../Data/Addresses/*')
for name in names :
    
    with open(name, 'r') as file:    
        data = json.load(file)
        
    if not data :
        continue
        
    for x in data :
        if x['to'] is None or x['from'] is None :
            continue

        for y in ['from', 'to', 'cc'] :
            if x[y] is None :
                continue
        
            for k,v in x[y].items() :
                people[k] = v
        
            lists[y].extend(x[y].keys())
        
people
       

{'phillip.allen@enron.com': 'Phillip Allen',
 'keith.holst@enron.com': 'Keith Holst',
 'david.delainey@enron.com': "Stephens 'vickersore@aol.com'",
 'paula.harris@enron.com': 'Paula Harris',
 'ina.rangel@enron.com': 'Ina Rangel',
 'jeffrey.hodge@enron.com': 'Jeffrey T Hodge',
 'cindy.cicchetti@enron.com': 'Cindy Cicchetti',
 'christopher.calger@enron.com': 'Christopher F Calger',
 'kathy.moore@enron.com': 'Kathy M Moore',
 'john.lavorato@enron.com': 'John Lavorato',
 'kenneth.shulklapper@enron.com': 'Kenneth Shulklapper',
 'paul.lucci@enron.com': 'Paul T Lucci',
 'frank.hayden@enron.com': 'Frank Hayden',
 'frank.ermis@enron.com': 'Frank Ermis',
 'jay.reitmeyer@enron.com': 'Jay Reitmeyer',
 'mike.grigsby@enron.com': 'Mike Grigsby',
 'thomas.martin@enron.com': 'Thomas Martin',
 'cooper.richey@enron.com': 'Cooper Richey',
 'cindy.long@enron.com': 'Dan Masters@ect',
 'matthew.lenhart@enron.com': 'Matthew Lenhart',
 'suzanne.nicholie@enron.com': 'Suzanne Nicholie',
 'colleen.sullivan@enron.

In [7]:
inbox = Counter(lists['to']) + Counter(lists['cc'])

print("Recieved the most emails (including cc'd emails)")
for k,v in inbox.most_common(10) :
    print(people[k], ':', v)
    
print('------------------------------------------------------------')

nocc = Counter(lists['to'])
print("Recieved the most emails (not including cc'd emails)")
for  k,v in nocc.most_common(10) :
    print(people[k], ':', v)

print('------------------------------------------------------------')

print("Sent the most emails")
sent = Counter(lists['from'])
for k,v in sent.most_common(10) :
    print(people[k], ':', v)
    
onlycc = Counter(lists['cc'])

Recieved the most emails (including cc'd emails)
Richard Shapiro : 17074
Steven Kean : 15494
Jeff Dasovich : 13415
Tana Jones : 13169
Mark Taylor : 11662
James Steffes : 11660
Sara Shackleton : 11485
Vince J Kaminski : 9631
Susan Mara : 9329
Louise Kitchen : 9093
------------------------------------------------------------
Recieved the most emails (not including cc'd emails)
Richard Shapiro : 13069
Steven Kean : 12321
Tana Jones : 11246
Jeff Dasovich : 11110
Sara Shackleton : 9362
James Steffes : 9067
Mark Taylor : 8725
Susan Mara : 8011
Louise Kitchen : 7965
John Lavorato : 7030
------------------------------------------------------------
Sent the most emails
Emmanuel Mangin : 11390
Jeff Dasovich : 7836
Tana Jones : 7696
Vince J Kaminski : 7366
Sara Shackleton : 7333
Chris Germany : 6525
Steven Kean : 5702
Kate Symes : 5113
Sally Beck : 3848
Eric Bass : 3813


In [8]:
print(len(people.keys()))


18277


In [10]:
to = dict(nocc)
fr = dict(sent)
cc = dict(onlycc)


rows = [['Name', 'E-mail', 'Number-From', 'Number-To', 'Number-CC']]

for k,v in people.items() :

    row = [v, k, fr.get(k, 0), to.get(k,0), cc.get(k,0)]
    
    rows.append(row)
    
rows[:10]

[['Name', 'E-mail', 'Number-From', 'Number-To', 'Number-CC'],
 ['Phillip Allen', 'phillip.allen@enron.com', 1208, 2777, 475],
 ['Keith Holst', 'keith.holst@enron.com', 37, 3058, 58],
 ["Stephens 'vickersore@aol.com'",
  'david.delainey@enron.com',
  3007,
  3068,
  434],
 ['Paula Harris', 'paula.harris@enron.com', 7, 208, 3],
 ['Ina Rangel', 'ina.rangel@enron.com', 392, 1547, 349],
 ['Jeffrey T Hodge', 'jeffrey.hodge@enron.com', 181, 3563, 641],
 ['Cindy Cicchetti', 'cindy.cicchetti@enron.com', 0, 24, 19],
 ['Christopher F Calger', 'christopher.calger@enron.com', 144, 2475, 420],
 ['Kathy M Moore', 'kathy.moore@enron.com', 18, 66, 15]]

In [11]:
import csv
f = open('../../Output/Q1/messsagetotals.csv', 'w+', newline='')
try:
    writer = csv.writer(f)
    writer.writerows(rows)
finally:
    f.close()