### Enron Scandal Email Dataset Analysis

### Parse the emails and identify people linked to C-suite

In [1]:
from email.parser import Parser
import glob
import os
import datetime
import sys
from collections import Counter


### Function Definitions and Variables

In [2]:
# Change the value of this variable as per your directory structure
relative_path = '/Users/tanya/midterm/data/enron/maildir'


In [3]:
# Preparation for analysis
imp_emails_stage1 = []
imp_emails_stage2 = []
imp_emails_stage3 = []
unread_files = []

# The TRIO - CEO, COO and CFO
imp_people = ['kenneth.lay@enron.com', 'jeff.skilling@enron.com', 'andrew.fastow@enron.com']
start_tracking_date = datetime.date(1998, 1, 1)
resignation_date = datetime.date(2001, 8, 14)
bankruptcy_date = datetime.date(2001, 12, 2)

# Identified in previous analysis
linked_members_stage_1 = ['rob.bradley@enron.com', 'joe.hillings@enron.com', 
                  'jeffrey.garten@yale.edu', 'mark.lay@enron.com','sally.keepers@enron.com', 
                    'sherri.sera@enron.com','kevinscott@onlinemailbox.net', 
                    'eharris@insightpartners.com','joannie.williamson@enron.com',
                    'jeffrey.shankman@enron.com']

linked_members_stage_2 = ['enron_update@concureworkplace.com', 'l..wells@enron.com,'
                          ,'svarga@kudlow.com', 'nshaw@usenergyservices.com', 
                          'j..kean@enron.com', 'john@pgsenergy.com', 
                          'hema@izhuta.com','bpaddock@ghcf.org', 'news@real-net.net', 
                          '_corolla@response.etracks.com']

linked_members_stage_3 = ['svarga@kudlow.com', 'mailings@cnn.com', 'lynda.l.phinney@williams.com', 'enron_update@concureworkplace.com','jharwood@mindspring.com']

In [4]:
def reset_stages(imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files):
    del imp_emails_stage1[:]
    del imp_emails_stage2[:]
    del imp_emails_stage3[:]
    del unread_files[:]


In [5]:
def apply_action_for_stages(email_item, imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, action):
    
    email_from = email_item['from']
    email_recipients = []
    email_to = email_item['to']
    if email_to:
        email_recipients = email_recipients + email_to.split()
    email_cc = email_item['cc']
    if email_cc:
        email_recipients = email_recipients + email_cc.split()
    email_bcc = email_item['bcc']    
    if email_bcc:
        email_recipients = email_recipients + email_bcc.split()
        
    email_date = email_item['date']
    parsed_date = datetime.datetime.strptime(email_date[:-6], "%a, %d %b %Y %H:%M:%S %z")
    parsed_date = parsed_date.date()    
    
    # check dates to figure out stage number
    if parsed_date > start_tracking_date and parsed_date <= resignation_date:                                
        # invoke action on corresponding stage
        action(email_from, email_recipients, imp_emails_stage1, linked_members_stage_1)
    elif parsed_date > resignation_date and parsed_date < bankruptcy_date:                                
        action(email_from, email_recipients, imp_emails_stage2, linked_members_stage_2)
    elif parsed_date > bankruptcy_date:                                    
        action(email_from, email_recipients, imp_emails_stage3, linked_members_stage_3)

In [6]:
def readRecursively(dirname, imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files, action):
    for item in glob.glob(dirname + '/*'):
        if os.path.isfile(item):
            try:
                with open(item) as email_file:                
                    email_item = Parser().parsestr(email_file.read())
                    if (email_item):
                        apply_action_for_stages(email_item, imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, action)
                            
            except UnicodeDecodeError:
                unread_files.append(item)                                
        else:
            readRecursively(item, imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files, action)


In [7]:
def action_word_bags(email_from, email_to, email_body, imp_email_stage_x, linked_members_stage_x):
    email_participants = email_from + email_to    
    participant_is_member_list = [p for p in email_participants if p in linked_members_stage_x]    
    if participant_is_member_list:
        imp_email_stage_x.extend(email_body.split())

In [8]:
def action_simple_append(email_from, email_to, email_body, imp_email_stage_x, linked_members_stage_x = None):
    imp_email_stage_x.append(1)

In [9]:
def action_identify_people(email_from, email_to, email_body, imp_email_stage_x, linked_members_stage_x = None):
    # check if sent FROM one of the imp_people
    if email_from in imp_people:
        # track people in sent TO list
        imp_email_stage_x.extend([e for e in email_to]) 
    else:
        # check if recipient is one of imp_people
        imp_in_to_list = [e for e in email_to if e in imp_people]                                
        if imp_in_to_list:
            imp_email_stage_x.append(email_from)
            # imp_email_stage_x.extend(imp_in_to_list)

## Find out the other people linked to the 'trio' by looking at Kenneth Lay's mailbox.

> ### First let's look at the email counts where the 'trio' was involved across the stages

In [10]:
# Count emails during stages for Kenneth Lay's mailbox
reset_stages(imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files)
readRecursively(relative_path + '/lay-k', imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files, action_simple_append)
print('No of emails:', len(imp_emails_stage1), len(imp_emails_stage2), len(imp_emails_stage3))

No of emails: 0 0 0


> ### Next, identify email ids that appear the most with the 'trio'

In [11]:
# Identify people (email_ids) during stages for Kenneth Lay's mailbox
reset_stages(imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files)
readRecursively(relative_path + '/lay-k', imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files, action_identify_people)
print('Email Ids:', len(set(imp_emails_stage1)), len(set(imp_emails_stage2)), len(set(imp_emails_stage3)))

email_occurrence_1 = Counter(imp_emails_stage1)
email_occurrence_2 = Counter(imp_emails_stage2)
email_occurrence_3 = Counter(imp_emails_stage3)

s1_top = sorted(email_occurrence_1.items(), key=lambda i:i[1], reverse=True)[:5]
s2_top = sorted(email_occurrence_2.items(), key=lambda i:i[1], reverse=True)[:5]
s3_top = sorted(email_occurrence_3.items(), key=lambda i:i[1], reverse=True)[:5]

print('Top 5 people in stage 1: ')
for i in s1_top:
    print(i)

print('Top 5 people in stage 2: ')
for i in s2_top:
    print(i)

print('Top 5 people in stage 3: ')
for i in s3_top:
    print(i)


Email Ids: 0 0 0
Top 5 people in stage 1: 
Top 5 people in stage 2: 
Top 5 people in stage 3: 


> ### Finally, let's see what these guys have been up to

In [12]:
# Action Function for word count



## Now let's do the same thing on Jeff Skillings mailbox

In [13]:
# Count emails during stages for Jeff Skilling's mailbox
reset_stages(imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files)
readRecursively(relative_path + '/skilling-j', imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files, action_simple_append)
print('No of emails', len(imp_emails_stage1), len(imp_emails_stage2), len(imp_emails_stage3))

No of emails 0 0 0


In [14]:
# Identify people (email_ids) during stages for Jeff Skilling's mailbox
reset_stages(imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files)
readRecursively(relative_path + '/skilling-j', imp_emails_stage1, imp_emails_stage2, imp_emails_stage3, unread_files, action_identify_people)
print('Email Ids', len(set(imp_emails_stage1)), len(set(imp_emails_stage2)), len(set(imp_emails_stage3)))

email_occurrence_1 = Counter(imp_emails_stage1)
email_occurrence_2 = Counter(imp_emails_stage2)
email_occurrence_3 = Counter(imp_emails_stage3)

s1_top = sorted(email_occurrence_1.items(), key=lambda i:i[1], reverse=True)[:5]
s2_top = sorted(email_occurrence_2.items(), key=lambda i:i[1], reverse=True)[:5]
# No people in stage 3 because Jeff Skillings resigned

print('Top 5 people in stage 1: ')
for i in s1_top:
    print(i)

print('Top 5 people in stage 2: ')
for i in s2_top:
    print(i)


Email Ids 0 0 0
Top 5 people in stage 1: 
Top 5 people in stage 2: 


In [15]:
item = relative_path + '/lay-k/inbox/5.'
with open(item) as email_file:                
    email_item = Parser().parsestr(email_file.read())
    if (email_item):
        email_body = email_item.get_payload()
        email_from = email_item['from']
        email_to = email_item['to']
        if email_to:
            email_to = email_to.split()
        email_cc = email_item['cc']    
        if email_cc:
            email_cc = email_cc.split()
        email_bcc = email_item['bcc']    
        if email_bcc:
            email_bcc = email_bcc.split()   
            
    print('from', email_from)        
    print('to', email_to)
    print('cc', email_cc)
    print('bcc', email_bcc)  
    print('combined', email_to + email_cc  + email_bcc)
    
    print(type(email_body))

from a..davis@enron.com
to ['kenneth.lay@enron.com']
cc ['cindy.olson@enron.com']
bcc ['cindy.olson@enron.com']
combined ['kenneth.lay@enron.com', 'cindy.olson@enron.com', 'cindy.olson@enron.com']
<class 'str'>


a = []
b = ['a','b']
c = ['a','b','c']
a = a+b
a

a = a+c
a