In [1]:
import pandas as pd
import numpy as np

## Read the pre-processed data

In [2]:
data = pd.read_csv("processed_enron.csv")

In [3]:
data.head(5)

Unnamed: 0,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00,frozenset({'phillip.allen@enron.com'}),frozenset({'tim.belden@enron.com'}),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00,frozenset({'phillip.allen@enron.com'}),frozenset({'john.lavorato@enron.com'}),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 10:00:00,frozenset({'phillip.allen@enron.com'}),frozenset({'leah.arsdall@enron.com'}),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 13:13:00,frozenset({'phillip.allen@enron.com'}),frozenset({'randall.gay@enron.com'}),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 12:07:00,frozenset({'phillip.allen@enron.com'}),frozenset({'greg.piper@enron.com'}),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


## Clean up the email ids

The email ids is what we are primarily interested in but they need to be cleaned up to remove the "frozenset" 

In [6]:
all_ids = np.array(data['From'])
clean_ids = [x.split("{")[1].split("}")[0] for x in all_ids]
data['From'] = clean_ids

In [7]:
data.head(5)

Unnamed: 0,Message-ID,Date,From,To,Subject,X-From,X-To,X-cc,X-bcc,X-Folder,X-Origin,X-FileName,content,user
0,<18782981.1075855378110.JavaMail.evans@thyme>,2001-05-14 23:39:00,'phillip.allen@enron.com',frozenset({'tim.belden@enron.com'}),,Phillip K Allen,Tim Belden <Tim Belden/Enron@EnronXGate>,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Here is our forecast\n\n,allen-p
1,<15464986.1075855378456.JavaMail.evans@thyme>,2001-05-04 20:51:00,'phillip.allen@enron.com',frozenset({'john.lavorato@enron.com'}),Re:,Phillip K Allen,John J Lavorato <John J Lavorato/ENRON@enronXg...,,,"\Phillip_Allen_Jan2002_1\Allen, Phillip K.\'Se...",Allen-P,pallen (Non-Privileged).pst,Traveling to have a business meeting takes the...,allen-p
2,<24216240.1075855687451.JavaMail.evans@thyme>,2000-10-18 10:00:00,'phillip.allen@enron.com',frozenset({'leah.arsdall@enron.com'}),Re: test,Phillip K Allen,Leah Van Arsdall,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,test successful. way to go!!!,allen-p
3,<13505866.1075863688222.JavaMail.evans@thyme>,2000-10-23 13:13:00,'phillip.allen@enron.com',frozenset({'randall.gay@enron.com'}),,Phillip K Allen,Randall L Gay,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,"Randy,\n\n Can you send me a schedule of the s...",allen-p
4,<30922949.1075863688243.JavaMail.evans@thyme>,2000-08-31 12:07:00,'phillip.allen@enron.com',frozenset({'greg.piper@enron.com'}),Re: Hello,Phillip K Allen,Greg Piper,,,\Phillip_Allen_Dec2000\Notes Folders\'sent mail,Allen-P,pallen.nsf,Let's shoot for Tuesday at 11:45.,allen-p


## Searching for words associated with suspicious behavior

List of all the words that word2vec threw at us which relate to suspicious behavior. Let's scan through the corpus to find these words and their users

In [8]:
bad_terms = ['fraudulent','scam','embezzlement', 'forgery', 
 'swindle', 'fraudsters','bribery',  'theft', 'defrauded', 'corruption','suspicous','suspect',
 'amiss','fishy', 'alarmed', 'unexplained', 'strange', 'mysterious', 'untoward','police', 'policeman', 'constable', 
 'patrol','detective', 'arrested', 'apprehended', 'trooper', 'arrest', 'arresting','cop']

In [9]:
from collections import defaultdict

suspicious_peeps = defaultdict(int)

In [10]:
for email, c in zip(list(data.From), list(data.content)):
    if any(word in c for word in bad_terms):
        suspicious_peeps[email] += 1
        

## Find top 15 emails ids asssociated with these words

In [12]:
sorted(suspicious_peeps.items(), key=lambda x:x[1])[-15:]

[("'gerald.nemec@enron.com'", 472),
 ("'drew.fossum@enron.com'", 496),
 ("'sally.beck@enron.com'", 590),
 ("'chris.germany@enron.com'", 597),
 ("'veronica.espinoza@enron.com'", 609),
 ("'mark.taylor@enron.com'", 656),
 ("'enron.announcements@enron.com'", 836),
 ("'richard.sanders@enron.com'", 906),
 ("'steven.kean@enron.com'", 992),
 ("'exchangeinfo@nymex.com'", 1135),
 ("'jeff.dasovich@enron.com'", 1341),
 ("'tana.jones@enron.com'", 1472),
 ("'sara.shackleton@enron.com'", 1643),
 ("'vince.kaminski@enron.com'", 1924),
 ("'kay.mann@enron.com'", 3926)]