In [1]:
import os, inspect
import re
from itertools import chain, compress
import pandas as pd
import numpy as np
import datetime as dt
from time import time

In [2]:
CURR_DIR =  os.path.dirname(inspect.getabsfile(inspect.currentframe()))
ROOT_DIR =  os.path.dirname(CURR_DIR)

ffname = os.path.join(ROOT_DIR,"enron","data", "raw", 
                      "enron-event-history-all.csv")

In [3]:
# function to flatten large 2D lists
def fast_flatten(in_list):
    return list(chain.from_iterable(in_list))

### Set params

In [4]:
values_to_drop = ["", "."]
chars_to_drop = ['"']

In [5]:
df = pd.read_csv(
    ffname,
    header=None,
    converters= {
        0: lambda x: dt.datetime.fromtimestamp(int(x)/1000.0),
    },
    names=["datetime", "id", "sender", "recipients", "topic", "mode"],
    usecols=["datetime", "sender", "recipients"],
)
df.head()

Unnamed: 0,datetime,sender,recipients
0,1998-05-27 17:31:00,Christopher Behney,Toni P Schulenburg|mary hain
1,1998-10-30 17:43:00,mark legal taylor,Marc.R.Cutler@BankAmerica.com
2,1998-10-30 17:56:00,mark legal taylor,Marc.R.Cutler@BankAmerica.com
3,1998-10-30 18:02:00,mark legal taylor,shari stack
4,1998-10-30 19:06:00,mark legal taylor,Marc.R.Cutler@BankAmerica.com


In [6]:
df["date"] = df.datetime.dt.date
df["time"] = df.datetime.dt.time
df.head()

print("-"*30 + " columns " + "-"*30 + os.linesep)
print(repr(df.columns))
print("-"*30 + " shape" + "-"*30 + os.linesep)
print(df.shape)
print("-"*30 + " data types " + "-"*30 + os.linesep)
print(df.dtypes)
print("-"*30 + " top 5 rows " + "-"*30 + os.linesep)
df.head()

------------------------------ columns ------------------------------

Index(['datetime', 'sender', 'recipients', 'date', 'time'], dtype='object')
------------------------------ shape------------------------------

(205731, 5)
------------------------------ data types ------------------------------

datetime      datetime64[ns]
sender                object
recipients            object
date                  object
time                  object
dtype: object
------------------------------ top 5 rows ------------------------------



Unnamed: 0,datetime,sender,recipients,date,time
0,1998-05-27 17:31:00,Christopher Behney,Toni P Schulenburg|mary hain,1998-05-27,17:31:00
1,1998-10-30 17:43:00,mark legal taylor,Marc.R.Cutler@BankAmerica.com,1998-10-30,17:43:00
2,1998-10-30 17:56:00,mark legal taylor,Marc.R.Cutler@BankAmerica.com,1998-10-30,17:56:00
3,1998-10-30 18:02:00,mark legal taylor,shari stack,1998-10-30,18:02:00
4,1998-10-30 19:06:00,mark legal taylor,Marc.R.Cutler@BankAmerica.com,1998-10-30,19:06:00


In [7]:
# check on null rows
df.isnull().sum()

datetime       0
sender        32
recipients    38
date           0
time           0
dtype: int64

In [8]:
# drop any rows with missing values (None) at 'sender' or 'recipients' columns
df.dropna(axis=0, how="any", subset=["sender", "recipients"], inplace=True)

In [9]:
# convert all entries of 'sender'/'recipients' columns to lower case
def func(x):
    x = x.lower()
    x = x.replace('"', "")
    return x
df[["sender" , "recipients"]] = df[["sender" , "recipients"]].applymap(func)
df.head()

Unnamed: 0,datetime,sender,recipients,date,time
0,1998-05-27 17:31:00,christopher behney,toni p schulenburg|mary hain,1998-05-27,17:31:00
1,1998-10-30 17:43:00,mark legal taylor,marc.r.cutler@bankamerica.com,1998-10-30,17:43:00
2,1998-10-30 17:56:00,mark legal taylor,marc.r.cutler@bankamerica.com,1998-10-30,17:56:00
3,1998-10-30 18:02:00,mark legal taylor,shari stack,1998-10-30,18:02:00
4,1998-10-30 19:06:00,mark legal taylor,marc.r.cutler@bankamerica.com,1998-10-30,19:06:00


In [10]:
# tokenize the recipient column
df.recipients = df.recipients.str.split("|")
df.head()

Unnamed: 0,datetime,sender,recipients,date,time
0,1998-05-27 17:31:00,christopher behney,"[toni p schulenburg, mary hain]",1998-05-27,17:31:00
1,1998-10-30 17:43:00,mark legal taylor,[marc.r.cutler@bankamerica.com],1998-10-30,17:43:00
2,1998-10-30 17:56:00,mark legal taylor,[marc.r.cutler@bankamerica.com],1998-10-30,17:56:00
3,1998-10-30 18:02:00,mark legal taylor,[shari stack],1998-10-30,18:02:00
4,1998-10-30 19:06:00,mark legal taylor,[marc.r.cutler@bankamerica.com],1998-10-30,19:06:00


### Create recipients DataFrame

In [11]:
column_names = ["datetime", "recipient"]
df_recipients = pd.DataFrame(columns=column_names)
frames = []
for idx, row in df.loc[0:, :].iterrows():
    frames.append(pd.DataFrame(
        data={"datetime":[row.datetime]*len(row.recipients),"recipient": row.recipients}))

In [12]:
# create diectionary from column names
df_dict_ = dict.fromkeys(column_names, [])

In [13]:
for col in column_names:
    # use a generator to save memory
    extracted = (frame[col] for frame in frames)
    # flatten and save to df_dict
    df_dict_[col] = fast_flatten(extracted)

In [14]:
df_recipients = pd.DataFrame.from_dict(df_dict_)[column_names]
df_recipients.head()

Unnamed: 0,datetime,recipient
0,1998-05-27 17:31:00,toni p schulenburg
1,1998-05-27 17:31:00,mary hain
2,1998-10-30 17:43:00,marc.r.cutler@bankamerica.com
3,1998-10-30 17:56:00,marc.r.cutler@bankamerica.com
4,1998-10-30 18:02:00,shari stack


### Create senders data frame

In [15]:
df_senders = df[["datetime", "sender"]]
df_senders.head()

Unnamed: 0,datetime,sender
0,1998-05-27 17:31:00,christopher behney
1,1998-10-30 17:43:00,mark legal taylor
2,1998-10-30 17:56:00,mark legal taylor
3,1998-10-30 18:02:00,mark legal taylor
4,1998-10-30 19:06:00,mark legal taylor


### Save recipients/senders data frames

In [16]:
ffname = os.path.join(ROOT_DIR,"enron","data", "ext", 
                      "enron-recipients.csv")
df_recipients.to_csv(ffname, index=False)

In [17]:
ffname = os.path.join(ROOT_DIR,"enron","data", "ext", 
                      "enron-senders.csv")
df_senders.to_csv(ffname, index=False)