In [14]:
import zipfile
import pandas as pd

Get the data from kaggle and save it to the data folder along with some splitting and sampling.

In [2]:
# params
dataset_name = "wcukierski/enron-email-dataset"
n_small = 10000

In [3]:
# download using kaggle cli
!kaggle datasets download -d {dataset_name} -p ./data

enron-email-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
# unzip the data using python
with zipfile.ZipFile("./data/enron-email-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("./data/")

In [7]:
# read data
df = pd.read_csv("./data/emails.csv")
print(df.shape)

(517401, 2)


In [8]:
# shuffle data
df = df.sample(frac=1).reset_index(drop=True)

# split into train, test and holdout 80, 10, 10
df_train = df.sample(frac=0.8)
df_train_small = df_train.sample(n_small)
df_test = df.drop(df_train.index)
df_holdout = df_test.sample(frac=0.5)
df_holdout_small = df_holdout.sample(n_small)
df_test = df_test.drop(df_holdout.index)
df_test_small = df_test.sample(n_small)
# print shapes
print(f"df_train: {df_train.shape}")
print(f"df_train_small: {df_train_small.shape}")
print(f"df_test: {df_test.shape}")
print(f"df_test_small: {df_test_small.shape}")
print(f"df_holdout: {df_holdout.shape}")
print(f"df_holdout_small: {df_holdout_small.shape}")

# save data
df_train.to_csv("./data/emails_train.csv", index=False)
df_train_small.to_csv("./data/emails_train_small.csv", index=False)
df_test.to_csv("./data/emails_test.csv", index=False)
df_test_small.to_csv("./data/emails_test_small.csv", index=False)
df_holdout.to_csv("./data/emails_holdout.csv", index=False)
df_holdout_small.to_csv("./data/emails_holdout_small.csv", index=False)

df_train: (413921, 2)
df_train_sample: (41392, 2)
df_test: (51740, 2)
df_holdout: (51740, 2)


In [13]:
# sample some data to eyeball it
df_sample = df_train_small.sample(5)
for i, row in df_sample.iterrows():
    print("="*100)
    print(f"file: {row['file']}")
    print(f"message:\n{row['message']}")
    print("="*100)

file: gay-r/all_documents/407.
message:
Message-ID: <23579752.1075855738305.JavaMail.evans@thyme>
Date: Wed, 19 Jan 2000 05:17:00 -0800 (PST)
From: randall.gay@enron.com
To: heather.choate@enron.com
Subject: 
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
X-From: Randall L Gay
X-To: Heather Choate
X-cc: 
X-bcc: 
X-Folder: \Randall_Gay_Dec2000\Notes Folders\All documents
X-Origin: Gay-R
X-FileName: rgay.nsf

 

Thanks Heather
file: nemec-g/sent_items/552.
message:
Message-ID: <5652121.1075858558404.JavaMail.evans@thyme>
Date: Mon, 27 Aug 2001 07:01:15 -0700 (PDT)
From: gerald.nemec@enron.com
To: mary.ogden@enron.com
Subject: FW: Mowing Letter
Cc: rakhi.israni@enron.com
Mime-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Transfer-Encoding: 7bit
Bcc: rakhi.israni@enron.com
X-From: Nemec, Gerald </O=ENRON/OU=NA/CN=RECIPIENTS/CN=GNEMEC>
X-To: Ogden, Mary </O=ENRON/OU=NA/CN=RECIPIENTS/CN=Mogden>
X-cc: Israni, Rakhi </O=ENRON/OU=N