In [1]:
import requests
import tarfile
import os

url = "https://bailando.berkeley.edu/enron/enron_with_categories.tar.gz"

destination_dir = "/content/data"

if not os.path.exists(destination_dir):
  os.makedirs(destination_dir)

file_name = os.path.join(destination_dir, os.path.basename(url))

response = requests.get(url, stream = True)
if response.status_code == 200:
  with open(file_name, "wb") as f:
    for chunk in response.iter_content(chunk_size = 1024):
      if chunk:
        f.write(chunk)

with tarfile.open(file_name, "r:gz") as tar:
  tar.extractall(destination_dir)

os.remove(file_name)

In [2]:
import pandas as pd

df = pd.DataFrame({"Label": [], "Email": []})
directory_path = "/content/data/enron_with_categories"

for folder_name in range(1, 7):
  folder_path = os.path.join(directory_path, str(folder_name))
  for filename in os.listdir(folder_path):
    if filename.endswith(".txt"):
      txt_file_path = os.path.join(folder_path, filename)
      with open(txt_file_path, "r") as txt_file:
        txt_content = txt_file.read()
      new_row = pd.DataFrame({"Label": [folder_name], "Email": [txt_content]})
      df = pd.concat([df, new_row], ignore_index = True)
df["Label"] = df["Label"].astype(int)
df["Label"] = df["Label"] - 1

In [3]:
df

Unnamed: 0,Label,Email
0,0,Message-ID: <3282112.1075846173955.JavaMail.ev...
1,0,Message-ID: <25005816.1075847581140.JavaMail.e...
2,0,Message-ID: <10288433.1075843393490.JavaMail.e...
3,0,Message-ID: <4131316.1075840896739.JavaMail.ev...
4,0,Message-ID: <23858201.1075846164772.JavaMail.e...
...,...,...
1658,5,Message-ID: <25861174.1075863426951.JavaMail.e...
1659,5,Message-ID: <15325419.1075849875791.JavaMail.e...
1660,5,Message-ID: <5117287.1075847638493.JavaMail.ev...
1661,5,Message-ID: <18786165.1075854496769.JavaMail.e...


In [4]:
df.Label.value_counts()

0    834
3    476
5    143
2    100
4     74
1     36
Name: Label, dtype: int64

In [5]:
from email.parser import Parser
for i in df["Email"]:
  email = Parser().parsestr(i)
  new_email = email["subject"] + email.get_payload()
  df.loc[df["Email"] == i, "Email"] = new_email

In [6]:
df

Unnamed: 0,Label,Email
0,0,Re: testimonyHere it is.\n\n\n\n\n\n\n\nPeggy ...
1,0,"Re: Tariffs ApprovedCongrats Ray. ... Now, ge..."
2,0,WPTF Friday Crazy About U BurritoTHE FRIDAY BU...
3,0,FW: Bingaman Draft On Transparency -- Amendmen...
4,0,Re: executivesJeff Shankman (COO of Enron Glob...
...,...,...
1658,5,"RE: the summary report, I am sorry.Thanks\n\nV..."
1659,5,FW: Draft of Organizational AnnouncementThe ot...
1660,5,CONFIDENTIAL - DO NOT DISTRIBUTECONFIDENTIAL -...
1661,5,"CONFIDENTIAL - DO NOT DISTRIBUTESteve, I only ..."


In [7]:
import re
def remove_quoted(txt):
  arr1 = txt.split("\n")
  for ele in arr1:
    if "<" in ele or "<<" in ele or "<<<" in ele or ">" in ele or ">>" in ele or ">>>" in ele:
      idx = arr1.index(ele)
      arr1 = arr1[:idx]
      break
  text = (" ").join(arr1)

  arr1 = text.split("\t")
  text = (" ").join(arr1)

  original_message_index = text.find("Original Message")
  forwarded_by_index = text.find("Forwarded by")

  if original_message_index != -1 and forwarded_by_index != -1:
    if original_message_index < forwarded_by_index:
      return text[:original_message_index]
    else:
      return text[:forwarded_by_index]
  elif original_message_index != -1:
    return text[:original_message_index]
  elif forwarded_by_index != -1:
    return text[:forwarded_by_index]

  return text

df["Email"] = df["Email"].apply(remove_quoted)

In [8]:
df

Unnamed: 0,Label,Email
0,0,Re: testimonyHere it is. Peggy Mahoney@...
1,0,"Re: Tariffs ApprovedCongrats Ray. ... Now, ge..."
2,0,WPTF Friday Crazy About U BurritoTHE FRIDAY BU...
3,0,FW: Bingaman Draft On Transparency -- Amendmen...
4,0,Re: executivesJeff Shankman (COO of Enron Glob...
...,...,...
1658,5,"RE: the summary report, I am sorry.Thanks Vin..."
1659,5,FW: Draft of Organizational AnnouncementThe ot...
1660,5,CONFIDENTIAL - DO NOT DISTRIBUTECONFIDENTIAL -...
1661,5,"CONFIDENTIAL - DO NOT DISTRIBUTESteve, I only ..."


In [9]:
import string
import nltk

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [10]:
def remove_whitespaces(text):
  cleaned_string = text.strip()
  cleaned_string = re.sub(r"\s+", " ", cleaned_string)
  return cleaned_string

def convert_to_lowercase(text):
  return text.lower()

def remove_numbers(text):
  return re.sub(r'\d+', '', text)

def remove_punctuations(text):
  return text.translate(str.maketrans("", "", string.punctuation))

def remove_url_html(text):
  text = re.sub(r"http\S+", "", text)
  text = BeautifulSoup(text, "html.parser").get_text()
  return text

def remove_stopwords(text):
  arr = []
  w = list(stopwords.words("english"))
  tokens = word_tokenize(text)
  for i in range(len(tokens)):
    if tokens[i] not in w:
      arr.append(tokens[i])
  return " ".join(arr)

def lemmatization(text):
  lemmatizer = WordNetLemmatizer()
  tokens = word_tokenize(text)
  for i in range(len(tokens)):
    w = lemmatizer.lemmatize(tokens[i])
    tokens[i] = w
  return " ".join(tokens)

In [11]:
df["Email"] = df["Email"].apply(remove_whitespaces)
df["Email"] = df["Email"].apply(convert_to_lowercase)
df["Email"] = df["Email"].apply(remove_numbers)
df["Email"] = df["Email"].apply(remove_punctuations)
df["Email"] = df["Email"].apply(remove_url_html)
df["Email"] = df["Email"].apply(remove_stopwords)

In [12]:
df

Unnamed: 0,Label,Email
0,0,testimonyhere peggy mahoneyees karen dennecorp...
1,0,tariffs approvedcongrats ray get ferc somethin...
2,0,wptf friday crazy u burritothe friday burrito ...
3,0,fw bingaman draft transparency amendment ideas...
4,0,executivesjeff shankman coo enron global marke...
...,...,...
1658,5,summary report sorrythanks vince
1659,5,fw draft organizational announcementthe memo f...
1660,5,confidential distributeconfidential distribute...
1661,5,confidential distributesteve following suggest...


In [13]:
!pip install transformers
!pip install torchmetrics
import torch
from transformers import BertTokenizer, BertForSequenceClassification

Collecting transformers
  Downloading transformers-4.33.3-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.8 MB/s[0m eta [36m0:00:0

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [15]:
X = df["Email"]
y = df["Label"]

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [17]:
!pip install nlpaug
import nlpaug.augmenter.word.context_word_embs as aug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [18]:
y_train.value_counts()

0    669
3    377
5    116
2     80
4     57
1     31
Name: Label, dtype: int64

In [19]:
X_train.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
X_train = pd.DataFrame(X_train, index = None)
y_train = pd.DataFrame(y_train, index = None)
train_df = pd.concat([y_train, X_train], axis = 1)
train_df

Unnamed: 0,Label,Email
0,3,advisory council meetingfyi
1,0,pge px credit calculation confidential atty cl...
2,3,fw possible cosponsorshipsall lee asked forwar...
3,0,seen thisyes thanks lot vince
4,0,energy issuesplease see following articles sac...
...,...,...
1325,1,fw patent attorneyfyi dad
1326,3,virginia manufacturers association seminar jun...
1327,5,draft response cpuc oiii taken stab putting dr...
1328,0,fw note liquidity


In [20]:
augmenter = aug.ContextualWordEmbsAug(model_path = "bert-base-uncased", action = "insert")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [21]:
from tqdm.auto import tqdm
from sklearn.utils import shuffle
import numpy as np

In [22]:
def f(train_df, augmenter, repetitions = 1):
  augmented_texts = []
  class1 = train_df[train_df["Label"] == 1].reset_index(drop = True)
  for i in tqdm(np.random.randint(0, len(class1), 638)):
    for _ in range(repetitions):
      augmented_text = augmenter.augment(class1["Email"].iloc[i])
      augmented_texts.append(augmented_text)
  data = {
      "Label": 1,
      "Email": augmented_texts
  }
  aug_df = pd.DataFrame(data)
  d1 = pd.concat([train_df, aug_df], axis = 0).reset_index(drop = True)



  augmented_texts = []
  class2 = train_df[train_df["Label"] == 2].reset_index(drop = True)
  for i in tqdm(np.random.randint(0, len(class2), 589)):
    for _ in range(repetitions):
      augmented_text = augmenter.augment(class2["Email"].iloc[i])
      augmented_texts.append(augmented_text)
  data = {
      "Label": 2,
      "Email": augmented_texts
  }
  aug_df = pd.DataFrame(data)
  d2 = pd.concat([d1, aug_df], axis = 0).reset_index(drop = True)



  augmented_texts = []
  class3 = train_df[train_df["Label"] == 3].reset_index(drop = True)
  for i in tqdm(np.random.randint(0, len(class3), 292)):
    for _ in range(repetitions):
      augmented_text = augmenter.augment(class3["Email"].iloc[i])
      augmented_texts.append(augmented_text)
  data = {
      "Label": 3,
      "Email": augmented_texts
  }
  aug_df = pd.DataFrame(data)
  d3 = pd.concat([d2, aug_df], axis = 0).reset_index(drop = True)



  augmented_texts = []
  class4 = train_df[train_df["Label"] == 4].reset_index(drop = True)
  for i in tqdm(np.random.randint(0, len(class4), 612)):
    for _ in range(repetitions):
      augmented_text = augmenter.augment(class4["Email"].iloc[i])
      augmented_texts.append(augmented_text)
  data = {
      "Label": 4,
      "Email": augmented_texts
  }
  aug_df = pd.DataFrame(data)
  d4 = pd.concat([d3, aug_df], axis = 0).reset_index(drop = True)



  augmented_texts = []
  class5 = train_df[train_df["Label"] == 5].reset_index(drop = True)
  for i in tqdm(np.random.randint(0, len(class5), 553)):
    for _ in range(repetitions):
      augmented_text = augmenter.augment(class5["Email"].iloc[i])
      augmented_texts.append(augmented_text)
  data = {
      "Label": 5,
      "Email": augmented_texts
  }
  aug_df = pd.DataFrame(data)
  d5 = pd.concat([d4, aug_df], axis = 0).reset_index(drop = True)



  return d5

In [23]:
# augmented_train = f(train_df, augmenter)

In [24]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [25]:
# file_name = "/content/gdrive/MyDrive/V-Labs/augmented_train.csv"
# augmented_train.to_csv(file_name, index = False)

In [26]:
augmented_train = pd.read_csv("/content/gdrive/MyDrive/V-Labs/augmented_train.csv")

In [27]:
augmented_train = shuffle(augmented_train, random_state = 0)

In [28]:
augmented_train.Label.value_counts()

4    669
3    669
1    669
2    669
0    669
5    669
Name: Label, dtype: int64

In [29]:
import ast

def extract_string_from_list(email):
  try:
    email_list = ast.literal_eval(email)
    if isinstance(email_list, list) and len(email_list) > 0:
      return email_list[0]
  except (ValueError, SyntaxError, MemoryError):
    pass
  return email

augmented_train["Email"] = augmented_train["Email"].apply(extract_string_from_list)

In [30]:
augmented_train.isna().sum()

Label     0
Email    29
dtype: int64

In [31]:
augmented_train.dropna(axis = 0, inplace = True)

In [32]:
augmented_train["Email"] = augmented_train["Email"].apply(lemmatization)
X_val = X_val.apply(lemmatization)

In [33]:
X_train = augmented_train["Email"]
y_train = augmented_train["Label"]

In [34]:
X_t = X_val
y_t = y_val

In [35]:
X_test, X_val, y_test, y_val = train_test_split(X_t, y_t, test_size = 0.5, random_state = 0, stratify = y_t)

In [36]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, X_val.shape, y_val.shape)

(3985,) (3985,) (166,) (166,) (167,) (167,)


In [37]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
train_encodings = tokenizer(list(X_train), truncation = True, padding = True, max_length = 100, return_tensors = "pt")
test_encodings = tokenizer(list(X_test), truncation = True, padding = True, max_length = 100, return_tensors = "pt")
val_encodings = tokenizer(list(X_val), truncation = True, padding = True, max_length = 100, return_tensors = "pt")

train_labels = torch.from_numpy(y_train.to_numpy()).type(torch.LongTensor)
test_labels = torch.from_numpy(y_test.to_numpy()).type(torch.LongTensor)
val_labels = torch.from_numpy(y_val.to_numpy()).type(torch.LongTensor)

train_encodings = train_encodings.to(device)
test_encodings = test_encodings.to(device)
val_encodings = val_encodings.to(device)

train_labels = train_labels.to(device)
test_labels = test_labels.to(device)
val_labels = val_labels.to(device)

In [38]:
from torch.utils.data import DataLoader, TensorDataset
import random

seed_val = 52
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

train_dataset = TensorDataset(train_encodings.input_ids, train_encodings.attention_mask, train_labels)
test_dataset = TensorDataset(test_encodings.input_ids, test_encodings.attention_mask, test_labels)
val_dataset = TensorDataset(val_encodings.input_ids, val_encodings.attention_mask, val_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)

In [39]:
# from torch import nn

# class_samples = torch.tensor([834, 36, 100, 476, 74, 143], dtype = torch.float32)
# total_samples = class_samples.sum()
# num_classes = len(class_samples)

# class_weights = total_samples / (num_classes * class_samples)
# class_weights /= class_weights.sum()
# class_weights = torch.tensor(class_weights, dtype = torch.float32)

# loss_fn = nn.CrossEntropyLoss(weight = class_weights).to(device)

In [40]:
from torchmetrics import Accuracy
from torch import nn

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 6).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-5)
loss_fn = nn.CrossEntropyLoss().to(device)
accuracy = Accuracy(task = "multiclass", num_classes = 6).to(device)

num_epochs = 3

for epoch in range(num_epochs):
  model.train()
  train_loss = 0
  val_loss = 0
  t = torch.tensor([])
  t = t.to(device)
  a = torch.tensor([])
  a = a.to(device)
  for batch in train_loader:
    input_ids, attention_mask, labels = batch
    a = torch.cat((a, labels), dim = 0)
    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
    y_logits = outputs.logits
    y_preds = torch.softmax(y_logits, dim = 1).argmax(dim = 1)
    t = torch.cat((t, y_preds), dim = 0)
    loss = loss_fn(y_logits, labels)
    train_loss += loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  train_accuracy = accuracy(t, a)

  model.eval()
  with torch.inference_mode():
    t2 =  torch.tensor([])
    t2 = t2.to(device)
    a2 = torch.tensor([])
    a2 = a2.to(device)
    val_loss = 0
    for batch in val_loader:
      input_ids, attention_mask, labels = batch
      a2 = torch.cat((a2, labels), dim = 0)
      outputs = model(input_ids, attention_mask = attention_mask)
      val_logits = outputs.logits
      val_preds = torch.softmax(val_logits, dim = 1).argmax(dim = 1)
      loss = loss_fn(val_logits, labels)
      val_loss += loss
      t2 = torch.cat((t2, val_preds), dim = 0)
    val_accuracy = accuracy(t2, a2)

  print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss/len(train_loader):.5f} | Train Accuracy: {train_accuracy :.5f} | Validation Loss: {val_loss/len(test_loader):.5f} | Validation Accuracy: {val_accuracy:.5f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 | Train Loss: 1.13885 | Train Accuracy: 0.57666 | Validation Loss: 0.69215 | Validation Accuracy: 0.78443
Epoch 2/3 | Train Loss: 0.30336 | Train Accuracy: 0.90389 | Validation Loss: 0.42465 | Validation Accuracy: 0.85030
Epoch 3/3 | Train Loss: 0.14130 | Train Accuracy: 0.95659 | Validation Loss: 0.30064 | Validation Accuracy: 0.92814


In [41]:
from torchmetrics import Precision, Recall, F1Score, ConfusionMatrix
from sklearn.metrics import classification_report

model.eval()
with torch.inference_mode():
  t3 =  torch.tensor([])
  t3 = t3.to(device)
  a3 = torch.tensor([])
  a3 = a3.to(device)
  for batch in test_loader:
    input_ids, attention_mask, labels = batch
    a3 = torch.cat((a3, labels), dim = 0)
    outputs = model(input_ids, attention_mask = attention_mask)
    test_logits = outputs.logits
    test_preds = torch.softmax(test_logits, dim = 1).argmax(dim = 1)
    t3 = torch.cat((t3, test_preds), dim = 0)

  num_classes = 6
  accuracy = accuracy(t3, a3)
  precision = Precision(task = "multiclass", average = "macro", num_classes = 6).to(device)
  precision_scores = precision(t3, a3)
  f1score = F1Score(task = "multiclass", num_classes = 6).to(device)
  f1_scores = f1score(t3, a3)
  recall = Recall(task = "multiclass", average = "macro", num_classes = 6).to(device)
  recall_scores = recall(t3, a3)
  confusionmatrix = ConfusionMatrix(task = "multiclass", num_classes = 6).to(device)
  confusion_matrix = confusionmatrix(t3, a3)
  print(f"Accuracy: {accuracy:.5f}\nPrecision: {precision_scores:.5f}\nRecall: {recall_scores:.5f}\nF1 Score: {f1_scores:.5f}\nConfusion Matrix:\n\t{confusion_matrix}")

Accuracy: 0.91566
Precision: 0.93727
Recall: 0.91474
F1 Score: 0.91566
Confusion Matrix:
	tensor([[75,  0,  2,  5,  0,  0],
        [ 0,  3,  0,  0,  0,  0],
        [ 0,  0,  9,  1,  0,  0],
        [ 3,  0,  0, 46,  0,  0],
        [ 1,  0,  0,  0,  8,  0],
        [ 1,  0,  0,  1,  0, 11]], device='cuda:0')


In [42]:
print(classification_report(a3.to("cpu").numpy(), t3.to("cpu").numpy()))

              precision    recall  f1-score   support

         0.0       0.94      0.91      0.93        82
         1.0       1.00      1.00      1.00         3
         2.0       0.82      0.90      0.86        10
         3.0       0.87      0.94      0.90        49
         4.0       1.00      0.89      0.94         9
         5.0       1.00      0.85      0.92        13

    accuracy                           0.92       166
   macro avg       0.94      0.91      0.92       166
weighted avg       0.92      0.92      0.92       166

