# Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -U accelerate
!pip install -q transformers datasets evaluate
%cd /content/drive/MyDrive/Code/Maitrise/sappelli_email_classification

Collecting accelerate
  Downloading accelerate-0.27.0-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that ar

# Preprocessing

In [20]:
from datasets import load_dataset

raw_dataset = load_dataset("csv", data_files="./data/act1_encoded.csv")
raw_dataset['train'][0]

{'label': 3,
 'text': 'It was cute!\tGenia FitzGerald\t10/10/2000 03:23 PM\t\t \t\t To: Marie Heard/Enron Communications@Enron Communications\t\t cc: Tana Jones/HOU/ECT@ECT\t\t Subject: Re: HOTDOG !This is sooo adorable!Thanks for inviting me to share Mexican food with you.  Tana said it was YOUR fault I wasn t invited!!!!!Genia----- Forwarded by Genia FitzGerald/HOU/ECT on 10/10/2000 03:21 PM -----\t\t----- Forwarded by Genia FitzGerald/HOU/ECT on 10/10/2000 02:30 PM ----- - Hotdog PHOTO2.jpg'}

In [21]:
from datasets import DatasetDict

dataset_lenght = len(raw_dataset["train"])
print(f"Dataset length: {dataset_lenght}")

train_test_dataset = raw_dataset["train"].train_test_split(test_size=0.2, shuffle=False, seed=42)
val_test_dataset = train_test_dataset["test"].train_test_split(test_size=0.5, shuffle=False, seed=42)


dataset = DatasetDict({
    "train": train_test_dataset["train"],
    "validation": val_test_dataset["train"],
    "test": val_test_dataset["test"]
})

# prompt: length of train, validation, and test

train_length = len(dataset["train"])
val_length = len(dataset["validation"])
test_length = len(dataset["test"])

print(f"Train length: {train_length}")
print(f"Validation length: {val_length}")
print(f"Test length: {test_length}")

Dataset length: 570
Train length: 456
Validation length: 57
Test length: 57


In [23]:
dataset['test'][0]

{'label': 3, 'text': 'Very sweet.  Thank you very much.'}

In [24]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [7]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [8]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/456 [00:00<?, ? examples/s]

Map:   0%|          | 0/114 [00:00<?, ? examples/s]

In [9]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation metrics

In [42]:
import numpy as np
from evaluate import load

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric1 = load("accuracy")
    metric2 = load("f1")
    metric3 = load("precision")
    metric4 = load("recall")
    accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = metric2.compute(predictions=predictions, references=labels, average="micro")["f1"]
    precision = metric3.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = metric4.compute(predictions=predictions, references=labels, average="micro")["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

# Training

In [36]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
training_args = TrainingArguments(
    output_dir="./results/act1",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.234026,0.535088,0.535088,0.535088,0.535088
2,No log,1.146786,0.596491,0.596491,0.596491,0.596491
3,No log,1.162856,0.614035,0.614035,0.614035,0.614035
4,No log,1.178918,0.605263,0.605263,0.605263,0.605263
5,No log,1.206668,0.578947,0.578947,0.578947,0.578947


TrainOutput(global_step=145, training_loss=0.7692740078630118, metrics={'train_runtime': 294.372, 'train_samples_per_second': 7.745, 'train_steps_per_second': 0.493, 'total_flos': 598054867486080.0, 'train_loss': 0.7692740078630118, 'epoch': 5.0})

In [46]:
# prompt: save model with name act1

model.save_pretrained("./models/act1")


# Loading

In [15]:
# prompt: load local model in /content/drive/MyDrive/Code/Maitrise/sappelli/models/act1/model.safetensors
from transformers import AutoModelForSequenceClassification

# Download model and configuration from huggingface.co and cache.
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)
# Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
model = AutoModelForSequenceClassification.from_pretrained("./models/act1/")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Evaluation

In [45]:
trainer.evaluate()

{'eval_loss': 1.1628563404083252,
 'eval_accuracy': 0.6140350877192983,
 'eval_f1': 0.6140350877192983,
 'eval_precision': 0.6140350877192983,
 'eval_recall': 0.6140350877192983,
 'eval_runtime': 6.9819,
 'eval_samples_per_second': 16.328,
 'eval_steps_per_second': 1.146,
 'epoch': 5.0}