# Colab

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!pip install -U accelerate
!pip install -q transformers datasets evaluate
%cd /content/drive/MyDrive/Code/Maitrise/sappelli_email_classification

/content/drive/MyDrive/Code/Maitrise/sappelli_email_classification


# Preprocessing

In [9]:
from datasets import load_dataset

raw_dataset = load_dataset("csv", data_files="./data/numta_encoded.csv")
raw_dataset['train'][0]

Generating train split: 0 examples [00:00, ? examples/s]

{'label': 0,
 'text': 'It was cute!\tGenia FitzGerald\t10/10/2000 03:23 PM\t\t \t\t To: Marie Heard/Enron Communications@Enron Communications\t\t cc: Tana Jones/HOU/ECT@ECT\t\t Subject: Re: HOTDOG !This is sooo adorable!Thanks for inviting me to share Mexican food with you.  Tana said it was YOUR fault I wasn t invited!!!!!Genia----- Forwarded by Genia FitzGerald/HOU/ECT on 10/10/2000 03:21 PM -----\t\t----- Forwarded by Genia FitzGerald/HOU/ECT on 10/10/2000 02:30 PM ----- - Hotdog PHOTO2.jpg'}

In [10]:
from datasets import DatasetDict

dataset_lenght = len(raw_dataset["train"])
print(f"Dataset length: {dataset_lenght}")

train_test_dataset = raw_dataset["train"].train_test_split(test_size=0.2, shuffle=False, seed=42)
val_test_dataset = train_test_dataset["test"].train_test_split(test_size=0.5, shuffle=False, seed=42)


dataset = DatasetDict({
    "train": train_test_dataset["train"],
    "validation": val_test_dataset["train"],
    "test": val_test_dataset["test"]
})

# prompt: length of train, validation, and test

train_length = len(dataset["train"])
val_length = len(dataset["validation"])
test_length = len(dataset["test"])

print(f"Train length: {train_length}")
print(f"Validation length: {val_length}")
print(f"Test length: {test_length}")

Train length: 578
Test length: 145


In [11]:
dataset['test'][0]

{'label': 1,
 'text': 'Allyson,You should receive some formation documents relating to Edgecombe Development.  The name is not spelled correctly (missing an "e"), but that is being corrected at this time.   Bob spoke to me about wanting Enron to send a letter concerning the financing arrangements for the project.  I would like for us to discuss this as well.Kay'}

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [13]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Map:   0%|          | 0/145 [00:00<?, ? examples/s]

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Evaluation metrics

In [18]:
import numpy as np
from evaluate import load

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric1 = load("accuracy")
    metric2 = load("f1")
    metric3 = load("precision")
    metric4 = load("recall")
    accuracy = metric1.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = metric2.compute(predictions=predictions, references=labels, average="micro")["f1"]
    precision = metric3.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = metric4.compute(predictions=predictions, references=labels, average="micro")["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Training

In [15]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir="./results/numta",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.725069,0.682759
2,No log,0.49998,0.772414
3,No log,0.532809,0.786207
4,No log,0.581148,0.772414
5,No log,0.638769,0.758621


TrainOutput(global_step=185, training_loss=0.3141700332229202, metrics={'train_runtime': 297.1008, 'train_samples_per_second': 9.727, 'train_steps_per_second': 0.623, 'total_flos': 757488747180048.0, 'train_loss': 0.3141700332229202, 'epoch': 5.0})

In [21]:
model.save_pretrained("./models/numta")

# Loading

In [None]:
# prompt: load local model in /content/drive/MyDrive/Code/Maitrise/sappelli/models/act1/model.safetensors
from transformers import AutoModelForSequenceClassification

# Download model and configuration from huggingface.co and cache.
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
# Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
model = AutoModelForSequenceClassification.from_pretrained("./models/numta/")


# Evaluation

In [23]:
trainer.evaluate()

{'eval_loss': 0.6387693285942078,
 'eval_accuracy': 0.7586206896551724,
 'eval_runtime': 5.1036,
 'eval_samples_per_second': 28.411,
 'eval_steps_per_second': 1.959,
 'epoch': 5.0}