In [None]:
%pip install datasets transformers evaluate scikit-learn 'transformers[torch]'

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import argparse
import torch
import torch.nn.functional as F
from sklearn.utils.class_weight import compute_class_weight


In [None]:
def get_special_tokens():
    #[SEP] is already a special token in BERT, these are additional special tokens
    return ['[USER]', '[LINK]', '[EMAIL]', '[IMAGE]', '[QUOTED_TEXT]', '[EMPTY]', '[HTML]']

In [None]:
token = #REDACTED

In [None]:

def train(input_csv: str, category_file: str, checkpoint: str = None):

    with open(category_file, 'r') as f:
        categories = f.readlines()
        num_labels = len(categories)

    dataset = load_dataset('csv', data_files=input_csv, split='train')
    dataset = dataset.train_test_split(test_size=0.2)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    special_tokens = get_special_tokens()
    tokenizer.add_tokens(special_tokens)
    def tokenize_function(examples):
        return tokenizer(examples['text'], padding="max_length", truncation=True)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    train_dataset = tokenized_dataset["train"].shuffle(seed=42)#.select(range(1000))
    eval_dataset = tokenized_dataset["test"].shuffle(seed=42)#.select(range(1000))

    model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=num_labels)
    model.resize_token_embeddings(len(tokenizer))
    training_args = TrainingArguments(output_dir="test_trainer",
                                      evaluation_strategy="epoch",
                                      save_strategy="epoch",
                                      per_device_train_batch_size=32,
                                      per_device_eval_batch_size=8,
                                      num_train_epochs=10,
                                      )
    metric = evaluate.load("f1")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels, average="micro")

    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
    model.to(device)
    #class_weights_tensor = compute_class_weights(train_dataset)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )
    # trainer = WeightedTrainer(
    #     model=model,
    #     args=training_args,
    #     train_dataset=train_dataset,
    #     eval_dataset=eval_dataset,
    #     compute_metrics=compute_metrics,
    #     class_weights=class_weights_tensor
    # )

    trainer.train(resume_from_checkpoint=checkpoint)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install accelerate -U



In [None]:
%pip install transformers==4.30

Collecting transformers==4.30
  Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.19.1
    Uninstalling tokenizers-0.19.1:
      Successfully uninstalled tokenizers-0.19.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.40.0
    Uninstalling transformers-4.40.0:
      Successfully uninstalled transformers-4.40.0
Successfully installed tokenizers-0.13.3 transformers-4.30.0


In [None]:
#login to hugging face
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
train('drive/MyDrive/cleaned_all.csv', 'training_categories')

Generating train split: 0 examples [00:00, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/60572 [00:00<?, ? examples/s]

Map:   0%|          | 0/15144 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1
1,0.2298,0.154303,0.952456
2,0.1224,0.108872,0.967248
3,0.0856,0.1041,0.9687
4,0.0707,0.082483,0.974908
5,0.0613,0.093874,0.976096
6,0.0467,0.081851,0.977945
7,0.0438,0.093493,0.976162
8,0.0333,0.0832,0.978407
9,0.0301,0.092263,0.978539
10,0.0258,0.108456,0.978143


In [None]:
!zip -r /content/training_info.zip /content/test_trainer

zip I/O error: Operation not supported
zip error: Could not create output file (/content/drive/training_info.zip)


In [None]:
!cp /content/training_info.zip /gdrive/MyDrive/

cp: cannot create regular file '/gdrive/MyDrive/': No such file or directory


In [None]:
#download /content/training_info.zip

from google.colab import files
files.download('/content/training_info.zip')

In [1]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_metric = load_metric("f1")
    acc_metric = load_metric("accuracy")
    per_class_f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)['f1']
    return {
        "f1": f1_metric.compute(predictions=predictions, references=labels, average="micro")['f1'],
        "accuracy": acc_metric.compute(predictions=predictions, references=labels)['accuracy'],
        "class_0_f1": per_class_f1[0],
        "class_1_f1": per_class_f1[1],
        "class_2_f1": per_class_f1[2],
        "class_3_f1": per_class_f1[3],
        "class_4_f1": per_class_f1[4],
        "class_5_f1": per_class_f1[5],
        "class_6_f1": per_class_f1[6],
        "class_7_f1": per_class_f1[7],
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
import os
from datasets import load_metric

In [None]:
dataset = load_dataset('csv', data_files='drive/MyDrive/cleaned_all.csv', split='train')
dataset = dataset.train_test_split(test_size=0.05)
eval_dataset = dataset["test"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
special_tokens = get_special_tokens()
tokenizer.add_tokens(special_tokens)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3786 [00:00<?, ? examples/s]

In [None]:
results = []
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
checkpoints_dir = "test_trainer"

for checkpoint in sorted(os.listdir("test_trainer")):
    if checkpoint.startswith("checkpoint-"):
        model_path = os.path.join(checkpoints_dir, checkpoint)
        print(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        model.to(device)

        trainer = Trainer(
            model=model,
            args=TrainingArguments(output_dir="temp_dir"),  # Temp dir for evaluation
            eval_dataset=tokenized_eval_dataset,
            compute_metrics=compute_metrics
        )

        eval_results = trainer.evaluate()
        print(eval_results)
        results.append({
            "checkpoint": checkpoint,
            "f1": eval_results["eval_f1"],
            "accuracy": eval_results["eval_accuracy"],
            "class_0_f1": eval_results["eval_class_0_f1"],
            "class_1_f1": eval_results["eval_class_1_f1"],
            "class_2_f1": eval_results["eval_class_2_f1"],
            "class_3_f1": eval_results["eval_class_3_f1"],
            "class_4_f1": eval_results["eval_class_4_f1"],
            "class_5_f1": eval_results["eval_class_5_f1"],
            "class_6_f1": eval_results["eval_class_6_f1"],
            "class_7_f1": eval_results["eval_class_7_f1"],
        })

# Convert results to DataFrame and save as CSV
results_df = pd.DataFrame(results)
results_df.to_csv("model_evaluation_results.csv", index=False)

test_trainer/checkpoint-11358


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.04013960808515549, 'eval_f1': 0.9838880084521923, 'eval_accuracy': 0.9838880084521923, 'eval_class_0_f1': 0.9893121403124144, 'eval_class_1_f1': 0.9620817843866171, 'eval_class_2_f1': 0.9686746987951808, 'eval_class_3_f1': 0.9887640449438202, 'eval_class_4_f1': 0.981636060100167, 'eval_class_5_f1': 0.993939393939394, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 1.0, 'eval_runtime': 32.8193, 'eval_samples_per_second': 115.359, 'eval_steps_per_second': 14.443}
test_trainer/checkpoint-13251


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.040262091904878616, 'eval_f1': 0.9833597464342314, 'eval_accuracy': 0.9833597464342314, 'eval_class_0_f1': 0.989849108367627, 'eval_class_1_f1': 0.9585798816568046, 'eval_class_2_f1': 0.9731051344743276, 'eval_class_3_f1': 0.9887640449438202, 'eval_class_4_f1': 0.9750415973377704, 'eval_class_5_f1': 0.994954591321897, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 1.0, 'eval_runtime': 32.7845, 'eval_samples_per_second': 115.482, 'eval_steps_per_second': 14.458}
test_trainer/checkpoint-15144


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.03774925693869591, 'eval_f1': 0.9833597464342314, 'eval_accuracy': 0.9833597464342314, 'eval_class_0_f1': 0.9898546750754045, 'eval_class_1_f1': 0.96, 'eval_class_2_f1': 0.9705882352941176, 'eval_class_3_f1': 0.9832402234636872, 'eval_class_4_f1': 0.9783693843594009, 'eval_class_5_f1': 0.9939516129032258, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 0.9956709956709957, 'eval_runtime': 32.8073, 'eval_samples_per_second': 115.401, 'eval_steps_per_second': 14.448}
test_trainer/checkpoint-17037


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.035522107034921646, 'eval_f1': 0.9852086634970946, 'eval_accuracy': 0.9852086634970946, 'eval_class_0_f1': 0.99039780521262, 'eval_class_1_f1': 0.9621942179392143, 'eval_class_2_f1': 0.9781021897810219, 'eval_class_3_f1': 0.9887640449438202, 'eval_class_4_f1': 0.981636060100167, 'eval_class_5_f1': 0.9959758551307847, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 1.0, 'eval_runtime': 32.822, 'eval_samples_per_second': 115.35, 'eval_steps_per_second': 14.442}
test_trainer/checkpoint-1893


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.12917256355285645, 'eval_f1': 0.9577390385631274, 'eval_accuracy': 0.9577390385631274, 'eval_class_0_f1': 0.9803813208068528, 'eval_class_1_f1': 0.9301310043668122, 'eval_class_2_f1': 0.9135802469135802, 'eval_class_3_f1': 0.9285714285714286, 'eval_class_4_f1': 0.8943089430894309, 'eval_class_5_f1': 0.966966966966967, 'eval_class_6_f1': 0.9818181818181818, 'eval_class_7_f1': 0.9779735682819383, 'eval_runtime': 32.7469, 'eval_samples_per_second': 115.614, 'eval_steps_per_second': 14.475}
test_trainer/checkpoint-18930


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.04044223204255104, 'eval_f1': 0.9849445324881141, 'eval_accuracy': 0.9849445324881141, 'eval_class_0_f1': 0.9901207464324917, 'eval_class_1_f1': 0.9614814814814815, 'eval_class_2_f1': 0.9781021897810219, 'eval_class_3_f1': 0.9944134078212291, 'eval_class_4_f1': 0.981636060100167, 'eval_class_5_f1': 0.9949647532729103, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 1.0, 'eval_runtime': 32.8264, 'eval_samples_per_second': 115.334, 'eval_steps_per_second': 14.44}
test_trainer/checkpoint-3786


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.07865126430988312, 'eval_f1': 0.9751716851558373, 'eval_accuracy': 0.9751716851558373, 'eval_class_0_f1': 0.9865050950151473, 'eval_class_1_f1': 0.9511111111111111, 'eval_class_2_f1': 0.9508196721311475, 'eval_class_3_f1': 0.96, 'eval_class_4_f1': 0.9537953795379539, 'eval_class_5_f1': 0.9847715736040609, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 0.9914529914529915, 'eval_runtime': 32.896, 'eval_samples_per_second': 115.09, 'eval_steps_per_second': 14.409}
test_trainer/checkpoint-5679


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.07093462347984314, 'eval_f1': 0.9762282091917591, 'eval_accuracy': 0.9762282091917591, 'eval_class_0_f1': 0.9873417721518988, 'eval_class_1_f1': 0.9485458612975392, 'eval_class_2_f1': 0.9692671394799054, 'eval_class_3_f1': 0.9473684210526316, 'eval_class_4_f1': 0.954248366013072, 'eval_class_5_f1': 0.98989898989899, 'eval_class_6_f1': 0.9820359281437125, 'eval_class_7_f1': 0.9914529914529915, 'eval_runtime': 32.7568, 'eval_samples_per_second': 115.579, 'eval_steps_per_second': 14.47}
test_trainer/checkpoint-7572


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.05687284842133522, 'eval_f1': 0.9801901743264659, 'eval_accuracy': 0.9801901743264659, 'eval_class_0_f1': 0.9882030178326474, 'eval_class_1_f1': 0.9563286454478165, 'eval_class_2_f1': 0.9611650485436893, 'eval_class_3_f1': 0.9888888888888889, 'eval_class_4_f1': 0.9698996655518394, 'eval_class_5_f1': 0.9888776541961577, 'eval_class_6_f1': 1.0, 'eval_class_7_f1': 0.9957081545064378, 'eval_runtime': 32.7985, 'eval_samples_per_second': 115.432, 'eval_steps_per_second': 14.452}
test_trainer/checkpoint-9465


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.05651785433292389, 'eval_f1': 0.9807184363444268, 'eval_accuracy': 0.9807184363444268, 'eval_class_0_f1': 0.9879584017515053, 'eval_class_1_f1': 0.957683741648107, 'eval_class_2_f1': 0.9705882352941176, 'eval_class_3_f1': 0.9887640449438202, 'eval_class_4_f1': 0.9647058823529412, 'eval_class_5_f1': 0.9909365558912387, 'eval_class_6_f1': 0.993939393939394, 'eval_class_7_f1': 1.0, 'eval_runtime': 32.815, 'eval_samples_per_second': 115.374, 'eval_steps_per_second': 14.445}


In [None]:
print(results_df)

         checkpoint        f1  accuracy  class_0_f1  class_1_f1  class_2_f1  \
0  checkpoint-11358  0.983888  0.983888    0.989312    0.962082    0.968675   
1  checkpoint-13251  0.983360  0.983360    0.989849    0.958580    0.973105   
2  checkpoint-15144  0.983360  0.983360    0.989855    0.960000    0.970588   
3  checkpoint-17037  0.985209  0.985209    0.990398    0.962194    0.978102   
4   checkpoint-1893  0.957739  0.957739    0.980381    0.930131    0.913580   
5  checkpoint-18930  0.984945  0.984945    0.990121    0.961481    0.978102   
6   checkpoint-3786  0.975172  0.975172    0.986505    0.951111    0.950820   
7   checkpoint-5679  0.976228  0.976228    0.987342    0.948546    0.969267   
8   checkpoint-7572  0.980190  0.980190    0.988203    0.956329    0.961165   
9   checkpoint-9465  0.980718  0.980718    0.987958    0.957684    0.970588   

   class_3_f1  class_4_f1  class_5_f1  class_6_f1  class_7_f1  
0    0.988764    0.981636    0.993939    1.000000    1.000000  
1 