In [17]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import numpy as np
import torch
import wandb
import os
from covid_voices.data.corona_dataset import CoronaTweetDataset
from datasets import Dataset, DatasetDict
import numpy as np
import evaluate

SEED = 42

%load_ext autoreload
%autoreload 2


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(SEED)
if device == "cuda":
    torch.cuda.manual_seed(SEED)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Some Examples of usage


In [8]:
def preprocess_tweet(text):
        """Clean and normalize tweet text"""
        # Example preprocessing - you can expand this
        text = text.lower()
        text = text.replace('#', 'hashtag_')
        text = text.replace('@', 'mention_')
        return text


# Load datasets with preprocessing and split into train, validation, test
datasets = CoronaTweetDataset.load_datasets(preprocessing=preprocess_tweet, 
                                            is_val_split=True,
                                            val_size=0.2,
                                            seed=SEED)

print(f"Loaded datasets:\n{datasets.keys()}\n")
print(f'See the preprocess_tweet function in action :\n{datasets["train"][0]["text"]}')


# split in train, validation, test
train_dataset = datasets["train"]
val_dataset = datasets["val"]
test_dataset = datasets["test"]

Loaded datasets:
dict_keys(['train', 'val', 'test'])

See the preprocess_tweet function in action :
"consumer voice has compiled a list of creative ideas and best practices for staying connected during the pandemic.  the list includes ways to communicate with loved ones and ideas for staying active and engaged while in isolation."

https://t.co/j9udncqlnn https://t.co/kwrrsjhfkq


## Using HF libraries 

In [12]:
model_name = "huawei-noah/TinyBERT_General_4L_312D"
output_base_dir = "./models/"
project_name = "corona-NLP-ensemble"
batch_size = 128
max_length = 280 # max length of the tweet


def make_tokenizer(model_name, max_length=512):
    """Factory function to create a tokenizer function"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    def tokenize(examples):
        return tokenizer(
            examples["text"],
            padding=False,
            truncation=True,
            max_length=max_length,
        )


tokenize_function = make_tokenizer(model_name, max_length)


# Apply to all datasets
tokenized_datasets = hf_datasets.map(
    tokenize_function,
    batched=True,
    remove_columns=hf_datasets["train"].column_names,
)



Map:   0%|          | 0/32925 [00:00<?, ? examples/s]

Map:   0%|          | 0/8232 [00:00<?, ? examples/s]

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [13]:
metric_names = ["accuracy", "f1", "precision", "recall"]
metrics = [evaluate.load(name) for name in metric_names]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results = {}
    for metric in metrics:
        # Some metrics (like f1) require average="macro" for multiclass
        if metric.name in ["f1", "precision", "recall"]:
            res = metric.compute(predictions=predictions, references=labels, average="macro")
        else:
            res = metric.compute(predictions=predictions, references=labels)
        results.update(res)
    return results

In [14]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).astype(float).mean().item()
    return {"accuracy": accuracy}




In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=train_dataset.num_labels)

training_args = TrainingArguments(
    output_dir="./test_output",
    eval_strategy="epoch",
    per_device_train_batch_size=128,
    num_train_epochs=5,
    do_train=True,
    do_eval=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    )


In [None]:

wandb.init(project=project_name, name=model_name, reinit=True)
trainer.train()
trainer.save_model(os.path.join(output_base_dir, model_name))
tokenizer.save_pretrained(os.path.join(output_base_dir, model_name))
wandb.finish()
