In [None]:
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
import numpy as np
import torch
import wandb
import os
from covid_voices.data.datasets.corona_dataset import CoronaTweetDataset
from datasets import Dataset, DatasetDict
import numpy as np
import evaluate

%load_ext autoreload
%autoreload 2

SEED = 42

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(SEED)
if device == "cuda":
    torch.cuda.manual_seed(SEED)



## Some Examples of usage


In [3]:
def preprocess_tweet(text):
        """Clean and normalize tweet text"""
        # Example preprocessing - you can expand this
        text = text.lower()
        text = text.replace('#', 'hashtag_')
        text = text.replace('@', 'mention_')
        return text

# Load datasets with preprocessing and split into train, validation, test
datasets = CoronaTweetDataset.load_datasets(preprocessing=preprocess_tweet, 
                                            is_val_split=True,
                                            val_size=0.2,
                                            seed=SEED)

print(f"Loaded datasets:\n{datasets.keys()}\n")
print(f'See the preprocess_tweet function in action :\n{datasets["train"][0]["text"]}')


# split in train, validation, test
train_dataset = datasets["train"]
val_dataset = datasets["val"]
test_dataset = datasets["test"]

Loaded datasets:
dict_keys(['train', 'val', 'test'])

See the preprocess_tweet function in action :
"consumer voice has compiled a list of creative ideas and best practices for staying connected during the pandemic.  the list includes ways to communicate with loved ones and ideas for staying active and engaged while in isolation."

https://t.co/j9udncqlnn https://t.co/kwrrsjhfkq


## Using HF libraries 

In [None]:
hf_datasets = DatasetDict({
    "train": Dataset.from_pandas(train_dataset.df, preserve_index=False),
    "val": Dataset.from_pandas(val_dataset.df, preserve_index=False),
    "test": Dataset.from_pandas(test_dataset.df, preserve_index=False)
})

model_name = "huawei-noah/TinyBERT_General_4L_312D"
output_base_dir = "./models/"
project_name = "corona-NLP-ensemble"
batch_size = 128

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_datasets = hf_datasets.map(lambda x: tokenizer(x["text"], padding="max_length", truncation=True), batched=True)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=train_dataset.num_labels)

vocab.txt: 0.00B [00:00, ?B/s]

NameError: name 'tokenize_function' is not defined

In [None]:
metric_names = ["accuracy", "f1", "precision", "recall"]
metrics = [evaluate.load(name) for name in metric_names]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    results = {}
    for metric in metrics:
        # Some metrics (like f1) require average="macro" for multiclass
        if metric.name in ["f1", "precision", "recall"]:
            res = metric.compute(predictions=predictions, references=labels, average="macro")
        else:
            res = metric.compute(predictions=predictions, references=labels)
        results.update(res)
    return results

config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).astype(float).mean().item()
    return {"accuracy": accuracy}


def preprocess(example, tokenizer):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=train_dataset.max_length,
    )


In [33]:
d = split["train"]
d[1]

{'OriginalTweet': 'Corona virus safety tips #6. Take all the necessary preventive measures. Stay safe, stay healthy!!!\r\r\n#staysafe\r\r\n#healthtips\r\r\n#coronavirus\r\r\n#Shopping #eatery #lounge\r\r\n#acesupermarket\r\r\n#aceeatery\r\r\n#acelounge\r\r\n#acefamily\r\r\n#Ibadan #Oyo #Ogbomoso #Ilorin\r\r\n#Osogbo #Ileife #Ijebuode #Abeokuta https://t.co/LfiLC2RuB5',
 'label': 4}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized = split.values().map(lambda x: preprocess(x, tokenizer, train_dataset.preprocessing), batched=True)



training_args = TrainingArguments(
    output_dir="./test_output",
    eval_strategy="epoch",
    per_device_train_batch_size=128,
    num_train_epochs=5,
    do_train=True,
    do_eval=True
    )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    )


AttributeError: 'dict_values' object has no attribute 'map'

In [None]:

wandb.init(project=project_name, name=model_name, reinit=True)
trainer.train()
trainer.save_model(os.path.join(output_base_dir, model_name))
tokenizer.save_pretrained(os.path.join(output_base_dir, model_name))
wandb.finish()


In [None]:

def tokenize(batch):
    return tokenizer(
        batch["OriginalTweet"],
        padding="max_length",
        truncation=True,
        max_length=max_length)

tokenized = raw_datasets.map(tokenize, batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask",
"Sentiment"])

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5, id2label=label2id, label2id=label2id)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = (predictions == labels).astype(float).mean().item()
    return {"accuracy": accuracy}

args = TrainingArguments(
output_dir="./results",
eval_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=1,
logging_dir="./logs",
logging_steps=10,
save_strategy="no"
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["test"],
compute_metrics=compute_metrics
)