# Help BOBAI: Classify an unknown language #2

You know the drill.
Your task is to build the best text classifier for language X that you can, while operating within the constraints of Bobai:

*   The classifier has to be based on mBERT (and cannot use any additional pre-trained language encoder).
*   The classifier has to train in under 8 hours using an L4 GPU as the compute resources of the company are limited.
*   The classifier has to perform inference on any random 500 data samples in under 5 minutes (Bobai will then apply their optimization tricks to bring this time even further down).
*   You can spend as much time as you want trying to decrypt the data
*   The given code is the code you received for the at-home assignment with updated links. You can modify whatever you deem necessarry

## Prerequisites


### HuggingFace configuration

The steps below need to be completed by the team leader:

1. Create a team account on [HuggingFace](https://huggingface.co/) using the Gmail account provided by the IOAI organizers.

2. Go to the [Olimpiada-AI HuggingFace repo](https://huggingface.co/Olimpiada-AI) to access the datasets.

3. In settings, create two Access Tokens, one with read rights, one with write rights, and store those in [Colab Secrets](https://www.youtube.com/watch?v=q87i2LZbbPc) as `hf_read` and `hf_write`, respectively.

In [3]:
from google.colab import userdata

read_access_token = userdata.get('hf_read')
write_access_token = userdata.get('hf_write')

### Dependencies

In [2]:
import importlib
import torch, transformers

if '2.3.0' not in torch.__version__:
  !pip install torch==2.3.0
if transformers.__version__!='4.41.2':
  !pip install transformers==4.41.2

if importlib.util.find_spec('datasets') is None:
  !pip install datasets==2.18.0s
  !pip install evaluate==0.4.2
  !pip install accelerate -U


If you've just installed `accelerate`, execute `Runtime > Restart session and run all` in the Colab UI menu above.

# Data

In [5]:
# load the data

from datasets import load_dataset, Dataset, DatasetDict

classification_dataset = load_dataset('Olimpiada-AI/NLP_problem', token=read_access_token)

# Baseline

In [12]:
# load the pre-trained tokenizer and use it to process the data
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-uncased")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_data = classification_dataset.map(preprocess_function, batched=True)

# Extract the dataset to split
full_train_dataset = tokenized_data['train']

# Convert to Pandas DataFrame for splitting
import pandas as pd
df = full_train_dataset.to_pandas()

# Split the dataset into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])  # Adjust test_size as needed

# Convert back to Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Create a new DatasetDict
split_tokenized_data = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
# define the evaluation metric

import evaluate
import numpy as np

f1 = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return f1.compute(predictions=predictions, references=labels, average='macro')

In [16]:
# define the model and the training configuration

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "google-bert/bert-base-multilingual-uncased", num_labels=5
)

training_args = TrainingArguments(
    output_dir="baseline_bobai_2",
    learning_rate=1e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=20,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=5,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_strategy="checkpoint",
    hub_token=write_access_token,
    hub_private_repo=True,
    hub_model_id='baseline_bobai_2'

)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_tokenized_data["train"],
    eval_dataset=split_tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
print(split_tokenized_data)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', '__index_level_0__'],
        num_rows: 2589
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', '__index_level_0__'],
        num_rows: 648
    })
})


In [19]:
# execute the model training
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.558474,0.27864
2,No log,1.445435,0.340132
3,No log,1.408473,0.355641
4,No log,1.41195,0.363624
5,No log,1.290221,0.464859
6,No log,1.258511,0.468592
7,No log,1.204345,0.513798
8,No log,1.158278,0.530088
9,No log,1.158539,0.512119
10,No log,1.127043,0.549813


TrainOutput(global_step=820, training_loss=0.8548285507574314, metrics={'train_runtime': 1941.0752, 'train_samples_per_second': 26.676, 'train_steps_per_second': 0.422, 'total_flos': 3758033176588548.0, 'train_loss': 0.8548285507574314, 'epoch': 20.0})

# Inference

In [20]:
# run the trained model on a dev/test split
data_split = "dev"
eval_out = trainer.predict(tokenized_data[data_split])
predictions = eval_out.predictions.argmax(1)
labels = eval_out.label_ids
dev_f1 = f1.compute(predictions=predictions, references=labels, average='macro')

KeyError: 'dev'

# Testing

In [8]:
# UPDATE THIS CELL ACCORDINGLY

# define a funciton to load your tokenizer and model from a HF path
# the path variables can be strings or lists of strings (for ensemble solutions)
def load_model(path_to_tokenizer, path_to_model, token):
  # Example:
  tokenizer = AutoTokenizer.from_pretrained(path_to_tokenizer, token=token)
  model = AutoModelForSequenceClassification.from_pretrained(path_to_model, token=token)
  model.eval()

  return tokenizer, model

# define a "predict" function that takes the model and a list of input strings
# and returns the outputs as a list of integer classes
def predict(tokenizer, model, input_texts):
  #Example:
  predictions = []
  for input_text in input_texts:

    input_ids = tokenizer(input_text, return_tensors="pt")

    with torch.no_grad():
      logits = model(**input_ids).logits

    predictions.append(logits.argmax().item())

  return predictions

# set variables
path_to_model = "path/to/your/best/model/on/hf" # can be a list instead
path_to_tokenizer = "path/to/your/best/tokenizer/on/hf" # can be a list instead
model_access_token = "access token" # a fine-grained token with read rights for your model repository


In [None]:
# DO NOT CHANGE THIS CELL!!!

tokenizer, model = load_model(path_to_tokenizer, path_to_model, token=model_access_token)

test_data = load_dataset("Olimpiada-AI/NLP_problem_test")['test']['text']

predictions = predict(tokenizer, model, test_data)

with open('test_predictions.txt', 'w') as outfile:
  outfile.write('\n'.join([str(p) for p in predictions]))