In [2]:
!pip install transformers datasets peft
import os
import zipfile
import requests

url = "https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip"
r = requests.get(url)
with open("multinli_1.0.zip", "wb") as f:
    f.write(r.content)

with zipfile.ZipFile("multinli_1.0.zip", "r") as zip_ref:
    zip_ref.extractall()

os.listdir("multinli_1.0")

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.11.1-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━

['multinli_1.0_train.txt',
 'Icon\r',
 'multinli_1.0_train.jsonl',
 'multinli_1.0_dev_matched.jsonl',
 'README.txt',
 'multinli_1.0_dev_mismatched.txt',
 'multinli_1.0_dev_mismatched.jsonl',
 'paper.pdf',
 '.DS_Store',
 'multinli_1.0_dev_matched.txt']

In [3]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict, Dataset
import json
import pandas as pd

def load_multinli_data(split):
    if split == 'train':
        file_path = 'multinli_1.0/multinli_1.0_train.jsonl'
    elif split == 'validation':
        file_path = 'multinli_1.0/multinli_1.0_dev_matched.jsonl'
    else:
        raise ValueError("Invalid split. Use 'train' or 'validation'")

    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))

    return data

def create_dataset_dict():
    train_data = load_multinli_data('train')
    validation_data = load_multinli_data('validation')
    train_df = pd.DataFrame(train_data)
    validation_df = pd.DataFrame(validation_data)
    train_df = train_df.rename(columns={"premise": "sentence1", "hypothesis": "sentence2", "gold_label": "label"})
    validation_df = validation_df.rename(columns={"premise": "sentence1", "hypothesis": "sentence2", "gold_label": "label"})
    train_df = train_df[train_df['label'].isin(['entailment', 'neutral', 'contradiction'])]
    validation_df = validation_df[validation_df['label'].isin(['entailment', 'neutral', 'contradiction'])]
    label_mapping = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
    train_df['label'] = train_df['label'].map(label_mapping)
    validation_df['label'] = validation_df['label'].map(label_mapping)
    return DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'validation': Dataset.from_pandas(validation_df)
    })

dataset = create_dataset_dict()
model_name = 'roberta-large'
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation='longest_first', padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
columns_to_keep = ['input_ids', 'attention_mask', 'label']
tokenized_dataset = tokenized_dataset.remove_columns(set(tokenized_dataset['train'].column_names) - set(columns_to_keep))
tokenized_dataset.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [13]:
from datasets import load_metric
accuracy_metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=2,
    fp16=True,
    dataloader_num_workers=8,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'].shuffle().select(range(int(0.1 * len(tokenized_dataset['train'])))),
    eval_dataset=tokenized_dataset['validation'],
    compute_metrics=compute_metrics
)

trainer.train()
results = trainer.evaluate()
print(results)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  self.pid = os.fork()


Epoch,Training Loss,Validation Loss,Accuracy
0,0.4058,0.354279,0.874885
2,0.137,0.543746,0.881915


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


{'eval_loss': 0.5437459349632263, 'eval_accuracy': 0.8819154355578197, 'eval_runtime': 68.7286, 'eval_samples_per_second': 142.808, 'eval_steps_per_second': 11.902, 'epoch': 2.999083409715857}


In [14]:
print(f"Accuracy: {results['eval_accuracy']}")

Accuracy: 0.8819154355578197
