In [1]:
pip install transformers datasets evaluate scikit-learn accelerate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->a

In [2]:
pip install torch torchvision torchaudio



In [3]:
pip install sentence-transformers



In [4]:
!pip install --upgrade transformers



In [5]:
!pip install --upgrade accelerate datasets evaluate

Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.10.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.7/374.7 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.9.0
    Uninstalling accelerate-1.9.0:
      Successfully uninstalled accelerate-1.9.0
Successfully installed accelerate-1.10.0


In [6]:
!pip install --upgrade transformers accelerate datasets evaluate



In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [12]:
import numpy as np
from sklearn.metrics import f1_score
import evaluate
import pandas as pd

In [13]:
df = pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
df.shape

(50000, 2)

In [15]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [16]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [17]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [18]:
from datasets import DatasetDict, Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [19]:
small_train = dataset["train"].shuffle(seed=42).select(range(5000))
small_test = dataset["test"].shuffle(seed=42).select(range(1000))

In [20]:
models_to_try = [
    "bert-base-uncased",
    "roberta-base",
    "distilbert-base-uncased",
    "google/electra-base-discriminator",
    "albert-base-v2"
]

In [21]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"f1": f1_score(labels, preds, average="weighted")}

In [22]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

def train_model(model_name, train_data, test_data):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def tokenize_fn(batch):
        return tokenizer(batch["review"], truncation=True, padding="max_length", max_length=256)

    tokenized_train = train_data.map(tokenize_fn, batched=True)
    tokenized_test = test_data.map(tokenize_fn, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    training_args = TrainingArguments(
        output_dir=f"./results_{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_dir=f"./logs_{model_name}",
        report_to = None
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_test,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    trainer.train()
    return trainer.evaluate()

In [23]:
results = {}
for model_name in models_to_try:
    print(f"Training {model_name}...")
    results[model_name] = train_model(model_name, small_train, small_test)

print(results)

Training bert-base-uncased...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.299668,0.878911
2,0.278000,0.414809,0.873533


Training roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.304571,0.887844
2,0.283200,0.359646,0.901792


Training distilbert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.320454,0.863183
2,0.311500,0.364532,0.876727


Training google/electra-base-discriminator...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1
1,No log,0.219351,0.919022
2,0.252400,0.272359,0.921937


Training albert-base-v2...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.28447,0.886987
2,0.297500,0.392629,0.88962


{'bert-base-uncased': {'eval_loss': 0.4148090183734894, 'eval_f1': 0.8735333333333333, 'eval_runtime': 13.078, 'eval_samples_per_second': 76.464, 'eval_steps_per_second': 4.817, 'epoch': 2.0}, 'roberta-base': {'eval_loss': 0.3596455454826355, 'eval_f1': 0.9017921184680467, 'eval_runtime': 12.8265, 'eval_samples_per_second': 77.964, 'eval_steps_per_second': 4.912, 'epoch': 2.0}, 'distilbert-base-uncased': {'eval_loss': 0.3645324409008026, 'eval_f1': 0.8767268206981138, 'eval_runtime': 6.695, 'eval_samples_per_second': 149.364, 'eval_steps_per_second': 9.41, 'epoch': 2.0}, 'google/electra-base-discriminator': {'eval_loss': 0.2723591923713684, 'eval_f1': 0.9219374436090224, 'eval_runtime': 14.0417, 'eval_samples_per_second': 71.216, 'eval_steps_per_second': 4.487, 'epoch': 2.0}, 'albert-base-v2': {'eval_loss': 0.3926286995410919, 'eval_f1': 0.8896201691088583, 'eval_runtime': 15.1914, 'eval_samples_per_second': 65.827, 'eval_steps_per_second': 4.147, 'epoch': 2.0}}


In [26]:
best_model_name = max(results, key=lambda x: results[x]["eval_f1"])
train_model(best_model_name, dataset["train"], dataset["test"])

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.1887,0.19878,0.928108


Epoch,Training Loss,Validation Loss,F1
1,0.1887,0.19878,0.928108
2,0.1184,0.203068,0.945197


{'eval_loss': 0.20306773483753204,
 'eval_f1': 0.9451970578980976,
 'eval_runtime': 141.139,
 'eval_samples_per_second': 70.852,
 'eval_steps_per_second': 4.428,
 'epoch': 2.0}

In [27]:
import random

tokenizer = AutoTokenizer.from_pretrained(best_model_name)
model = AutoModelForSequenceClassification.from_pretrained(best_model_name)

samples = random.sample(range(len(dataset["test"])), 10)
texts = [dataset["test"][i]["review"] for i in samples]

tokenized = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
outputs = model(**tokenized)
preds = outputs.logits.argmax(dim=-1)

for t, p in zip(texts, preds):
    print(f"Review: {t}\nPredicted: {'Positive' if p == 1 else 'Negative'}\n")


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Review: This movie fully deserves to be one of the top Hindi comedies ever made . Rajkumar Santoshi is mostly known for his gritty hard-hitting social dramas , but this is easily the most effortless movie he has made .<br /><br />The plot revolves around two small-town buffoons Amar (Aamir Khan) and Prem (Salman Khan) . They want to get rich quick and so move to the big city . They hatch the same plan separately - to woo a rich heiress Raveena (Raveena Tandon) who is the daughter of a rich businessman Ramgopal Bajaj (Paresh Rawal) . Thus the one who marries Raveena gets his hands on all her wealth . but when they get to know each other's plan , there is an intense tussle of one-oneupmanship over who marries Raveena . Hilarious gags and situations ensue as the battle rages on .<br /><br />At the same time Ramgopal Bajaj's identical twin brother Shyamgopal Bajaj wants to eliminate his brother and niece and usurp the family fortune . Add to this an assortment of funny characters including