In [1]:
import gzip
import shutil
import time

import pandas as pd
import numpy as np
import requests
import torch
import torch.nn.functional as F
import torchtext

import transformers
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertTokenizer

### General Settings

In [2]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 11
torch.manual_seed(RANDOM_SEED)

NUM_EPOCHS = 15
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Download Dataset

In [3]:
url = "https://github.com/rasbt/python-machine-learning-book-3rd-edition/raw/master/ch08/movie_data.csv.gz"
filename = url.split("/")[-1]

with open(filename, "wb") as f:
    r = requests.get(url)
    f.write(r.content)

with gzip.open('movie_data.csv.gz', 'rb') as f_in:
    with open('movie_data.csv', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [4]:
df = pd.read_csv("movie_data.csv")
df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [5]:
df.shape

(50000, 2)

### Splitting the Dataset

In [6]:
train_texts = df.iloc[:35000]["review"].values
train_labels = df.iloc[:35000]["sentiment"].values

val_texts = df.iloc[35000:40000]["review"].values
val_labels = df.iloc[35000:40000]["sentiment"].values

test_texts = df.iloc[40000:]["review"].values
test_labels = df.iloc[40000:]["sentiment"].values

### Tokeinzation

In [7]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)

In [9]:
train_encodings[0]

Encoding(num_tokens=512, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

### Dataset Classes and Loaders

In [10]:
class IMDbDataset(torch.utils.data.Dataset):
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    item["labels"] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

train_dataset=IMDbDataset(train_encodings, train_labels)
val_dataset=IMDbDataset(val_encodings, val_labels)
test_dataset=IMDbDataset(test_encodings, test_labels)

In [11]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = 16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = 16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = 16, shuffle=True)

### Load Model

In [12]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.train()

optim = AdamW(model.parameters(), lr = 5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Train Model

In [13]:
from transformers import Trainer, TrainingArguments

training_arguments = TrainingArguments(
    output_dir = "/contents/results",
    num_train_epochs = 3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    warmup_steps = 500,
    weight_decay = 0.05,
    logging_dir = "/contents/logs",
    logging_steps = 10
)

trainer = Trainer(
    model = model,
    args = training_arguments,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

trainer.train()

Step,Training Loss
10,0.7021
20,0.695
30,0.6929
40,0.69
50,0.678
60,0.6804
70,0.6526
80,0.6159
90,0.5599
100,0.5


TrainOutput(global_step=6564, training_loss=0.1594632496082012, metrics={'train_runtime': 5096.5496, 'train_samples_per_second': 20.602, 'train_steps_per_second': 1.288, 'total_flos': 1.390907685888e+16, 'train_loss': 0.1594632496082012, 'epoch': 3.0})

### Evaluation

In [14]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [15]:
model.eval()
model.to(DEVICE)

print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Test accuracy: 93.64%


In [16]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
