## Sentiment Analysis with Transformer model & Fine Tuning

In [3]:
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification
)
from datasets import load_dataset
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# --- config ---

MODEL_NAME = 'distilbert-base-uncased'
BATCH_SIZE = 8
MAX_LEN = 256
LR = 5e-5
EPOCHS = 1
DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
# --- load dataset ---

dataset = load_dataset('imdb')


In [9]:
# --- tokenizer ---

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
# --- Tokenization Function ---

def tokenize_fn(batch):
    return tokenizer(
        batch['text'], 
        padding = 'max_length', 
        truncation = True, 
        max_length = MAX_LEN
    )

In [11]:
# --- Tokenize dataset ---

tokenized_dataset = dataset.map(
    tokenize_fn, batched = True, 
    remove_columns = ['text']
)

Map: 100%|██████████| 25000/25000 [20:23<00:00, 20.44 examples/s] 
Map: 100%|██████████| 25000/25000 [00:41<00:00, 596.46 examples/s]
Map: 100%|██████████| 50000/50000 [01:27<00:00, 573.95 examples/s]


In [12]:
# --- Torch Format ---

tokenized_dataset.set_format(
    type = 'torch', 
    columns = ['input_ids', 'attention_mask', 'label']
)

In [18]:
# --- DataLoaders ---

train_loader = DataLoader(
    tokenized_dataset['train'], 
    batch_size = BATCH_SIZE, 
    shuffle = True
)

test_loader = DataLoader(
    tokenized_dataset['test'], 
    batch_size = BATCH_SIZE
)