### Sentiment Classification with Transformers

#### 0. How does tiktoken library works?

[tiktoken library for part II](#scrollTo=iU7qA3wgxKmS&line=1&uniqifier=1)

#### 1. Download and Preprocess the IMDB Dataset

In [None]:
import os
import tarfile
import urllib.request

# Download the dataset
url = 'https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
dataset_path = 'aclImdb_v1.tar.gz'
if not os.path.exists(dataset_path):
    urllib.request.urlretrieve(url, dataset_path)

# Extract the dataset
if not os.path.exists('aclImdb'):
    with tarfile.open(dataset_path, 'r:gz') as tar:
        tar.extractall()

In [None]:
import glob

def load_data_from_directory(directory):
    texts = []
    labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(directory, label_type)
        for file_path in glob.glob(os.path.join(dir_name, '*.txt')):
            with open(file_path, 'r', encoding='utf-8') as file:
                texts.append(file.read())
            labels.append(0 if label_type == 'neg' else 1)
    return texts, labels

train_texts_imdb, train_labels_imdb = load_data_from_directory('aclImdb/train')
test_texts_imdb, test_labels = load_data_from_directory('aclImdb/test')

#### Tokenize the data and Prepare training Dataset

In [None]:
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
nltk.download('punkt')

MAX_SEQ_LEN = 128

# Tokenization Part

train_texts = [... for text in train_texts_imdb]
test_texts = [... for text in test_texts_imdb]

In [None]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch

train_texts, valid_texts, train_labels, valid_labels = train_test_split(
    train_texts, train_labels_imdb, test_size=0.1, random_state=1234)

class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

train_dataset = TextDataset(train_texts, train_labels)
valid_dataset = TextDataset(valid_texts, valid_labels)
test_dataset = TextDataset(test_texts, test_labels)

BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Model class
class TransformerClassifier(nn.Module):
    ...

vocab_size = ...
embed_dim = ...
n_heads = ...
hidden_dim = ...
n_layers = ...
output_dim = ...
dropout = ...

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerClassifier(...).to(device)

In [None]:
N_EPOCHS = ...
# Train Model and Provide train and validation loss/accuracy
...

In [None]:
# Test and save the model
model.load_state_dict(torch.load('transformer_model.pt'))
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

### PHASE II: Use GPT2 Tokenizer

In [None]:
# install tiktoken library
pip install tiktoken



In [None]:
import tiktoken

In [None]:
# Initialize the tokenizer with the "gpt2" encoding
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
# Get the vocabulary size
vocab_size = tokenizer.n_vocab
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 50257


In [None]:
# Encode Text and Visualize Tokens
text = """Deep learning is the subset of machine learning methods based on neural networks with representation learning. The adjective "deep" refers to the use of multiple layers in the network. Methods used can be either supervised, semi-supervised, unsupervised or hayalettiklerimiz."""
# encode the text into token IDs
token_ids = tokenizer.encode(text)
print(text)
print(token_ids)

Deep learning is the subset of machine learning methods based on neural networks with representation learning. The adjective "deep" refers to the use of multiple layers in the network. Methods used can be either supervised, semi-supervised, unsupervised or hayalettiklerimiz.
[29744, 4673, 318, 262, 24637, 286, 4572, 4673, 5050, 1912, 319, 17019, 7686, 351, 10552, 4673, 13, 383, 43441, 366, 22089, 1, 10229, 284, 262, 779, 286, 3294, 11685, 287, 262, 3127, 13, 25458, 973, 460, 307, 2035, 28679, 11, 10663, 12, 16668, 16149, 11, 555, 16668, 16149, 393, 27678, 282, 3087, 1134, 1754, 320, 528, 13]


In [None]:
# get the tokens themselves
tokens = []
for token_id in token_ids:
    token = tokenizer.decode([token_id])
    tokens.append(token)
print(f"All tokens: {tokens}")
print(f"Last 15 tokens: {tokens[-15:]}")
# to use list comprehension uncomment the following line
# tokens = [tokenizer.decode([token_id]) for token_id in token_ids]

All tokens: ['Deep', ' learning', ' is', ' the', ' subset', ' of', ' machine', ' learning', ' methods', ' based', ' on', ' neural', ' networks', ' with', ' representation', ' learning', '.', ' The', ' adjective', ' "', 'deep', '"', ' refers', ' to', ' the', ' use', ' of', ' multiple', ' layers', ' in', ' the', ' network', '.', ' Methods', ' used', ' can', ' be', ' either', ' supervised', ',', ' semi', '-', 'super', 'vised', ',', ' un', 'super', 'vised', ' or', ' hay', 'al', 'ett', 'ik', 'ler', 'im', 'iz', '.']
Last 15 tokens: ['super', 'vised', ',', ' un', 'super', 'vised', ' or', ' hay', 'al', 'ett', 'ik', 'ler', 'im', 'iz', '.']


##### Working with a PADding Token
- You are going to use your version of encode and decode with pad token methods in your assignment

In [None]:
# Truncating and Padding Sequences
MAX_SEQ_LEN = 10
PAD_TOKEN = "<PAD>"
PAD_TOKEN_ID = vocab_size
vocab_size = vocab_size + 1
# Example text longer than MAX_SEQ_LEN
long_text = "Deep learning methods used can be either supervised, semi-supervised or unsupervised."
short_text = "Learning unsupervised learning methods is great."

# Encode and truncate the text to MAX_SEQ_LEN
def truncate_or_pad_token_ids(tokenids: list, max_seq_len, PAD_TOKEN_ID):
    # truncate if longer than max_seq_len
    tokens2use = tokenids[:max_seq_len]
    # if the nuber of tokens is less than max _seq len then do padding (with unknown) !
    if len(tokens2use) < max_seq_len:
        tokens2use += [PAD_TOKEN_ID] * (max_seq_len - len(tokens2use))
    return tokens2use

def encode_with_pad_token(tokenizer, text, max_seq_len, PAD_TOKEN_ID):
    token_ids = tokenizer.encode(text)
    token_ids = truncate_or_pad_token_ids(token_ids, max_seq_len, PAD_TOKEN_ID)
    return token_ids

def decode_with_pad_token(tokenizer, tokenids, PAD_TOKEN_ID, PAD_TOKEN):
    tokens = [tokenizer.decode([token]) if token != PAD_TOKEN_ID else PAD_TOKEN for token in tokenids]
    return tokens

# LONG TEXT
token_ids = encode_with_pad_token(tokenizer, long_text, MAX_SEQ_LEN, PAD_TOKEN_ID)
tokens = decode_with_pad_token(tokenizer, token_ids, PAD_TOKEN_ID, PAD_TOKEN)
print("Long Text:", long_text)
print("Truncated Token IDs:", token_ids)
print("Truncated Tokens:", tokens)

# SHORT TEXT
token_ids = encode_with_pad_token(tokenizer, short_text, MAX_SEQ_LEN, PAD_TOKEN_ID)
tokens = decode_with_pad_token(tokenizer, token_ids, PAD_TOKEN_ID, PAD_TOKEN)
print("Short Text:", short_text)
print("Padded Token IDs:", token_ids)
print("Padded Tokens:", tokens)

Long Text: Deep learning methods used can be either supervised, semi-supervised or unsupervised.
Truncated Token IDs: [29744, 4673, 5050, 973, 460, 307, 2035, 28679, 11, 10663]
Truncated Tokens: ['Deep', ' learning', ' methods', ' used', ' can', ' be', ' either', ' supervised', ',', ' semi']
Short Text: Learning unsupervised learning methods is great.
Padded Token IDs: [41730, 555, 16668, 16149, 4673, 5050, 318, 1049, 13, 50268]
Padded Tokens: ['Learning', ' un', 'super', 'vised', ' learning', ' methods', ' is', ' great', '.', '<PAD>']


##### DATA PREPARATION with GPT2 TOKENIZER

In [None]:
# Tokenize the data and Prepare training Dataset
...

# NO NEED for handling UNKNOWNS


In [None]:
# Model Training and Evaluation part (similar to phase I)