1:00:08 / 2:40:20

0 Import important liberaries <br>
1 Prepare your data <br>
    1.1 Loading datasets <br>
    1.2 Tokenize data <br>
    1.3 Split data <br>
    1.4 Creating vocabulary <br>
    1.5 Numericalizing data <br>
    1.6 Converting data into tensors <br>
    1.7 Creating dataloaders <br>
2. Build a model <br>
3. Pick a loss function and optimizer <br>
4. Training and evaluation loop functions <br>
5. Start training loop <br>
6. Visulize your model <br>
7. Testing model with new sentence (sentiment analysis)

<h2>0 Import important liberaries</h2>

In [4]:
import collections
import datasets #!pip install datasets
import matplotlib.pyplot as plt
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torchtext.data import get_tokenizer
import tqdm



<h2>1 Prepare our data</h2>

 <h3>1.1 Loading datasets<h3>

In [7]:
train_data,test_data = datasets.load_dataset('imdb',split=['train','test'])

Using the latest cached version of the dataset since imdb couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at C:\Users\Abdul Zoha\.cache\huggingface\datasets\imdb\plain_text\0.0.0\e6281661ce1c48d982bc483cf8a173c1bbeb5d31 (last modified on Tue Jan 14 10:03:38 2025).


In [8]:
train_data

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

<h3>1.2 Tokenize data</h3>

In [10]:
tokenizer = get_tokenizer('basic_english')

In [11]:
def tokenize_example(example,tokenizer,max_length):
    tokens = tokenizer(example['text'])[:max_length]
    length = len(tokens)
    return {'tokens': tokens,'length': length}

In [12]:
max_length = 256
train_data = train_data.map(
    tokenize_example,fn_kwargs={'tokenizer': tokenizer, 'max_length': max_length}
)
test_data = test_data.map(
    tokenize_example,fn_kwargs={'tokenizer': tokenizer,'max_length': max_length}
)

<h3>1.3 Creating validation data</h3>

In [14]:
test_size = 0.25
train_valid_data = train_data.train_test_split(test_size = test_size)
train_data = train_valid_data['train']
valid_data = train_valid_data['test']

<h3>1.4 Creating vocabulary</h3>

In [16]:
from torchtext.vocab import build_vocab_from_iterator as bi



In [17]:
min_freq = 5
special_tokens = ["<unk>","<pad>"]
vocab = bi(
    train_data["tokens"],
    min_freq = min_freq,
    specials = special_tokens,
)

In [18]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

In [19]:
vocab.set_default_index(unk_index)

In [20]:
def numericalize_example(example,vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [21]:
train_data = train_data.map(numericalize_example,fn_kwargs={"vocab":vocab})
valid_data = valid_data.map(numericalize_example,fn_kwargs={"vocab":vocab})
test_data = data = test_data.map(numericalize_example,fn_kwargs={"vocab":vocab})

Map:   0%|          | 0/18750 [00:00<?, ? examples/s]

Map:   0%|          | 0/6250 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

<h2>1.6 Converting into tensors</h2>

In [23]:
train_data = train_data.with_format(type="torch", columns=["ids","label","length"])
valid_data = valid_data.with_format(type="torch", columns=["ids","label","length"])
test_data = test_data.with_format(type="torch", columns=["ids","label","length"])

In [24]:
train_data[0]

{'label': tensor(1),
 'length': tensor(256),
 'ids': tensor([   92,     9,    27,    87,    76,   345,     3, 16674,    17,  4339,
             6,    10,   404,     2,   122, 20105,   485,     8,    35,   434,
            28,     2, 14992,  3093,     3,    22,     2,   202,    72,  2809,
            97,     5, 20105, 15892,     3,  2328,   293,    50,   166,     4,
            74,  1347,  1806,    13,     0,     3,    60,    12,  1033,    61,
           983,   671,    21,    44,  4709,    20,   121,     4,    12,    72,
          1620,     0,     6,    48,   565, 19992,     6,  6894, 19992,     5,
           237, 15452,     3,    66,   165,     0,   404,    51,     2,  1447,
            13,  4775,     6,  4174,     2,  3177,     7,     2,    18,  3093,
             4, 19992,   404,    10,  1779,    13,  1285,     7,  9275,  6835,
             6, 12005,  2478,     3,  9328,  2428,    44,  1252,     0,     8,
          2046,    41, 13908,     4,  2046,    41,     0,  1407,     4,  2046,


<h2>1.7 Creating DataLoader</h2>

In [26]:
def get_collate_fn(pad_index):
    def collate_fn(batch):
        batch_ids = [i["ids"] for i in batch]
        batch_ids = nn.utils.rnn.pad_sequence(
            batch_ids,padding_value = pad_index,batch_first = True
        )
        batch_length = [i["length"] for i in batch]
        batch_length = torch.stack(batch_length)
        batch_label = [i["label"] for i in batch]
        batch_label = torch.stack(batch_label)
        batch = {"ids": batch_ids, "length": batch_length, "label": batch_label}
        return batch
    return collate_fn

In [27]:
def get_data_loader(dataset,batch_size,pad_index,shuffle=False):
    collate_fn = get_collate_fn(pad_index)
    data_loader = torch.utils.data.DataLoader(
        dataset = dataset,
        batch_size = batch_size,
        collate_fn = collate_fn,
        shuffle = shuffle
    )
    return data_loader

In [28]:
batch_size = 512
train_data_loader = get_data_loader(train_data,batch_size,pad_index,shuffle=True)
valid_data_loader = get_data_loader(valid_data,batch_size,pad_index)
test_data_loader = get_data_loader(test_data,batch_size,pad_index)

<h2>2 Build LSTM model</h2>