In [1]:
!pip install torchdata
!pip install torchtext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchdata
  Downloading torchdata-0.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting urllib3>=1.25
  Downloading urllib3-1.26.14-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker>=2.0.0
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: urllib3, portalocker, torchdata
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
Successfully installed portalocker-2.7.0 torchdata-0.5.1 urllib3-1.26.14
Looking in indexes: https://pypi.org/simple, https://us-pytho

In [3]:
import torch, torchdata, torchtext
from torch import nn

import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# reproducibility 
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


In [4]:
torch.__version__

'1.13.1+cu116'

In [5]:
torchtext.__version__

'0.14.1'

In [6]:
torchdata.__version__

'0.5.1'

In [7]:
# torch.cuda.get_device_name(0)

## 1. Loading the Dataset

In [8]:
# for puffer
# import os
# os.environ['http_proxy']  = 'http://192.41.170.23:3128'
# os.environ['https_proxy'] = 'http://192.41.170.23:3128'

from torchtext.datasets import AG_NEWS
train, test = AG_NEWS()

In [9]:
train  # a new object by torchdata.....streaming data (yield ....)

ShardingFilterIterDataPipe

## 2. EDA - Exploratory Data Analysis

In [10]:
next(iter(train))  # generator
# (“World”, “Sports”, “Business”, “Sci/Tech”)
#  1,        2,        3,          4

(3,
 "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [11]:
list(iter(train))[100]  # generator
# (“World”, “Sports”, “Business”, “Sci/Tech”)
#  1,        2,        3,          4



(4,
 'Comets, Asteroids and Planets around a Nearby Star (SPACE.com) SPACE.com - A nearby star thought to harbor comets and asteroids now appears to be home to planets, too. The presumed worlds are smaller than Jupiter and could be as tiny as Pluto, new observations suggest.')

In [12]:
set([y for y, x in list(iter(train))])

{1, 2, 3, 4}

In [13]:
train_size = len(list(iter(train)))
train_size

120000

In [14]:
train

ShardingFilterIterDataPipe

In [15]:
# splitting data
too_much, train, valid = train.random_split(total_length=train_size, 
                                            weights = {"too_much": 0.7, 
                                                       "smaller_train": 0.2,
                                                       "valid": 0.1},
                                            seed = SEED)

In [16]:
train_size = len(list(iter(train)))
val_size   = len(list(iter(valid)))
test_size  = len(list(iter(test)))

In [17]:
train_size, val_size, test_size

(24000, 12000, 7600)

## 3. Preprocessing

In [18]:
## 3.1 Tokenizing

from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

# checking whether the tokenizer works.....
tokens    = tokenizer("Chaky likes deep learning very much and wants his student to be number 1 in Asia....")
tokens



['Chaky',
 'likes',
 'deep',
 'learning',
 'very',
 'much',
 'and',
 'wants',
 'his',
 'student',
 'to',
 'be',
 'number',
 '1',
 'in',
 'Asia',
 '....']

In [19]:
next(iter(train))

(3,
 'Safety Net (Forbes.com) Forbes.com - After earning a PH.D. in Sociology, Danny Bazil Riley started to work as the general manager at a commercial real estate firm at an annual base salary of  #36;70,000. Soon after, a financial planner stopped by his desk to drop off brochures about insurance benefits available through his employer. But, at 32, "buying insurance was the furthest thing from my mind," says Riley.')

In [20]:
## 3.2 Numericalization

from torchtext.vocab import build_vocab_from_iterator

def yield_tokens(data_iter):  # data_iter, e.g., train
    for _, text in data_iter:
        yield tokenizer(text)
        
vocab = build_vocab_from_iterator(yield_tokens(train), specials=['<unk>', '<pad>',
                                                                 '<bos>', '<eos>'])

In [21]:
vocab.set_default_index(vocab["<unk>"]) # if we don't the id of this word, we can set it unk

In [22]:
vocab(['Chaky', 'wants', 'his', 'student', 'to', 'be', 'number', '1', '.'])

[0, 944, 38, 3956, 8, 43, 498, 109, 6]

In [23]:
id2word = vocab.get_itos()

In [24]:
id2word[0]

'<unk>'

In [25]:
vocab(['<pad>', '<bos>', '<eos>'])

[1, 2, 3]

In [26]:
len(vocab)  # 52k unique words.....

52686

## 4. FastText Embedding

In [27]:
from torchtext.vocab import FastText
fast_vectors = FastText(language='simple')

.vector_cache/wiki.simple.vec: 293MB [00:57, 5.08MB/s]                           
100%|██████████| 111051/111051 [00:14<00:00, 7879.67it/s]


In [28]:
fast_embedding = fast_vectors.get_vecs_by_tokens(vocab.get_itos()).to(device)

In [29]:
fast_embedding.shape # (vocab size, 300) == (52k, 300)

torch.Size([52686, 300])

In [31]:
# looking up the fasttext embedding of id 100
fast_embedding[100][:10] #size of 300 dim of this word id 100

tensor([-0.0935,  0.0915,  0.2640,  0.0387,  0.0843,  0.3809, -0.1776,  0.1745,
        -0.0362, -0.0278])

## 5. Preparing Dataloader

In [32]:
text_pipeline  = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1  # 1, 2, 3, 4 ---> 0, 1, 2, 3

In [33]:
'''
why padding????

in the same batch, e.g., batch size = 2

"chaky eat sushi", ==> "chaky", "eat", "sushi" ==> 0, 22, 11, 1, 1
"chaky sleep" ==> "chaky", "sleep" ==> 0, 99, 1, 1, 1

'''

'\nwhy padding????\n\nin the same batch, e.g., batch size = 2\n\n"chaky eat sushi", ==> "chaky", "eat", "sushi" ==> 0, 22, 11, 1, 1\n"chaky sleep" ==> "chaky", "sleep" ==> 0, 99, 1, 1, 1\n\n'

In [34]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence # making each batch same length

pad_ix = vocab['<pad>']

# this function gonna be called by DataLoader
def collate_batch(batch):
    label_list, text_list, length_list = [], [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))  # [3, 1, 0, 2, ]
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64) # [0, 44, 21, 2]
        text_list.append(processed_text)
        length_list.append(processed_text.size(0)) # for padding
        
    return torch.tensor(label_list, dtype=torch.int64), \
        pad_sequence(text_list, padding_value=pad_ix, batch_first=True), \
        torch.tensor(length_list, dtype=torch.int64)

In [35]:
batch_size = 64

train_loader = DataLoader(train, batch_size = batch_size, 
                          shuffle=True, collate_fn=collate_batch)

val_loader   = DataLoader(valid, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

test_loader  = DataLoader(test, batch_size = batch_size,
                          shuffle=True, collate_fn=collate_batch)

In [36]:
# for label, text, length in train_loader:
#     break

# label: [batch size, ]
# text : [batch size, longest length of this batch] ==> [batch size, seq len] ==> [b, l]
# length:[batch size, ]

# label, text, length  #why we need length --> we can later ignore padding....

## 6. Designing the Model

In [37]:
class LSTM(nn.Module):
    
    def __init__(self, input_dim, emb_dim, hid_dim, output_dim, num_layers, bidirectional, 
                 dropout):
        # input dim = how many vocab you have
        # emb dim = 300 --> we use fasttext
        # padding_idx tells this lookup table to ignore, and just randomize....
        # <unk>, <bos>, <eos>
        self.embedding_layer = nn.Embedding(input_dim, emb_dim, padding_idx=pad_ix)
        self.lstm            = nn.LSTM(emb_dim,
                                       hid_dim,
                                       num_layers = num_layers,
                                       bidrectional = bidirectional,
                                       dropout = dropout,  # dropout is applied between layers....
                                       batch_first=True)
        
        self.fc              = nn.Linear(hid_dim * 2, output_dim)
        
    def forward(self, x, lengths):
        # x: [batch size, seq len]
        
        embedded_x = self.embedding_layer(x)
        # x: [batch size, seq len, emb dim]
        
        # packing this embedded_x in such a way that RNN knows to ignore padding....
        # without batch_first = True; things will become [seq len, batch size, emb dim]
        pack_embedded = nn.utils.rnn.pack_padded_sequence(embedded_x, lengths.to('cpu'),
                                                          enforce_sorted=False,
                                                          batch_first = True
                                                          )
        
        # packed_outputs is basically all hidden states
         #h is the last hidden state
        # c is the last cell state
        packed_outputs, (h, _) = self.lstm(pack_embedded)
        
        # h: [num_layers * num_directions, batch_size, hidden dim]
        
        # it happens that because packed_outputs is all hidden states....some hidden states near the end is
        # hidden state for padding, pytorch guys help you
        # by using this pad_packed_sequence, then all the hidden states will only be not padding....
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first = True)
        # output: [batch size, seq len, direction * hidden sim]
        
        # last hidden state - concat last forward and backward states
        last_hidden_state = torch.cat((h[-1, :, :], h[-2, :, :]), dim = 1)
        # last_hidden_state: [batch_size, hidden_dim * 2]
        
        # for sentiment analysis...
        return self.fc(last_hidden_state)  #[batch_size, output_dim]==> [batch_size, 4]        