Resources:  
https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel  
https://machinelearningmastery.com/pytorch-tutorial-develop-deep-learning-models/  
https://jovian.ml/aakanksha-ns/lstm-multiclass-text-classification  
https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df  
https://towardsdatascience.com/lstm-text-classification-using-pytorch-2c6c657f8fc0  

In [2]:
import torch
import torch.nn as nn

# Basic LSTM example

In [3]:
# inpus random numbers

In [4]:
x = torch.tensor([
    [1, 2, 12, 34, 56, 78, 90, 8],
    [3, 24, 6, 99, 12, 56, 21, 22]
])

In [5]:
x.shape

torch.Size([2, 8])

In [6]:
EMBEDDING_DIM = 4
VOCAB_SIZE = 100
LSTM_HIDDEN_SIZE = 3 # arbitrary number

### Declare layers

In [7]:
# Param padding_idx - pads the output with the vector initialized to zeros
embedding_layer = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM, padding_idx=0)

In [8]:
embedding_layer(x).shape

torch.Size([2, 8, 4])

In [9]:
lstm_model = nn.LSTM(input_size=EMBEDDING_DIM, hidden_size=LSTM_HIDDEN_SIZE, num_layers=1, batch_first=True)

In [10]:
out1 = embedding_layer(x)

# ht, ct - hidden and cell states at time t
out, (ht, ct) = lstm_model(out1)

In [11]:
ht

tensor([[[-0.2453, -0.2346, -0.1035],
         [-0.2944, -0.1233, -0.0452]]], grad_fn=<StackBackward>)

In [12]:
ht.shape

torch.Size([1, 2, 3])

### Use nn.Sequential

In [13]:
model = nn.Sequential(
    nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM, padding_idx=0),
    nn.LSTM(input_size=EMBEDDING_DIM, 
            hidden_size=LSTM_HIDDEN_SIZE, 
            num_layers=1, 
            batch_first=True)
)

In [14]:
out, (ht, ct) = model(x)

In [15]:
out.shape

torch.Size([2, 8, 3])

In [16]:
# out - all hidden states, last == ht
out[0][-1]

tensor([-0.2267, -0.3558, -0.1870], grad_fn=<SelectBackward>)

In [17]:
ht

tensor([[[-0.2267, -0.3558, -0.1870],
         [-0.2007, -0.2303, -0.2910]]], grad_fn=<StackBackward>)

# Data

In [18]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
from collections import Counter
from sklearn.model_selection import train_test_split

In [19]:
path = "../data/raw/lenta_10k_3_classes.csv"

In [20]:
df = pd.read_csv(path)

In [21]:
df

Unnamed: 0.1,Unnamed: 0,text,topic
0,630,С 1 января 2000 года все телеканалы будут опла...,Экономика
1,635,Германский автопромышленный концерн Volkswagen...,Экономика
2,636,"Нераспределенная прибыль ОАО ""Тюменнефтегаз"", ...",Экономика
3,660,Две крупнейших телекоммуникационных компании С...,Экономика
4,661,"ОАО ""ГАЗ"" и Нижегородский банк Сбербанка Росси...",Экономика
...,...,...,...
9995,41649,На прошедшем в Лос-Анджелесе вручении American...,Культура
9996,41652,Журнал Foreign Policy и консалтинговая компани...,Экономика
9997,41656,По данным аргентинской телекомпании Todo Notic...,Экономика
9998,41659,Согласно данным январского исследования Blue C...,Экономика


In [22]:
# keeping only relevant columns
dataset = df[['text', 'topic']]

In [23]:
# add text length col
dataset['text_length'] = dataset['text'].apply(lambda x: len(x.split()))

In [24]:
# lower texts
dataset['text'] = dataset['text'].apply(lambda x: x.lower())

In [25]:
dataset.head()

Unnamed: 0,text,topic,text_length
0,с 1 января 2000 года все телеканалы будут опла...,Экономика,79
1,германский автопромышленный концерн volkswagen...,Экономика,135
2,"нераспределенная прибыль оао ""тюменнефтегаз"", ...",Экономика,87
3,две крупнейших телекоммуникационных компании с...,Экономика,144
4,"оао ""газ"" и нижегородский банк сбербанка росси...",Экономика,269


In [26]:
# mean text length
np.mean(dataset['text_length'])

146.8561

In [27]:
counts = Counter()
for _, row in dataset.iterrows():
    counts.update(word_tokenize(row['text']))

In [28]:
counts.most_common(10)

[(',', 114983),
 ('.', 77556),
 ('в', 67630),
 ('``', 38845),
 ("''", 38774),
 ('и', 32237),
 ('на', 30267),
 ('-', 15669),
 ('с', 15274),
 ('по', 15087)]

In [29]:
# create vocab
vocab2index = {"": 0, "<UNK>": 1}
words = ["", "<UNK>"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [30]:
words[:10]

['', '<UNK>', 'с', '1', 'января', '2000', 'года', 'все', 'телеканалы', 'будут']

In [31]:
# too much features
len(words)

128323

In [32]:
vocab2index['а']

320

In [33]:
def encode_sentence(text, vocab2index, N=150):
    """
    Numeracalize and padd
    """
    
    tokenized = word_tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["<UNK>"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [34]:
dataset['encoded'] = dataset['text'].apply(lambda x: np.array(encode_sentence(x, vocab2index )))
dataset.head()

Unnamed: 0,text,topic,text_length,encoded
0,с 1 января 2000 года все телеканалы будут опла...,Экономика,79,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,германский автопромышленный концерн volkswagen...,Экономика,135,"[[69, 70, 71, 72, 73, 74, 75, 76, 77, 17, 18, ..."
2,"нераспределенная прибыль оао ""тюменнефтегаз"", ...",Экономика,87,"[[179, 180, 181, 182, 183, 184, 25, 185, 186, ..."
3,две крупнейших телекоммуникационных компании с...,Экономика,144,"[[233, 234, 235, 145, 236, 237, 238, 129, 239,..."
4,"оао ""газ"" и нижегородский банк сбербанка росси...",Экономика,269,"[[181, 182, 339, 184, 27, 340, 341, 342, 343, ..."


In [35]:
#check how balanced the dataset is
Counter(dataset['topic'])

Counter({'Экономика': 4902, 'Спорт': 2170, 'Культура': 2928})

In [36]:
X = list(dataset['encoded'])
y = list(dataset['topic'])
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

# Pytorch Dataset

In [38]:
from torch.utils.data import Dataset, DataLoader

In [39]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [40]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [41]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total