In [129]:
import pandas as pd

df=pd.read_csv("Amazon-Deutsch-Dataset.csv")
df = df[["content", "rating"]]
df.rating= df.rating.str[0]
df = df.dropna()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('spacy', language='de_core_news_sm')

# create iterator from tokenized df
def df_iterator_content(df):
    for _, row in df.iterrows():
        yield tokenizer(row['content'])

vocab = build_vocab_from_iterator(df_iterator_content(df), specials=["<unk>"], min_freq=10)
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
print(vocab_size)

2399


In [130]:
import torch
from torch.utils.data import Dataset

class AmazonDataset(Dataset):
    def __init__(self, df, word_count=500, vocab_size=10000):
        self.df = df
        self.word_count = word_count
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        x= self.df.iloc[idx]["content"]
        y= self.df.iloc[idx]["rating"]
        y = int(y) - 1
        x = vocab(tokenizer(x))
        if len(x) > self.word_count:
            x=x[:self.word_count]
        else:
            x.extend([0]*(self.word_count-len(x)))
        x = torch.tensor(x)
        return x, y

amazon_dataset = AmazonDataset(df, word_count=100, vocab_size=vocab_size)
x,y=amazon_dataset[0]
print(x.shape)
print(y)

torch.Size([100])
4


In [131]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

class MyLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, out_size, word_count=50):
        super(MyLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim=embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.2)

        # The linear layer that maps from hidden state space to output space
        self.hidden2output = nn.Linear(hidden_dim*word_count, out_size)

    def forward(self, xb):
        #print("xb shape", xb.shape)
        embeds = self.word_embeddings(xb)
        #print("embeds shape", embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        #print("lstm_out shape", lstm_out.shape)
        # lstm_out_view = lstm_out[:, -1, :]   # works but looses information
        lstm_out_view = lstm_out.reshape(xb.shape[0], -1   )
        #print("lstm_out_view shape", lstm_out_view.shape)
        hidden_space = self.hidden2output(lstm_out_view)
        #print("hidden_space shape", hidden_space.shape)
        output = F.log_softmax(hidden_space, dim=1)
        #print("output shape", output.shape)
        return output


In [132]:
# create model
vocab_size = len(vocab)
embed_dim = 32
num_class = 5
hidden_dim = 32
word_count = 200

model= MyLSTM(embed_dim, hidden_dim, vocab_size, num_class, word_count=word_count)
dataset = AmazonDataset(df, word_count=word_count, vocab_size=vocab_size)
loader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True)

# check if model works
xb, yb = next(iter(loader))

print("yb", yb)
print("xb", xb.shape)
model(xb)

yb tensor([2, 3, 3, 3, 0])
xb torch.Size([5, 200])




tensor([[-1.6160, -1.6460, -1.4634, -1.5387, -1.8178],
        [-1.6355, -1.6236, -1.5338, -1.7346, -1.5335],
        [-1.6314, -1.6048, -1.5299, -1.5347, -1.7641],
        [-1.6264, -1.6392, -1.5456, -1.5011, -1.7537],
        [-1.6041, -1.5927, -1.5062, -1.5007, -1.8915]],
       grad_fn=<LogSoftmaxBackward0>)

In [133]:
import time

def evaluate(model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [134]:
# create train and valid dataset
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

# import torch DataLoader
from torch.utils.data import DataLoader

# Hyperparameters
EPOCHS = 30 # epoch
LR = 0.01  # learning rate
BATCH_SIZE = 64 # batch size for training

model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class, word_count=word_count)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10.0, gamma=0.7)   # every 10 epochs, LR is multiplied by 0.7




In [None]:
total_accu = None
train_accus=[]
valid_accus=[]

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    
    model.train()
    total_acc, total_count = 0, 0

    for idx, (text, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = loss_func(predicted_label, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

    accu_train = evaluate(model, train_dataloader)
    accu_valid = evaluate(model, valid_dataloader)
    train_accus.append(accu_train)
    valid_accus.append(accu_valid)
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | train accuracy {:8.3f} | valid accuracy {:8.3f} | lr: {:1.2f}'.format(
                                epoch,
                                time.time() - epoch_start_time,
                                accu_train, 
                                accu_valid, 
                                scheduler.get_last_lr()[0]))

    scheduler.step() # learning rate scheduler after each epoch

import matplotlib.pyplot as plt
plt.plot(train_accus, label='train_accu')
plt.plot(valid_accus, label='valid_accu')
plt.legend()
plt.show()

-----------------------------------------------------------
| end of epoch   1 | time: 18.75s | train accuracy    0.556 | valid accuracy    0.400 | lr: 0.01
-----------------------------------------------------------
| end of epoch   2 | time: 16.44s | train accuracy    0.696 | valid accuracy    0.455 | lr: 0.01
-----------------------------------------------------------
| end of epoch   3 | time: 16.80s | train accuracy    0.797 | valid accuracy    0.444 | lr: 0.01
-----------------------------------------------------------
| end of epoch   4 | time: 16.96s | train accuracy    0.871 | valid accuracy    0.456 | lr: 0.01
-----------------------------------------------------------
| end of epoch   5 | time: 20.06s | train accuracy    0.902 | valid accuracy    0.438 | lr: 0.01


In [None]:
# how much valid accuracy do we get in a new untrained model?
new_model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class)
evaluate(new_model, valid_dataloader)
