In [2]:
import pandas as pd

df=pd.read_csv("Amazon-Deutsch-Dataset.csv")
df = df[["content", "rating"]]
df.rating= df.rating.str[0]
df = df.dropna()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('spacy', language='de_core_news_sm')

# create iterator from tokenized df
def df_iterator_content(df):
    for _, row in df.iterrows():
        yield tokenizer(row['content'])

vocab = build_vocab_from_iterator(df_iterator_content(df), specials=["<unk>"], min_freq=5)
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
print(vocab_size)

4163


In [3]:
import torch
from torch.utils.data import Dataset

class AmazonDataset(Dataset):
    def __init__(self, df, word_count=500, vocab_size=10000):
        self.df = df
        self.word_count = word_count
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        x= self.df.iloc[idx]["content"]
        y= self.df.iloc[idx]["rating"]
        y = int(y) - 1
        x = vocab(tokenizer(x))
        if len(x) > self.word_count:
            x=x[:self.word_count]
        else:
            x.extend([0]*(self.word_count-len(x)))
        x = torch.nn.functional.one_hot(torch.tensor(x), num_classes=self.vocab_size)
        return x, y

amazon_dataset = AmazonDataset(df, word_count=500, vocab_size=vocab_size)
x,y=amazon_dataset[0]
print(x.shape)
print(y)

# which input shape is expected by the LSTM
# (seq_len, batch, input_size)
        

torch.Size([500, 4163])
4


In [10]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pack_sequence

torch.manual_seed(1)

class MyLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(MyLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim=embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to output space
        self.hidden2output = nn.Linear(hidden_dim, tagset_size)

    def forward(self, xb):
        print("xb shape", xb.shape)
        embeds = self.word_embeddings(xb)
        print("embeds shape", embeds.shape)
        embeds_view = embeds.view(xb.shape[0], xb.shape[1], -1)

        print("embeds_view shape as input for lstm", embeds_view.shape)
        lstm_out, _ = self.lstm(embeds_view)
        hidden_space = self.hidden2output(lstm_out.view(xb.shape[0], -1))
        output = F.log_softmax(hidden_space, dim=1)
        return output


In [11]:
# create model
vocab_size = len(vocab)
embed_dim = 32
num_class = 2
hidden_dim = 32

model= MyLSTM(embed_dim, hidden_dim, vocab_size, num_class)
dataset = AmazonDataset(df, word_count=500, vocab_size=vocab_size)
loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True)

# check if model works
xb, yb = next(iter(loader))

print("yb", yb)
print("xb", xb.shape)
model(xb)

yb tensor([0, 2])
xb torch.Size([2, 500, 4163])
xb shape torch.Size([2, 500, 4163])
embeds shape torch.Size([2, 500, 4163, 32])
embeds_view shape as input for lstm torch.Size([2, 500, 133216])


RuntimeError: input.size(-1) must be equal to input_size. Expected 32, got 133216

In [6]:
import time

def evaluate(model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [11]:
# create train and valid dataset
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

# import torch DataLoader
from torch.utils.data import DataLoader

# Hyperparameters
EPOCHS = 30 # epoch
LR = 5  # learning rate
BATCH_SIZE = 2 # batch size for training

model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3.0, gamma=0.7)   # every 3 epochs, LR is multiplied by 0.7




In [12]:
total_accu = None
train_accus=[]
valid_accus=[]

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    
    model.train()
    total_acc, total_count = 0, 0

    for idx, (text, label) in enumerate(train_dataloader):
        print(text.shape)
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = loss_func(predicted_label, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

    accu_train = evaluate(model, train_dataloader)
    accu_valid = evaluate(model, valid_dataloader)
    train_accus.append(accu_train)
    valid_accus.append(accu_valid)
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | train accuracy {:8.3f} | valid accuracy {:8.3f} | lr: {:1.2f}'.format(
                                epoch,
                                time.time() - epoch_start_time,
                                accu_train, 
                                accu_valid, 
                                scheduler.get_last_lr()[0]))

    scheduler.step() # learning rate scheduler after each epoch



import matplotlib.pyplot as plt
plt.plot(train_accus, label='train_accu')
plt.plot(valid_accus, label='valid_accu')
plt.legend()
plt.show()

torch.Size([2, 500, 22392])
xb shape torch.Size([2, 500, 22392])


RuntimeError: input.size(-1) must be equal to input_size. Expected 32, got 1433088

In [10]:
# convert byte two gigabyte
def convert_bytes(bytes):
    return bytes / 1024 / 1024 / 1024

convert_bytes(14330880000)


13.346672058105469