In [1]:
import pandas as pd

df=pd.read_csv("Amazon-Deutsch-Dataset.csv")
df = df[["content", "rating"]]
df.rating= df.rating.str[0]
df = df.dropna()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('spacy', language='de_core_news_sm')

# create iterator from tokenized df
def df_iterator_content(df):
    for _, row in df.iterrows():
        yield tokenizer(row['content'])

vocab = build_vocab_from_iterator(df_iterator_content(df), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)

def text_pipeline(x):                           
    return vocab(tokenizer(x))

label_pipeline = lambda x: int(x) - 1

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch, word_count=500):
    label_list, text_list=[], []
    for (text, label) in batch:
        label_list.append(label_pipeline(label))
        words=text_pipeline(text)
        if len(words) > word_count:
            words=words[:word_count]
        else:
            words.extend([0]*(word_count-len(words)))
        text_list.append(words)
    return torch.tensor(text_list), torch.tensor(label_list)

# check if collate_batch works
collate_batch([("Das neue IPhone ist wirklich toll!!", "5"), ("FritzBox 7830 ist schon ganz nett, aber geht besser", "3")], word_count=10)

(tensor([[  28,  212,  472,    6,  122,  656,   18,   18,    0,    0],
         [ 778,    0,    6,   61,  145, 2043,    2,   26,  146,  129]]),
 tensor([4, 2]))

In [None]:
def idx_to_one_hot(batch, num_classes):
    return torch.zeros(batch.shape[0], batch.shape[1], num_classes).scatter_(2, batch.unsqueeze(2), 1)

xb, yb=collate_batch([("Das neue IPhone ist wirklich toll!!", "5"), ("FritzBox 7830 ist schon ganz nett, aber geht besser", "3")], word_count=10)
idx_to_one_hot(xb, vocab_size)

def collate_batch_with_one_hot(batch, word_count=500, vocab_size=10000):
    xb, yb=collate_batch(batch, word_count)
    return idx_to_one_hot(xb, vocab_size), yb

In [None]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)


class MyLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(MyLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, xb):
        print("xb shape", xb.shape)
        embeds = self.word_embeddings(xb)
        lstm_out, _ = self.lstm(embeds.view(xb.shape[1], 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(xb.shape[1], -1))
        output = F.log_softmax(tag_space, dim=1)
        return output


In [None]:
# create model
vocab_size = len(vocab)
embed_dim = 32
num_class = 2
hidden_dim = 32

model= MyLSTM(embed_dim, hidden_dim, vocab_size, num_class)

# check if model works
xb, yb=collate_batch_with_one_hot([("Das neue IPhone ist wirklich toll!!", "5"), ("FritzBox 7830 ist schon ganz nett, aber geht besser", "3")], vocab_size=vocab_size)

print("yb", yb)
print("xb", xb.shape)

yb tensor([4, 2])
xb torch.Size([2, 500, 22392])


In [None]:
import time

def evaluate(model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

from torch.utils.data.dataset import random_split
train_dataset=df[['content', 'rating']].values;

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

In [None]:
# import torch DataLoader
from torch.utils.data import DataLoader

# Hyperparameters
EPOCHS = 30 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class).to(device)

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch_with_one_hot)

valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,collate_fn=collate_batch_with_one_hot)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 3.0, gamma=0.7)   # every 3 epochs, LR is multiplied by 0.7
total_accu = None

train_accus=[]
valid_accus=[]


for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    
    model.train()
    total_acc, total_count = 0, 0

    for idx, (text, label) in enumerate(train_dataloader):
        print(text.shape)
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = loss_func(predicted_label, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

    accu_train = evaluate(model, train_dataloader)
    accu_valid = evaluate(model, valid_dataloader)
    train_accus.append(accu_train)
    valid_accus.append(accu_valid)
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | train accuracy {:8.3f} | valid accuracy {:8.3f} | lr: {:1.2f}'.format(
                                epoch,
                                time.time() - epoch_start_time,
                                accu_train, 
                                accu_valid, 
                                scheduler.get_last_lr()[0]))

    scheduler.step() # learning rate scheduler after each epoch



import matplotlib.pyplot as plt
plt.plot(train_accus, label='train_accu')
plt.plot(valid_accus, label='valid_accu')
plt.legend()
plt.show()

: 

: 