In [55]:
kaggle=False
kaggle_path='/kaggle/input/amazon-deutsch-review-dataset/Amazon-Deutsch-Dataset.csv'
local_path='Amazon-Deutsch-Dataset.csv'

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torchtext
import pandas as pd
import time
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

In [56]:

df=pd.read_csv(kaggle_path if kaggle else local_path)
df = df[["content", "rating"]]
df.rating= df.rating.str[0]
df = df.dropna()

df.rating = df.rating.astype(int)
df.rating = df.rating.apply(lambda x: 1 if x>3 else 0)

df.sample(5)

Unnamed: 0,content,rating
2934,Bei mir waren nach jedem Bildschirm-Timeout so...,0
1511,"Das nötige Setup für Klingeltöne, Lautsprecher...",0
292,Die guten Rezensionen erweckten hohe Erwartung...,0
901,....was will man für 8€ erwarten? Meine Sony i...,0
1017,Sehr enttäuschend. Haben nach knapp 2 Wochen s...,0


In [57]:
df.rating.value_counts()

0    2144
1    1265
Name: rating, dtype: int64

In [58]:


# Tokenizer-Funktion (einfaches Beispiel)
def tokenizer(text):
    tokens = text.split()
    return tokens

# Vokabular erstellen
def build_vocab(texts, vocab_size):
    word_counts = {}
    for text in texts:
        tokens = tokenizer(text)
        for token in tokens:
            if token in word_counts:
                word_counts[token] += 1
            else:
                word_counts[token] = 1

    sorted_vocab = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    sorted_vocab = sorted_vocab[:vocab_size-1]
    word_to_idx = {word: idx+1 for idx, (word, _) in enumerate(sorted_vocab)}
    word_to_idx['<unk>'] = 0
    return word_to_idx

# Texte in Sequenzen von Wortindizes umwandeln
def text_to_indices(text, word_to_idx):
    tokens = tokenizer(text)
    indices = [word_to_idx[token] if token in word_to_idx else 0 for token in tokens]
    return indices

# Hyperparameter
vocab_size = 10000
word_count = 200
embedding_dim = 128
hidden_dim = 256
num_layers = 2
num_classes = 2
batch_size = 32

num_epochs = 2

# Aufteilung in Trainings- und Testdaten
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Erstellung des Vokabulars
texts = train_df['content'].tolist()
word_to_idx = build_vocab(texts, vocab_size)

In [59]:
texts[0]

'Mein altes Handy gab den Geist auf - wie das oft so der Fall ist, vor dem verlängerten Wochenende. Also musste ich noch schnell los und mir ein neues besorgen (daher stammt es auch nicht von Amazon). Das klappte - aber dann trat das nächste Problem auf ... ich hatte vorher (im Galaxy S4 Mini) eine MikroSIM und brauchte jetzt eine NanoSIM. Also musste ich eine halbe Stunde vor Landeschluss noch in den T-Shop, um mir eine neue SIM-Karte zu besorgen.\n\nAlso - Merkposten Nr. 1 (wenn man von älteren Handys kommt) - neue NanoSIM besorgen!\n\nIch bin nicht (mehr) der Ansicht, dass ich immer das allerneuste Handy haben muss. Ein Gerät, das zur "vorletzten" Generation gehört (die neuen Galaxy S8er Handys sind ja raus) reicht für mich vollkommen aus (und ist ein Riesenfortschritt zum S4 Mini).\n\nWas gefällt mir?\n- das sehr schöne Display. Gute Größe (für meinen Geschmack) - bietet viel Platz, gute Auflösung, schöne Farben - und ist jedenfalls für mich mit einer Handy bedienbar (habe allerdin

In [60]:
word_to_idx["Handy"]

47

In [61]:
text_to_indices(texts[0], word_to_idx)[:10]

[307, 523, 47, 483, 14, 412, 18, 33, 46, 3]

In [62]:
class AmazonDataset(Dataset):
    def __init__(self, df, word_count=500, vocab_size=10000):
        self.df = df
        self.word_count = word_count
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        x= self.df.iloc[idx]["content"]
        y= self.df.iloc[idx]["rating"]
        y = int(y)
        x = text_to_indices(x, word_to_idx)
        # we need this because we need to have a fixed size input
        if len(x) > self.word_count:
            x=x[:self.word_count]
        else:
        # pad with zeros, in case the text is shorter than word_count
            x.extend([0]*(self.word_count-len(x)))
        x = torch.tensor(x)
        y= torch.tensor(y, dtype=torch.long)
        return x, y
    
amazon_dataset = AmazonDataset(df, word_count=50, vocab_size=vocab_size)
x,y=amazon_dataset[0]
print(x)
print(y)


tensor([  21,   69,   16,  179,   10,   22,  122, 4016,   67, 1480,   95,  122,
        1466,   32,    9, 6146,    0,    4,   27,  163,    0,  212,   21,  170,
           8, 1592,   55,  285,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0])
tensor(1)


In [63]:
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

def indices_to_text(indices, idx_to_word):
    tokens = [idx_to_word[idx.item()] for idx in indices]
    text = " ".join(tokens)
    return text

indices_to_text(x, idx_to_word)


'Ich bin sehr zufrieden mit dem iPhone 11. Der Wechsel vom iPhone 6s war ein riesiger <unk> der sich allerdings <unk> hat. Ich würde es jederzeit wieder kaufen. <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>'

In [64]:
# Padding-Funktion für Batch
def custom_collate_fn(batch):
    inputs, labels = zip(*batch)  # this means: separate inputs and labels from the batch
    inputs = [torch.tensor(text, dtype=torch.long) for text in inputs]
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0) 
    labels = torch.tensor(labels, dtype=torch.long)
    return inputs, labels

In [65]:
# test custom_collate_fn
batch_size = 32
train_loader = DataLoader(amazon_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
inputs, labels = next(iter(train_loader))
print(inputs.shape)
print(labels.shape)

print(indices_to_text(inputs[2], idx_to_word))
print(labels[2])

torch.Size([32, 50])
torch.Size([32])
<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>
tensor(1)


  inputs = [torch.tensor(text, dtype=torch.long) for text in inputs]


In [66]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, word_count):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.transformer_layer = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=4,  # Number of attention heads
            dim_feedforward=hidden_dim,
        )
        self.transformer = nn.TransformerEncoder(self.transformer_layer, num_layers=num_layers)
        self.fc = nn.Linear(word_count*embedding_dim, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.4)
        self.batch_norm = nn.BatchNorm1d(word_count*embedding_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        #print("embedded shape: ", embedded.shape)
        embedded = embedded.permute(1, 0, 2)  # Shape: (word_count, batch_size, embedding_dim)
        #print("embedded 2 shape: ", embedded.shape)

        transformer_output = self.transformer(embedded)
        #print("transformer_output shape: ", transformer_output.shape)
        transformer_output = transformer_output.permute(1, 0, 2)  # Back to (batch_size, word_count, embedding_dim)
        #print("transformer_output 2 shape: ", transformer_output.shape)
        transformer_output = transformer_output.contiguous().view(transformer_output.size(0), -1)
        out=self.batch_norm(transformer_output)
        out=self.dropout(out)
        #print("transformer_output 3 shape: ", transformer_output.shape)
        out = self.fc(transformer_output)
        #print("out shape: ", out.shape)
        out = self.relu(out)
        #print("final out  shape: ", out.shape)
        return out
    
    
print("vocab_size: ", vocab_size)
print("embedding_dim: ", embedding_dim)
print("hidden_dim: ", hidden_dim)
print("num_layers: ", num_layers)
print("num_classes: ", num_classes)
print("word_count: ", word_count)
print("batch_size: ", batch_size)

# test model
train_dataset = AmazonDataset(train_df, word_count, vocab_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
x, y = next(iter(train_loader))
print(x.shape)
print(y.shape)

model = TransformerClassifier(vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, word_count)
out = model(x)
print(out.shape)


vocab_size:  10000
embedding_dim:  128
hidden_dim:  256
num_layers:  2
num_classes:  2
word_count:  200
batch_size:  32
torch.Size([32, 200])
torch.Size([32])
torch.Size([32, 2])


  inputs = [torch.tensor(text, dtype=torch.long) for text in inputs]


In [67]:
if torch.cuda.is_available():
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

if kaggle==False:
    device = torch.device("cpu")


def calcAccuracy(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total
    return accuracy

In [68]:
learning_rate = 0.0001
num_epochs = 100

# Daten laden und Dataloader erstellen
train_dataset = AmazonDataset(train_df, word_count, vocab_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)

test_dataset = AmazonDataset(test_df, word_count, vocab_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)


# Modell initialisieren
model = TransformerClassifier(vocab_size, embedding_dim, hidden_dim, num_layers, num_classes, word_count)
model=model.to(device)

# Optimizer und Loss-Funktion
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Trainingsschleife
for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        #print("outputs.shape", outputs)
        #print("labels.shape", labels)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss}")
    if (epoch+1) % 10 == 0:
        print("Train Accuracy: ", calcAccuracy(model, train_loader))
        print("Test Accuracy: ", calcAccuracy(model, test_loader))
        

# Evaluierung auf Testdaten
model.eval()


print("Accuracy: ", calcAccuracy(model, test_loader))

  0%|          | 0/100 [00:00<?, ?it/s]

  inputs = [torch.tensor(text, dtype=torch.long) for text in inputs]


Epoch 1/100, Loss: 61.401256024837494


KeyboardInterrupt: 

In [None]:
text = ["Das Handy ist sehr gut. Wirklich empfehlenswert!", "Das Handy ist sehr schlecht. Nicht empfehlenswert"]

text = [text_to_indices(t, word_to_idx) for t in text]

def padd_text_batch(batch):
    for t in batch:
        if len(t) < word_count:
            t.extend([0]*(word_count-len(t)))
    return torch.tensor(batch, dtype=torch.long)

text = padd_text_batch(text)
device = torch.device("cpu")
text = text.to(device)
# padd with zeros
print(text.shape)
model=model.to(device)
model.eval()
with torch.no_grad():
    outputs = model(text)
    print("outputs", outputs)
    _, predicted = torch.max(outputs.data, 1)
    print("predicted", predicted)

torch.Size([2, 200])
outputs tensor([[0.0000, 0.1023, 1.2175, 0.3537, 0.0000],
        [0.0000, 0.1700, 1.0861, 0.3672, 0.0000]])
predicted tensor([2, 2])
