In [1]:
!git clone https://github.com/berndheidemann/text_classification_first_trys

fatal: destination path 'text_classification_first_trys' already exists and is not an empty directory.


In [2]:
!python -m spacy download de

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'de' are deprecated. Please use the
full pipeline package name 'de_core_news_sm' instead.[0m
Collecting de-core-news-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.3.0/de_core_news_sm-3.3.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.3.0
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [4]:
import pandas as pd

df=pd.read_csv("/kaggle/working/text_classification_first_trys/amazon_review_analysis/Amazon-Deutsch-Dataset.csv")
df = df[["content", "rating"]]
df.rating= df.rating.str[0]
df = df.dropna()
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
tokenizer = get_tokenizer('spacy', language='de_core_news_sm')

# create iterator from tokenized df
def df_iterator_content(df):
    for _, row in df.iterrows():
        yield tokenizer(row['content'])

vocab = build_vocab_from_iterator(df_iterator_content(df), specials=["<unk>"], min_freq=10)
vocab.set_default_index(vocab["<unk>"])
vocab_size = len(vocab)
print(vocab_size)

2399


In [5]:
import torch
from torch.utils.data import Dataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class AmazonDataset(Dataset):
    def __init__(self, df, word_count=500, vocab_size=10000):
        self.df = df
        self.word_count = word_count
        self.vocab_size = vocab_size
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        x= self.df.iloc[idx]["content"]
        y= self.df.iloc[idx]["rating"]
        y = int(y) - 1
        x = vocab(tokenizer(x))
        if len(x) > self.word_count:
            x=x[:self.word_count]
        else:
            x.extend([0]*(self.word_count-len(x)))
        x = torch.tensor(x)
        return x.to(device), torch.tensor(y).to(device)

amazon_dataset = AmazonDataset(df, word_count=100, vocab_size=vocab_size)
x,y=amazon_dataset[0]
print(x.shape)
print(y)

torch.Size([100])
tensor(4, device='cuda:0')


In [6]:
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

torch.manual_seed(1)

class MyLSTM(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, out_size, word_count=50):
        super(MyLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim=embedding_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, dropout=0.3, num_layers=2)

        # The linear layer that maps from hidden state space to output space
        self.hidden2output = nn.Linear(hidden_dim*word_count, out_size)

    def forward(self, xb):
        #print("xb shape", xb.shape)
        embeds = self.word_embeddings(xb)
        #print("embeds shape", embeds.shape)
        lstm_out, _ = self.lstm(embeds)
        #print("lstm_out shape", lstm_out.shape)
        # lstm_out_view = lstm_out[:, -1, :]   # works but looses information
        lstm_out_view = lstm_out.reshape(xb.shape[0], -1   )
        #print("lstm_out_view shape", lstm_out_view.shape)
        hidden_space = self.hidden2output(lstm_out_view)
        #print("hidden_space shape", hidden_space.shape)
        output = F.log_softmax(hidden_space, dim=1)
        #print("output shape", output.shape)
        return output


In [24]:
# Hyperparameter
embed_dim = 24
num_class = 5
hidden_dim = 64
word_count = 50
EPOCHS = 30 # epoch
LR = 0.03  # learning rate
lr_step_size=5
lr_gamma=0.5
weight_decay=1e-3
BATCH_SIZE = 64 # batch size for training


# check if model works
model= MyLSTM(embed_dim, hidden_dim, vocab_size, num_class, word_count=word_count).to(device)
dataset = AmazonDataset(df, word_count=word_count, vocab_size=vocab_size)
loader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True)

xb, yb = next(iter(loader))
print("yb", yb)
print("xb", xb.shape)
model(xb)

yb tensor([4, 0, 0, 0, 4], device='cuda:0')
xb torch.Size([5, 50])


tensor([[-1.6156, -1.5973, -1.6099, -1.6844, -1.5451],
        [-1.6102, -1.6225, -1.6050, -1.6558, -1.5563],
        [-1.6053, -1.6091, -1.6091, -1.6617, -1.5644],
        [-1.6192, -1.6056, -1.5936, -1.6645, -1.5669],
        [-1.6277, -1.6086, -1.5925, -1.6801, -1.5433]], device='cuda:0',
       grad_fn=<LogSoftmaxBackward0>)

In [25]:
import time

def evaluate(model, dataloader):
    model.eval()
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [26]:
# create train and valid dataset
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [int(len(dataset)*0.8), len(dataset)-int(len(dataset)*0.8)])

# import torch DataLoader
from torch.utils.data import DataLoader


model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class, word_count=word_count).to(device)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_step_size, gamma=lr_gamma)   # every 10 epochs, LR is multiplied by 0.7




In [27]:
total_accu = None
train_accus=[]
valid_accus=[]

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    
    model.train()
    total_acc, total_count = 0, 0

    for idx, (text, label) in enumerate(train_dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = loss_func(predicted_label, label)
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

    accu_train = evaluate(model, train_dataloader)
    accu_valid = evaluate(model, valid_dataloader)
    train_accus.append(accu_train)
    valid_accus.append(accu_valid)
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | train accuracy {:8.3f} | valid accuracy {:8.3f} | lr: {:1.3f}'.format(
                                epoch,
                                time.time() - epoch_start_time,
                                accu_train, 
                                accu_valid, 
                                scheduler.get_last_lr()[0]))

    scheduler.step() # learning rate scheduler after each epoch

import matplotlib.pyplot as plt
plt.plot(train_accus, label='train_accu')
plt.plot(valid_accus, label='valid_accu')
plt.legend()
plt.show()

-----------------------------------------------------------
| end of epoch   1 | time:  3.91s | train accuracy    0.468 | valid accuracy    0.443 | lr: 0.030
-----------------------------------------------------------
| end of epoch   2 | time:  3.83s | train accuracy    0.547 | valid accuracy    0.501 | lr: 0.030
-----------------------------------------------------------
| end of epoch   3 | time:  3.57s | train accuracy    0.560 | valid accuracy    0.525 | lr: 0.030
-----------------------------------------------------------
| end of epoch   4 | time:  3.77s | train accuracy    0.587 | valid accuracy    0.525 | lr: 0.030
-----------------------------------------------------------
| end of epoch   5 | time:  3.57s | train accuracy    0.632 | valid accuracy    0.540 | lr: 0.030
-----------------------------------------------------------
| end of epoch   6 | time:  3.57s | train accuracy    0.678 | valid accuracy    0.522 | lr: 0.015
----------------------------------------------------

KeyboardInterrupt: 

In [45]:
# how much valid accuracy do we get in a new untrained model?
new_model = MyLSTM(embed_dim, hidden_dim, vocab_size, num_class)
evaluate(new_model, valid_dataloader)


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)