In [9]:
%reload_ext autoreload
%autoreload 2

%xmode Verbose
# stack trace bei crash

Exception reporting mode: Verbose


In [3]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchtext import data, datasets
import torchtext

import tqdm
import random

from TwitterPipeline import TwitterPipeline

## some constants

In [4]:
SEED = 762
# IN_FILE = 'germeval2018.try.txt'
IN_FILE = 'germeval2018.training.txt'
IN_FILE_TEST = 'germeval2018.test.txt'
BATCH_SIZE = 16

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

## define torchtext.Field instances

In [5]:
# define Fields
f_text = data.Field(sequential=True, use_vocab=True)
f_pos_tag = data.Field(sequential=True, use_vocab=False,
                       pad_token=1, unk_token=0)
f_lemma = data.Field(sequential=True, use_vocab=True)
f_label = data.LabelField(tensor_type=torch.FloatTensor)
fields = [('text', f_text), ('pos', f_pos_tag),
          ('lemma', f_lemma), ('label', f_label)]

# HINT: don't specify a tokenizer here
# assign single fields to map

## Spacy

In [11]:
# create a spacy pipeline
# HINT: a simple one - maybe even without setting the model to use - is easier
pipe = TwitterPipeline()
# pre-process training data
full_examples = pipe.process_data(
    IN_FILE, fields)[0]
full_ds = data.Dataset(full_examples, fields)

## Splitting of data

In [12]:
# do the splitting with torchtext
trn_ds, val_ds = full_ds.split(
    split_ratio=[0.8, 0.2], stratified=True, random_state=random.seed(SEED))
test_examples = pipe.process_data(
    IN_FILE_TEST, fields)[0]
tst_ds = data.Dataset(test_examples, fields)

In [13]:
print(f'train len {len(trn_ds.examples)}')
print(f'val len {len(val_ds.examples)}')
print(f'test len {len(tst_ds.examples)}')

train len 16
val len 4
test len 3398


## Vocabulary

In [20]:
# build vocab
# vec = torchtext.vocab.Vectors('embed_tweets_de_100D_fasttext',
#                              cache='/Users/michel/Downloads/')

# build vocab
# validation + test data should by no means influence the model, so build the vocab just on trn
#f_text.build_vocab(trn_ds, vectors=vec)
f_text.build_vocab(trn_ds, max_size=20000)
f_lemma.build_vocab(trn_ds)
f_label.build_vocab(trn_ds)


In [21]:
print(f'text vocab size {len(f_text.vocab)}')
print(f'lemma vocab size {len(f_lemma.vocab)}')
print(f'label vocab size {len(f_label.vocab)}')

text vocab size 247
lemma vocab size 227
label vocab size 2


## Iterator for Training loop

In [22]:
# create training iterators
# Aufteilung in Mini-Batches
trn_iter, val_iter, tst_iter = data.BucketIterator.splits((trn_ds, val_ds, tst_ds),
                                                          batch_size=BATCH_SIZE,
                                                          device=-1,
                                                          sort_key=lambda t: len(
                                                              t.text),
                                                          sort_within_batch=False,
                                                          repeat=False)



## The model

In [23]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_dim, emb_dim=100, hidden_dim=200):
        super().__init__()

        self.embedding = nn.Embedding(vocab_dim, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, 1)  # 1 is output dim

    def forward(self, x):
        # x type is Tensor[sentence len, batch size]. Internally pytorch does not use 1-hot

        embedded = self.embedding(x)
        # embedded type is Tensor[sentence len, batch size, emb dim]

        output, hidden_state = self.rnn(embedded)
        # output type is Tensor[sentence len, batch size, hidden dim]
        # hidden_state type is Tensor[1, batch size, hidden dim]

        return self.fc(hidden_state.squeeze(0))



## Metrik zur Messung

In [24]:
def binary_accuracy(preds, y):
    """
    return accuracy per batch as ratio of correct/all
    """

    # round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    # convert into float for division
    pred_is_correct = (rounded_preds == y).float()
    acc = pred_is_correct.sum()/len(pred_is_correct)
    return acc

## Trainings-Durchlauf

In [25]:
def train(model, iterator, optimizer, criterion, metric):  # optimierungs-Interface, criterion=loss-Function= Optimnierungskriterium, metric zum beobachten  
    epoch_loss = 0
    epoch_meter = 0

    model.train()   # Regularisier einschalten, um Overfitting zu verhindern

    for batch in iterator:
        optimizer.zero_grad()
        y_hat = model(batch.text).squeeze(1)   #y_hat = ^y = Prognose
        loss = criterion(y_hat, batch.label)
        meter = metric(y_hat, batch.label)
        loss.backward()
        optimizer.step()   # Trainings-Schritt 

        epoch_loss += loss.item()   # .item --> skalarer = nativer Wert eines Tensors
        epoch_meter += meter.item()

    return epoch_loss / len(iterator), epoch_meter / len(iterator)


## Evaluierung (auf Validation-Data)

In [26]:
def evaluate(model, iterator, criterion, metric):
    epoch_loss = 0
    epoch_meter = 0

    model.eval()   # Regularisierer ausschalten, da beste Werte gesucht werden. 

    with torch.no_grad():

        for batch in iterator:
            y_hat = model(batch.text).squeeze(1)
            loss = criterion(y_hat, batch.label)
            meter = metric(y_hat, batch.label)

            epoch_loss += loss.item()
            epoch_meter += meter.item()

    return epoch_loss / len(iterator), epoch_meter / len(iterator)


In [27]:
EMB_SIZE = 100
HID_SIZE = 200
NUM_LIN = 3
NUM_EPOCH = 5

# RNN variant SETUP
model = SimpleRNN(len(f_text.vocab), EMB_SIZE, HID_SIZE)
optimizer = optim.SGD(model.parameters(), lr=1e-3)    # SGD stochastic gradient descent (etwas alt abe gar nicht schlecht)
criterion = nn.BCEWithLogitsLoss()

## Training

In [28]:
for epoch in range(NUM_EPOCH):
    train_loss, train_acc = train(
        model, trn_iter, optimizer, criterion, binary_accuracy)
    valid_loss, valid_acc = evaluate(
        model, val_iter, criterion, binary_accuracy)

    print(f'EPOCH: {epoch:02} - TRN_LOSS: {train_loss:.3f} - TRN_ACC: {train_acc*100:.2f}% - VAL_LOSS: {valid_loss:.3f} - VAL_ACC: {valid_acc*100:.2f}%')

test_loss, test_acc = evaluate(model, tst_iter, criterion, binary_accuracy)
print(f'TEST_LOSS: {test_loss:.3f}, TEST_ACC: {test_acc*100:.2f}%')


  return Variable(arr, volatile=not train)


EPOCH: 00 - TRN_LOSS: 0.668 - TRN_ACC: 62.50% - VAL_LOSS: 0.687 - VAL_ACC: 50.00%
EPOCH: 01 - TRN_LOSS: 0.667 - TRN_ACC: 62.50% - VAL_LOSS: 0.686 - VAL_ACC: 50.00%
EPOCH: 02 - TRN_LOSS: 0.666 - TRN_ACC: 62.50% - VAL_LOSS: 0.686 - VAL_ACC: 50.00%
EPOCH: 03 - TRN_LOSS: 0.665 - TRN_ACC: 62.50% - VAL_LOSS: 0.685 - VAL_ACC: 50.00%
EPOCH: 04 - TRN_LOSS: 0.664 - TRN_ACC: 62.50% - VAL_LOSS: 0.684 - VAL_ACC: 50.00%
TEST_LOSS: 0.692, TEST_ACC: 50.54%
