In [13]:
import numpy as np
import torch
from torch import nn, optim
from dataset import get_twit_company_dataloaders, get_twit_sentiment_dataloaders, get_twit_company_sentiment_dataloaders
from model import LSTMTwitClassifier
import torch.nn.functional as F

# text, label = next(iter(dataloader_train))

In [14]:
device = "cuda"
use_wandb = True

lr = 0.001
embedding_size = 100
hidden_size = 100
epochs_cnt = 100
embeddings = "random"
lstm_layers = 1
dropout = 0.5
task = "text2company" # "text2sentiment"  #
use_company_info = True
preprocessing = "tutorial"
use_stop_words = True

get_dataloaders = get_twit_company_dataloaders if task == "text2company" else \
    get_twit_sentiment_dataloaders if not use_company_info else get_twit_company_sentiment_dataloaders

dataset_train, dataloader_train, dataset_test, dataloader_test = get_dataloaders(embedding_dim=embedding_size,
                                                                                 embedding=embeddings,
                                                                                 preprocessing=preprocessing,
                                                                                 use_stop_words=use_stop_words)

model = LSTMTwitClassifier(4, embedding_dim=embedding_size, hidden_dim=hidden_size, dropout=dropout,
                           lstm_layers=lstm_layers,
                           additional_one_hot_arg=use_company_info and task == "text2sentiment")
model = model.to(device)

if use_wandb:
    import wandb

    wandb.init(project=task + '_twit_classification', entity='ars860')

    config = wandb.config
    config.loss = "BCE"
    config.optimizer = "Adam"
    config.learning_rate = lr
    config.hidden_size = hidden_size
    config.embedding_size = embedding_size
    config.embeddings = embeddings
    config.epochs = epochs_cnt
    config.dropout = dropout
    config.lstm_layers = lstm_layers
    config.stem = "snowballstemmer"
    config.preprocessing = preprocessing
    config.use_stop_words = use_stop_words

    if task == "text2sentiment":
        config.use_company_info = use_company_info

    wandb.watch(model)

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


def loss_on_test():
    correct = 0
    losses = np.zeros(len(dataloader_test))

    with torch.no_grad():
        model.eval()
        for i, (*args, target) in enumerate(dataloader_test):
            args = [arg.to(device) for arg in args]
            target = target.to(device)

            prediction = model(*args)
            prediction = F.softmax(prediction, dim=0)

            losses[i] = F.binary_cross_entropy(prediction, target.view(-1))
            if torch.argmax(prediction) == torch.argmax(target):
                correct += 1

            # predictions_cnt[torch.argmax(prediction)] += 1

            # if i % 100 == 0:
            #     print(f"Iter: {i}/{len(dataloader_test)}")

    model.train()
    if use_wandb:
        wandb.log({"test_loss": np.mean(losses), "test_accuracy": correct / len(dataloader_test)})


losses = np.empty(100)
model.train()
for epoch in range(epochs_cnt):
    epoch_loss = np.zeros(len(dataloader_train))

    for i, (*args, target) in enumerate(dataloader_train):
        args = [arg.to(device) for arg in args]
        target = target.to(device)

        model.zero_grad()

        prediction = model(*args)
        prediction = F.softmax(prediction, dim=0)

        loss = criterion(prediction, target.view(-1))

        loss.backward()
        optimizer.step()

        loss = loss.detach().item()
        losses[i % 100] = loss
        epoch_loss[i] = loss

        if (i + 1) % 100 == 0:
            print(
                f"Epoch {epoch + 1}/{epochs_cnt}, iter: {i + 1}/{len(dataloader_train)}, mean loss: {np.mean(losses)}")
            if use_wandb:
                wandb.log({"loss": np.mean(losses)})

    if use_wandb:
        wandb.log({"epoch_loss": np.mean(epoch_loss)})
        loss_on_test()

# [model.get_word_embedding(word) for word in "hello_world".split(' ')]

Tweet ignored due to unreadability: Поиск от 
Tweet ignored due to unreadability: Новите 
Tweet ignored due to unreadability: 看見 
Tweet ignored due to unreadability: نظام جديد .. و جهاز جديد شكراً جزيلاً 
Tweet ignored due to unreadability: الجهاز الجديد عجيب   
Tweet ignored due to unreadability: يبدو ان طفرة الاجهزة الالكترونية القادمة ستكون بقيادة موتورولا ،، لاسيم بعد استحواذ قوقل عليها.   
Tweet ignored due to unreadability: Με συγχισες 
Tweet ignored due to unreadability: На сайте 
Tweet ignored due to unreadability: Настоящий твиттерянин как только попадает в толпу стремиться тут же как можно быстрее попасть в 
Tweet ignored due to unreadability: Доброе утро 
Tweet ignored due to unreadability: 【
Tweet ignored due to unreadability: رقم الفلو والفلورز والتويتات  للبيع لاعلى سعر 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ars86\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tweet ignored due to unreadability: قال الرئيس التنفيذي لشركة 
Tweet ignored due to unreadability: Улучшим продукты компании 
Tweet ignored due to unreadability: نفسي يوم يعدي علي تويتر من غير مشاكل فنية 
Tweet ignored due to unreadability: ツイッター検索 


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ars86\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
wandb: wandb version 0.12.6 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Epoch 1/100, iter: 100/3360, mean loss: 0.1948420729348436
Epoch 1/100, iter: 200/3360, mean loss: 0.013445660095731
Epoch 1/100, iter: 300/3360, mean loss: 0.0067745567192150705
Epoch 1/100, iter: 400/3360, mean loss: 0.010724953596168233
Epoch 1/100, iter: 500/3360, mean loss: 0.010642227034547886
Epoch 1/100, iter: 600/3360, mean loss: 0.0022055611081452753
Epoch 1/100, iter: 700/3360, mean loss: 0.0005620868948284397
Epoch 1/100, iter: 800/3360, mean loss: 0.004167603207768451
Epoch 1/100, iter: 900/3360, mean loss: 0.003927047654838134
Epoch 1/100, iter: 1000/3360, mean loss: 0.5497546947835075
Epoch 1/100, iter: 1100/3360, mean loss: 0.17919453689828516
Epoch 1/100, iter: 1200/3360, mean loss: 0.03077995244268095
Epoch 1/100, iter: 1300/3360, mean loss: 0.02314006559576228
Epoch 1/100, iter: 1400/3360, mean loss: 0.02117022492820979
Epoch 1/100, iter: 1500/3360, mean loss: 0.010701918165532334
Epoch 1/100, iter: 1600/3360, mean loss: 0.015736534022767045
Epoch 1/100, iter: 1700/3

In [15]:
model.eval()

print("Testing on train")

correct = 0
predictions_cnt = [0, 0, 0, 0]

with torch.no_grad():
    for i, (*args, target) in enumerate(dataloader_train):
        args = [arg.to(device) for arg in args]
        target = target.to(device)

        prediction = model(*args)
        prediction = F.softmax(prediction, dim=0)

        if torch.argmax(prediction) == torch.argmax(target):
            correct += 1

        predictions_cnt[torch.argmax(prediction)] += 1

        if i % 100 == 0:
            print(f"Iter: {i}/{len(dataloader_train)}")

print(f"Accuracy {correct / len(dataloader_train)}")

if use_wandb:
    wandb.run.summary.train_accuracy = correct / len(dataloader_train)
    wandb.run.summary.classified_as = {
        "apple": predictions_cnt[0],
        "google": predictions_cnt[1],
        "microsoft": predictions_cnt[2],
        "twitter": predictions_cnt[3]
    }
    wandb.finish()

Testing on train
Iter: 0/3360
Iter: 100/3360
Iter: 200/3360
Iter: 300/3360
Iter: 400/3360
Iter: 500/3360
Iter: 600/3360
Iter: 700/3360
Iter: 800/3360
Iter: 900/3360
Iter: 1000/3360
Iter: 1100/3360
Iter: 1200/3360
Iter: 1300/3360
Iter: 1400/3360
Iter: 1500/3360
Iter: 1600/3360
Iter: 1700/3360
Iter: 1800/3360
Iter: 1900/3360
Iter: 2000/3360
Iter: 2100/3360
Iter: 2200/3360
Iter: 2300/3360
Iter: 2400/3360
Iter: 2500/3360
Iter: 2600/3360
Iter: 2700/3360
Iter: 2800/3360
Iter: 2900/3360
Iter: 3000/3360
Iter: 3100/3360
Iter: 3200/3360
Iter: 3300/3360
Accuracy 0.95625


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,0.01685
_runtime,2030.0
_timestamp,1635529472.0
_step,3499.0
epoch_loss,0.05448
test_loss,0.86376
test_accuracy,0.63855
train_accuracy,0.95625


0,1
loss,▁▂▅▁▃█▂▄▅▁▃▂▆▂▂▆▂▂▃▄▃▄▁▂▂▁▃▂▂▂▄▃▂▆▁▁▂▂▂▃
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch_loss,▅▆▇██▇▇█▆▆▄▄▃▃▄▃▂▂▃▂▃▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▂▂▁▂
test_loss,█▆▆▅▅▅▃▃▂▂▂▂▂▂▂▂▂▁▂▁▁▂▂▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
test_accuracy,▁▁▁▁▁▂▂▃▄▅▄▅▅▅▆▆▅▆▆▇▇▆▆▇▇▇▇▇▇▇▇█████████


In [16]:
print("Testing on test")

correct = 0
predictions_cnt = [0, 0, 0, 0]

with torch.no_grad():
    for i, (*args, target) in enumerate(dataloader_test):
        args = [arg.to(device) for arg in args]
        target = target.to(device)

        prediction = model(*args)
        prediction = F.softmax(prediction, dim=0)

        if torch.argmax(prediction) == torch.argmax(target):
            correct += 1

        predictions_cnt[torch.argmax(prediction)] += 1

        if i % 100 == 0:
            print(f"Iter: {i}/{len(dataloader_test)}")

print(f"Accuracy {correct / len(dataloader_test)}")

Testing on test
Iter: 0/332
Iter: 100/332
Iter: 200/332
Iter: 300/332
Accuracy 0.6385542168674698
