# 1. imports

In [18]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import gc
from tqdm import tqdm
import csv
import os

%matplotlib inline

# 2. function convert str of text to list when we upload data

In [19]:
def str_text_to_list(value):
    list_values = value.strip('[]').split(', ')
    cleaned_list_values = [item[1:-1] for item in list_values]
    return cleaned_list_values

# 3. take labels from data

In [20]:
dataset_wsd_1 = pd.read_csv("../datasets/tonetags_wsd_1.csv", index_col=0, converters={"text": str_text_to_list})

labels = dataset_wsd_1.tags.unique().tolist()

del dataset_wsd_1
gc.collect()

0

# 4. create vocab and embedding by some pretrained embedding

In [21]:
glove_twitter_27B = torchtext.vocab.GloVe(name='twitter.27B', dim=50)

vocab = glove_twitter_27B.stoi
vocab["<unk>"] = len(vocab)
vocab["<pad>"] = len(vocab)

embedding_vector = glove_twitter_27B.vectors.numpy()
embedding_vector = np.append(embedding_vector, np.zeros(50)).reshape(-1, 50)  # vector for unknown value in vocab
embedding_vector = np.append(embedding_vector, np.ones(50)).reshape(-1, 50)  # vector for padding value in vocab

embedding_tensor = torch.tensor(embedding_vector, dtype=torch.float)

nn.Embedding.from_pretrained(embedding_tensor, freeze=True)

Embedding(1193516, 50)

# 5. create class of datasets

In [22]:
max_length = 4096

class myDataset(Dataset):
    def __init__(self, dataset):
        self.data = []
        for sentence in dataset.text:
            if len(sentence) > max_length:
                continue
            sentence_ids = []
            for token in sentence:
                try:
                    sentence_ids.append(vocab[token])
                except KeyError:
                    sentence_ids.append(vocab["<unk>"])
            self.data.append(sentence_ids)
        self.labels = dataset.tags

        self.context = None

        if 'context' in dataset.columns:
            self.context = dataset.context

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.context is None:
            return self.data[idx], torch.tensor(self.labels.iloc[idx])
        else:
            return self.data[idx], torch.tensor(self.labels.iloc[idx]), self.context.iloc[idx]

# 6. Dataloader part

In [23]:
batch_size = 32

def collate_fn(batch):
    data_ids = []
    labels = []
    contexts = []

    for dat in batch:

        data_ids.append(dat[0])
        labels.append(dat[1])

        if len(dat) >= 3:
            contexts.append(dat[2])

    for i in range(len(data_ids)):
        while len(data_ids[i]) < max_length:
            data_ids[i].append(vocab["<pad>"])

    return torch.tensor(data_ids), torch.tensor(contexts), torch.tensor(labels)

# 7. clean_corrected_text_wsd_2 dataset (from tonetags_dataset_tumblr_clean with glove.twitter.27b.50d)

## 7.1. upload dataset

In [24]:
def str_context_to_list(value):
    list_values = value.strip('[]').split(' ')
    cleaned_list_values = []
    for item in list_values:
        item.strip()
        # if '\n' in item:
        #     item.replace('\n', '')
        if item != "":
            cleaned_list_values.append(float(item))
    return cleaned_list_values


dataset_clean_corrected_wsd_2 = pd.read_csv("../datasets/tonetags_dataset_tumblr_clean_corrected_text_wsd_2.csv",
                                            index_col=0, converters={"text": str_text_to_list})

dataset_clean_corrected_wsd_2 = dataset_clean_corrected_wsd_2.dropna()
dataset_clean_corrected_wsd_2.context = dataset_clean_corrected_wsd_2.context.apply(str_context_to_list)

## 7.2. encode labels

In [25]:
dataset_clean_corrected_wsd_2.tags = dataset_clean_corrected_wsd_2.tags.apply(labels.index)

## 7.3. split

In [26]:
train_clean_corrected_wsd_2, test_clean_corrected_wsd_2 = train_test_split(dataset_clean_corrected_wsd_2, stratify=dataset_clean_corrected_wsd_2['tags'], test_size=0.2, random_state=42)

## 7.4. create datasets

In [27]:
train_dataset_clean_corrected_wsd_2, test_dataset_clean_corrected_wsd_2 = myDataset(train_clean_corrected_wsd_2), myDataset(test_clean_corrected_wsd_2)

## 7.5. create dataloaders

In [28]:
train_dataloader_clean_corrected_wsd_2 = DataLoader(train_dataset_clean_corrected_wsd_2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_clean_corrected_wsd_2 = DataLoader(test_dataset_clean_corrected_wsd_2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

In [29]:
class ToneTagsLSTM_wsd_2(nn.Module):
    def __init__(self, vocab_size, embedding, hidden_dim, context_dim, output_size, num_layers, dropout):
        super(ToneTagsLSTM_wsd_2, self).__init__()

        # output_size = 19

        self.embedding = embedding

        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)

        self.fc1 = nn.Linear(hidden_dim * max_length * 2 + context_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, output_size)
        # self.out = nn.Softmax(output_size, dim=1)


    def forward(self, tokens, contexts):

        embedded = self.embedding(tokens)
        output, (hidden, cell) = self.lstm(embedded)

        lstm_out = torch.cat((output.reshape(batch_size, -1), contexts), dim=1)

        fc1_out = self.fc1(lstm_out)


        fc2_out = self.fc2(fc1_out)
        out = self.fc3(fc2_out)
        # out = self.out(fc3_out)

        return out

## 7.6. parameters for clean_corrected_wsd_2

In [30]:
# train_dataloader_clean_corrected_wsd_2
# test_dataloader_clean_corrected_wsd_2

vocab_size = len(vocab)
embedding = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
hidden_dim = 30 # 30
context_dim = 50 # 50
output_size = len(labels) # 19
num_layers = 4
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 5e-4

criterion = nn.CrossEntropyLoss()

## 7.7. create instance of model and optimizer to clean_corrected_wsd_2 data

In [31]:
# train_dataloader_clean_corrected_wsd_2
# test_dataloader_clean_corrected_wsd_2

model_clean_corrected_wsd_2 = ToneTagsLSTM_wsd_2(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    context_dim=context_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_clean_corrected_wsd_2 = optim.Adam(model_clean_corrected_wsd_2.parameters(), lr=lr)

## 7.8. train and test to clean_corrected_wsd_2

In [32]:
%%time

for ep in range(1, 11):

    model_clean_corrected_wsd_2.train()
    epoch_losses_train = []


    for tokens, contexts, tags in tqdm(train_dataloader_clean_corrected_wsd_2, desc=f"Epoch {ep} training..."):
        optimizer_clean_corrected_wsd_2.zero_grad()

        tokens = tokens.to(device)
        contexts = contexts.to(device)
        tags = tags.to(device)

        predictions = model_clean_corrected_wsd_2(tokens, contexts)

        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_clean_corrected_wsd_2.step()

        epoch_losses_train.append(loss.item())

    del tokens
    del contexts
    del tags

    gc.collect()
    torch.cuda.empty_cache()

    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')

Epoch 1 training...: 100%|██████████| 2084/2084 [06:03<00:00,  5.74it/s]


[Train Epoch 1] Loss: 2.7893472200620657


Epoch 2 training...: 100%|██████████| 2084/2084 [05:58<00:00,  5.81it/s]


[Train Epoch 2] Loss: 2.4650996071134554


Epoch 3 training...: 100%|██████████| 2084/2084 [06:05<00:00,  5.71it/s]


[Train Epoch 3] Loss: 2.4317184411518404


Epoch 4 training...: 100%|██████████| 2084/2084 [06:11<00:00,  5.61it/s]


[Train Epoch 4] Loss: 2.4173328384556836


Epoch 5 training...: 100%|██████████| 2084/2084 [06:12<00:00,  5.60it/s]


[Train Epoch 5] Loss: 2.4133007140855187


Epoch 6 training...: 100%|██████████| 2084/2084 [06:13<00:00,  5.59it/s]


[Train Epoch 6] Loss: 2.408822317670266


Epoch 7 training...: 100%|██████████| 2084/2084 [06:10<00:00,  5.63it/s]


[Train Epoch 7] Loss: 2.4047703767005864


Epoch 8 training...: 100%|██████████| 2084/2084 [06:12<00:00,  5.60it/s]


[Train Epoch 8] Loss: 2.400300346043197


Epoch 9 training...: 100%|██████████| 2084/2084 [06:13<00:00,  5.58it/s]


[Train Epoch 9] Loss: 2.3878835375043574


Epoch 10 training...: 100%|██████████| 2084/2084 [06:13<00:00,  5.58it/s]


[Train Epoch 10] Loss: 2.357204598851945
CPU times: total: 1h 1min 43s
Wall time: 1h 1min 42s


## 8.8. delete some data

In [33]:
del train_dataloader_clean_corrected_wsd_2
del test_dataloader_clean_corrected_wsd_2
del train_dataset_clean_corrected_wsd_2
del test_dataset_clean_corrected_wsd_2
del train_clean_corrected_wsd_2
del test_clean_corrected_wsd_2
del dataset_clean_corrected_wsd_2
del optimizer_clean_corrected_wsd_2
gc.collect()
torch.cuda.empty_cache()

# 9. export and import model

In [34]:
torch.save(model_clean_corrected_wsd_2, "../results/models/lstm_model.pt")

model_clean_corrected_wsd_2 = torch.load("../results/models/lstm_model.pt")
model_clean_corrected_wsd_2.eval()

ToneTagsLSTM_wsd_2(
  (embedding): Embedding(1193516, 50)
  (lstm): LSTM(50, 30, num_layers=4, batch_first=True, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=245810, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=19, bias=True)
)