# 1. imports

In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import gc
from tqdm import tqdm
import csv
import os

%matplotlib inline

# 2. function convert str of text to list when we upload data

In [5]:
def str_text_to_list(value):
    list_values = value.strip('[]').split(', ')
    cleaned_list_values = [item[1:-1] for item in list_values]
    return cleaned_list_values

# 3. take labels from data

In [6]:
dataset_wsd_1 = pd.read_csv("../datasets/tonetags_wsd_1.csv", index_col=0, converters={"text": str_text_to_list})

labels = dataset_wsd_1.tags.unique().tolist()

del dataset_wsd_1
gc.collect()

0

In [7]:
labels

['genuine question',
 'half joking',
 'genuine',
 'not a vent',
 'reference',
 'serious',
 'platonic',
 'inside joke',
 'sarcastic',
 'joking',
 'romantic',
 'passive aggressive',
 'copypasta',
 'ironic',
 'clickbait',
 'lyrics',
 'nothing personal',
 'not mad',
 'rhetorical']

# 4. create vocab and embedding by some pretrained embedding

In [4]:
glove_twitter_27B = torchtext.vocab.GloVe(name='twitter.27B', dim=50)

In [5]:
vocab = glove_twitter_27B.stoi
vocab["<unk>"] = len(vocab)
vocab["<pad>"] = len(vocab)

In [6]:
embedding_vector = glove_twitter_27B.vectors.numpy()
embedding_vector = np.append(embedding_vector, np.zeros(50)).reshape(-1, 50) # vector for unknown value in vocab
embedding_vector = np.append(embedding_vector, np.ones(50)).reshape(-1, 50) # vector for padding value in vocab

In [7]:
embedding_tensor = torch.tensor(embedding_vector, dtype=torch.float)

In [8]:
nn.Embedding.from_pretrained(embedding_tensor, freeze=True)

Embedding(1193516, 50)

# 5. create class of datasets

In [9]:
max_length = 4096

In [10]:
class myDataset(Dataset):
    def __init__(self, dataset):
        self.data = []
        for sentence in dataset.text:
            if len(sentence) > max_length:
                continue
            sentence_ids = []
            for token in sentence:
                try:
                    sentence_ids.append(vocab[token])
                except KeyError:
                    sentence_ids.append(vocab["<unk>"])
            self.data.append(sentence_ids)
        self.labels = dataset.tags

        self.context = None

        if 'context' in dataset.columns:
            self.context = dataset.context

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.context is None:
            return self.data[idx], torch.tensor(self.labels.iloc[idx])
        else:
            return self.data[idx], torch.tensor(self.labels.iloc[idx]), self.context.iloc[idx]

# 6. Dataloader part

In [11]:
batch_size = 32

In [12]:
def collate_fn(batch):
    data_ids = []
    labels = []
    contexts = []

    for dat in batch:

        data_ids.append(dat[0])
        labels.append(dat[1])

        if len(dat) >= 3:
            contexts.append(dat[2])

    for i in range(len(data_ids)):
        while len(data_ids[i]) < max_length:
            data_ids[i].append(vocab["<pad>"])

    return torch.tensor(data_ids), torch.tensor(contexts), torch.tensor(labels)

# 7. Create model

## 7.1. lstm for wsd_1, clean and clean_corrected datasets

In [13]:
class ToneTagsLSTM(nn.Module):
    def __init__(self, vocab_size, embedding, hidden_dim, output_size, num_layers, dropout):
        super(ToneTagsLSTM, self).__init__()
        
        # output_size = 19

        self.embedding = embedding

        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)

        self.fc1 = nn.Linear(hidden_dim * max_length * 2, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, output_size)
        # self.out = nn.Softmax(output_size, dim=1)


    def forward(self, x):

        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)


        lstm_out = output.reshape(batch_size, -1)

        fc1_out = self.fc1(lstm_out)


        fc2_out = self.fc2(fc1_out)
        out = self.fc3(fc2_out)
        # out = self.out(fc3_out)

        return out

## 7.2. parameters of this model

In [14]:
vocab_size = len(vocab)
embedding = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
hidden_dim = 30
output_size = len(labels) # 19
num_layers = 4
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 5e-4

n_epochs = 20


criterion = nn.CrossEntropyLoss()

# 8. function to export results

In [15]:
def export_predictions(dir_name, epoch, train_predictions, train_tags, test_predictions, test_tags):

    train_predictions = train_predictions
    train_tags = train_tags
    
    test_predictions = test_predictions
    test_tags = test_tags
    
    # export_time = int(datetime.datetime.now().timestamp())
    file_path = f'../results/lstm/predictions_vs_tags/{dir_name}'
    
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    with open(f'{file_path}/{epoch}.csv', 'w', newline='\n') as csvfile:
        writer = csv.writer(csvfile)
    
        writer.writerow(['train_predictions', 'train_tags', 'test_predictions', 'test_tags'])
    
        for row in zip(train_predictions, train_tags, test_predictions, test_tags):
            writer.writerow(row)

# 9. wsd_1 dataset

## 9.1. upload dataset

In [16]:
dataset_wsd_1 = pd.read_csv("../datasets/tonetags_wsd_1.csv", index_col=0, converters={"text": str_text_to_list})

## 9.2. encode labels

In [17]:
dataset_wsd_1.tags = dataset_wsd_1.tags.apply(labels.index)

## 9.3. split

In [18]:
train_wsd_1, test_wsd_1 = train_test_split(dataset_wsd_1, stratify=dataset_wsd_1['tags'], test_size=0.2, random_state=42)

## 9.4. create datasets

In [19]:
train_dataset_wsd_1, test_dataset_wsd_1 = myDataset(train_wsd_1), myDataset(test_wsd_1)

## 9.5. create dataloaders

In [20]:
train_dataloader_wsd_1 = DataLoader(train_dataset_wsd_1, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_wsd_1 = DataLoader(test_dataset_wsd_1, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

## 9.6. create instance of model and optimizer

In [21]:
# train_dataloader_wsd_1
# test_dataloader_wsd_1

model_wsd_1 = ToneTagsLSTM(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_wsd_1 = optim.Adam(model_wsd_1.parameters(), lr=lr)

## 9.7. train and test model for wsd_1 dataloader

In [22]:
%%time

for ep in range(1, n_epochs + 1):

    model_wsd_1.train()
    epoch_losses_train = []
    
    train_predictions = []
    train_tags = []
    
    test_predictions = []
    test_tags = []

    for tokens, contexts, tags in tqdm(train_dataloader_wsd_1, desc=f"Epoch {ep} training..."):
        optimizer_wsd_1.zero_grad()

        tokens = tokens.to(device)
        tags = tags.to(device)

        predictions = model_wsd_1(tokens)
        
        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_wsd_1.step()

        epoch_losses_train.append(loss.item())

        train_predictions.extend(predictions.tolist())
        train_tags.extend(tags.tolist())
    
    del tokens
    del tags
    
    gc.collect()
    torch.cuda.empty_cache()

    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')


    model_wsd_1.eval()

    with torch.no_grad():
        for tokens, contexts, tags in tqdm(test_dataloader_wsd_1, desc=f"Epoch {ep} testing..."):
            tokens = tokens.to(device)
            tags = tags.to(device)

            predictions = model_wsd_1(tokens)

            test_predictions.extend(predictions.tolist())
            test_tags.extend(tags.tolist())
        
        del tokens
        del tags
    
        gc.collect()
        torch.cuda.empty_cache()
    
    export_predictions('clean_wsd_1', epoch=ep, train_predictions=train_predictions, train_tags=train_tags, test_predictions=test_predictions, test_tags=test_tags)
    
    
    del train_predictions, train_tags, test_predictions, test_tags
    gc.collect()
    torch.cuda.empty_cache()

training...: 100%|██████████| 2113/2113 [06:14<00:00,  5.65it/s]


[Train Epoch 1] Loss: 2.805278515849585


testing...: 100%|██████████| 528/528 [00:34<00:00, 15.18it/s]
training...: 100%|██████████| 2113/2113 [06:03<00:00,  5.81it/s]


[Train Epoch 2] Loss: 2.66751756061043


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.84it/s]
training...: 100%|██████████| 2113/2113 [06:07<00:00,  5.75it/s]


[Train Epoch 3] Loss: 2.662090325660182


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.84it/s]
training...: 100%|██████████| 2113/2113 [06:08<00:00,  5.74it/s]


[Train Epoch 4] Loss: 2.6586376140312846


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.51it/s]
training...: 100%|██████████| 2113/2113 [06:15<00:00,  5.63it/s]


[Train Epoch 5] Loss: 2.657022865135637


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.34it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.59it/s]


[Train Epoch 6] Loss: 2.6562974638020433


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.31it/s]
training...: 100%|██████████| 2113/2113 [06:16<00:00,  5.61it/s]


[Train Epoch 7] Loss: 2.6559402564663035


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.15it/s]
training...: 100%|██████████| 2113/2113 [06:16<00:00,  5.61it/s]


[Train Epoch 8] Loss: 2.6558686480723


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.13it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.60it/s]


[Train Epoch 9] Loss: 2.65585996266287


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.35it/s]
training...: 100%|██████████| 2113/2113 [06:18<00:00,  5.58it/s]


[Train Epoch 10] Loss: 2.655735472960321


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.52it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.59it/s]


[Train Epoch 11] Loss: 2.655558586459108


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.09it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.59it/s]


[Train Epoch 12] Loss: 2.655555355904286


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.19it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.59it/s]


[Train Epoch 13] Loss: 2.655660432941023


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.25it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.60it/s]


[Train Epoch 14] Loss: 2.6554809091669993


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.27it/s]
training...: 100%|██████████| 2113/2113 [06:19<00:00,  5.56it/s]


[Train Epoch 15] Loss: 2.655359375042426


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.23it/s]
training...: 100%|██████████| 2113/2113 [06:19<00:00,  5.56it/s]


[Train Epoch 16] Loss: 2.655297502656712


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.05it/s]
training...: 100%|██████████| 2113/2113 [06:17<00:00,  5.60it/s]


[Train Epoch 17] Loss: 2.6553875470240422


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.11it/s]
training...: 100%|██████████| 2113/2113 [06:21<00:00,  5.54it/s]


[Train Epoch 18] Loss: 2.655260761033513


testing...: 100%|██████████| 528/528 [00:28<00:00, 18.22it/s]
training...: 100%|██████████| 2113/2113 [06:16<00:00,  5.61it/s]


[Train Epoch 19] Loss: 2.655318316609951


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.07it/s]
training...: 100%|██████████| 2113/2113 [06:20<00:00,  5.55it/s]


[Train Epoch 20] Loss: 2.655146322146559


testing...: 100%|██████████| 528/528 [00:29<00:00, 18.09it/s]


CPU times: total: 2h 16min 29s
Wall time: 2h 16min 41s


## 9.8. delete some data

In [23]:
del train_dataloader_wsd_1
del test_dataloader_wsd_1
del train_dataset_wsd_1
del test_dataset_wsd_1
del train_wsd_1
del test_wsd_1
del dataset_wsd_1

del model_wsd_1
del optimizer_wsd_1
gc.collect()
torch.cuda.empty_cache()

# 10. clean dataset

## 10.1. upload dataset

In [24]:
dataset_clean = pd.read_csv("../datasets/tonetags_dataset_tumblr_clean.csv", converters={"text": str_text_to_list})

## 10.2. encode labels

In [25]:
dataset_clean.tags = dataset_clean.tags.apply(labels.index)

## 10.3. split

In [26]:
train_clean, test_clean = train_test_split(dataset_clean, stratify=dataset_clean['tags'], test_size=0.2, random_state=42)

## 10.4. create datasets

In [27]:
train_dataset_clean, test_dataset_clean = myDataset(train_clean), myDataset(test_clean)

## 10.5. create dataloaders

In [28]:
train_dataloader_clean = DataLoader(train_dataset_clean, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_clean = DataLoader(test_dataset_clean, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

## 10.6. create instance of model and optimizer for clean dataloader

In [29]:
# train_dataloader_clean
# test_dataloader_clean

model_clean = ToneTagsLSTM(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_clean = optim.Adam(model_clean.parameters(), lr=lr)

## 10.7. train and test model of clean dataloader

In [30]:
%%time

for ep in range(1, n_epochs + 1):

    model_clean.train()
    epoch_losses_train = []

    train_predictions = []
    train_tags = []

    test_predictions = []
    test_tags = []


    for tokens, contexts, tags in tqdm(train_dataloader_clean, desc=f"Epoch {ep} training..."):
        optimizer_clean.zero_grad()

        tokens = tokens.to(device)
        tags = tags.to(device)

        predictions = model_clean(tokens)

        train_predictions.extend(predictions.tolist())
        train_tags.extend(tags.tolist())

        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_clean.step()

        epoch_losses_train.append(loss.item())

    del tokens
    del tags

    gc.collect()
    torch.cuda.empty_cache()
    
    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')

    model_clean.eval()

    with torch.no_grad():
        for tokens, contexts, tags in tqdm(test_dataloader_clean, desc=f"Epoch {ep} testing..."):

            tokens = tokens.to(device)
            tags = tags.to(device)

            predictions = model_clean(tokens)

            test_predictions.extend(predictions.tolist())
            test_tags.extend(tags.tolist())

        del tokens
        del tags
    
        gc.collect()
        torch.cuda.empty_cache()

    export_predictions('clean', epoch=ep, train_predictions=train_predictions, train_tags=train_tags, test_predictions=test_predictions, test_tags=test_tags)


    del train_predictions, train_tags, test_predictions, test_tags
    gc.collect()
    torch.cuda.empty_cache()

training...: 100%|██████████| 2131/2131 [06:48<00:00,  5.22it/s]


[Train Epoch 1] Loss: 2.9324233685228984


testing...: 100%|██████████| 532/532 [00:34<00:00, 15.23it/s]
training...: 100%|██████████| 2131/2131 [06:05<00:00,  5.83it/s]


[Train Epoch 2] Loss: 2.668667636304772


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.64it/s]
training...: 100%|██████████| 2131/2131 [06:06<00:00,  5.82it/s]


[Train Epoch 3] Loss: 2.663657796712953


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.47it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 4] Loss: 2.658495580014462


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.42it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 5] Loss: 2.6569677045903592


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.43it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 6] Loss: 2.656304714994104


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.39it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 7] Loss: 2.6563027666969963


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.30it/s]
training...: 100%|██████████| 2131/2131 [06:08<00:00,  5.78it/s]


[Train Epoch 8] Loss: 2.6562280000402914


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.44it/s]
training...: 100%|██████████| 2131/2131 [06:10<00:00,  5.76it/s]


[Train Epoch 9] Loss: 2.6558438677678136


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.35it/s]
training...: 100%|██████████| 2131/2131 [06:10<00:00,  5.75it/s]


[Train Epoch 10] Loss: 2.656099052621635


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.30it/s]
training...: 100%|██████████| 2131/2131 [06:11<00:00,  5.73it/s]


[Train Epoch 11] Loss: 2.656040229242433


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.31it/s]
training...: 100%|██████████| 2131/2131 [06:08<00:00,  5.78it/s]


[Train Epoch 12] Loss: 2.6556100495149138


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.22it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 13] Loss: 2.6555823185588205


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.40it/s]
training...: 100%|██████████| 2131/2131 [06:10<00:00,  5.75it/s]


[Train Epoch 14] Loss: 2.6555437976795986


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.39it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 15] Loss: 2.6554465754834324


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.31it/s]
training...: 100%|██████████| 2131/2131 [06:10<00:00,  5.76it/s]


[Train Epoch 16] Loss: 2.6554870738607232


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.17it/s]
training...: 100%|██████████| 2131/2131 [06:10<00:00,  5.76it/s]


[Train Epoch 17] Loss: 2.6554272491458657


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.31it/s]
training...: 100%|██████████| 2131/2131 [06:08<00:00,  5.78it/s]


[Train Epoch 18] Loss: 2.655510802718744


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.29it/s]
training...: 100%|██████████| 2131/2131 [06:09<00:00,  5.77it/s]


[Train Epoch 19] Loss: 2.6553108075704444


testing...: 100%|██████████| 532/532 [00:28<00:00, 18.36it/s]
training...: 100%|██████████| 2131/2131 [06:08<00:00,  5.79it/s]


[Train Epoch 20] Loss: 2.655403370702843


testing...: 100%|██████████| 532/532 [00:29<00:00, 18.29it/s]


CPU times: total: 2h 14min 42s
Wall time: 2h 14min 51s


## 10.8. delete some data

In [31]:
del train_dataloader_clean
del test_dataloader_clean
del train_dataset_clean
del test_dataset_clean
del train_clean
del test_clean
del dataset_clean

del model_clean
del optimizer_clean
gc.collect()
torch.cuda.empty_cache()

# 11. clean_corrected dataset

## 11.1. upload dataset

In [32]:
dataset_clean_corrected = pd.read_csv("../datasets/tonetags_dataset_tumblr_clean_corrected_text.csv", converters={"text": str_text_to_list})

## 11.2. encode labels

In [33]:
dataset_clean_corrected.tags =dataset_clean_corrected.tags.apply(labels.index)

## 11.3. split

In [34]:
train_clean_corrected, test_clean_corrected = train_test_split(dataset_clean_corrected, stratify=dataset_clean_corrected['tags'], test_size=0.2, random_state=42)

## 11.4. create datasets

In [35]:
train_dataset_clean_corrected, test_dataset_clean_corrected = myDataset(train_clean_corrected), myDataset(test_clean_corrected)

## 11.5. create dataloaders

In [36]:
train_dataloader_clean_corrected = DataLoader(train_dataset_clean_corrected, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_clean_corrected = DataLoader(test_dataset_clean_corrected, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

## 11.6. create instance of model and optimizer for clean_corrected data

In [37]:
# train_dataloader_clean_corrected
# test_dataloader_clean_corrected

model_clean_corrected = ToneTagsLSTM(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_clean_corrected = optim.Adam(model_clean_corrected.parameters(), lr=lr)

## 11.7. train and test for clean_corrected dataloader

In [38]:
%%time

for ep in range(1, n_epochs + 1):

    model_clean_corrected.train()
    epoch_losses_train = []

    train_predictions = []
    train_tags = []

    test_predictions = []
    test_tags = []


    for tokens, contexts, tags in tqdm(train_dataloader_clean_corrected, desc=f"Epoch {ep} training..."):
        optimizer_clean_corrected.zero_grad()

        tokens = tokens.to(device)
        tags = tags.to(device)

        predictions = model_clean_corrected(tokens)

        train_predictions.extend(predictions.tolist())
        train_tags.extend(tags.tolist())

        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_clean_corrected.step()

        epoch_losses_train.append(loss.item())

    del tokens
    del tags

    gc.collect()
    torch.cuda.empty_cache()

    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')


    model_clean_corrected.eval()

    with torch.no_grad():
        for tokens, contexts, tags in tqdm(test_dataloader_clean_corrected, desc=f"Epoch {ep} testing..."):

            tokens = tokens.to(device)
            tags = tags.to(device)

            predictions = model_clean_corrected(tokens)

            test_predictions.extend(predictions.tolist())
            test_tags.extend(tags.tolist())
                
        del tokens
        del tags
    
        gc.collect()
        torch.cuda.empty_cache()

    export_predictions('clean_corrected', epoch=ep, train_predictions=train_predictions, train_tags=train_tags, test_predictions=test_predictions, test_tags=test_tags)


    del train_predictions, train_tags, test_predictions, test_tags
    gc.collect()
    torch.cuda.empty_cache()

training...: 100%|██████████| 2127/2127 [06:36<00:00,  5.37it/s]


[Train Epoch 1] Loss: 2.890346392141467


testing...: 100%|██████████| 531/531 [00:34<00:00, 15.22it/s]
training...: 100%|██████████| 2127/2127 [06:03<00:00,  5.85it/s]


[Train Epoch 2] Loss: 2.669441573206225


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.92it/s]
training...: 100%|██████████| 2127/2127 [06:04<00:00,  5.84it/s]


[Train Epoch 3] Loss: 2.6654624486115264


testing...: 100%|██████████| 531/531 [00:27<00:00, 18.97it/s]
training...: 100%|██████████| 2127/2127 [06:09<00:00,  5.76it/s]


[Train Epoch 4] Loss: 2.659454311403132


testing...: 100%|██████████| 531/531 [00:31<00:00, 16.75it/s]
training...: 100%|██████████| 2127/2127 [06:07<00:00,  5.79it/s]


[Train Epoch 5] Loss: 2.6571426220101597


testing...: 100%|██████████| 531/531 [00:29<00:00, 18.18it/s]
training...: 100%|██████████| 2127/2127 [06:08<00:00,  5.78it/s]


[Train Epoch 6] Loss: 2.656355250201205


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.43it/s]
training...: 100%|██████████| 2127/2127 [06:10<00:00,  5.74it/s]


[Train Epoch 7] Loss: 2.6561366569059914


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.34it/s]
training...: 100%|██████████| 2127/2127 [06:07<00:00,  5.79it/s]


[Train Epoch 8] Loss: 2.6558452970722666


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.42it/s]
training...: 100%|██████████| 2127/2127 [06:08<00:00,  5.78it/s]


[Train Epoch 9] Loss: 2.655698408809244


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.33it/s]
training...: 100%|██████████| 2127/2127 [06:06<00:00,  5.80it/s]


[Train Epoch 10] Loss: 2.655714885135968


testing...: 100%|██████████| 531/531 [00:29<00:00, 18.28it/s]
training...: 100%|██████████| 2127/2127 [06:08<00:00,  5.77it/s]


[Train Epoch 11] Loss: 2.6555357545432092


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.39it/s]
training...: 100%|██████████| 2127/2127 [06:07<00:00,  5.80it/s]


[Train Epoch 12] Loss: 2.655589084194813


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.33it/s]
training...: 100%|██████████| 2127/2127 [06:09<00:00,  5.76it/s]


[Train Epoch 13] Loss: 2.6555189022616594


testing...: 100%|██████████| 531/531 [00:29<00:00, 17.72it/s]
training...: 100%|██████████| 2127/2127 [06:10<00:00,  5.74it/s]


[Train Epoch 14] Loss: 2.6552355753517958


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.36it/s]
training...: 100%|██████████| 2127/2127 [06:09<00:00,  5.76it/s]


[Train Epoch 15] Loss: 2.655410554748664


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.33it/s]
training...: 100%|██████████| 2127/2127 [06:10<00:00,  5.74it/s]


[Train Epoch 16] Loss: 2.6552791731089678


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.37it/s]
training...: 100%|██████████| 2127/2127 [06:08<00:00,  5.77it/s]


[Train Epoch 17] Loss: 2.6552592954783583


testing...: 100%|██████████| 531/531 [00:28<00:00, 18.32it/s]
training...: 100%|██████████| 2127/2127 [06:07<00:00,  5.79it/s]


[Train Epoch 18] Loss: 2.655429122702415


testing...: 100%|██████████| 531/531 [00:29<00:00, 18.13it/s]
training...: 100%|██████████| 2127/2127 [06:08<00:00,  5.77it/s]


[Train Epoch 19] Loss: 2.655268999013645


testing...: 100%|██████████| 531/531 [00:29<00:00, 18.25it/s]
training...: 100%|██████████| 2127/2127 [06:09<00:00,  5.75it/s]


[Train Epoch 20] Loss: 2.655285018791769


testing...: 100%|██████████| 531/531 [00:29<00:00, 18.19it/s]


CPU times: total: 2h 14min 12s
Wall time: 2h 14min 29s


## 11.8. delete some data

In [39]:
del train_dataloader_clean_corrected
del test_dataloader_clean_corrected
del train_dataset_clean_corrected
del test_dataset_clean_corrected
del train_clean_corrected
del test_clean_corrected
del dataset_clean_corrected

del model_clean_corrected
del optimizer_clean_corrected
gc.collect()
torch.cuda.empty_cache()

# 12. wsd_2 dataset (from tonetags_dataset_tumblr_clean with glove.6b.50d)

## 12.1. function to convert str of context to list when we upload dataset 

In [40]:
def str_context_to_list(value):
    list_values = value.strip('[]').split(' ')
    cleaned_list_values = []
    for item in list_values:
        item.strip()
        # if '\n' in item:
        #     item.replace('\n', '')
        if item != "":
            cleaned_list_values.append(float(item))
    return cleaned_list_values

## 12.2. upload dataset

In [41]:
# dataset_wsd_2 = pd.read_csv("../datasets/tonetags_wsd_2.csv", index_col=0, converters={"text": str_text_to_list, 'context': str_context_to_list})
dataset_wsd_2 = pd.read_csv("../datasets/tonetags_wsd_2.csv", index_col=0, converters={"text": str_text_to_list})

In [42]:
dataset_wsd_2 = dataset_wsd_2.dropna()

In [43]:
dataset_wsd_2.context = dataset_wsd_2.context.apply(str_context_to_list)

## 12.3. encode labels

In [44]:
dataset_wsd_2.tags = dataset_wsd_2.tags.apply(labels.index)

## 12.4. split

In [45]:
train_wsd_2, test_wsd_2 = train_test_split(dataset_wsd_2, stratify=dataset_wsd_2['tags'], test_size=0.2, random_state=42)

## 12.5. create datasets

In [46]:
train_dataset_wsd_2, test_dataset_wsd_2 = myDataset(train_wsd_2), myDataset(test_wsd_2)

## 12.6. create dataloaders

In [47]:
train_dataloader_wsd_2 = DataLoader(train_dataset_wsd_2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_wsd_2 = DataLoader(test_dataset_wsd_2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

## 12.7. class of model for wsd_2 data

In [48]:
class ToneTagsLSTM_wsd_2(nn.Module):
    def __init__(self, vocab_size, embedding, hidden_dim, context_dim, output_size, num_layers, dropout):
        super(ToneTagsLSTM_wsd_2, self).__init__()
        
        # output_size = 19

        self.embedding = embedding

        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)

        self.fc1 = nn.Linear(hidden_dim * max_length * 2 + context_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, output_size)
        # self.out = nn.Softmax(output_size, dim=1)


    def forward(self, tokens, contexts):

        embedded = self.embedding(tokens)
        output, (hidden, cell) = self.lstm(embedded)

        lstm_out = torch.cat((output.reshape(batch_size, -1), contexts), dim=1)

        fc1_out = self.fc1(lstm_out)


        fc2_out = self.fc2(fc1_out)
        out = self.fc3(fc2_out)
        # out = self.out(fc3_out)

        return out

## 12.8. parameters for wsd_2

In [49]:
# train_dataloader_wsd_2
# test_dataloader_wsd_2

vocab_size = len(vocab)
embedding = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
hidden_dim = 30 # 30
context_dim = 50 # 50
output_size = len(labels) # 19
num_layers = 4
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 5e-4

criterion = nn.CrossEntropyLoss()

## 12.9. create instance of model and optimizer to wsd_2 data

In [50]:
# train_dataloader_wsd_2
# test_dataloader_wsd_2

model_wsd_2 = ToneTagsLSTM_wsd_2(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    context_dim=context_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_wsd_2 = optim.Adam(model_wsd_2.parameters(), lr=lr)

## 12.10. train and test to wsd_2

In [51]:
%%time

for ep in range(1, n_epochs + 1):

    model_wsd_2.train()
    epoch_losses_train = []

    train_predictions = []
    train_tags = []

    test_predictions = []
    test_tags = []


    for tokens, contexts, tags in tqdm(train_dataloader_wsd_2, desc=f"Epoch {ep} training..."):
        optimizer_wsd_2.zero_grad()

        tokens = tokens.to(device)
        contexts = contexts.to(device)
        tags = tags.to(device)

        predictions = model_wsd_2(tokens, contexts)

        train_predictions.extend(predictions.tolist())
        train_tags.extend(tags.tolist())

        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_wsd_2.step()

        epoch_losses_train.append(loss.item())

    del tokens
    del contexts
    del tags

    gc.collect()
    torch.cuda.empty_cache()

    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')


    model_wsd_2.eval()

    with torch.no_grad():
        for tokens, contexts, tags in tqdm(test_dataloader_wsd_2, desc=f"Epoch {ep} testing..."):

            tokens = tokens.to(device)
            contexts = contexts.to(device)
            tags = tags.to(device)

            predictions = model_wsd_2(tokens, contexts)

            test_predictions.extend(predictions.tolist())
            test_tags.extend(tags.tolist())

        del tokens
        del contexts
        del tags
    
        gc.collect()
        torch.cuda.empty_cache()

    export_predictions('clean_wsd_2_grove_6b_50d', epoch=ep, train_predictions=train_predictions, train_tags=train_tags, test_predictions=test_predictions, test_tags=test_tags)


    del train_predictions, train_tags, test_predictions, test_tags
    gc.collect()
    torch.cuda.empty_cache()

training...: 100%|██████████| 2088/2088 [06:29<00:00,  5.36it/s]


[Train Epoch 1] Loss: 2.767507827944226


testing...: 100%|██████████| 521/521 [00:35<00:00, 14.88it/s]
training...: 100%|██████████| 2088/2088 [06:02<00:00,  5.76it/s]


[Train Epoch 2] Loss: 2.5141488100719633


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.42it/s]
training...: 100%|██████████| 2088/2088 [06:04<00:00,  5.73it/s]


[Train Epoch 3] Loss: 2.4816451814325378


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.83it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.68it/s]


[Train Epoch 4] Loss: 2.468050942338746


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.02it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.68it/s]


[Train Epoch 5] Loss: 2.4602553957495195


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.95it/s]
training...: 100%|██████████| 2088/2088 [06:06<00:00,  5.70it/s]


[Train Epoch 6] Loss: 2.4505234683267916


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.00it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.69it/s]


[Train Epoch 7] Loss: 2.4330518263395717


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.03it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.68it/s]


[Train Epoch 8] Loss: 2.4031816224828075


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.07it/s]
training...: 100%|██████████| 2088/2088 [06:09<00:00,  5.65it/s]


[Train Epoch 9] Loss: 2.3657538027028013


testing...: 100%|██████████| 521/521 [00:28<00:00, 17.97it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.68it/s]


[Train Epoch 10] Loss: 2.3253698586504123


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.77it/s]
training...: 100%|██████████| 2088/2088 [06:08<00:00,  5.66it/s]


[Train Epoch 11] Loss: 2.2774917310910205


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.60it/s]
training...: 100%|██████████| 2088/2088 [06:05<00:00,  5.71it/s]


[Train Epoch 12] Loss: 2.2291325576803236


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.78it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.69it/s]


[Train Epoch 13] Loss: 2.1847436595922227


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.78it/s]
training...: 100%|██████████| 2088/2088 [06:06<00:00,  5.69it/s]


[Train Epoch 14] Loss: 2.1440634625853248


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.77it/s]
training...: 100%|██████████| 2088/2088 [06:06<00:00,  5.70it/s]


[Train Epoch 15] Loss: 2.102017056211201


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.86it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.68it/s]


[Train Epoch 16] Loss: 2.0684688228871173


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.74it/s]
training...: 100%|██████████| 2088/2088 [06:06<00:00,  5.70it/s]


[Train Epoch 17] Loss: 2.033398964395925


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.78it/s]
training...: 100%|██████████| 2088/2088 [06:08<00:00,  5.67it/s]


[Train Epoch 18] Loss: 2.003809565442732


testing...: 100%|██████████| 521/521 [00:28<00:00, 17.97it/s]
training...: 100%|██████████| 2088/2088 [06:07<00:00,  5.69it/s]


[Train Epoch 19] Loss: 1.9773295355025835


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.76it/s]
training...: 100%|██████████| 2088/2088 [06:06<00:00,  5.69it/s]


[Train Epoch 20] Loss: 1.946366485508009


testing...: 100%|██████████| 521/521 [00:30<00:00, 16.91it/s]


CPU times: total: 2h 13min 43s
Wall time: 2h 13min 58s


## 12.11. delete some data

In [52]:
del train_dataloader_wsd_2
del test_dataloader_wsd_2
del train_dataset_wsd_2
del test_dataset_wsd_2
del train_wsd_2
del test_wsd_2
del dataset_wsd_2

del model_wsd_2
del optimizer_wsd_2
gc.collect()
torch.cuda.empty_cache()

# 13. clean_corrected_wsd_1 dataset

## 13.1. upload dataset

In [53]:
dataset_clean_corrected_wsd_1 = pd.read_csv("../datasets/tonetags_dataset_tumblr_clean_corrected_text_wsd_1.csv", index_col=0, converters={"text": str_text_to_list})

## 13.2. encode labels

In [54]:
dataset_clean_corrected_wsd_1.tags = dataset_clean_corrected_wsd_1.tags.apply(labels.index)

## 13.3. split

In [55]:
train_clean_corrected_wsd_1, test_clean_corrected_wsd_1 = train_test_split(dataset_clean_corrected_wsd_1, stratify=dataset_clean_corrected_wsd_1['tags'], test_size=0.2, random_state=42)

## 13.4. create datasets

In [56]:
train_dataset_clean_corrected_wsd_1, test_dataset_clean_corrected_wsd_1 = myDataset(train_clean_corrected_wsd_1), myDataset(test_clean_corrected_wsd_1)

## 13.5. create dataloaders

In [57]:
train_dataloader_clean_corrected_wsd_1 = DataLoader(train_dataset_clean_corrected_wsd_1, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_clean_corrected_wsd_1 = DataLoader(test_dataset_clean_corrected_wsd_1, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

## 13.6. create instance of model and optimizer

In [58]:
# train_dataloader_clean_corrected_wsd_1
# test_dataloader_clean_corrected_wsd_1

model_clean_corrected_wsd_1 = ToneTagsLSTM(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_clean_corrected_wsd_1 = optim.Adam(model_clean_corrected_wsd_1.parameters(), lr=lr)

## 13.7. train and test model for clean_corrected_wsd_1 dataloader

In [59]:
%%time

for ep in range(1, n_epochs + 1):

    model_clean_corrected_wsd_1.train()
    epoch_losses_train = []

    train_predictions = []
    train_tags = []

    test_predictions = []
    test_tags = []

    for tokens, contexts, tags in tqdm(train_dataloader_clean_corrected_wsd_1, desc=f"Epoch {ep} training..."):
        optimizer_clean_corrected_wsd_1.zero_grad()

        tokens = tokens.to(device)
        tags = tags.to(device)

        predictions = model_clean_corrected_wsd_1(tokens)

        train_predictions.extend(predictions.tolist())
        train_tags.extend(tags.tolist())

        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_clean_corrected_wsd_1.step()

        epoch_losses_train.append(loss.item())

    del tokens
    del tags

    gc.collect()
    torch.cuda.empty_cache()

    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')


    model_clean_corrected_wsd_1.eval()

    with torch.no_grad():
        for tokens, contexts, tags in tqdm(test_dataloader_clean_corrected_wsd_1, desc=f"Epoch {ep} testing..."):
            tokens = tokens.to(device)
            tags = tags.to(device)

            predictions = model_clean_corrected_wsd_1(tokens)

            test_predictions.extend(predictions.tolist())
            test_tags.extend(tags.tolist())


        del tokens
        del tags

        gc.collect()
        torch.cuda.empty_cache()

    export_predictions('clean_corrected_wsd_1', epoch=ep, train_predictions=train_predictions, train_tags=train_tags, test_predictions=test_predictions, test_tags=test_tags)


    del train_predictions, train_tags, test_predictions, test_tags
    gc.collect()
    torch.cuda.empty_cache()

training...: 100%|██████████| 2108/2108 [06:27<00:00,  5.44it/s]


[Train Epoch 1] Loss: 2.8090018590442143


testing...: 100%|██████████| 527/527 [00:34<00:00, 15.15it/s]
training...: 100%|██████████| 2108/2108 [05:58<00:00,  5.87it/s]


[Train Epoch 2] Loss: 2.667332948837606


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.57it/s]
training...: 100%|██████████| 2108/2108 [06:05<00:00,  5.78it/s]


[Train Epoch 3] Loss: 2.661266172972757


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.38it/s]
training...: 100%|██████████| 2108/2108 [06:04<00:00,  5.78it/s]


[Train Epoch 4] Loss: 2.657799835449378


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.37it/s]
training...: 100%|██████████| 2108/2108 [06:04<00:00,  5.78it/s]


[Train Epoch 5] Loss: 2.656422679179986


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.27it/s]
training...: 100%|██████████| 2108/2108 [06:05<00:00,  5.76it/s]


[Train Epoch 6] Loss: 2.6561248088697327


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.48it/s]
training...: 100%|██████████| 2108/2108 [06:06<00:00,  5.76it/s]


[Train Epoch 7] Loss: 2.6558852054362507


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.23it/s]
training...: 100%|██████████| 2108/2108 [06:09<00:00,  5.71it/s]


[Train Epoch 8] Loss: 2.655960283763495


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.33it/s]
training...: 100%|██████████| 2108/2108 [06:07<00:00,  5.74it/s]


[Train Epoch 9] Loss: 2.655698644707732


testing...: 100%|██████████| 527/527 [00:29<00:00, 18.02it/s]
training...: 100%|██████████| 2108/2108 [06:08<00:00,  5.72it/s]


[Train Epoch 10] Loss: 2.6555731334993915


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.36it/s]
training...: 100%|██████████| 2108/2108 [06:07<00:00,  5.74it/s]


[Train Epoch 11] Loss: 2.655876872209263


testing...: 100%|██████████| 527/527 [00:29<00:00, 18.15it/s]
training...: 100%|██████████| 2108/2108 [06:07<00:00,  5.74it/s]


[Train Epoch 12] Loss: 2.6555347925571606


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.29it/s]
training...: 100%|██████████| 2108/2108 [06:06<00:00,  5.76it/s]


[Train Epoch 13] Loss: 2.6554675830609433


testing...: 100%|██████████| 527/527 [00:29<00:00, 18.14it/s]
training...: 100%|██████████| 2108/2108 [06:07<00:00,  5.73it/s]


[Train Epoch 14] Loss: 2.6555339380731167


testing...: 100%|██████████| 527/527 [00:29<00:00, 17.94it/s]
training...: 100%|██████████| 2108/2108 [06:07<00:00,  5.74it/s]


[Train Epoch 15] Loss: 2.655467842968863


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.30it/s]
training...: 100%|██████████| 2108/2108 [06:06<00:00,  5.74it/s]


[Train Epoch 16] Loss: 2.655425106325457


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.43it/s]
training...: 100%|██████████| 2108/2108 [06:06<00:00,  5.75it/s]


[Train Epoch 17] Loss: 2.655214038361182


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.23it/s]
training...: 100%|██████████| 2108/2108 [06:05<00:00,  5.76it/s]


[Train Epoch 18] Loss: 2.6551138277976745


testing...: 100%|██████████| 527/527 [00:29<00:00, 17.73it/s]
training...: 100%|██████████| 2108/2108 [06:06<00:00,  5.75it/s]


[Train Epoch 19] Loss: 2.655184167272904


testing...: 100%|██████████| 527/527 [00:28<00:00, 18.29it/s]
training...: 100%|██████████| 2108/2108 [06:06<00:00,  5.76it/s]


[Train Epoch 20] Loss: 2.6551423164427392


testing...: 100%|██████████| 527/527 [00:29<00:00, 18.16it/s]


CPU times: total: 2h 13min 37s
Wall time: 2h 13min 48s


## 13.8. delete some data

In [60]:
del train_dataloader_clean_corrected_wsd_1
del test_dataloader_clean_corrected_wsd_1
del train_dataset_clean_corrected_wsd_1
del test_dataset_clean_corrected_wsd_1
del train_clean_corrected_wsd_1
del test_clean_corrected_wsd_1
del dataset_clean_corrected_wsd_1

del model_clean_corrected_wsd_1
del optimizer_clean_corrected_wsd_1
gc.collect()
torch.cuda.empty_cache()

# 14. clean_corrected_text_wsd_2 dataset (from tonetags_dataset_tumblr_clean with glove.twitter.27b.50d)

## 14.1. upload dataset

In [61]:
def str_context_to_list(value):
    list_values = value.strip('[]').split(' ')
    cleaned_list_values = []
    for item in list_values:
        item.strip()
        # if '\n' in item:
        #     item.replace('\n', '')
        if item != "":
            cleaned_list_values.append(float(item))
    return cleaned_list_values

In [62]:
dataset_clean_corrected_wsd_2 = pd.read_csv("../datasets/tonetags_dataset_tumblr_clean_corrected_text_wsd_2.csv", index_col=0, converters={"text": str_text_to_list})

In [63]:
dataset_clean_corrected_wsd_2 = dataset_clean_corrected_wsd_2.dropna()

In [64]:
dataset_clean_corrected_wsd_2.context = dataset_clean_corrected_wsd_2.context.apply(str_context_to_list)

## 14.2. encode labels

In [65]:
dataset_clean_corrected_wsd_2.tags = dataset_clean_corrected_wsd_2.tags.apply(labels.index)

## 14.3. split

In [66]:
train_clean_corrected_wsd_2, test_clean_corrected_wsd_2 = train_test_split(dataset_clean_corrected_wsd_2, stratify=dataset_clean_corrected_wsd_2['tags'], test_size=0.2, random_state=42)

## 14.4. create datasets

In [67]:
train_dataset_clean_corrected_wsd_2, test_dataset_clean_corrected_wsd_2 = myDataset(train_clean_corrected_wsd_2), myDataset(test_clean_corrected_wsd_2)

## 14.5. create dataloaders

In [68]:
train_dataloader_clean_corrected_wsd_2 = DataLoader(train_dataset_clean_corrected_wsd_2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

test_dataloader_clean_corrected_wsd_2 = DataLoader(test_dataset_clean_corrected_wsd_2, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, drop_last=True)

In [69]:
class ToneTagsLSTM_wsd_2(nn.Module):
    def __init__(self, vocab_size, embedding, hidden_dim, context_dim, output_size, num_layers, dropout):
        super(ToneTagsLSTM_wsd_2, self).__init__()

        # output_size = 19

        self.embedding = embedding

        self.lstm = nn.LSTM(self.embedding.embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True, dropout=dropout, batch_first=True)

        self.fc1 = nn.Linear(hidden_dim * max_length * 2 + context_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, output_size)
        # self.out = nn.Softmax(output_size, dim=1)


    def forward(self, tokens, contexts):

        embedded = self.embedding(tokens)
        output, (hidden, cell) = self.lstm(embedded)

        lstm_out = torch.cat((output.reshape(batch_size, -1), contexts), dim=1)

        fc1_out = self.fc1(lstm_out)


        fc2_out = self.fc2(fc1_out)
        out = self.fc3(fc2_out)
        # out = self.out(fc3_out)

        return out

## 14.6. parameters for clean_corrected_wsd_2

In [70]:
# train_dataloader_clean_corrected_wsd_2
# test_dataloader_clean_corrected_wsd_2

vocab_size = len(vocab)
embedding = nn.Embedding.from_pretrained(embedding_tensor, freeze=True)
hidden_dim = 30 # 30
context_dim = 50 # 50
output_size = len(labels) # 19
num_layers = 4
dropout = 0.2
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lr = 5e-4

criterion = nn.CrossEntropyLoss()

## 14.7. create instance of model and optimizer to clean_corrected_wsd_2 data

In [71]:
# train_dataloader_clean_corrected_wsd_2
# test_dataloader_clean_corrected_wsd_2

model_clean_corrected_wsd_2 = ToneTagsLSTM_wsd_2(
    vocab_size=vocab_size,
    embedding=embedding,
    hidden_dim=hidden_dim,
    context_dim=context_dim,
    output_size=output_size,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer_clean_corrected_wsd_2 = optim.Adam(model_clean_corrected_wsd_2.parameters(), lr=lr)

## 14.8. train and test to clean_corrected_wsd_2

In [72]:
%%time

for ep in range(1, n_epochs + 1):

    model_clean_corrected_wsd_2.train()
    epoch_losses_train = []

    train_predictions = []
    train_tags = []

    test_predictions = []
    test_tags = []


    for tokens, contexts, tags in tqdm(train_dataloader_clean_corrected_wsd_2, desc=f"Epoch {ep} training..."):
        optimizer_clean_corrected_wsd_2.zero_grad()

        tokens = tokens.to(device)
        contexts = contexts.to(device)
        tags = tags.to(device)

        predictions = model_clean_corrected_wsd_2(tokens, contexts)

        train_predictions.extend(predictions.tolist())
        train_tags.extend(tags.tolist())

        loss = criterion(predictions, tags)

        loss.backward()
        optimizer_clean_corrected_wsd_2.step()

        epoch_losses_train.append(loss.item())

    del tokens
    del contexts
    del tags

    gc.collect()
    torch.cuda.empty_cache()

    print(f'[Train Epoch {ep}] Loss: {np.mean(epoch_losses_train)}')


    model_clean_corrected_wsd_2.eval()

    with torch.no_grad():
        for tokens, contexts, tags in tqdm(test_dataloader_clean_corrected_wsd_2, desc=f"Epoch {ep} testing..."):

            tokens = tokens.to(device)
            contexts = contexts.to(device)
            tags = tags.to(device)

            predictions = model_clean_corrected_wsd_2(tokens, contexts)

            test_predictions.extend(predictions.tolist())
            test_tags.extend(tags.tolist())

        del tokens
        del contexts
        del tags

        gc.collect()
        torch.cuda.empty_cache()

    export_predictions('clean_corrected_wsd_2_grove_twitter_27b_50d', epoch=ep, train_predictions=train_predictions, train_tags=train_tags, test_predictions=test_predictions, test_tags=test_tags)


    del train_predictions, train_tags, test_predictions, test_tags
    gc.collect()
    torch.cuda.empty_cache()

training...: 100%|██████████| 2084/2084 [06:33<00:00,  5.29it/s]


[Train Epoch 1] Loss: 2.7682194369997037


testing...: 100%|██████████| 521/521 [00:35<00:00, 14.61it/s]
training...: 100%|██████████| 2084/2084 [06:01<00:00,  5.77it/s]


[Train Epoch 2] Loss: 2.4662772797462806


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.35it/s]
training...: 100%|██████████| 2084/2084 [06:04<00:00,  5.72it/s]


[Train Epoch 3] Loss: 2.430239568344691


testing...: 100%|██████████| 521/521 [00:28<00:00, 18.05it/s]
training...: 100%|██████████| 2084/2084 [06:06<00:00,  5.69it/s]


[Train Epoch 4] Loss: 2.4172535695407302


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.47it/s]
training...: 100%|██████████| 2084/2084 [06:10<00:00,  5.62it/s]


[Train Epoch 5] Loss: 2.4135968928831324


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.90it/s]
training...: 100%|██████████| 2084/2084 [06:06<00:00,  5.68it/s]


[Train Epoch 6] Loss: 2.4088925724981385


testing...: 100%|██████████| 521/521 [00:28<00:00, 17.98it/s]
training...: 100%|██████████| 2084/2084 [06:07<00:00,  5.67it/s]


[Train Epoch 7] Loss: 2.405474543857483


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.90it/s]
training...: 100%|██████████| 2084/2084 [06:06<00:00,  5.69it/s]


[Train Epoch 8] Loss: 2.401792000442915


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.91it/s]
training...: 100%|██████████| 2084/2084 [06:07<00:00,  5.68it/s]


[Train Epoch 9] Loss: 2.3970668214639637


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.93it/s]
training...: 100%|██████████| 2084/2084 [06:07<00:00,  5.67it/s]


[Train Epoch 10] Loss: 2.38907526968308


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.85it/s]
training...: 100%|██████████| 2084/2084 [06:06<00:00,  5.69it/s]


[Train Epoch 11] Loss: 2.368809774546614


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.47it/s]
training...: 100%|██████████| 2084/2084 [06:12<00:00,  5.59it/s]


[Train Epoch 12] Loss: 2.3387055324386954


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.78it/s]
training...: 100%|██████████| 2084/2084 [06:13<00:00,  5.58it/s]


[Train Epoch 13] Loss: 2.3069427030557863


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.78it/s]
training...: 100%|██████████| 2084/2084 [06:11<00:00,  5.61it/s]


[Train Epoch 14] Loss: 2.2823664967020734


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.57it/s]
training...: 100%|██████████| 2084/2084 [06:11<00:00,  5.61it/s]


[Train Epoch 15] Loss: 2.2511108836834812


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.58it/s]
training...: 100%|██████████| 2084/2084 [06:11<00:00,  5.61it/s]


[Train Epoch 16] Loss: 2.22712613862444


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.89it/s]
training...: 100%|██████████| 2084/2084 [06:09<00:00,  5.64it/s]


[Train Epoch 17] Loss: 2.2023610246158607


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.69it/s]
training...: 100%|██████████| 2084/2084 [06:07<00:00,  5.67it/s]


[Train Epoch 18] Loss: 2.176966857429658


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.81it/s]
training...: 100%|██████████| 2084/2084 [06:09<00:00,  5.65it/s]


[Train Epoch 19] Loss: 2.163046132694508


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.60it/s]
training...: 100%|██████████| 2084/2084 [06:10<00:00,  5.62it/s]


[Train Epoch 20] Loss: 2.1386220577010264


testing...: 100%|██████████| 521/521 [00:29<00:00, 17.78it/s]


CPU times: total: 2h 14min 24s
Wall time: 2h 14min 36s


## 14.9. delete some data

In [73]:
del train_dataloader_clean_corrected_wsd_2
del test_dataloader_clean_corrected_wsd_2
del train_dataset_clean_corrected_wsd_2
del test_dataset_clean_corrected_wsd_2
del train_clean_corrected_wsd_2
del test_clean_corrected_wsd_2
del dataset_clean_corrected_wsd_2

del model_clean_corrected_wsd_2
del optimizer_clean_corrected_wsd_2
gc.collect()
torch.cuda.empty_cache()