<a href="https://colab.research.google.com/github/amaslov455/nlp_project/blob/main/LSTM_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os

import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import torch.optim as optim

In [3]:
PATH_TO_FOLDER = '/content/drive/MyDrive/diplom_project/data'
SEED = 1234

### Create LSTM model and all functions to use it

In [4]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'), enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
        return self.fc(hidden)

In [5]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [6]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [7]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

### Prepare data and use model

In [8]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [9]:
import pandas as pd
df_check = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_train.csv'))

list_of_columns = [x for x in list(df_check.columns) if x not in ['sentence', 'santiment']]

print(list_of_columns)

['joined_spm_500', 'joined_spm_1000', 'joined_spm_1500', 'joined_spm_2000', 'joined_spm_2500', 'joined_spm_3000', 'joined_spm_3500', 'joined_spm_4000', 'joined_spm_4500', 'joined_spm_5000', 'joined_spm_5500', 'joined_spm_6000', 'joined_spm_6500', 'joined_spm_7000', 'joined_spm_7500', 'joined_spm_8000', 'joined_spm_8500', 'joined_spm_9000', 'joined_spm_9500', 'joined_spm_10000', 'joined_spm_10500', 'joined_spm_11000']


In [10]:
df_train = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_train.csv'))
df_test = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_test.csv'))
df_valid = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_valid.csv'))

print(f'Length train dataframe: {len(df_train)}')
print(f'Length test dataframe: {len(df_test)}')
print(f'Length validation dataframe: {len(df_valid)}')

Length train dataframe: 8544
Length test dataframe: 2210
Length validation dataframe: 1101


In [11]:
import time

dict_w_results = {
    'name':[],
    'number_of_tokens':[],
    'test_acc':[],
    'best_valid_acc':[],
    'epoches':[],
    'time_all_secs':[],
    'time_for_epoch_secs':[]
}

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

for current_column in list_of_columns:
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    ## Load data
    tokenize = lambda x: x.split()

    TEXT = data.Field(sequential=True,
                      use_vocab=True,
                      tokenize=tokenize,
                      include_lengths=True)
    LABEL = data.LabelField()

    fields_data = {current_column:('text', TEXT),'santiment':('label', LABEL)} 

    train_data, valid_data, test_data = data.TabularDataset.splits(
        path = PATH_TO_FOLDER,
        train = 'SST5_SPM_train.csv',
        test = 'SST5_SPM_test.csv',
        validation = 'SST5_SPM_valid.csv',
        format = 'csv',
        fields = fields_data
    )

    ## Create vocab
    TEXT.build_vocab(train_data)
    LABEL.build_vocab(train_data)

    ## Create iterator
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        sort = False,
        batch_size = BATCH_SIZE,
        device = device)

    ## Create model RNN LSTM
    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    HIDDEN_DIM = 256
    OUTPUT_DIM = len(LABEL.vocab)
    N_LAYERS = 2
    BIDIRECTIONAL = True
    DROPOUT = 0.5
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

    model = RNN(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)

    ## Count trainable parameters of our model


    # print(f'The model has {count_parameters(model):,} trainable parameters')

    ## Set vectors PAD and UNK tokens
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    ## Set optimizer and loss function
    optimizer = optim.Adam(model.parameters())

    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    MIN_EPOCHS = 5
    MAX_EPOCHS = 12

    best_valid_loss = float('inf')
    best_valid_acc = float(0)
    time_all = 0

    #Loop train model
    for epoch in range(MAX_EPOCHS):
        start_time = time.time()
        
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        end_time = time.time()

        time_all = time_all + end_time - start_time
        
        if epoch > MIN_EPOCHS:
          if valid_loss >= best_valid_loss:
            break
          else:
            best_valid_loss = valid_loss
            best_valid_acc = valid_acc

        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'tut5-model.pt')
        
        # print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        # print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        # print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

        # print(cur_vocab_words, valid_acc, valid_loss)

    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'{current_column} - Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

    dict_w_results['name'].append(current_column)
    dict_w_results['number_of_tokens'].append(len(TEXT.vocab))
    dict_w_results['test_acc'].append('%.3f' % test_acc)
    dict_w_results['best_valid_acc'].append('%.3f' % best_valid_acc)
    dict_w_results['epoches'].append(epoch + 1)
    dict_w_results['time_all_secs'].append('%.3f' % time_all)
    dict_w_results['time_for_epoch_secs'].append('%.3f' % (time_all / (epoch + 1)))

device: cuda
Tesla T4
joined_spm_500 - Test Loss: 1.466 | Test Acc: 37.11%
joined_spm_1000 - Test Loss: 1.491 | Test Acc: 35.70%
joined_spm_1500 - Test Loss: 1.489 | Test Acc: 37.22%
joined_spm_2000 - Test Loss: 1.454 | Test Acc: 37.42%
joined_spm_2500 - Test Loss: 1.456 | Test Acc: 39.21%
joined_spm_3000 - Test Loss: 1.460 | Test Acc: 37.97%
joined_spm_3500 - Test Loss: 1.467 | Test Acc: 39.02%
joined_spm_4000 - Test Loss: 1.440 | Test Acc: 39.07%
joined_spm_4500 - Test Loss: 1.461 | Test Acc: 39.26%
joined_spm_5000 - Test Loss: 1.441 | Test Acc: 38.32%
joined_spm_5500 - Test Loss: 1.488 | Test Acc: 36.71%
joined_spm_6000 - Test Loss: 1.455 | Test Acc: 37.13%
joined_spm_6500 - Test Loss: 1.449 | Test Acc: 37.93%
joined_spm_7000 - Test Loss: 1.431 | Test Acc: 38.87%
joined_spm_7500 - Test Loss: 1.457 | Test Acc: 38.39%
joined_spm_8000 - Test Loss: 1.455 | Test Acc: 37.54%
joined_spm_8500 - Test Loss: 1.417 | Test Acc: 40.82%
joined_spm_9000 - Test Loss: 1.461 | Test Acc: 40.20%
joined_

In [12]:
df_to_export = pd.DataFrame.from_dict(dict_w_results)
# df_to_export.to_csv(os.path.join(PATH_TO_FOLDER, 'LSTM_results.csv'), index = False)

In [13]:
df_to_export.applymap(lambda x: str(x).replace('.',','))

Unnamed: 0,name,number_of_tokens,test_acc,best_valid_acc,epoches,time_all_secs,time_for_epoch_secs
0,joined_spm_500,542,371,332,9,49044,5449
1,joined_spm_1000,1042,357,393,9,43947,4883
2,joined_spm_1500,1543,372,381,10,46262,4626
3,joined_spm_2000,2043,374,378,9,39703,4411
4,joined_spm_2500,2543,392,351,8,34316,4290
5,joined_spm_3000,3043,380,364,9,37637,4182
6,joined_spm_3500,3542,390,374,8,32633,4079
7,joined_spm_4000,4043,391,361,8,32104,4013
8,joined_spm_4500,4541,393,379,8,31722,3965
9,joined_spm_5000,5042,383,369,8,31283,3910
