<a href="https://colab.research.google.com/github/amaslov455/nlp_project/blob/main/CNN_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os

import torch
from torchtext.legacy import data
from torchtext.legacy import datasets
import random
import torch.optim as optim

In [2]:
PATH_TO_FOLDER = '/content/drive/MyDrive/diplom_project/data'
SEED = 1234

### Create CNN model and all functions to use it

In [3]:
# Create model

import torch.nn as nn
import torch.nn.functional as F

class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]

        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [4]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [5]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.text)
        
        loss = criterion(predictions, batch.label)
        
        acc = categorical_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [6]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text)
            
            loss = criterion(predictions, batch.label)
            
            acc = categorical_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [7]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


### Use model

In [8]:
import pandas as pd
df_check = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_train.csv'))

list_of_columns = [x for x in list(df_check.columns) if x not in ['sentence', 'santiment']]

print(list_of_columns)

['joined_spm_500', 'joined_spm_1000', 'joined_spm_1500', 'joined_spm_2000', 'joined_spm_2500', 'joined_spm_3000', 'joined_spm_3500', 'joined_spm_4000', 'joined_spm_4500', 'joined_spm_5000', 'joined_spm_5500', 'joined_spm_6000', 'joined_spm_6500', 'joined_spm_7000', 'joined_spm_7500', 'joined_spm_8000', 'joined_spm_8500', 'joined_spm_9000', 'joined_spm_9500', 'joined_spm_10000', 'joined_spm_10500', 'joined_spm_11000']


In [9]:
df_train = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_train.csv'))
df_test = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_test.csv'))
df_valid = pd.read_csv(os.path.join(PATH_TO_FOLDER, 'SST5_SPM_valid.csv'))

print(f'Length train dataframe: {len(df_train)}')
print(f'Length test dataframe: {len(df_test)}')
print(f'Length validation dataframe: {len(df_valid)}')

Length train dataframe: 8544
Length test dataframe: 2210
Length validation dataframe: 1101


In [10]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [11]:
dict_w_results = {
    'name':[],
    'number_of_tokens':[],
    'test_acc':[],
    'best_valid_acc':[],
    'epoches':[],
    'time_all_secs':[],
    'time_for_epoch_secs':[]
}

BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))

for current_column in list_of_columns:
    torch.manual_seed(SEED)
    torch.backends.cudnn.deterministic = True

    ## Load data
    tokenize = lambda x: x.split()

    TEXT = data.Field(sequential=True, use_vocab=True, tokenize=tokenize)
    LABEL = data.LabelField()

    fields_data = {current_column:('text', TEXT),'santiment':('label', LABEL)} 

    train_data, valid_data, test_data = data.TabularDataset.splits(
        path = PATH_TO_FOLDER,
        train = 'SST5_SPM_train.csv',
        test = 'SST5_SPM_test.csv',
        validation = 'SST5_SPM_valid.csv',
        format = 'csv',
        fields = fields_data
    )

    ## Create vocab
    TEXT.build_vocab(train_data)
    LABEL.build_vocab(train_data)

    ## Create iterator
    train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        sort = False,
        batch_size = BATCH_SIZE, 
        device = device)

    ## Create model CNN
    INPUT_DIM = len(TEXT.vocab)
    EMBEDDING_DIM = 100
    N_FILTERS = 100
    FILTER_SIZES = [2,3,4]
    OUTPUT_DIM = len(LABEL.vocab)
    DROPOUT = 0.5
    PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

    model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

    ## Count trainable parameters of our model


    # print(f'The model has {count_parameters(model):,} trainable parameters')

    ## Set vectors PAD and UNK tokens
    UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

    model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
    model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

    ## Set optimizer and loss function
    optimizer = optim.Adam(model.parameters())

    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    MIN_EPOCHS = 5
    MAX_EPOCHS = 12

    best_valid_loss = float('inf')
    best_valid_acc = float(0)
    time_all = 0

    #Loop train model
    for epoch in range(MAX_EPOCHS):
        start_time = time.time()
        
        train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
        end_time = time.time()

        time_all = time_all + end_time - start_time
        
        if epoch > MIN_EPOCHS:
          if valid_loss >= best_valid_loss:
            break
          else:
            best_valid_loss = valid_loss
            best_valid_acc = valid_acc

        # if valid_loss < best_valid_loss:
        #     best_valid_loss = valid_loss
        #     torch.save(model.state_dict(), 'tut5-model.pt')
        
        # print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        # print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        # print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

        # print(cur_vocab_words, valid_acc, valid_loss)

    test_loss, test_acc = evaluate(model, test_iterator, criterion)
    print(f'{current_column} - Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

    dict_w_results['name'].append(current_column)
    dict_w_results['number_of_tokens'].append(len(TEXT.vocab))
    dict_w_results['test_acc'].append('%.3f' % test_acc)
    dict_w_results['best_valid_acc'].append('%.3f' % best_valid_acc)
    dict_w_results['epoches'].append(epoch + 1)
    dict_w_results['time_all_secs'].append('%.3f' % time_all)
    dict_w_results['time_for_epoch_secs'].append('%.3f' % (time_all / (epoch + 1)))

device: cuda
Tesla P100-PCIE-16GB
joined_spm_500 - Test Loss: 1.403 | Test Acc: 37.25%
joined_spm_1000 - Test Loss: 1.401 | Test Acc: 37.96%
joined_spm_1500 - Test Loss: 1.443 | Test Acc: 35.30%
joined_spm_2000 - Test Loss: 1.432 | Test Acc: 36.81%
joined_spm_2500 - Test Loss: 1.463 | Test Acc: 36.59%
joined_spm_3000 - Test Loss: 1.458 | Test Acc: 36.52%
joined_spm_3500 - Test Loss: 1.432 | Test Acc: 36.64%
joined_spm_4000 - Test Loss: 1.418 | Test Acc: 37.70%
joined_spm_4500 - Test Loss: 1.446 | Test Acc: 37.02%
joined_spm_5000 - Test Loss: 1.458 | Test Acc: 36.36%
joined_spm_5500 - Test Loss: 1.411 | Test Acc: 37.74%
joined_spm_6000 - Test Loss: 1.438 | Test Acc: 36.95%
joined_spm_6500 - Test Loss: 1.447 | Test Acc: 37.92%
joined_spm_7000 - Test Loss: 1.451 | Test Acc: 37.31%
joined_spm_7500 - Test Loss: 1.467 | Test Acc: 37.51%
joined_spm_8000 - Test Loss: 1.457 | Test Acc: 38.26%
joined_spm_8500 - Test Loss: 1.432 | Test Acc: 37.30%
joined_spm_9000 - Test Loss: 1.426 | Test Acc: 36

In [12]:
# Export results to CSV
df_to_export = pd.DataFrame.from_dict(dict_w_results)
df_to_export.to_csv(os.path.join(PATH_TO_FOLDER, 'CNN_results.csv'), index = False)

In [14]:
df_to_export.applymap(lambda x: str(x).replace('.',','))

Unnamed: 0,name,number_of_tokens,test_acc,best_valid_acc,epoches,time_all_secs,time_for_epoch_secs
0,joined_spm_500,542,373,389,10,11727,1173
1,joined_spm_1000,1042,380,384,9,9535,1059
2,joined_spm_1500,1543,353,403,9,9187,1021
3,joined_spm_2000,2043,368,374,9,9057,1006
4,joined_spm_2500,2543,366,410,9,9051,1006
5,joined_spm_3000,3043,365,364,9,9166,1018
6,joined_spm_3500,3542,366,394,9,9294,1033
7,joined_spm_4000,4043,377,398,9,9250,1028
8,joined_spm_4500,4541,370,405,9,9452,1050
9,joined_spm_5000,5042,364,366,9,9654,1073
