In [68]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from torchmetrics import F1Score

import pandas as pd
import numpy as np
import re

from torch.utils.data import random_split

from textfn import *
from classes import *


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
df = pd.read_csv('dataset/train_processed.csv')
df['text'] = df['text'].apply(lambda x: re.sub(r'\'|\[|\]|\s', '', x).split(','))

In [63]:
params = {
    'vocab_size': 50000,
    'embedding_dim': 50, 
    'max_seq_length': 35
}

params = {
    # Global
    'num_epochs': 100,
    'batch_size': 32, 
    'device': torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'split_seed': 42,
    'train_dev_split': 0.65,
    # Vocabulary 
    'vocab_size': 50000, 
    'embedding_dim': 50,
    'max_seq_length': 20,
    # Model
    'hidden_dim':256,
    'output_dim':1,
    # Optimizer
    'optim_lr': 0.01, 
}

Need to select the vocab either:

- From glove, by loading via looping into it
- From training dataset by number and usage of words or other metrics (to select a defined nuber of words to translate) 

In [66]:
def load_vocab(vocab_size, df, glove_path, embedding_dim, load_type='1'):
    embeddings = {}
    word_to_index = {}
    if load_type=='1':
        # Create the best used words in the dataset
        count_vocab = {'word': [], 'count': []}
        for sentence in df['text']:
            for word in sentence:
                if word in count_vocab['word']:
                    idx = count_vocab['word'].index(word)
                    count_vocab['count'][idx] += 1 
                else:
                    count_vocab['word'].append(word)
                    count_vocab['count'].append(1)
        if len(count_vocab['word']) <= vocab_size:
            print('[I] - Vocab size given is too high, scalling down from {} to {}'.format(vocab_size, len(count_vocab['word'])))
            vocab_size = len(count_vocab['word'])
        count_vocab = pd.DataFrame(count_vocab).sort_values(by='count', ascending=False).head(vocab_size)['word'].tolist()
        
        # Load Glove
        embeddings = np.zeros((vocab_size, embedding_dim))
        word_to_index = {}
        index=0
        with open(glove_path, "r", encoding="utf-8") as f:
            for line in f:
                if index >= vocab_size:
                    break
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype="float16")
                if word in count_vocab:
                    embeddings[index] = vector
                    word_to_index[word] = index
                index +=1 
 
    elif load_type=='2':
        embeddings = np.zeros((vocab_size, embedding_dim))
        word_to_index = {}
        index=0
        with open(glove_path, "r", encoding="utf-8") as f:
            for line in f:
                if index >= vocab_size:
                    break
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], dtype="float16")
                embeddings[index] = vector
                word_to_index[word] = index
                index +=1 

    return vocab_size, torch.tensor(embeddings), word_to_index

vocab_size, vocab, word_to_index = load_vocab(params['vocab_size'], df, 
                                'glove_pretrained/glove.6B.{}d.txt'.format(params["embedding_dim"]), 
                                params["embedding_dim"], '2')

Need to get the vocab size from dataframe and to match with corresponding vector or UKN/PAD tokens 


In [82]:
train_data = CdatasetGlove(df, params['max_seq_length'], train=True)

In [83]:
train_data, dev_data = random_split(train_data, [params['train_dev_split'],1-params['train_dev_split']], generator=torch.Generator().manual_seed(params['split_seed']))

In [84]:
train_loader = DataLoader(train_data, batch_size=params['batch_size'], shuffle=True)
dev_loader = DataLoader(dev_data, batch_size=params['batch_size'], shuffle=False)

In [87]:
model = SimpleLSTMGloVe(params, vocab)
model_params = {
    'optimizer': torch.optim.Adam(model.parameters(), lr=params['optim_lr']), 
    'loss_fn'  : torch.nn.BCELoss(),
    'metric'   : F1Score(task='binary').to(params['device'])
}

[autoreload of classes failed: Traceback (most recent call last):
  File "c:\Users\Shadow\anaconda3\envs\tgpu\Lib\site-packages\IPython\extensions\autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "c:\Users\Shadow\anaconda3\envs\tgpu\Lib\site-packages\IPython\extensions\autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "c:\Users\Shadow\anaconda3\envs\tgpu\Lib\site-packages\IPython\extensions\autoreload.py", line 397, in update_generic
    update(a, b)
  File "c:\Users\Shadow\anaconda3\envs\tgpu\Lib\site-packages\IPython\extensions\autoreload.py", line 365, in update_class
    update_instances(old, new)
  File "c:\Users\Shadow\anaconda3\envs\tgpu\Lib\site-packages\IPython\extensions\autoreload.py", line 323, in update_instances
    object.__setattr__(ref, "__class__", new)
TypeError: __class__ assignment: 'CdatasetGlove' object layout differs from 'CdatasetGlove'
]


NameError: name 'SimpleLSTMGloVe' is not defined

In [None]:
def train(model, loader, params, model_params):
    accs = []
    losses = []
    for params['num_epochs']:
        model.train()
        epoch_acc = 0
        epoch_loss= 0

        for X,Y in loader:
            model_params['optimizer'].zero_grad()
            X = X.to(params['device'])
            Y = Y.to(params['device'])

            with torch.set_grad_enabled(True):
                Yhat = model(X)
                loss = model_params['loss_fn'](Yhat, Y)
                loss.backward()
                model_params['optimizer'].step()
            epoch_loss+= loss.item()
            epoch_acc += model_params['metric'](Yhat, Y)
            
        epoch_acc = epoch_acc/len(loader)
        epoch_loss = epoch_loss/len(loader)
        accs.append(epoch_acc)
        losses.append(epoch_loss)
    return model, accs, losses
