In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam, SGD

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from torchtext import *
from torchtext.data import *

import nltk
#nltk.download('punkt')
from nltk import word_tokenize

txt_field = data.Field(tokenize=word_tokenize, lower=True, include_lengths=True, batch_first=True)
label_field = data.Field(sequential=False, use_vocab=False, batch_first=True)

# make splits for data
train, test= TabularDataset.splits(path='./', train='train.csv', test='test.csv',format='csv', 
                                  fields=[('label', label_field),('sentence', txt_field)], skip_header=True)

txt_field.build_vocab(train, min_freq=5)
train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=32, 
                                                   sort_key=lambda x: len(x.sentence),sort_within_batch=True)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
print(f'Number of training samples: {len(train.examples)}')
print(f'Number of testing samples: {len(test.examples)}')
print(f"Unique tokens in TEXT vocabulary: {len(txt_field.vocab)}")
print(f'Example of training data:\n {vars(train.examples[0])}\n')
print(f'Example of testing data:\n {vars(test.examples[1])}\n')

_,batch  = next(enumerate(train_iter))
print('label tensor', batch.label.shape)
print(batch.label)
print()
sent, sent_len = batch.sentence
print('sentence length tensor', sent_len.shape)
print(sent_len)
print()
print('sentence tensor', sent.shape)
print(sent)

Number of training samples: 40000
Number of testing samples: 3000
Unique tokens in TEXT vocabulary: 38331
Example of training data:
 {'label': '1', 'sentence': ['i', 'guess', 'those', 'who', 'have', 'been', 'in', 'a', 'one-sided', 'relationship', 'of', 'some', 'sort', 'before', 'will', 'be', 'able', 'identify', 'with', 'the', 'lead', 'character', 'minako', 'yuko', 'tanaka', ',', 'a', '50', 'year', 'old', 'woman', 'who', 'is', 'still', 'in', 'the', 'pink', 'of', 'good', 'health', ',', 'as', 'demonstrated', 'by', 'her', 'daily', ',', 'grinding', 'routine', 'of', 'waking', 'up', 'extremely', 'early', 'in', 'the', 'morning', 'to', 'prepare', 'for', 'her', 'milk', 'delivery', 'work', ',', 'where', 'she', 'has', 'to', 'lug', 'bottles', 'of', 'megmilk', 'in', 'a', 'bag', 'in', 'a', 'route', 'around', 'her', 'town', 'like', 'clockwork', ',', 'to', 'exchange', 'empty', 'bottles', 'for', 'full', 'ones', ',', 'and', 'to', 'collect', 'payment', 'and', 'issue', 'receipt', '.', 'and', 'there', "'s",

In [5]:
class Text_RNN(nn.Module):
    def __init__(self, n_vocab, embedding_dim, n_hidden, n_layers, dropout, output_size):
        super(Text_RNN, self).__init__()
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.emb = nn.Embedding(n_vocab, embedding_dim)
        self.rnn = nn.RNN(
                input_size=embedding_dim,
                hidden_size=n_hidden,
                num_layers=n_layers,
                dropout=dropout,
                batch_first=True
        )
        self.dropout=nn.Dropout(dropout)
        self.fc=nn.Linear(n_hidden, output_size)
        self.sigmoid=nn.Sigmoid()
    def forward(self, sent, sent_len):
        # sent: batch_size, max_sent_len
        # sent_len: batch_size
        
        sent_emb = self.emb(sent)  #batch_size, max_sent_len, embedding_dim
        
        # method 1
        #outputs, hidden = self.rnn(sent_emb)
        
        # method 2, pack the input sequence, more computationally efficient
        packed_embedded = nn.utils.rnn.pack_padded_sequence(sent_emb, src_len, batch_first=True)
        packed_outputs, hidden = self.rnn(packed_embedded)
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
        
        #output: batch_size, max_sent_len, n_hidden
        #hidden: n_layer, batch_size, n_hidden 
        prob = self.dropout(hidden[-1,:,:])
        prob = torch.sigmoid(self.fc(prob))
        return prob

In [None]:
def save_checkpoint(save_path, model, optimizer, val_loss):
    if save_path==None:
        return
    save_path = save_path 
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'val_loss': val_loss}

    torch.save(state_dict, save_path)

    print(f'Model saved to ==> {save_path}')

def load_checkpoint(model, optimizer, save_path):
    state_dict = torch.load(save_path)
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    val_loss = state_dict['val_loss']
    print(f'Model loaded from <== {save_path}')
    
    return val_loss


def TRAIN(model, train_loader, valid_loader,  num_epochs, eval_every, total_step, criterion, optimizer, val_loss, device, save_name):
    
    running_loss = 0.0
    global_step = 0
    if val_loss==None:
        best_val_loss = float("Inf")  
    else: 
        best_val_loss=val_loss
    
    model.to(device)
    for epoch in range(num_epochs):  # loop over the dataset multiple times

        for i, (_,batch) in enumerate(train_loader):
            
            model.train()
            batch = batch.to(device)
            text, text_lengths = batch.sentence
            labels = batch.label
            '''Training of the model'''
            # Forward pass
            outputs = model(text, text_lengths)
            rounded_preds = torch.round(outputs)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            global_step += 1

            running_loss += loss.item()

            '''Evaluating the model every x steps'''
            if global_step % eval_every == 0:
                with torch.no_grad():
                    model.eval()
                    val_running_loss = 0.0
                    for _,val_batch in valid_loader:
                        val_text, val_lengths = val_batch.sentence
                        val_labels = val_batch.label
                        val_outputs = model(val_text, val_lengths)
                        val_loss = criterion(val_outputs, val_labels)
                        val_running_loss += val_loss.item()

                    average_train_loss = running_loss / eval_every
                    average_val_loss = val_running_loss / len(valid_loader)

                    print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}' 
                          .format(epoch+1, num_epochs, global_step, total_step, average_train_loss, average_val_loss))

                    running_loss = 0.0
                    if average_val_loss < best_val_loss:
                        best_val_loss = average_val_loss
                        save_checkpoint(save_name, model, optimizer, best_val_loss)
                    
    print('Finished Training')

In [8]:
device ='cpu'
# define parameters
n_vocab = len(txt_field.vocab)
embedding_dim = 64
n_hidden=128
n_layers=2
dropout = 0.5
lr = 0.001
output_size = 1
# build model
criterion = nn.BCELoss()
model = Text_RNN(n_vocab=n_vocab, embedding_dim=embedding_dim, n_hidden=n_hidden, n_layers=n_layers, dropout=dropout, output_size =output_size).to(device)
optimizer = Adam(model.parameters(), lr=lr)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 5877062244 bytes. Buy new RAM!
