# Our first RNN model (The baseline model)

For **Visualization and Recommendation system teams**

Please first install following stuff to run this notebook:

1) Pytorch

pip3 install torch torchvision

2) TorchText

pip3 install torchtext

3) Then English model

sudo python3 -m spacy download en
pip3 install transformers


## Data preprocessing

Using TorchText is very helpful in data preprocessing of this dataset.

In [2]:
import torch
from torchtext import data
#import spacy
SEED = 1234
#import en_core_web_sm
#spacy = en_core_web_sm.load()

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

#nlp = spacy.load("en_core_web_sm")
#doc = nlp("This is a sentence.")

TEXT = data.Field(tokenize = 'spacy')
LABEL = data.LabelField(dtype = torch.float)
FILENAME = data.Field()

print (TEXT)
print("Hello")

<torchtext.data.field.Field object at 0x7ff84b78b908>
Hello


**Next I wrote custom class to include additional stuff for IMDB dataset.**

In [3]:
import os
import glob
import io




class IMDB(data.Dataset):

    urls = ['/home/usman/Downloads/aclImdb_v1.tar.gz']
    name = 'imdb'
    dirname = 'aclImdb'

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, path, text_field, label_field, **kwargs):
        """Create an IMDB dataset instance given a path and fields.
        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        #print(text_field)
        fname_data = data.RawField()
        
        fields = [('text', text_field), ('label', label_field), ('aname', fname_data)]
        examples = []
        #dddddddddd
        for label in ['pos', 'neg']:
            for fname in glob.iglob(os.path.join(path, label, '*.txt')):
                with io.open(fname, 'r', encoding="utf-8") as f:
                    text = f.readline()
                #print(fname)
                #print(text)
                
                examples.append(data.Example.fromlist([text, label,fname.split('aclImdb')[1]], fields))
                #print(examples[0].fname)
                #mmmmmmmmmmmmm

        super(IMDB, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, text_field, label_field, root='.data',
               train='train', test='test', **kwargs):
        """Create dataset objects for splits of the IMDB dataset.
        Arguments:
            text_field: The field that will be used for the sentence.
            label_field: The field that will be used for label data.
            root: Root dataset storage directory. Default is '.data'.
            train: The directory that contains the training examples
            test: The directory that contains the test examples
            Remaining keyword arguments: Passed to the splits method of
                Dataset.
        """
        
        return super(IMDB, cls).splits(
            root=root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

    @classmethod
    def iters(cls, batch_size=32, device=0, root='.data', vectors=None, **kwargs):
        """Create iterator objects for splits of the IMDB dataset.
        Arguments:
            batch_size: Batch_size
            device: Device to create batches on. Use - 1 for CPU and None for
                the currently active GPU device.
            root: The root directory that contains the imdb dataset subdirectory
            vectors: one of the available pretrained vectors or a list with each
                element one of the available pretrained vectors (see Vocab.load_vectors)
            Remaining keyword arguments: Passed to the splits method.
        """
        TEXT = data.Field()
        LABEL = data.Field(sequential=False)
        
        #usman123
        train, test = cls.splits(TEXT, LABEL, root=root, **kwargs)

        TEXT.build_vocab(train, vectors=vectors)
        LABEL.build_vocab(train)
        #yyyyyyyy
        return data.BucketIterator.splits(
            (train, test), batch_size=batch_size, device=device)

In [4]:
#from torchtext import datasets

train_data, test_data = IMDB.splits(TEXT, LABEL)

We can see how many examples are in each split by checking their length.

In [5]:
print('Total training examples:', len(train_data))
print('Total testing examples:', len(test_data))
print(vars(test_data.examples[0]))

Total training examples: 25000
Total testing examples: 25000
{'text': ['The', 'Ali', 'G', 'character', 'works', 'brilliantly', 'within', 'the', 'confines', 'of', 'a', 'comedy', 'show', ',', 'but', 'as', 'a', 'movie', ',', 'it', 'does', "n't", 'work', 'in', 'the', 'same', 'way.<br', '/><br', "/>Don't", 'get', 'me', 'wrong', '-', 'this', 'is', 'a', 'very', 'funny', 'movie', ',', 'full', 'of', 'biting', ',', 'witty', 'dialogue', ',', 'that', 'caricatures', 'the', 'modern', 'British', 'chav', 'wonderfully', 'well', ',', 'whilst', 'providing', 'the', 'viewer', 'with', 'a', 'hilarious', ',', 'if', 'unrealistic', 'story.<br', '/><br', '/>One', 'problem', 'with', 'this', 'film', 'is', 'that', 'the', 'script', 'and', 'content', 'is', 'either', 'fantastically', 'brilliant', ',', 'or', 'it', "'s", 'embarrassing', 'to', 'watch', '.', 'When', 'I', 'say', 'embarrassing', ',', 'I', 'do', "n't", 'mean', 'funny', 'embarrassing', 'a', 'la', 'Office', 'or', 'Extras', ',', 'but', 'rather', ',', 'you', "'l

One example to help you access the training and testing objects.

For **Viualization and Recommendation system Teams**:
you can use these objects for your working

Splitting training data into 75/25 train/valid split.

For **Recommendation system Teams**:
you can use these objects for your working

In [6]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

print('Total training examples:', len(train_data))
print('Total validation examples: ',len(valid_data))
print('Total testing examples: ',len(test_data))

Total training examples: 17500
Total validation examples:  7500
Total testing examples:  25000


**Now preparing data for the baseline NLP model**

In [7]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
#FILENAME.build_vocab(train_data)

In [8]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(vars(train_data.examples[0]))
#usman:check iters
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    device = device)
print(vars(test_iterator))

{'text': ['<', 'br', '/><br', '/>Dull', 'Demi', ',', 'going', 'thru', 'the', 'motions', '.', 'Ditto', 'Prochnow', '.', 'Ominous', 'portents', 'that', 'elicit', 'yawns', '.', 'Michael', 'Biehn', 'trying', 'to', 'be', 'dynamic', ',', 'which', 'ai', "n't", 'his', 'shtick.<br', '/><br', '/>To', 'quote', 'Buffy', 'Summers', ',', '"', 'If', 'the', 'apocalypse', 'comes', '...', 'beep', 'me', '.', '"<br', '/><br', '/>Going', 'back', 'to', 'sleep', 'now', '.'], 'aname': '/train/neg/7944_2.txt', 'label': 'neg'}
{'batch_size_fn': None, 'train': False, 'random_shuffler': <torchtext.data.utils.RandomShuffler object at 0x7ff7b83e9f60>, '_iterations_this_epoch': 0, 'dataset': <__main__.IMDB object at 0x7ff7cd8f4358>, 'shuffle': False, 'iterations': 0, 'sort_within_batch': True, 'batch_size': 64, 'device': device(type='cpu'), 'repeat': False, 'sort': True, '_restored_from_state': False, '_random_state_this_epoch': None, 'sort_key': <function IMDB.sort_key at 0x7ff86002e488>}


**The baseline Model definition**

In [9]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
       
        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
 
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))
    
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

Training process

In [13]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

#util funtions
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [14]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    #batch = next(iterator.__iter__())
    #print(batch.text)
    #print(batch.aname)
    #print(batch.text)
    #usman: check eval
    
    for batch in iterator:
            
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    #sssssssss
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            #print(predictions)
            #print(batch.text.shape)
            #print(TEXT.vocab.itos[11])
            #sssdsdssd
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



def evaluate_test(model, iterator, criterion):
    
    
    import csv

    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    #sssssssss
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            rounded_preds_us = torch.round(torch.sigmoid(predictions))
            
            #print(predictions)
            #print(batch.text.shape)
            #print(TEXT.vocab.itos[11])
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)
            #print(predictions)
            #print(rounded_preds_us)
            #usman:check valid later
            
            with open('sentiment_results_file_baseline.csv', mode='a') as sent_file:
                file_writer = csv.writer(sent_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                
                for x in range(0,len(predictions)):
                    file_writer.writerow([batch.aname[x], rounded_preds_us[x].item(),batch.label[x].item()])

            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

**Training**

In [16]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'baselineModel.pt')
    
    print('Epoch: ',str(epoch+1),' | Epoch Time:',str(epoch_mins),'m ',str(epoch_secs),'s')
    print('\tTrain Loss: ',str(train_loss),' | Train Acc: ',str(train_acc*100),'%')
    print('\t Val. Loss: ',str(valid_loss),' |  Val. Acc: ',str(valid_acc*100),'%')

KeyboardInterrupt: 

Once we have trainned model, now evalting on the testing 25,000 reviews.

In [17]:
model.load_state_dict(torch.load('baselineModel.pt'))

import csv

with open('sentiment_results_file_baseline.csv', mode='w') as sent_file:
    file_writer = csv.writer(sent_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    file_writer.writerow(['Review File Path', 'Our Prediction', 'Ground Truth (1: pos, 0:neg)'])                
    




test_loss, test_acc = evaluate_test(model, test_iterator, criterion)

print('Test Loss: ',str(test_loss),' | Test Acc:',str(test_acc*100),'%')
print('Results are also written in the csv file')

Test Loss:  0.7097292280258121  | Test Acc: 47.179507675683105 %
Results are also written in the csv file
