In [1]:
from google.colab import drive
drive.mount('drive')

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


# Import Libraries

In [2]:
import numpy as np
import torch
import torchtext
from torchtext.legacy.data import Field, TabularDataset, BucketIterator
import re
import pandas as pd
from sklearn import model_selection
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import torch.nn as nn
import torch.optim as optim
pd.set_option('max_colwidth',99999)
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#Data Loading

In [3]:
amazon_df = pd .read_csv('/content/drive/My Drive/capstone_project/Data/amazon_cells_labelled.txt', delimiter='\t', names = ['customer_reviews', 'label'])
yelp_df =  pd .read_csv('/content/drive/My Drive/capstone_project/Data/yelp_labelled.txt', delimiter='\t', names = ['customer_reviews', 'label'])
imdb_df =  pd .read_csv('/content/drive/My Drive/capstone_project/Data/imdb_labelled.txt', delimiter='\t', names = ['customer_reviews', 'label'])
final_data = pd.concat([amazon_df,imdb_df,yelp_df])

In [4]:
#splitting data to train and test sets
train, test = model_selection.train_test_split(final_data, test_size=0.3, random_state = 42)
train.reset_index(drop = True, inplace = True), test.reset_index(drop = True, inplace = True)
train.shape, test.shape

((1923, 2), (825, 2))

In [5]:
train.to_csv('/content/drive/My Drive/capstone_project/Data/amazon_train.csv', index = False)
test.to_csv('/content/drive/My Drive/capstone_project/Data/amazon_test.csv', index = False)

#Pre-Processing

In [6]:
def clean(text):
    
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) 
    text = re.sub(r'https?:/\/\S+', ' ', text) 
    
    return text.strip()
def tokenizer(s): 
    return [w.lower() for w in word_tokenize(clean(s))]

# Create torchtext Tabular Dataset

In [7]:
TEXT = torchtext.legacy.data.Field(tokenize = tokenizer)

LABEL = torchtext.legacy.data.LabelField(dtype = torch.float)

datafields = [('reviews', TEXT), ('label', LABEL)]

trn, tst = torchtext.legacy.data.TabularDataset.splits(path = '/content/drive/My Drive/capstone_project/Data/', 
                                                train = 'amazon_train.csv',
                                                test = 'amazon_test.csv',    
                                                format = 'csv',
                                                skip_header = True,
                                                fields = datafields)

#Loading the data into batches

In [8]:
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
                                (trn, tst),
                                batch_size = 64,
                                sort_key=lambda x: len(x.reviews),
                                sort_within_batch=False)

# Word Embeddings

## Load pretrained GloVe word vectors and build vocabulary

In [9]:
TEXT.build_vocab(trn, max_size=25000,
                 vectors="glove.6B.100d",
                 unk_init=torch.Tensor.normal_)

LABEL.build_vocab(trn)

# A two-layer Bidirectional Gated Recurrent Unit Model

In [10]:
class Bi_GRU(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, 
                 output_dim, n_layers, bidirectional, dropout):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.biGru = nn.GRU(embedding_dim, hidden_dim, num_layers = n_layers, 
                           bidirectional = bidirectional, dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, text):
        
        embedded = self.dropout(self.embedding(text))
        
        output, hidden = self.biGru(embedded)
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
       
        return self.fc(hidden.squeeze(0))

In [11]:
input_dim = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 20
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.5
model=Bi_GRU(input_dim,embedding_dim,hidden_dim,output_dim,n_layers,bidirectional,dropout)

In [12]:
model

Bi_GRU(
  (embedding): Embedding(4232, 100)
  (biGru): GRU(100, 20, num_layers=2, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=40, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [13]:
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

## Train the model

In [14]:

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()
def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0

  model.train()
  
  for batch in iterator:
      
      optimizer.zero_grad()
      
      predictions = model(batch.reviews).squeeze(1)
      
      loss = criterion(predictions, batch.label)
      
      rounded_preds = torch.round(torch.sigmoid(predictions))
      correct = (rounded_preds == batch.label).float() 
      
      acc = correct.sum() / len(correct)
      
      loss.backward()
      
      optimizer.step()
      
      epoch_loss += loss.item()
      epoch_acc += acc.item()

      
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
num_epochs = 50
loss = []
acc = []

for epoch in range(num_epochs):
  train_loss, train_acc= train(model, train_iterator, optimizer, criterion)#
  print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% |')
  acc.append(train_acc)
  loss.append(train_loss)

| Epoch: 01 | Train Loss: 0.697 | Train Acc: 48.62% |
| Epoch: 02 | Train Loss: 0.684 | Train Acc: 55.59% |
| Epoch: 03 | Train Loss: 0.682 | Train Acc: 56.38% |
| Epoch: 04 | Train Loss: 0.665 | Train Acc: 60.62% |
| Epoch: 05 | Train Loss: 0.631 | Train Acc: 65.51% |
| Epoch: 06 | Train Loss: 0.548 | Train Acc: 74.24% |
| Epoch: 07 | Train Loss: 0.466 | Train Acc: 80.39% |
| Epoch: 08 | Train Loss: 0.436 | Train Acc: 81.13% |
| Epoch: 09 | Train Loss: 0.364 | Train Acc: 84.76% |
| Epoch: 10 | Train Loss: 0.376 | Train Acc: 85.42% |
| Epoch: 11 | Train Loss: 0.301 | Train Acc: 88.51% |
| Epoch: 12 | Train Loss: 0.339 | Train Acc: 86.98% |
| Epoch: 13 | Train Loss: 0.340 | Train Acc: 87.03% |
| Epoch: 14 | Train Loss: 0.248 | Train Acc: 90.98% |
| Epoch: 15 | Train Loss: 0.259 | Train Acc: 91.57% |
| Epoch: 16 | Train Loss: 0.211 | Train Acc: 92.89% |
| Epoch: 17 | Train Loss: 0.207 | Train Acc: 93.15% |
| Epoch: 18 | Train Loss: 0.193 | Train Acc: 93.20% |
| Epoch: 19 | Train Loss: 0.

## Test the model

In [16]:
epoch_loss = 0
epoch_acc = 0
from sklearn.metrics import confusion_matrix
nb_classes = 2

confusion_matrix = torch.zeros(nb_classes, nb_classes)
model.eval()

with torch.no_grad():
  for batch in test_iterator:
    predictions = model(batch.reviews).squeeze(1)

    loss = criterion(predictions, batch.label)
    classes = batch.label
    rounded_preds = torch.round(torch.sigmoid(predictions))
    correct = (rounded_preds == batch.label).float() 
    a = correct.tolist()

    acc = correct.sum()/len(correct)

    epoch_loss += loss.item()
    epoch_acc += acc.item()
    for t, p in zip(classes.view(-1), rounded_preds.view(-1)):
      confusion_matrix[t.long(), p.long()] += 1


test_loss = epoch_loss / len(test_iterator)
test_acc = epoch_acc / len(test_iterator)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.569 | Test Acc: 85.25%


In [17]:
print(confusion_matrix)

tensor([[340.,  50.],
        [ 72., 363.]])


# Save model

In [18]:
torch.save(model,'/content/drive/My Drive/capstone_project/Data/rnn.h5')

In [None]:
##loading the model
#loaded_model = torch.load('/content/drive/My Drive/capstone_project/Data/rnn.h5')