<a href="https://colab.research.google.com/github/abhiiyer/END-/blob/main/Assignment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch  #deal with tensors
from torchtext import data #handling text data
from torchtext import datasets

SEED = 1234 #Reproducing same results

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True  #Cuda algorithms

TEXT = data.Field(tokenize = 'spacy', include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

from torchtext import datasets
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

for i in range(len(train_data)):
  vars(train_data.examples[i]).get('text').reverse()

import random
train_data, valid_data = train_data.split(random_state = random.seed(SEED))

MAX_VOCAB_SIZE = 25_000

#initialize glove embeddings
TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d",   # Changed from 100D to 300D
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

#set batch size
BATCH_SIZE = 64

#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)



In [None]:
print(vars(train_data.examples[0])['text'])

['.', 'involved', 'everyone', 'for', 'embarrassed', 'deeply', 'felt', 'actually', 'I', 'where', 'experiences', 'going', '-', 'movie', 'unfortunate', 'those', 'of', 'one', "'s", 'It', '.', 'one', 'this', 'save', "n't", 'ca', 'Rapp', 'Anthony', 'Even', '.', 'charming', 'and', 'witty', 'are', 'that', 'lyrics', 'have', 'you', ')', 'e.', '(', 'and', ',', 'score', 'written', '-', 'well', 'a', 'have', 'you', ')', 'd.', '(', ',', 'equipment', 'sound', 'decent', 'have', 'you', ')', 'c.', '(', ',', 'dance', 'can', 'actors', 'your', ')', 'b.', '(', ',', 'sing', 'can', 'actors', 'your', ')', 'a.', '(', 'that', 'sure', 'make', 'probably', 'should', 'you', ',', 'playhouse', 'community', 'local', 'the', 'at', 'or', 'Broadway', 'on', 'whether', ',', 'stage', 'on', 'or', 'film', 'on', 'whether', ',', 'musical', 'a', 'make', 'to', 'going', "'re", 'you', 'If', '.', 'pass', 'free', 'a', 'gets', 'it', 'mean', "n't", 'does', '"', 'independent', '"', "'s", 'it', 'because', 'Just', '.', 'unwatchable', 'truly'

In [None]:
from torch import nn

class AdamNetV21(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,is_bidirectional=True,
              dropout=0.0, output_dim=1, padding_idx=None):
    super().__init__()
        
    embedding = nn.Embedding(vocab_size, embedding_dim, 
                                  padding_idx=padding_idx)
    
    lstm = nn.LSTM(embedding_dim, hidden_dim, 
                        num_layers=n_layers,bidirectional=is_bidirectional,
                       dropout=dropout)
    
    fc = nn.Linear(2*hidden_dim, output_dim)
    #dpout = nn.Dropout(dropout)
    
    l = [embedding,lstm,fc]
    self.module_list = nn.ModuleList(l)
    
  def forward(self, input_sequence, sequence_length):

    for f in self.module_list:
      #print("Inside forward function. f is ",f)
      embeddings = (self.module_list[0](input_sequence))
      packed_embeddings = nn.utils.rnn.pack_padded_sequence(embeddings, sequence_length)
      packed_output, (hidden_state, cell_state) = self.module_list[1](packed_embeddings)
      output = (torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1))
      scores = self.module_list[2](output)
      return scores
      
      # if self.is_bidirectional:
      #   output = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
      # else:
      #   output = hidden_state[-1,:,:]
      
    
    

In [None]:
vocab_size = len(TEXT.vocab)
embedding_dim = 100 # This needs to match the size of the pre-trained embeddings!
hidden_dim = 256
num_layers = 3
dropout = 0.2
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
model = AdamNetV21(vocab_size=vocab_size, embedding_dim=embedding_dim,hidden_dim=hidden_dim,
                  n_layers=num_layers,  dropout=dropout,
                  padding_idx=pad_idx)
print(model)
#print(list(model.parameters()))



# Initialize word embeddings
glove_vectors = TEXT.vocab.vectors
model.module_list[0].weight.data.copy_(glove_vectors)
# Zero out <unk> and <pad> tokens
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
model.module_list[0].weight.data[unk_idx] = torch.zeros(embedding_dim)
model.module_list[0].weight.data[pad_idx] = torch.zeros(embedding_dim)
# Define our loss function, optimizer, and move things to GPU
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)

import torch.optim as optim
optimizer = optim.Adam(model.parameters())

AdamNetV21(
  (module_list): ModuleList(
    (0): Embedding(25002, 100, padding_idx=1)
    (1): LSTM(100, 256, num_layers=3, dropout=0.2, bidirectional=True)
    (2): Linear(in_features=512, out_features=1, bias=True)
  )
)


In [None]:
def accuracy(scores, y):    
    scores = torch.round(torch.sigmoid(scores))
    correct = (scores == y)
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        text_lengths = text_lengths.cpu()
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            text_lengths = text_lengths.cpu()
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
      
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)


In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
num_epochs = 10
best_valid_loss = 1000000

# from tensorboardX import SummaryWriter
# summary_writer = SummaryWriter(log_dir=f"C:/Users/Lenovo/Desktop/Banking Use-Cases/tf_log/")

for epoch in range(num_epochs):    

    start_time = time.time()
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')

    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
    
    # # Log the training results
    # summary_writer.add_scalar("training/accuracy", train_acc, epoch)
    # summary_writer.add_scalar("training/loss", train_loss, epoch)
    
    # # Log the validation results
    # summary_writer.add_scalar("validation/accuracy", valid_acc, epoch)
    # summary_writer.add_scalar("validation/loss", valid_loss, epoch)
 
   
    # # After completing all epochs, visualize our word vectors
    # vecs = model.embedding.weight.data
    # labels = [l.encode('utf8') for l in TEXT.vocab.itos]
    # summary_writer.add_embedding(vecs, 
    #                             metadata=labels)
    # summary_writer.close()

# Print test performance
test_loss, test_accuracy = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}\nTest Acc: {test_accuracy*100:.2f}%')

Epoch: 01 | Epoch Time: 1m 5s
	Train Loss: 0.526 | Train Acc: 74.43%
	 Val. Loss: 0.487 |  Val. Acc: 77.91%
Epoch: 02 | Epoch Time: 1m 7s
	Train Loss: 0.345 | Train Acc: 85.53%
	 Val. Loss: 0.349 |  Val. Acc: 85.79%
Epoch: 03 | Epoch Time: 1m 8s
	Train Loss: 0.272 | Train Acc: 89.00%
	 Val. Loss: 0.402 |  Val. Acc: 82.80%
Epoch: 04 | Epoch Time: 1m 8s
	Train Loss: 0.205 | Train Acc: 92.28%
	 Val. Loss: 0.298 |  Val. Acc: 87.91%
Epoch: 05 | Epoch Time: 1m 8s
	Train Loss: 0.166 | Train Acc: 93.78%
	 Val. Loss: 0.357 |  Val. Acc: 87.50%
Epoch: 06 | Epoch Time: 1m 8s
	Train Loss: 0.122 | Train Acc: 95.77%
	 Val. Loss: 0.340 |  Val. Acc: 87.96%
Epoch: 07 | Epoch Time: 1m 8s
	Train Loss: 0.157 | Train Acc: 94.91%
	 Val. Loss: 0.340 |  Val. Acc: 86.55%
Epoch: 08 | Epoch Time: 1m 8s
	Train Loss: 0.089 | Train Acc: 97.07%
	 Val. Loss: 0.387 |  Val. Acc: 87.39%
Epoch: 09 | Epoch Time: 1m 8s
	Train Loss: 0.056 | Train Acc: 98.27%
	 Val. Loss: 0.420 |  Val. Acc: 88.10%
Epoch: 10 | Epoch Time: 1m 8

In [None]:
import spacy
nlp = spacy.load('en')

def predict_sentiment(model, sentence):
    model.eval()
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    length = [len(indexed)]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    length_tensor = torch.LongTensor(length)
    prediction = torch.sigmoid(model(tensor, length_tensor))
    return prediction.item()

In [None]:
predict_sentiment(model, "great is film This")


0.9997666478157043

In [None]:
predict_sentiment(model, "This film is great")


0.9999008178710938