<a href="https://colab.research.google.com/github/alecbidaran/Pytorch_excersies/blob/main/pytorch_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import numpy as np
from tqdm import tqdm
from string import punctuation
from collections import Counter
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.manual_seed(123)

<torch._C.Generator at 0x7fce09fbac30>

In [None]:
import random
from torchtext.legacy import data, datasets

In [None]:
TEXT_FIELD = data.Field(tokenize = data.get_tokenizer("basic_english"), include_lengths = True)
LABEL_FIELD = data.LabelField(dtype = torch.float)

train_dataset, test_dataset = datasets.IMDB.splits(TEXT_FIELD, LABEL_FIELD)
train_dataset, valid_dataset = train_dataset.split(random_state = random.seed(123))

In [None]:
MAX_VOCAULARY_SIZE=25000
TEXT_FIELD.build_vocab(train_dataset,max_size=MAX_VOCAULARY_SIZE)
LABEL_FIELD.build_vocab(train_dataset)

In [None]:
batch_size=64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_data_iterator,valid_data_iterator,test_data_iterator=data.BucketIterator.splits((train_dataset,valid_dataset,test_dataset),
                                                                                       batch_size=batch_size,
                                                                                       sort_within_batch = True,
                                                                                       device=device)

In [None]:
if torch.cuda.is_available():
    torch.set_default_tensor_type(torch.cuda.FloatTensor)
from torch.nn.utils.rnn import pack_padded_sequence, PackedSequence

def cuda_pack_padded_sequence(input, lengths, batch_first=False, enforce_sorted=True):
    lengths = torch.as_tensor(lengths, dtype=torch.int64)
    lengths = lengths.cpu()
    if enforce_sorted:
      sorted_indices = None
    else:
      lengths, sorted_indices = torch.sort(lengths, descending=True)
      sorted_indices = sorted_indices.to(input.device)
      batch_dim = 0 if batch_first else 1
      input = input.index_select(batch_dim, sorted_indices)

    data, batch_sizes = \
    torch._C._VariableFunctions._pack_padded_sequence(input, lengths, batch_first)
    return PackedSequence(data, batch_sizes, sorted_indices)

In [None]:
from torch import nn 

In [None]:
class LSTM(nn.Module):
  def __init__(self, vocabulary_size, embedding_dimension, hidden_dimension, output_dimension, dropout, pad_index):
    super().__init__()
    self.embedding=nn.Embedding(vocabulary_size,embedding_dimension,pad_index)
    self.lstm1=nn.LSTM(embedding_dimension,hidden_dimension,bidirectional=True,num_layers=1,
                           dropout=dropout)
    self.linear=nn.Linear(hidden_dimension*2,output_dimension)
    self.dropout=nn.Dropout()
  def forward(self, sequence, sequence_lengths=None):
    if sequence_lengths is None:
      sequence_lengths = torch.LongTensor([len(sequence)])
    embedded_output=self.dropout(self.embedding(sequence))
    if torch.cuda.is_available():
      packed_embded=cuda_pack_padded_sequence(embedded_output,sequence_lengths)
    else:
      packed_embded=nn.utils.rnn.pack_padded_sequence(embedded_output,sequence_lengths)
    packed_out,(hidden_state,cell_state)=self.lstm1(packed_embded)
    op,op_lenght=nn.utils.rnn.pad_packed_sequence(packed_out)
    hidden_output = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1) 
    return self.linear(hidden_output)
INPUT_DIMENSION = len(TEXT_FIELD.vocab)
EMBEDDING_DIMENSION = 100
HIDDEN_DIMENSION = 32
OUTPUT_DIMENSION = 1
DROPOUT = 0.5
PAD_INDEX = TEXT_FIELD.vocab.stoi[TEXT_FIELD.pad_token]

lstm_model = LSTM(INPUT_DIMENSION, 
            EMBEDDING_DIMENSION, 
            HIDDEN_DIMENSION, 
            OUTPUT_DIMENSION, 
            DROPOUT, 
            PAD_INDEX)
    



  "num_layers={}".format(dropout, num_layers))


In [None]:
UNK_INDEX=TEXT_FIELD.vocab.stoi[TEXT_FIELD.unk_token]
lstm_model.embedding.weight.data[UNK_INDEX] = torch.zeros(EMBEDDING_DIMENSION)
lstm_model.embedding.weight.data[PAD_INDEX] = torch.zeros(EMBEDDING_DIMENSION)

In [None]:
optim = torch.optim.Adam(lstm_model.parameters())
loss_func = nn.BCEWithLogitsLoss()

lstm_model = lstm_model.to(device)
loss_func = loss_func.to(device)

In [None]:
def accuracy_metric(predictions, ground_truth):
    """
    Returns 0-1 accuracy for the given set of predictions and ground truth
    """
    # round predictions to either 0 or 1
    rounded_predictions = torch.round(torch.sigmoid(predictions))
    success = (rounded_predictions == ground_truth).float() #convert into float for division 
    accuracy = success.sum() / len(success)
    return accuracy

In [None]:
def train(model,data_iterator,optim,loss_func):
  loss = 0
  accuracy = 0
  model.train()
  optim.zero_grad()
  for sequence_batch in data_iterator:
    sequence,sequence_lenght=sequence_batch.text
    preds=lstm_model(sequence,sequence_lenght).squeeze(1)
    loss_c=loss_func(preds,sequence_batch.label)
    acc_c=accuracy_metric(preds,sequence_batch.label)
    loss_c.backward()
    optim.step()
    loss+=loss_c.item()
    accuracy+=acc_c.item()
  return loss/len(data_iterator), accuracy/len(data_iterator)

In [None]:
def validate(model, data_iterator, loss_func):
    loss = 0
    accuracy = 0
    model.eval()
    
    with torch.no_grad():
        for curr_batch in data_iterator:
            sequence, sequence_lengths = curr_batch.text
            preds = model(sequence, sequence_lengths).squeeze(1)
            
            loss_curr = loss_func(preds, curr_batch.label)
            accuracy_curr = accuracy_metric(preds, curr_batch.label)

            loss += loss_curr.item()
            accuracy += accuracy_curr.item()
        
    return loss/len(data_iterator), accuracy/len(data_iterator)

In [None]:
num_epochs = 10
best_validation_loss = float('inf')

for ep in range(num_epochs):

    time_start = time.time()
    
    training_loss, train_accuracy = train(lstm_model, train_data_iterator, optim, loss_func)
    validation_loss, validation_accuracy = validate(lstm_model, valid_data_iterator, loss_func)
    
    time_end = time.time()
    time_delta = time_end - time_start 
    
    if validation_loss < best_validation_loss:
        best_validation_loss = validation_loss
        torch.save(lstm_model.state_dict(), 'lstm_model.pt')
    
    print(f'epoch number: {ep+1} | time elapsed: {time_delta}s')
    print(f'training loss: {training_loss:.3f} | training accuracy: {train_accuracy*100:.2f}%')
    print(f'validation loss: {validation_loss:.3f} |  validation accuracy: {validation_accuracy*100:.2f}%')
    print()

epoch number: 1 | time elapsed: 22.244837045669556s
training loss: 0.666 | training accuracy: 58.77%
validation loss: 0.628 |  validation accuracy: 65.58%

epoch number: 2 | time elapsed: 19.85049057006836s
training loss: 0.603 | training accuracy: 67.03%
validation loss: 0.587 |  validation accuracy: 68.89%

epoch number: 3 | time elapsed: 19.678786277770996s
training loss: 0.506 | training accuracy: 75.60%
validation loss: 0.708 |  validation accuracy: 69.80%

epoch number: 4 | time elapsed: 19.827998876571655s
training loss: 0.445 | training accuracy: 79.16%
validation loss: 0.541 |  validation accuracy: 73.47%

epoch number: 5 | time elapsed: 19.677167654037476s
training loss: 0.399 | training accuracy: 82.22%
validation loss: 0.553 |  validation accuracy: 77.97%

epoch number: 6 | time elapsed: 19.56511950492859s
training loss: 0.365 | training accuracy: 83.69%
validation loss: 0.463 |  validation accuracy: 78.73%

epoch number: 7 | time elapsed: 19.64629888534546s
training loss: 

In [None]:
def test(model,senteces):
  tokenized=data.get_tokenizer("basic_english")(senteces)
  tokenized=[TEXT_FIELD.vocab.stoi[t] for t in tokenized]
  model_input=torch.LongTensor(tokenized).to(device)
  model_input=model_input.unsqueeze(1)
  pred=torch.sigmoid(model(model_input))
  return pred.item()

In [None]:
print(test(lstm_model, "This film is horrible"))
print(test(lstm_model, "This film will be houseful for weeks"))

0.0017328496323898435
0.9640718698501587
