#Import the libraries

In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [51]:
# url = 'https://raw.githubusercontent.com/anhphuongnguyenquynh/ML-DL-NLP-AI_Code_Practices/main/deep_learning/data/temperature.csv'
# temperature = pd.read_csv(url)
# temperature.head()

#Prepare data

In [52]:
#Sentences (textual data) and their sentiment labels (1 for positive and 0 for negative)
sentences = ["i love this movie", "this film is amazing", "i didn't like it", "it was terrible"]
sentiment = [1, 1, 0, 0]

##Create vocabulary
Create simple vocabulary to represent words as indices.
Convert words in our sentences to numbers. Fed as input to neural network.

In [53]:
#Simple vocabulary to represent words as indices
vocab = {"<PAD>": 0, "i": 1, "love": 2, "this": 3, "movie": 4, "film": 5, "is": 6, "amazing": 7, "didn't": 8, "like": 9, "it": 10, "was": 11, "terrible": 12}

##Tokenize, Encode, Pad sentences

In [54]:
encoded_sentences = [[vocab[word] for word in sentence.split()] for sentence in sentences]
max_length = max([len(sentence) for sentence in encoded_sentences])
padded_sentences = [sentence + [vocab["<PAD>"]] * (max_length - len(sentence)) for sentence in encoded_sentences]

In [55]:
encoded_sentences

[[1, 2, 3, 4], [3, 5, 6, 7], [1, 8, 9, 10], [10, 11, 12]]

In [56]:
max_length

4

In [57]:
padded_sentences

[[1, 2, 3, 4], [3, 5, 6, 7], [1, 8, 9, 10], [10, 11, 12, 0]]

##Convert data to tensors

In [58]:
inputs = torch.LongTensor(padded_sentences)
labels = torch.FloatTensor(sentiment)

In [59]:
print(inputs)

tensor([[ 1,  2,  3,  4],
        [ 3,  5,  6,  7],
        [ 1,  8,  9, 10],
        [10, 11, 12,  0]])


In [60]:
print(labels)

tensor([1., 1., 0., 0.])


#LSTM Model

In [61]:
class SimpleLSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
    super(SimpleLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim)
    self.fc = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    embedded = self.embedding(x)
    output, (hidden, _) = self.lstm(embedded)
    logits = self.fc(hidden.squeeze(0))
    return logits

#Train the model

In [62]:
model = SimpleLSTM(vocab_size = len(vocab), embedding_dim = 10, hidden_dim = 32, output_dim = 1)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [63]:
epochs = 100
for epoch in range(epochs):
  optimizer.zero_grad()
  predictions = model(inputs.t()).squeeze(1) #inputs [batch, max_length] -> input of lstm pytorch
  loss = criterion(predictions, labels)
  loss.backward()
  optimizer.step()
  if (epoch + 1) % 10 == 0:
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

Epoch 10/100, Loss: 0.6397
Epoch 20/100, Loss: 0.5903
Epoch 30/100, Loss: 0.5260
Epoch 40/100, Loss: 0.4412
Epoch 50/100, Loss: 0.3358
Epoch 60/100, Loss: 0.2218
Epoch 70/100, Loss: 0.1256
Epoch 80/100, Loss: 0.0678
Epoch 90/100, Loss: 0.0399
Epoch 100/100, Loss: 0.0264


#Test the model

In [64]:
with torch.no_grad():
  test_sentences = ["i love this movie", "i didn't like it"]
  test_encoded_sentences = [[vocab[word] for word in sentence.split()] for sentence in test_sentences]
  padded_test_sentences = [sentence + [vocab["<PAD>"]] * (max_length - len(sentence)) for sentence in test_encoded_sentences]
  test_inputs = torch.LongTensor(padded_test_sentences)
  test_predictions = torch.sigmoid(model(test_inputs.t()).squeeze(1))
  print("Test prdictions:", test_predictions)

Test prdictions: tensor([0.9798, 0.0178])
