In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tensorflow.keras.datasets import imdb
from keras.preprocessing import sequence
import matplotlib.pyplot as plt

Using TensorFlow backend.


In [4]:
VOCAB_SIZE = 88584

MAXLEN = 250
BATCH_SIZE = 500

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=VOCAB_SIZE)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
 2293760/17464789 [==>...........................] - ETA: 2: - ETA: 29:0 - ETA: 21:1 - ETA: 16:2 - ETA: 19:0 - ETA: 14:4 - ETA: 13:2 - ETA: 12:2 - ETA: 11:3 - ETA: 10:4 - ETA: 10:1 - ETA: 9:3 - ETA: 9: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 7: - ETA: 14:4 - ETA: 13:0 - ETA: 12:3 - ETA: 12:5 - ETA: 13:5 - ETA: 13:0 - ETA: 12:4 - ETA: 16:1 - ETA: 15:5 - ETA: 15:0 - ETA: 14:4 - ETA: 14:2 - ETA: 14:0 - ETA: 13:4 - ETA: 13:4 - ETA: 13:5 - ETA: 13:2 - ETA: 12:4 - ETA: 12:2 - ETA: 11:5 - ETA: 11:3 - ETA: 11:1 - ETA: 11:0 - ETA: 10:4 - ETA: 10:5 - ETA: 10:4 - ETA: 10:4 - ETA: 10:1 - ETA: 10:1 - ETA: 9:5 - ETA: 9: - ETA: 9: - ETA: 9: - ETA: 9: - ETA: 9: - ETA: 9: - ETA: 9: - ETA: 9: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 8: - ETA: 7: - ETA: 7: - ETA: 7: - ETA: 7: - ETA: 7: - ETA: 7: - ETA: 7: - ETA: 7: - ETA: 8: - ETA: 10:0 - ETA: 33:0 - ETA: 32:4 - ETA: 32:3 

OSError: [WinError 10051] A socket operation was attempted to an unreachable network

In [None]:
# preprocessing padding or stripping
train_data = sequence.pad_sequences(train_data, MAXLEN)
test_data = sequence.pad_sequences(test_data, MAXLEN)

In [None]:
class IMDBDataset(torch.utils.data.Dataset):

  def __init__(self, data, labels):
    self.x = []
    self.y = []
    for i in range(len(data)):
      self.x.append(data[i])
      self.y.append(labels[i])
    self.x = torch.tensor(self.x, dtype=torch.int64)
    self.y = torch.tensor(self.y, dtype=torch.int64)
    self.n_items = i

  def __getitem__(self, index):
    return self.x[index], self.y[index]

  def __len__(self):
    return self.n_items

In [None]:
train_dataset = IMDBDataset(train_data, train_labels)

In [None]:
test_dataset = IMDBDataset(test_data, test_labels)

In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                           shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE,
                                           shuffle=True, num_workers=2)

In [None]:
next(iter(train_loader))[0].shape

In [None]:
class TextSentiment(nn.Module):

  def __init__(self):
    super(TextSentiment, self).__init__()
    self.hidden_dim = 32
    # print('hello')
    self.embedding = nn.Embedding(VOCAB_SIZE, 16, padding_idx=0)
    self.lstm = nn.LSTM(16, self.hidden_dim, batch_first=True)
    
    # dropout layer
    self.dropout = nn.Dropout(0.2)
    
    # linear and sigmoid layers
    self.fc = nn.Linear(self.hidden_dim, 1)
    self.sig = nn.Sigmoid()

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(1, batch_size, self.hidden_dim).zero_(),
                weight.new(1, batch_size, self.hidden_dim).zero_())
      
    return hidden

  def forward(self, x, hidden):
    self.batch_size = x.size(0)
    embeded = self.embedding(x)
    # print(embeded.shape)
    lstm_out = self.lstm(embeded, hidden)
    out, hidden = lstm_out
    out = self.dropout(self.fc(hidden[0][0]))
    out = self.sig(out)
    out = out[:, -1]
    return out, hidden



In [None]:
model = TextSentiment()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=.01)
criterion = nn.BCELoss()
n_epochs = 200

In [None]:
def get_correct_num(preds, labels):
  preds[preds < .5] = 0
  preds[preds > .5] = 1
  return torch.eq(preds, labels).sum(dim=0).item()

In [None]:
clip = 5
for epoch in range(1, n_epochs+1):
  total_loss = 0
  total_correct = 0
  for sentence, labels in train_loader:
    h = model.init_hidden(len(sentence))
    optimizer.zero_grad()
    preds, h = model(sentence, h)
    loss = criterion(preds, labels.float())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    total_loss += loss.item()
    total_correct += get_correct_num(preds, labels)
  with torch.no_grad():
    print('epoch:', epoch, 'loss:', total_loss, 'accuracy:',total_correct/len(train_dataset))    

In [None]:
with torch.no_grad():
  test_loss = 0
  test_correct = 0
  for sentence, labels in test_loader:
    h = model.init_hidden(len(sentence))
    preds, h = model(sentence, h)
    loss = criterion(preds, labels.float())
    test_loss += loss.item()
    test_correct += get_correct_num(preds, labels)
  print('val_loss:', test_loss, 'val_accuracy:',test_correct/len(test_dataset))