In [3]:
import torch
from torch import nn
import portalocker
import torchtext.datasets as datasets
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader 
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator


# Download the IMDB dataset
train_iter, test_iter = datasets.IMDB()

# Create a tokenizer

#tokenizer = get_tokenizer('basic_english')

In [4]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [5]:
device = torch.device("cuda")

from nltk.corpus import stopwords
import string

punct = set(string.punctuation)                 # remove punctuations
stop_words = set(stopwords.words('english'))    # remove stopwords

# Define a function to preprocess the text data
def process_data(data):
    label = data[0]
    text = data[1]
    tokens = preprocess(text)
    return label, tokens

# Define a function to tokenize and remove stop words
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in punct]
    return tokens

# Create vocabulary from the training data
mapped_train_iter = map(process_data, train_iter)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/auslei/nltk_data'
    - '/home/auslei/dev/machine_learning_deep_learning/.env/nltk_data'
    - '/home/auslei/dev/machine_learning_deep_learning/.env/share/nltk_data'
    - '/home/auslei/dev/machine_learning_deep_learning/.env/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
# Define an iterator that yields the tokens, this is then used to build vocab
def token_iterator(sentences):
    for _, s in sentences:
        yield s

# tokens to index
def tokens_to_idex(vocab, sentences):
    token_idx = [torch.LongTensor(vocab.lookup_indices(preprocess(sentence))) for sentence in sentences]
    return token_idx
        
# Define a collate function to pad the sequences to the same length
def collate_fn(batch):
    labels, texts = zip(*batch)
    texts = [torch.LongTensor([vocab[token.lower()] for token in text]) for text in texts] #sentence tokens to index
    texts = pad_sequence(texts, batch_first=True)
    labels = torch.LongTensor(labels)
    return texts, labels

In [None]:
vocab = build_vocab_from_iterator(token_iterator(mapped_train_iter), specials=["<unk>"])
vocab.set_default_index(0)
# Print the size of the vocabulary
print(f"Vocabulary size: {len(vocab)}")

In [None]:
vocab.lookup_indices(["dsdfa", "item"])
vocab.lookup_tokens([69940, 6905])

In [None]:
train_loader = DataLoader(train_iter, batch_size=64, shuffle=True, collate_fn=collate_fn) #create a train loaders.

## LSTM

In [None]:
# define LSTM-based model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden_last = hidden[-1, :]
        return torch.sigmoid(self.fc(hidden_last))

In [None]:
from torch import optim
from torch.utils.data import DataLoader 

# define model hyperparameters
INPUT_DIM = len(vocab) # input dimension will be the vocab
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
NUM_LAYERS = 2
OUTPUT_DIM = 1

# initialize model
model = LSTMClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, OUTPUT_DIM)

In [None]:
# define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

#train_len = len(sentence_iterator(train_iter))

# define function to calculate accuracy
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    accuracy = correct.sum() / len(correct)
    return accuracy


# train the model
for epoch in range(10):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for i, (texts, labels) in enumerate(train_loader):
        predictions = model(texts)
        #print(texts.shape, predictions.shape, labels.shape)
        #print(texts.dtype, predictions.dtype, labels.dtype)
        loss = criterion(predictions, labels.reshape(-1, 1).to(torch.float32))
        #acc = binary_accuracy(predictions, labels)
        
        optimizer.zero_grad()        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        #epoch_acc += acc.item()
    
    #epoch_loss /= train_len
    #epoch_acc /= train_len
    print(f'Epoch {epoch}: Loss={epoch_loss:.3f}')#, Accuracy={epoch_acc:.3f}')

In [None]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()

input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()


In [None]:
input, target.unsqueeze(0)

In [None]:
# Example of target with class probabilities
input = torch.randn(3, 1, requires_grad=True)
target = torch.randn(3, 1).softmax(dim=1)
output = loss(input, target)
output.backward()

In [None]:
input, target