In [39]:
import torch
from torch import nn
import portalocker
from torchtext.datasets import IMDB
from torch.utils.data import DataLoader 
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Download the IMDB dataset
train_iter, test_iter = IMDB(split = ('train', 'test'))
# Create a tokenizer
tokenizer = get_tokenizer('basic_english')

In [40]:
# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [41]:
from nltk.corpus import stopwords
import string

punct = set(string.punctuation)                 # remove punctuations
stop_words = set(stopwords.words('english'))    # remove stopwords

# Define a function to preprocess the text data
def preprocess(data):
    label = data[0]
    text = data[1]
    tokens = remove_stop_words(text.lower())
    return label, tokens

# Define a function to tokenize and remove stop words
def remove_stop_words(text):
    tokens = text.lower().split()
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token not in punct]
    return tokens

# Create vocabulary from the training data
mapped_train_iter = map(preprocess, train_iter)

In [42]:
# Define an iterator that yields the tokens, this is then used to build vocab
def token_iterator(sentences):
    for _, s in sentences:
        yield s
        
# Define a collate function to pad the sequences to the same length
def collate_fn(batch):
    labels, texts = zip(*batch)
    texts = [torch.LongTensor([vocab[token] for token in text]) for text in texts]
    texts = pad_sequence(texts, batch_first=True)
    labels = torch.LongTensor(labels)
    return texts, labels

In [43]:
vocab = build_vocab_from_iterator(token_iterator(mapped_train_iter), min_freq=1, specials=["<unk>"])

# Print the size of the vocabulary
print(f"Vocabulary size: {len(vocab)}")

Vocabulary size: 158726


In [44]:
train_loader = DataLoader(train_iter, batch_size=64, shuffle=True, collate_fn=collate_fn)

## LSTM

In [45]:
# define LSTM-based model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, (hidden, cell) = self.lstm(embedded)
        hidden_last = hidden[-1, :]
        return self.fc(hidden_last)

In [46]:
from torch import optim
from torch.utils.data import DataLoader 

# define model hyperparameters
INPUT_DIM = len(vocab) # input dimension will be the vocab
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
NUM_LAYERS = 2
OUTPUT_DIM = 2

# initialize model
model = LSTMClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, NUM_LAYERS, OUTPUT_DIM)

In [None]:
# define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

#train_len = len(sentence_iterator(train_iter))

# define function to calculate accuracy
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    accuracy = correct.sum() / len(correct)
    return accuracy


# train the model
for epoch in range(10):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for i, (texts, labels) in enumerate(train_loader):

        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, label)
        acc = binary_accuracy(predictions, label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    epoch_loss /= train_len
    epoch_acc /= train_len
    print(f'Epoch {epoch}: Loss={epoch_loss:.3f}, Accuracy={epoch_acc:.3f}')

In [34]:
for label, text in train_iter:
    print(label, text)
    break
    
train_dataloader = DataLoader(list(train_iter), batch_size = 8, shuffle = True)

1 I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, e

In [23]:
for batch in train_dataloader:
    print(batch)
    break

TypeError: object of type 'generator' has no len()

In [26]:
list(train_iter)

[]

In [46]:
train_iter

ShardingFilterIterDataPipe

In [41]:
for x, y in mapped_train_iter:
    print(x, y)
    break