A **Recurrent Neural Network (RNN)** is a type of neural network designed to handle sequential data. Unlike traditional neural networks, RNNs have connections that form directed cycles, allowing information to persist.

**Where can they be used?**
Sequential Data Processing: Ideal for data where the order of elements matters (e.g., sentences, time series).
Capturing Dependencies: Can learn dependencies between different elements in a sequence


**Limitations**
Vanishing/Exploding Gradients: Difficulty in learning long-range dependencies.
Short-Term Memory: Struggle with maintaining information over long sequences.



In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from datasets import load_dataset

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
import string
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('all')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\

True

#### Seed for Reproducibility

In [2]:
SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
np.random.seed(SEED)


In [3]:
dataset = load_dataset("imdb")

In [4]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})


#### Prepare the data

In [5]:
train = pd.DataFrame(dataset["train"])
test = pd.DataFrame(dataset["test"])

df = pd.concat([train, test], ignore_index=True)
df.head()


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [6]:
train_text, temp_text, train_labels, temp_labels= train_test_split(df['text'], df['label'], test_size=0.3, random_state=SEED,stratify=df['label'])
val_text, test_text, val_labels, test_labels= train_test_split(temp_text, temp_labels, test_size=0.5, random_state=SEED,stratify=temp_labels)

In [7]:
print(f"Training samples: {len(train_text)}")
print(f"Validation samples: {len(val_text)}")
print(f"Testing samples: {len(test_text)}")

Training samples: 35000
Validation samples: 7500
Testing samples: 7500


### Building a vocab

Before training the RNN, we need to convert the text data into numerical representations. We'll perform the following preprocessing steps:

Cleaning the Text

Tokenization

Removing Stopwords

Lemmatization

Building the Vocabulary

In [8]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'<.*?>', '', text) # Remove HTML tags
    text = re.sub(r'http\S+|www.\S+', '', text) # Remove URLs
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    text = text.strip() # Remove extra whitespace
    text = text.lower() # Lowercase

    #create tokens
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return tokens

train_tokens = train_text.apply(clean_text)
test_tokens = test_text.apply(clean_text)
val_tokens = val_text.apply(clean_text)

from collections import Counter


all_tokens = [token for tokens in train_tokens for token in tokens] #flatten the list of tokens
token_count = Counter(all_tokens) #count the frequency of each token
MAX_VOCAB_SIZE = 10000 #set the maximum vocab size to the number of tokens
vocab = ['<PAD>','<UNK>'] + [word for word, freq in token_count.most_common(MAX_VOCAB_SIZE - 2)] #create the vocab
wordtoidx = {word: idx for idx, word in enumerate(vocab)} #create the word to index mapping
unk_idx = wordtoidx['<UNK>']
pad_idx = wordtoidx['<PAD>']

In [9]:
#ENCODING THE TOKENS

def encode_tokens(tokens, wordtoidx,max_len = 200):
    encoded = [wordtoidx.get(token,unk_idx) for token in tokens] 
    if len(encoded) < max_len:
        encoded += [pad_idx] * (max_len - len(encoded))
    else:
        encoded = encoded[:max_len]
    return np.array(encoded)

#encode all the datasets
train_encoded = np.array([encode_tokens(tokens, wordtoidx) for tokens in train_tokens])
val_encoded = np.array([encode_tokens(tokens, wordtoidx) for tokens in val_tokens])
test_encoded = np.array([encode_tokens(tokens, wordtoidx) for tokens in test_tokens])

#convert them to numpy array
X_train = np.array(train_encoded.tolist())
X_valid = np.array(val_encoded.tolist())
X_test = np.array(test_encoded.tolist())

y_train = np.array(train_labels)
y_valid = np.array(val_encoded)
y_test = np.array(test_labels)

print(f"X_train shape: {X_train.shape}")
print(f"X_valid shape: {X_valid.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (35000, 200)
X_valid shape: (7500, 200)
X_test shape: (7500, 200)


#### Create a dataloader

In [10]:
class IMDBDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]
    


train_dataset = IMDBDataset(X_train, y_train)
val_dataset = IMDBDataset(X_valid, y_valid)
test_dataset = IMDBDataset(X_test, y_test)

#CREATING DATALOADERS
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

### RNN MODEL

In [11]:
#model architecture

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx):
        
        # SimpleRNN inherits from nn.Module and is structured to take a sequence of tokenized text data, 
        # pass it through an embedding layer and RNN layer, and then output a prediction.
        super(SimpleRNN, self).__init__() 

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        hidden = hidden.squeeze(0)
        out = self.fc(hidden)
        return out


VOCAB_SIZE = len(vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
PAD_IDX = pad_idx

model = SimpleRNN(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, PAD_IDX)


#define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

print(model)

SimpleRNN(
  (embedding): Embedding(10000, 100, padding_idx=0)
  (rnn): RNN(100, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)


### Training Model

In [12]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e., if you got 19/20 right, this returns 0.95
    """
    # Apply sigmoid to predictions
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()  # Convert into float for division 
    acc = correct.sum() / len(correct)
    return acc


In [13]:
def train_model(model, loader, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for texts, labels in tqdm(loader, desc="Training"):
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(texts).squeeze(1)
        
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)

def evaluate_model(model, loader, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for texts, labels in tqdm(loader, desc="Evaluating"):
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts).squeeze(1)
            
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)

In [14]:
def train_model(model, loader, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for texts, labels in tqdm(loader, desc="Training"):
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(texts).squeeze(1)
        
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)

def evaluate_model(model, loader, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        for texts, labels in tqdm(loader, desc="Evaluating"):
            texts, labels = texts.to(device), labels.to(device)
            
            predictions = model(texts).squeeze(1)
            
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(loader), epoch_acc / len(loader)


In [17]:
N_EPOCHS = 5
best_valid_loss = float('inf')

train_losses = []
train_accuracies = []
valid_losses = []
valid_accuracies = []

for epoch in range(N_EPOCHS):
    print(f'Epoch {epoch+1}/{N_EPOCHS}')
    
    train_loss, train_acc = train_model(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate_model(model, val_loader, criterion)
    
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    valid_losses.append(valid_loss)
    valid_accuracies.append(valid_acc)
    
    # Save the model if validation loss decreases
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_rnn_model.pt')
        print("Saved Best Model")
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')


Epoch 1/5


Training: 100%|██████████| 2188/2188 [10:30<00:00,  3.47it/s]
Evaluating:   0%|          | 0/469 [00:00<?, ?it/s]


ValueError: Target size (torch.Size([16, 200])) must be the same as input size (torch.Size([16]))

In [None]:
#hi
