In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [16]:
df=pd.read_csv('IMDB Dataset.csv')

In [17]:
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
df=df.replace('positive', 1)
df=df.replace('negative', 0)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [19]:
data=list(zip(df['review'], df['sentiment']))
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [20]:
# Sample IMDb dataset
class IMDBDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        tokens = self.tokenizer(text, truncation=True, padding='max_length', max_length=128, return_tensors='pt')
        return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(), 'label': torch.tensor(label, dtype=torch.float)}

In [21]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Preprocess the dataset
train_dataset = IMDBDataset(train_data, tokenizer)
test_dataset = IMDBDataset(test_data, tokenizer)

In [108]:
# Define the RNN model
class SimpleRNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        output, hidden = self.rnn(embedded)
        return self.fc(output[:,-1,:])

In [109]:
# Parameters
INPUT_DIM = tokenizer.vocab_size
EMBEDDING_DIM = 100
NUM_LAYERS = 1
HIDDEN_DIM = 256
OUTPUT_DIM = 1

In [110]:
# Initialize model
model = SimpleRNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, NUM_LAYERS)

# Data Loader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training Preparation
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

In [111]:
# Training Loop
model.train()
for epoch in range(3):
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        optimizer.zero_grad()

        output = model(input_ids, attention_mask)
        loss = criterion(output.squeeze(1), labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1} completed')

Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


In [114]:
# Evaluation (Inference) Example
model.eval()
with torch.no_grad():
    sample_texts = ["Excellent. This is a great film!", "I hate that charector. I did not like the way he was potrayed. And there was a lot of violence"]
    sample_encodings = tokenizer(sample_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
    sample_input_ids = sample_encodings['input_ids']
    sample_attention_mask = sample_encodings['attention_mask']
    predictions = model(input_ids=sample_input_ids, attention_mask=sample_attention_mask).squeeze(1)
    predictions = torch.sigmoid(predictions)
    print(predictions)  # Output probabilities for positive sentiment

tensor([0.5378, 0.5095])


In [115]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")
data = ["Excellent. This is a great film!", "I hate that charector. I did not like the way he was potrayed. And there was a lot of violence"]
sentiment_pipeline(data)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998705387115479},
 {'label': 'NEGATIVE', 'score': 0.9978775978088379}]

HW:
Read about how to train pretrained models of hugging face transformer!

https://huggingface.co/docs/transformers/en/training
