In [1]:
import torch
from torch import nn
import numpy as np

In [2]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace

In [3]:
docs = ['go india',
		'india india',
		'hip hip hurray',
		'jeetega bhai jeetega india jeetega',
		'bharat mata ki jai',
		'kohli kohli',
		'sachin sachin',
		'dhoni dhoni',
		'modi ji ki jai',
		'inquilab zindabad']

In [4]:
tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
tokenizer.pre_tokenizer = Whitespace()

In [5]:
tokenizer.train_from_iterator(docs)

In [6]:
type(docs)

list

In [7]:
vocab = tokenizer.get_vocab()
print(vocab)

{'sachin': 7, 'ji': 13, 'bhai': 8, 'zindabad': 16, 'hurray': 11, 'jai': 4, 'india': 0, 'inquilab': 12, 'ki': 5, 'dhoni': 2, 'kohli': 6, 'go': 10, 'bharat': 9, 'modi': 15, 'jeetega': 1, 'mata': 14, 'hip': 3}


In [8]:
print(tokenizer.get_vocab_size())

17


In [9]:
from collections import Counter
word_counts = Counter(word for doc in docs for word in doc.split())
print(word_counts)

Counter({'india': 4, 'jeetega': 3, 'hip': 2, 'ki': 2, 'jai': 2, 'kohli': 2, 'sachin': 2, 'dhoni': 2, 'go': 1, 'hurray': 1, 'bhai': 1, 'bharat': 1, 'mata': 1, 'modi': 1, 'ji': 1, 'inquilab': 1, 'zindabad': 1})


In [10]:
# Convert texts to sequences of token IDs
sequences = [tokenizer.encode(doc).ids for doc in docs]

# Output sequences
print(sequences)

[[10, 0], [0, 0], [3, 3, 11], [1, 8, 1, 0, 1], [9, 14, 5, 4], [6, 6], [7, 7], [2, 2], [15, 13, 5, 4], [12, 16]]


In [11]:
import torch.nn.functional as F

In [12]:
# Step 1: Tokenize and Integer Encode the Text
# Create a vocabulary from the dataset
all_words = " ".join(docs).split()
vocab = {word: idx+1 for idx, (word, _) in enumerate(Counter(all_words).items())}

In [13]:
all_words

['go',
 'india',
 'india',
 'india',
 'hip',
 'hip',
 'hurray',
 'jeetega',
 'bhai',
 'jeetega',
 'india',
 'jeetega',
 'bharat',
 'mata',
 'ki',
 'jai',
 'kohli',
 'kohli',
 'sachin',
 'sachin',
 'dhoni',
 'dhoni',
 'modi',
 'ji',
 'ki',
 'jai',
 'inquilab',
 'zindabad']

In [14]:
vocab

{'go': 1,
 'india': 2,
 'hip': 3,
 'hurray': 4,
 'jeetega': 5,
 'bhai': 6,
 'bharat': 7,
 'mata': 8,
 'ki': 9,
 'jai': 10,
 'kohli': 11,
 'sachin': 12,
 'dhoni': 13,
 'modi': 14,
 'ji': 15,
 'inquilab': 16,
 'zindabad': 17}

In [15]:
# Integer encode the documents
encoded_docs = [[vocab[word] for word in doc.split()] for doc in docs]

In [16]:
encoded_docs

[[1, 2],
 [2, 2],
 [3, 3, 4],
 [5, 6, 5, 2, 5],
 [7, 8, 9, 10],
 [11, 11],
 [12, 12],
 [13, 13],
 [14, 15, 9, 10],
 [16, 17]]

In [17]:
# Step 2: Pad Sequences using torch.nn.functional.pad
# Find the maximum length of sequences for padding
max_len = max(len(seq) for seq in encoded_docs)

In [18]:
max_len

5

In [19]:
# Pad the sequences manually
padded_docs = [seq + [0] * (max_len - len(seq)) for seq in encoded_docs]
padded_docs

[[1, 2, 0, 0, 0],
 [2, 2, 0, 0, 0],
 [3, 3, 4, 0, 0],
 [5, 6, 5, 2, 5],
 [7, 8, 9, 10, 0],
 [11, 11, 0, 0, 0],
 [12, 12, 0, 0, 0],
 [13, 13, 0, 0, 0],
 [14, 15, 9, 10, 0],
 [16, 17, 0, 0, 0]]

In [20]:
# Convert to tensor
padded_docs_tensor = torch.tensor(padded_docs)

In [21]:
padded_docs_tensor

tensor([[ 1,  2,  0,  0,  0],
        [ 2,  2,  0,  0,  0],
        [ 3,  3,  4,  0,  0],
        [ 5,  6,  5,  2,  5],
        [ 7,  8,  9, 10,  0],
        [11, 11,  0,  0,  0],
        [12, 12,  0,  0,  0],
        [13, 13,  0,  0,  0],
        [14, 15,  9, 10,  0],
        [16, 17,  0,  0,  0]])

In [22]:
import pandas as pd

In [87]:
df = pd.read_csv("IMDB Dataset.csv", chunksize=30000)

In [88]:
df = df.get_chunk()

In [89]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [90]:
df.shape

(30000, 2)

In [91]:
df["sentiment"].unique()

array(['positive', 'negative'], dtype=object)

In [92]:
mapping_dict = {'positive': 1, 'negative': 0}
df['sentiment'] = df['sentiment'].map(mapping_dict)

In [93]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [94]:
import re
# Function to preprocess and tokenize reviews
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove any non-alphabetical characters
    return text.split()  # Tokenize by whitespace

In [95]:
# Apply preprocessing
df['processed_reviews'] = df['review'].apply(preprocess_text)

In [96]:
df.head()

Unnamed: 0,review,sentiment,processed_reviews
0,One of the other reviewers has mentioned that ...,1,"[one, of, the, other, reviewers, has, mentione..."
1,A wonderful little production. <br /><br />The...,1,"[a, wonderful, little, production, br, br, the..."
2,I thought this was a wonderful way to spend ti...,1,"[i, thought, this, was, a, wonderful, way, to,..."
3,Basically there's a family where a little boy ...,0,"[basically, theres, a, family, where, a, littl..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"[petter, matteis, love, in, the, time, of, mon..."


In [97]:
# Create a vocabulary (assign unique integer to each word)
all_words = [word for review in df['processed_reviews'] for word in review]
vocab = {word: idx+1 for idx, (word, _) in enumerate(Counter(all_words).items())}  # Start indexing from 1

In [98]:
# Function to convert text to integer sequence
def encode_reviews(reviews):
    return [[vocab.get(word, 0) for word in review] for review in reviews]  # 0 is used for unknown words

In [99]:
# Integer encode the reviews
encoded_reviews = encode_reviews(df['processed_reviews'])

In [100]:
print(encoded_reviews[:2])

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 3, 29, 30, 8, 31, 32, 33, 12, 34, 35, 36, 37, 38, 39, 2, 40, 41, 42, 43, 19, 44, 3, 45, 46, 47, 32, 21, 22, 48, 49, 50, 51, 3, 52, 53, 54, 55, 21, 50, 56, 57, 58, 26, 59, 60, 61, 62, 54, 40, 35, 22, 63, 43, 3, 64, 65, 2, 3, 66, 28, 67, 22, 68, 12, 20, 8, 22, 3, 69, 70, 60, 3, 71, 72, 73, 74, 75, 67, 76, 77, 78, 79, 80, 81, 82, 83, 2, 3, 84, 85, 86, 3, 87, 88, 89, 90, 37, 91, 92, 93, 94, 22, 48, 95, 78, 3, 96, 97, 80, 22, 98, 60, 99, 100, 101, 102, 103, 104, 105, 37, 106, 107, 108, 109, 110, 111, 37, 112, 113, 18, 114, 115, 116, 28, 117, 118, 119, 3, 120, 121, 2, 3, 50, 22, 122, 60, 3, 123, 8, 67, 124, 85, 4, 125, 126, 127, 128, 129, 130, 131, 51, 132, 133, 128, 134, 128, 135, 136, 137, 138, 3, 29, 13, 117, 139, 140, 31, 32, 20, 93, 141, 67, 34, 142, 117, 143, 119, 117, 34, 144, 51, 67, 145, 20, 117, 146, 147, 117, 148, 49, 149, 51, 12, 37, 150, 151, 60, 3, 95, 152, 2, 153, 40, 48, 1

In [101]:
# Set max_len to the maximum sequence length you want
max_len = max(len(seq) for seq in encoded_reviews)  # Get the maximum length of reviews

In [102]:
# Function to manually pad sequences
def pad_sequences(sequences, max_len):
    padded_sequences = []
    for seq in sequences:
        padded_seq = seq + [0] * (max_len - len(seq))  # Pad with zeros if sequence is shorter than max_len
        padded_sequences.append(padded_seq)
    return torch.tensor(padded_sequences).float()

In [103]:
# Apply padding
padded_reviews_tensor = pad_sequences(encoded_reviews, max_len)

In [104]:
# Show some padded sequences
print(padded_reviews_tensor[:2])

tensor([[  1.,   2.,   3.,  ...,   0.,   0.,   0.],
        [ 49., 191., 192.,  ...,   0.,   0.,   0.]])


In [105]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Use the last output for classification
        return out


# Model parameters
input_size = 1  # Single feature per token (since we're using integer encoding)
hidden_size = 128  # Number of hidden units in the RNN
output_size = 1  # Output size (binary classification)

# Create model instance
model = SimpleRNN(input_size, hidden_size, output_size)

In [106]:
from torch.utils.data import DataLoader, TensorDataset

# Convert sentiment labels to tensor
labels = torch.tensor(df['sentiment'].values)

In [107]:
# Create a DataLoader
dataset = TensorDataset(padded_reviews_tensor, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [108]:
# Define loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [109]:
# Function to calculate accuracy
def calculate_accuracy(predictions, labels):
    # Apply sigmoid to output logits to get probabilities
    predicted_probs = torch.sigmoid(predictions)
    # Convert probabilities to class labels (0 or 1) using a threshold of 0.5
    predicted_classes = (predicted_probs > 0.5).float()
    # Compare predicted classes to true labels and calculate accuracy
    correct_predictions = (predicted_classes == labels).float()
    accuracy = correct_predictions.sum() / len(correct_predictions)
    return accuracy

In [110]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for batch_data, batch_labels in dataloader:
        # Ensure labels are of type float32
        batch_labels = batch_labels.float()

        # Forward pass
        outputs = model(batch_data.unsqueeze(-1))  # Add feature dimension
        loss = criterion(outputs.squeeze(), batch_labels)  # Squeeze output to match label shape
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        accuracy = calculate_accuracy(outputs.squeeze(), batch_labels)
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy.item()*100:.2f}%")

Epoch [1/10], Loss: 0.6935, Accuracy: 37.50%
Epoch [2/10], Loss: 0.6791, Accuracy: 68.75%
Epoch [3/10], Loss: 0.6997, Accuracy: 43.75%
Epoch [4/10], Loss: 0.6814, Accuracy: 56.25%
Epoch [5/10], Loss: 0.7072, Accuracy: 43.75%
Epoch [6/10], Loss: 0.7323, Accuracy: 31.25%
Epoch [7/10], Loss: 0.6996, Accuracy: 43.75%
Epoch [8/10], Loss: 0.6909, Accuracy: 62.50%
Epoch [9/10], Loss: 0.6709, Accuracy: 68.75%
Epoch [10/10], Loss: 0.8058, Accuracy: 18.75%
