In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
import re


In [9]:
df = pd.read_csv(r"C:\Users\abhis\OneDrive\Desktop\week4\Sequence Modelling Basics\data\cleaned_sampled_train.csv")
df = df.dropna(subset=['cleaned_review'])  # Drop rows with missing review text
df = df[df['cleaned_review'].apply(lambda x: isinstance(x, str))]  # Ensure all are strings

df['polarity'] = df['polarity'].map({1: 0, 2: 1})  # map labels to 0 and 1 for BCE loss
# Check unique values after mapping
print("Unique polarity values:", df['polarity'].unique())

# Check dtype (should be int or float before tensor conversion)
print("Polarity dtype:", df['polarity'].dtype)

print(df.head())
print(f"Dataset size: {len(df)}")


Unique polarity values: [0 1]
Polarity dtype: int64
                                      cleaned_review  \
0  the light bulb does not light anything, has a ...   
1  i purchased the flowtron bf 190 replacement bu...   
2  it doesn't work well in the hamilton beech sin...   
3  it took me forever to get through this book. i...   
4  this was a little of a deception, is smaller t...   

                      title  polarity  
0       PIAA SUPER LED BULB         0  
1               unsatisfied         0  
2  Refillable Coffee Filter         0  
3         Not to our liking         0  
4               Not so good         0  
Dataset size: 799998


In [10]:
def simple_tokenizer(text):
    if isinstance(text, str):
        return text.lower().strip().split()
    return []  # Return empty list if text is not a string


In [11]:
def simple_tokenizer(text):
    text = text.lower().strip()
    tokens = text.split()
    return tokens

df['tokens'] = df['cleaned_review'].apply(simple_tokenizer)

print(df['tokens'].head())


0    [the, light, bulb, does, not, light, anything,...
1    [i, purchased, the, flowtron, bf, 190, replace...
2    [it, doesn't, work, well, in, the, hamilton, b...
3    [it, took, me, forever, to, get, through, this...
4    [this, was, a, little, of, a, deception,, is, ...
Name: tokens, dtype: object


In [12]:
from collections import Counter

# Build vocabulary from tokens
all_tokens = [token for tokens in df['tokens'] for token in tokens]
token_counts = Counter(all_tokens)

# Set vocab size limit (e.g., 20,000 most frequent tokens)
vocab_size = 20000
most_common_tokens = token_counts.most_common(vocab_size - 2)  # reserve 2 for PAD and UNK

# Special tokens
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

# Build word to index dict
word2idx = {PAD_TOKEN: 0, UNK_TOKEN: 1}
for i, (word, _) in enumerate(most_common_tokens, start=2):
    word2idx[word] = i

print(f"Vocabulary size (including PAD & UNK): {len(word2idx)}")

# Map tokens to indices, replace unknown tokens with UNK index
def tokens_to_indices(tokens):
    return [word2idx.get(token, word2idx[UNK_TOKEN]) for token in tokens]

df['indexed_tokens'] = df['tokens'].apply(tokens_to_indices)

print(df['indexed_tokens'].head())


Vocabulary size (including PAD & UNK): 20000
0    [2, 359, 3692, 107, 16, 359, 3498, 40, 5, 1024...
1    [4, 201, 2, 1, 1, 1, 717, 3692, 19, 210, 10101...
2    [10, 160, 130, 91, 11, 2, 6605, 1, 521, 2809, ...
3    [10, 297, 58, 1814, 6, 51, 146, 8, 131, 10, 10...
4    [8, 14, 5, 101, 7, 5, 1, 9, 1122, 64, 4, 1, 39...
Name: indexed_tokens, dtype: object


In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 150  # You can adjust this based on your review length stats

# Pad sequences with 0 (PAD token index)
df['padded_tokens'] = list(pad_sequences(df['indexed_tokens'], maxlen=max_len, padding='post', truncating='post'))

print(df['padded_tokens'].head())
print(f"Padded sequences shape example: {df['padded_tokens'].iloc[0].shape}")


0    [2, 359, 3692, 107, 16, 359, 3498, 40, 5, 1024...
1    [4, 201, 2, 1, 1, 1, 717, 3692, 19, 210, 10101...
2    [10, 160, 130, 91, 11, 2, 6605, 1, 521, 2809, ...
3    [10, 297, 58, 1814, 6, 51, 146, 8, 131, 10, 10...
4    [8, 14, 5, 101, 7, 5, 1, 9, 1122, 64, 4, 1, 39...
Name: padded_tokens, dtype: object
Padded sequences shape example: (150,)


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom dataset
class ReviewDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(np.array(texts), dtype=torch.long)  # fix for speed
        self.labels = torch.tensor(labels.values, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]


# Prepare Dataset
dataset = ReviewDataset(df['padded_tokens'].tolist(), df['polarity'])

# Create DataLoader
batch_size = 128
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Inspect a sample batch
sample_batch = next(iter(dataloader))
print("Sample batch shapes:")
print("Texts:", sample_batch[0].shape)
print("Labels:", sample_batch[1].shape)


Sample batch shapes:
Texts: torch.Size([128, 150])
Labels: torch.Size([128])
