In [1]:
import pandas as pd  # Data manipulation library
import numpy as np  # Numerical operations
import re  # Regular expressions for text processing
import torch  # PyTorch for deep learning
from sklearn.model_selection import train_test_split  # Splitting dataset
from transformers import AutoTokenizer  # Tokenizer from Hugging Face

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Read the dataset
df = pd.read_csv("./imdb_dataset.csv")

df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Data Cleaning
def clean_text(text):
    """Function to clean text data"""
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
    return text 

# Apply the cleaning function to the 'review' column
df["review"] = df["review"].apply(clean_text)

df.head()

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Load pre-trained BERT tokenizer
def tokenize_text(text):
    """Function to tokenize text using BERT tokenizer"""
    return tokenizer(text, padding="max_length", truncation=True, max_length=512, return_tensors="pt")

# Apply tokenization to the 'review' column
df["tokens"] = df["review"].apply(lambda x: tokenize_text(x))

df['tokens']


0        [input_ids, token_type_ids, attention_mask]
1        [input_ids, token_type_ids, attention_mask]
2        [input_ids, token_type_ids, attention_mask]
3        [input_ids, token_type_ids, attention_mask]
4        [input_ids, token_type_ids, attention_mask]
                            ...                     
49995    [input_ids, token_type_ids, attention_mask]
49996    [input_ids, token_type_ids, attention_mask]
49997    [input_ids, token_type_ids, attention_mask]
49998    [input_ids, token_type_ids, attention_mask]
49999    [input_ids, token_type_ids, attention_mask]
Name: tokens, Length: 50000, dtype: object

In [None]:
# Splitting the dataset into training and testing sets
# train_texts, test_texts, train_labels, test_labels = train_test_split(
#     df["tokens"], df["sentiment"], test_size=0.2, random_state=42
# )

# # Convert labels to torch tensors for deep learning       
# train_labels = torch.tensor(train_labels.values)
# test_labels = torch.tensor(test_labels.values)

# print("Dataset ready for training!")