# Movie Ratings Sentiment Analysis

### **Import Library**

In [1]:
import random
import numpy as np
import pandas as pd
import torch
import nltk
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm

from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [4]:
# Set random seed
set_seed(26012025)

In [32]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Instantiate model
model = BertModel.from_pretrained("bert-base-uncased")

In [6]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

## Data Acquisition

### Load Dataset

In [7]:
import kagglehub

path = kagglehub.dataset_download("yasserh/imdb-movie-ratings-sentiment-analysis")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/yasserh/imdb-movie-ratings-sentiment-analysis?dataset_version_number=1...


100%|██████████| 20.6M/20.6M [00:00<00:00, 138MB/s] 

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/yasserh/imdb-movie-ratings-sentiment-analysis/versions/1


In [8]:
dataset_path = f"{path}/movie.csv"
df = pd.read_csv(dataset_path)

df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Data cleaning

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [10]:
df.describe()

Unnamed: 0,label
count,40000.0
mean,0.499525
std,0.500006
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


#### Duplicate

In [11]:
# Check duplicates
print(f"Total Duplicates: {df.duplicated().sum()}")

duplicates = df[df.duplicated(keep=False)]
print(duplicates)

Total Duplicates: 277
                                                    text  label
202    "Go Fish" garnered Rose Troche rightly or wron...      0
253    A fun romp...a lot of good twists and turns! (...      1
266    Mean spirited, and down right degrading adapta...      0
339    (Spoilers)<br /><br />Oh sure it's based on Mo...      0
654    You do realize that you've been watching the E...      0
...                                                  ...    ...
39229  I Enjoyed Watching This Well Acted Movie Very ...      1
39246  You do realize that you've been watching the E...      0
39508  (Spoilers)<br /><br />Oh sure it's based on Mo...      0
39681  It's been a long time since I last saw a movie...      0
39746  Goodnight, Mister Tom begins in an impossibly ...      0

[546 rows x 2 columns]


In [12]:
# Drop duplicates
df.drop_duplicates(inplace=True)

print(df.duplicated().sum())

0


#### Miss Value

In [13]:
# Check for missing values in each column
print(df.isnull().sum())

text     0
label    0
dtype: int64


## Data Preprocessing

### Tokenization

In [34]:
df['token'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

In [35]:
df['token']

Unnamed: 0,token
0,"[i, grew, up, (, b, ., 1965, ), watching, and,..."
1,"[when, i, put, this, movie, in, my, dvd, playe..."
2,"[why, do, people, who, do, not, know, what, a,..."
3,"[even, though, i, have, great, interest, in, b..."
4,"[im, a, die, hard, dad, ##s, army, fan, and, n..."
...,...
39995,"["", western, union, "", is, something, of, a, f..."
39996,"[this, movie, is, an, incredible, piece, of, w..."
39997,"[my, wife, and, i, watched, this, movie, becau..."
39998,"[when, i, first, watched, flat, ##liner, ##s, ..."


### Stopwords

In [36]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))  # You can adjust the language

df['token'] = df['token'].apply(lambda x: [word for word in x if word not in stop_words])

In [37]:
df['token']

Unnamed: 0,token
0,"[grew, (, b, ., 1965, ), watching, loving, thu..."
1,"[put, movie, dvd, player, ,, sat, coke, chips,..."
2,"[people, know, particular, time, past, like, f..."
3,"[even, though, great, interest, biblical, movi..."
4,"[im, die, hard, dad, ##s, army, fan, nothing, ..."
...,...
39995,"["", western, union, "", something, forgotten, c..."
39996,"[movie, incredible, piece, work, ., explores, ..."
39997,"[wife, watched, movie, plan, visit, sicily, st..."
39998,"[first, watched, flat, ##liner, ##s, ,, amazed..."


### Lemmatization

In [38]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
df['lemmas'] = df['token'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [39]:
df['lemmas']

Unnamed: 0,lemmas
0,"[grew, (, b, ., 1965, ), watching, loving, thu..."
1,"[put, movie, dvd, player, ,, sat, coke, chip, ..."
2,"[people, know, particular, time, past, like, f..."
3,"[even, though, great, interest, biblical, movi..."
4,"[im, die, hard, dad, ##s, army, fan, nothing, ..."
...,...
39995,"["", western, union, "", something, forgotten, c..."
39996,"[movie, incredible, piece, work, ., explores, ..."
39997,"[wife, watched, movie, plan, visit, sicily, st..."
39998,"[first, watched, flat, ##liner, ##s, ,, amazed..."


## Feature Engineering

In [40]:
df['word_count'] = df['token'].apply(len)

In [41]:
df['word_count']

Unnamed: 0,word_count
0,122
1,253
2,126
3,40
4,158
...,...
39995,433
39996,35
39997,183
39998,154


## Split dataset

In [None]:
# prompt: split dataset train(80%), validation(10%), and test(10%)

from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
X_train, X_test, y_train, y_test = train_test_split(df['fitur'], df['label'], test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print("Train set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

## Model

### Test Pretrained Model

In [43]:
# Example text
text = "This movie was absolutely amazing! I loved every minute of it."

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt")

# Get the model's output
with torch.no_grad():
    outputs = model(**inputs)

# Get the last hidden state
last_hidden_state = outputs.last_hidden_state

# Use the last hidden state for sentiment analysis (example: average pooling)
sentence_embedding = torch.mean(last_hidden_state, dim=1)

# You can now use sentence_embedding for further analysis
# For example, feed it into a classifier for sentiment prediction
# Here's a very simple example (no training, just for demonstration)
sentiment_score = torch.mean(sentence_embedding)

if sentiment_score > 0.5:
  print("Positive sentiment")
else:
  print("Negative sentiment")

# Note: This is a basic example. You would typically train a classifier on a labelled dataset
# to get meaningful sentiment predictions. The above code just shows how to get a text representation
# from BERT which can then be used as input for a classifier.

Negative sentiment


### Fine Tuning Model

# -----