<a href="https://www.kaggle.com/code/arfin1403/sentiment-analyze-gru-tf-idf-vectorizer?scriptVersionId=281223788" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# imports

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
from nltk.corpus import stopwords 
from collections import Counter
import string
import re
import seaborn as sns
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader,TensorDataset
import torch.optim as optim

In [2]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


# Load the dataset

In [3]:
df=pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")

In [4]:
df=df.sample(frac =.10)

In [5]:
df.head(2)

Unnamed: 0,review,sentiment
40423,This is one of the funniest movies I have ever...,positive
8906,Strained and humorless (especially in light of...,negative


# Data Preprocessing

In [6]:
#Lower Case
df["review"]=df["review"].str.lower()

In [7]:
import re

def remove_urls(text):
    url_pattern = r'(https?://\S+|www\.\S+)'
    return re.sub(url_pattern, '', text)

df["review"] = df["review"].apply(remove_urls)


In [8]:
#REMOVE PUNCTUATIONS AND EMOJI
import re

def remove_punctuations(text):
    text=re.sub(r"[^A-Za-z0-9\s]","",text)
    return text

In [9]:
df["review"] = df["review"].apply(remove_punctuations)

In [10]:
#REMOVE HTML
import re

def remove_html(text):
    text=re.sub(r'<.*?>', '', text)
    return text

In [11]:
df["review"] = df["review"].apply(remove_html)

In [12]:
#REMOVE STOPWORDS

def remove_stopword(text):
    stop_words = stopwords.words('english')  # Specify 'english' for English stopwords
    temp_text = word_tokenize(text)

    for word in temp_text:
        if word in stop_words:
            text=text.replace(word,"")
    return text

In [13]:
df["review"] = df["review"].apply(remove_stopword)

In [14]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

def Stemming(text):
    ps = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_words = [ps.stem(token) for token in tokens]
    return ' '.join(stemmed_words)

df["review"] = df["review"].apply(Stemming)


In [15]:
df.head(3)

Unnamed: 0,review,sentiment
40423,one funnest move ever seen n opnon rob low h b...,positive
8906,stred humorless especilli light rr dubiou psyc...,negative
4644,shme tht th sere hsnt remster nd produc vdeo w...,positive


# **Changing the Target values to categorical value**

In [16]:
df["sentiment"].replace("positive",0,inplace=True)
df["sentiment"].replace("negative",1,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["sentiment"].replace("positive",0,inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["sentiment"].replace("negative",1,inplace=True)
  df["sentiment"].replace("negative",1,inplace=True)


In [17]:
Y=df["sentiment"]

# **Text Vectorization**

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
X =tf.fit_transform(df['review']).toarray()

# **Split the dataset**

In [19]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.20,random_state=0)

In [20]:
X_train.shape

(4000, 51098)

In [21]:
shape=X_train.shape

In [22]:
shape[1]

51098

In [23]:
X_test.shape

(1000, 51098)

In [24]:
type(X_train)

numpy.ndarray

In [25]:
type(Y_train)

pandas.core.series.Series

In [26]:
Y_train = Y_train.to_numpy()
Y_test = Y_test.to_numpy()

In [27]:
X_train.ndim

2

# **Create Tensor Datasets**

In [28]:
train_set = TensorDataset(torch.from_numpy(X_train).float(), torch.from_numpy(Y_train).float())
test_set = TensorDataset(torch.from_numpy(X_test).float(), torch.from_numpy(Y_test).float())

# **Data Loader (Load Data in Batches)**

In [29]:
train_loader = DataLoader(train_set, shuffle=True, batch_size=64)
test_loader = DataLoader(test_set, shuffle=True, batch_size=64)

# **GRU**

In [30]:
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        # GRU Layer
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully Connected Layer
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # GRU forward pass
        out, _ = self.gru(x, h0)
        
        # Take last time step output (sequence end)
        out = self.fc(out[:, -1, :])
        return out


# **Hyperparameters**

In [31]:
input_dim = shape[1] # Updated to match TF-IDF feature size
hidden_dim = 128
output_dim = 1  # Binary classification (positive or negative sentiment)
num_layers = 1
num_epochs = 10
batch_size = 64
learning_rate = 0.001

# **Initialize model, criterion, and optimizer**

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GRUModel(input_dim, hidden_dim, output_dim, num_layers).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# **Training**

In [33]:
for epoch in range(num_epochs):
    model.train()
    for X_batch, Y_batch in train_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        
        # Add an additional dimension for the sequence length
        X_batch = X_batch.unsqueeze(1)
        
        outputs = model(X_batch)
        
        # Apply sigmoid activation to get probabilities
        outputs = torch.sigmoid(outputs.squeeze())
        
        # Compute the loss
        loss = criterion(outputs, Y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/10], Loss: 0.6316
Epoch [2/10], Loss: 0.4008
Epoch [3/10], Loss: 0.1254
Epoch [4/10], Loss: 0.0979
Epoch [5/10], Loss: 0.0558
Epoch [6/10], Loss: 0.0253
Epoch [7/10], Loss: 0.0117
Epoch [8/10], Loss: 0.0102
Epoch [9/10], Loss: 0.0066
Epoch [10/10], Loss: 0.0038


# **EVALUATION**

In [34]:
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X_batch, Y_batch in test_loader:
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        
        # Add an additional dimension for the sequence length
        X_batch = X_batch.unsqueeze(1)
        
        outputs = model(X_batch)
        predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).float()
        total += Y_batch.size(0)
        correct += (predicted == Y_batch).sum().item()
    
    accuracy = correct / total
    print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 81.70%
