## Download data  from
https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [None]:
#!pip install "numpy<2" --upgrade

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
from nltk.corpus import stopwords
from collections import Counter
import re
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
base_csv ='/Users/bilgesipal/PycharmProjects/boun_MIS48D/Lecture 8/IMDB Dataset.csv'
df = pd.read_csv(base_csv)
df = df.head(3000)

In [None]:
X,y = df['review'].values,df['sentiment'].values

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,stratify=y)

In [None]:
def preprocess_string(s):
    # Remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    # Replace all runs of whitespaces with no space
    s = re.sub(r"\s+", '', s)
    # replace digits with no space
    s = re.sub(r"\d", '', s)

    return s

In [None]:
def tokenize(x_train, y_train, x_val, y_val):
    word_list = []
    stop_words = set(stopwords.words('english'))

    for sent in x_train:
        for word in sent.lower().split():
            word = preprocess_string(word)
            if word not in stop_words and word != '':
                word_list.append(word)

    corpus = Counter(word_list)
    corpus_ = sorted(corpus, key=corpus.get, reverse=True)[:1000]
    onehot_dict = {w: i+1 for i, w in enumerate(corpus_)}

    final_list_train, final_list_test = [], []

    for sent in x_train:
        final_list_train.append(
            [onehot_dict[preprocess_string(word)]
             for word in sent.lower().split()
             if preprocess_string(word) in onehot_dict]
        )

    for sent in x_val:
        final_list_test.append(
            [onehot_dict[preprocess_string(word)]
             for word in sent.lower().split()
             if preprocess_string(word) in onehot_dict]
        )

    encoded_train = np.array([1 if label == 'positive' else 0 for label in y_train])
    encoded_test  = np.array([1 if label == 'positive' else 0 for label in y_val])

    return final_list_train, encoded_train, final_list_test, encoded_test, onehot_dict


In [None]:
x_train,y_train,x_test,y_test,vocab = tokenize( x_train,y_train,x_test,y_test)

In [None]:
def padding_(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [None]:
x_train_pad = padding_(x_train,500)
x_test_pad = padding_(x_test,500)

In [None]:
train_data = TensorDataset(
    torch.tensor(x_train_pad, dtype=torch.long),
    torch.tensor(y_train, dtype=torch.long)
)

valid_data = TensorDataset(
    torch.tensor(x_test_pad, dtype=torch.long),
    torch.tensor(y_test, dtype=torch.long)
)

batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)

In [None]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print(sample_x.shape, sample_y.shape)


In [None]:
class SentimentRNN(nn.Module):

    def __init__(self, no_layers, vocab_size, hidden_dim, embedding_dim, output_dim=1, drop_prob=0.5):
        super(SentimentRNN, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM layer  ← MISSING BEFORE
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            no_layers,
            batch_first=True,
            dropout=drop_prob
        )

        # Dropout
        self.dropout = nn.Dropout(0.3)

        # Fully-connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Sigmoid for binary output
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)

        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)

        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)

        out = self.dropout(lstm_out)
        out = self.fc(out)
        sig_out = self.sig(out)

        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]

        return sig_out, hidden

    def init_hidden(self, batch_size):
        h0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers, batch_size, self.hidden_dim)).to(device)
        return (h0, c0)


In [None]:
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
embedding_dim = 64
output_dim = 1
hidden_dim = 256

In [None]:
model = SentimentRNN(no_layers,vocab_size,hidden_dim,embedding_dim,drop_prob=0.5)

#moving to gpu
model.to(device)

In [None]:
criterion = nn.BCELoss()
lr = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

In [None]:
clip = 5
epochs = 1
valid_loss_min = np.inf
epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []

for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")   # ← epoch header

    train_losses = []
    train_acc = 0.0
    model.train()

    # initialize hidden state
    h = model.init_hidden(batch_size)

    for batch_i, (inputs, labels) in enumerate(train_loader):
        inputs, labels = inputs.to(device), labels.to(device)

        # detach hidden state from previous graph
        h = tuple(each.data for each in h)

        model.zero_grad()
        output, h = model(inputs, h)

        # loss and backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        train_losses.append(loss.item())

        # accuracy
        accuracy = acc(output, labels)
        train_acc += accuracy

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()


        print(f"  {batch_i} batch - loss: {loss.item():.4f}, acc: {accuracy/len(labels):.4f}")

    # ----- validation -----
    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()

    for inputs, labels in valid_loader:
        val_h = tuple(each.data for each in val_h)
        inputs, labels = inputs.to(device), labels.to(device)

        output, val_h = model(inputs, val_h)
        val_loss = criterion(output.squeeze(), labels.float())
        val_losses.append(val_loss.item())

        accuracy = acc(output, labels)
        val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc / len(train_loader.dataset)
    epoch_val_acc = val_acc / len(valid_loader.dataset)

    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)

    # ← THIS is the key: print epoch summary
    print(f"  Train loss: {epoch_train_loss:.4f}  |  Val loss: {epoch_val_loss:.4f}")
    print(f"  Train acc:  {epoch_train_acc:.4f}  |  Val acc:  {epoch_val_acc:.4f}")


In [None]:
def predict_text(text):
        word_seq = np.array([vocab[preprocess_string(word)] for word in text.split()
                         if preprocess_string(word) in vocab.keys()])
        word_seq = np.expand_dims(word_seq,axis=0)
        pad_ = padding_(word_seq,500)
        pad = torch.tensor(pad_, dtype=torch.long)

        inputs = pad.to(device)
        batch_size = 1
        h = model.init_hidden(batch_size)
        h = tuple([each.data for each in h])
        output, h = model(inputs, h)
        return(output.item())

In [None]:
index = 32
print(df['review'][index])
print('='*70)

In [None]:

print(f'Actual sentiment is  : {df["sentiment"][index]}')
print('='*70)
pro = predict_text(df['review'][index])
status = "positive" if pro > 0.5 else "negative"
pro = (1 - pro) if status == "negative" else pro
print(f'Predicted sentiment is {status} with a probability of {pro}')