# Sentiment Analysis : NLP

#### References : 
* Paper : https://aclanthology.org/D14-1181/
* Data : https://www.kaggle.com/datasets/bittlingmayer/amazonreviews

#### Dependancy : 

* pip install bz2file

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd
import random
import bz2
# Preprocess text data

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

from collections import Counter

from tqdm import tqdm
tqdm.pandas()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.cuda.get_device_name(0))

cuda
NVIDIA GeForce RTX 2060


In [3]:
# Read data
def get_data_and_labels(filename):
    labels = []
    texts = []
    for row in bz2.BZ2File(filename):
        x = row.decode("utf-8")
        labels.append(int(x[9]) - 1)
        texts.append(x[10:].strip())
    labels = labels[:int(len(labels) * 0.01)]
    texts = texts[:int(len(texts) * 0.01)]
    return np.array(labels), texts

train_labels, train_texts = get_data_and_labels("../dataset/amazon-reviews-sentiment-analysis/train.ft.txt.bz2")
test_labels, test_texts = get_data_and_labels("../dataset/amazon-reviews-sentiment-analysis/test.ft.txt.bz2")

# converting into dataframe 
train_df = pd.DataFrame(zip(train_texts, train_labels), columns=["text", "label"])
test_df = pd.DataFrame(zip(test_texts, test_labels), columns=["text", "label"])

In [4]:
display(train_df.sample(5))
print(train_df.label.value_counts())

Unnamed: 0,text,label
1934,"A booklet- not a textbook!: At 60 pages,this i...",0
20230,A good book for understand higher dimensions: ...,1
203,Not Like the Old Formula: The formulation of t...,0
16172,GoFit 75cm Pro Stability Ball: The worst purch...,0
8170,This movie stinks!: This movie proves that hol...,0


label
1    18180
0    17820
Name: count, dtype: int64


In [5]:
display(test_df.sample(5))
print(test_df.label.value_counts())

Unnamed: 0,text,label
1438,A Lesson in Human Tragedy: This piece is ultim...,1
823,Intriguing beginning and then schmaltz: I won'...,0
390,"Beyond the mega-hit, preview of better things ...",1
2595,Very disappointed!: Very disappointed with thi...,0
851,"Worse than ""The Quest"": The beginning of the f...",0


label
1    2049
0    1951
Name: count, dtype: int64


In [6]:
stop_words = stopwords.words('english')
stemmer = PorterStemmer()

In [7]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]
    return tokens

In [8]:
train_df['tokens'] = train_df['text'].progress_apply(preprocess_text)
test_df['tokens'] = test_df['text'].progress_apply(preprocess_text)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36000/36000 [00:42<00:00, 837.66it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:04<00:00, 836.43it/s]


In [9]:
# Build vocabulary (afterwords the tensors made out of this need to be converted on CUDA device otherwise it will raise an error)
word_to_idx = {}
for tokens in tqdm(train_df['tokens']):
    for token in tokens:
        if token not in word_to_idx:
            word_to_idx[token] = len(word_to_idx)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36000/36000 [00:00<00:00, 158173.82it/s]


In [10]:
# Convert tokens to numerical indices wrt to word_to_idx
train_df['indices'] = train_df['tokens'].progress_apply(lambda x: [word_to_idx[token] for token in x])
test_df['indices'] = test_df['tokens'].progress_apply(lambda x: [word_to_idx[token] if token in word_to_idx.keys() else 17 for token in x])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36000/36000 [00:00<00:00, 100103.38it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:00<00:00, 82506.16it/s]


In [11]:
train_df.head()

Unnamed: 0,text,label,tokens,indices
0,Stuning even for the non-gamer: This sound tra...,1,"[stune, even, non-gam, :, sound, track, beauti...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,The best soundtrack ever to anything.: I'm rea...,1,"[best, soundtrack, ever, anyth, ., :, 'm, read...","[24, 41, 23, 42, 17, 3, 43, 44, 45, 46, 47, 24..."
2,Amazing!: This soundtrack is my favorite music...,1,"[amaz, !, :, soundtrack, favorit, music, time,...","[82, 7, 3, 41, 83, 19, 84, 71, 85, 17, 86, 87,..."
3,Excellent Soundtrack: I truly like this soundt...,1,"[excel, soundtrack, :, truli, like, soundtrack...","[135, 41, 3, 136, 109, 41, 137, 138, 18, 19, 1..."
4,"Remember, Pull Your Jaw Off The Floor After He...",1,"[rememb, ,, pull, jaw, floor, hear, :, 've, pl...","[178, 71, 179, 180, 181, 182, 3, 94, 20, 18, 7..."


In [12]:
# Pad sequences
max_len = max(len(seq) for seq in train_df['indices'])
train_df['padded'] = train_df['indices'].apply(lambda x: x + [0]*(max_len-len(x)))
test_df['padded'] = test_df['indices'].apply(lambda x: x + [0]*(max_len-len(x)))

In [13]:
train_df.head()

Unnamed: 0,text,label,tokens,indices,padded
0,Stuning even for the non-gamer: This sound tra...,1,"[stune, even, non-gam, :, sound, track, beauti...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,The best soundtrack ever to anything.: I'm rea...,1,"[best, soundtrack, ever, anyth, ., :, 'm, read...","[24, 41, 23, 42, 17, 3, 43, 44, 45, 46, 47, 24...","[24, 41, 23, 42, 17, 3, 43, 44, 45, 46, 47, 24..."
2,Amazing!: This soundtrack is my favorite music...,1,"[amaz, !, :, soundtrack, favorit, music, time,...","[82, 7, 3, 41, 83, 19, 84, 71, 85, 17, 86, 87,...","[82, 7, 3, 41, 83, 19, 84, 71, 85, 17, 86, 87,..."
3,Excellent Soundtrack: I truly like this soundt...,1,"[excel, soundtrack, :, truli, like, soundtrack...","[135, 41, 3, 136, 109, 41, 137, 138, 18, 19, 1...","[135, 41, 3, 136, 109, 41, 137, 138, 18, 19, 1..."
4,"Remember, Pull Your Jaw Off The Floor After He...",1,"[rememb, ,, pull, jaw, floor, hear, :, 've, pl...","[178, 71, 179, 180, 181, 182, 3, 94, 20, 18, 7...","[178, 71, 179, 180, 181, 182, 3, 94, 20, 18, 7..."


In [14]:
# Convert to tensors
X_train = torch.tensor(np.array(train_df['padded'].tolist()))
y_train = torch.tensor(np.array(train_df['label'].tolist())).long()

X_test = torch.tensor(np.array(test_df['padded'].tolist()))
y_test = torch.tensor(np.array(test_df['label'].tolist())).long()

 ### Building a model

In [15]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dims, n_filters, filter_sizes, output_dim, dropout, pad_idx):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dims, padding_idx=pad_idx) 
        self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size= (filter_sizes[0], embedding_dims))
        self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size= (filter_sizes[1], embedding_dims))
        self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size= (filter_sizes[2], embedding_dims))
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, text):
        #text = [batch size, sent len]
        embedded = self.embedding(text)
        #embedded = [batch size, sent len, emb dim]
        embedded = embedded.unsqueeze(1)
        #embedded = [batch size, 1, sent len, emb dim]
        convd_0 = F.relu(self.conv_0(embedded).squeeze(3))
        convd_1 = F.relu(self.conv_1(embedded).squeeze(3))        
        convd_2 = F.relu(self.conv_2(embedded).squeeze(3))  
        
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        pooled_0 = F.max_pool1d(convd_0, convd_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(convd_1, convd_1.shape[2]).squeeze(2)        
        pooled_2 = F.max_pool1d(convd_2, convd_2.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
        return self.fc(cat)

In [16]:
# Define model hyperparameters
vocab_size = len(word_to_idx)
embedding_dims = 100
n_filters = 100
filter_sizes = [3, 4, 5]
output_dim = 2
dropout = 0.5
pad_idx = 0
batch_size = 32
learning_rate = 0.001
num_epochs = 10

In [17]:
X_train = X_train.to(device)
y_train = y_train.to(device)
X_test = X_test.to(device)
y_test = y_test.to(device)

train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [18]:
# Instantiate model and move to device
model = CNN(vocab_size, embedding_dims, n_filters, filter_sizes, output_dim, dropout, pad_idx).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
from tqdm import tqdm

In [20]:
for epoch in range(num_epochs):
    train_loss = 0
    train_correct = 0
    train_total = 0
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        train_total += labels.size(0)
        train_correct += (predicted == labels).sum().item()
    train_acc = 100 * train_correct / train_total

    test_loss = 0
    test_correct = 0
    test_total = 0
    model.eval()
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
    test_acc = 100 * test_correct / test_total

    print(f"Epoch {epoch+1}/{num_epochs}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%, Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%")

Epoch 1/10: Train Loss: 610.1081, Train Acc: 72.23%, Test Loss: 47.1772, Test Acc: 83.30%
Epoch 2/10: Train Loss: 426.2246, Train Acc: 82.83%, Test Loss: 43.3729, Test Acc: 85.08%
Epoch 3/10: Train Loss: 341.8595, Train Acc: 87.00%, Test Loss: 42.1496, Test Acc: 85.92%
Epoch 4/10: Train Loss: 282.1348, Train Acc: 89.72%, Test Loss: 41.4804, Test Acc: 86.33%
Epoch 5/10: Train Loss: 229.3114, Train Acc: 91.83%, Test Loss: 49.1287, Test Acc: 84.40%
Epoch 6/10: Train Loss: 183.7502, Train Acc: 93.78%, Test Loss: 50.7152, Test Acc: 85.42%
Epoch 7/10: Train Loss: 133.9479, Train Acc: 95.50%, Test Loss: 61.5720, Test Acc: 84.58%
Epoch 8/10: Train Loss: 103.6976, Train Acc: 96.61%, Test Loss: 67.8914, Test Acc: 84.65%
Epoch 9/10: Train Loss: 74.6697, Train Acc: 97.59%, Test Loss: 81.6929, Test Acc: 84.28%
Epoch 10/10: Train Loss: 63.6842, Train Acc: 98.01%, Test Loss: 88.5330, Test Acc: 84.53%


In [21]:
def evaluate(model, X_test, y_test):
    with torch.no_grad():
        outputs = model(X_test)
        predicted = torch.argmax(outputs, dim=1)
        correct = (predicted == y_test).sum().item()
        total = y_test.size(0)
        accuracy = correct / total
        return accuracy

accuracy = evaluate(model, X_test, y_test)
print('Accuracy on test set: {:.2f}%'.format(accuracy*100))

Accuracy on test set: 84.52%


In [22]:
word_to_idx['<unk>'] = len(word_to_idx)

In [31]:
def predict_sentiment(model, sentence, word_to_idx):
    # Tokenize and preprocess the input text
    sentence = preprocess_text(sentence)
    indexed = [word_to_idx.get(word, word_to_idx['<unk>']) for word in sentence]
    tensor = torch.LongTensor(indexed).unsqueeze(0).to(device)

    # Pad the input sequence to be at least as long as the largest filter size
    max_filter_size = max([3,4,5])#max(model.filter_sizes)
    if tensor.size(1) < max_filter_size:
        padding = torch.zeros(1, max_filter_size - tensor.size(1), dtype=torch.long).to(device)
        tensor = torch.cat([tensor, padding], dim=1)

    # Evaluate the model and return the predicted sentiment
    with torch.no_grad():
        model.eval()
        outputs = model(tensor)
        prediction = torch.sigmoid(outputs).squeeze().tolist()
        sentiment = 'positive' if prediction[0] <= 0.5 else 'negative'
        return sentiment


text = "i am a bad boy"
sentiment = predict_sentiment(model, text, word_to_idx)
print(f'The sentiment of "{text}" is "{sentiment}"')

The sentiment of "i am a bad boy" is "negative"
