## Installed Packages

In [1]:
import sys
!{sys.executable} -m pip install contractions
!{sys.executable} -m pip install gensim==4.2.0
!pip install scikit-learn
!pip install torch torchvision torchaudio

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-2.0.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

In [2]:
## Importing and installing libraries

import numpy as np
import copy
import pandas as pd
import warnings 
import re
import sys
import nltk
from gensim.models import Word2Vec
from nltk.corpus import stopwords
import string
from torch import nn
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.nn import CrossEntropyLoss, Softmax, Linear
from torch.optim import SGD, Adam
from sklearn.metrics.pairwise import cosine_similarity
from torch.optim.lr_scheduler import ReduceLROnPlateau
from nltk.stem import WordNetLemmatizer
from gensim.models import KeyedVectors
from gensim import utils
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import classification_report
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from statistics import mean
from os import path
import os.path
import gensim
import gensim.downloader
from sklearn.svm import LinearSVC

nltk.download('punkt')

warnings.filterwarnings('ignore')

import contractions

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## 1. Dataset Generation

In [3]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/

Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks


In [4]:
#fields required in the balanced dataframe from the original dataset
input_column=["review_body","star_rating"]

#reading the original dataset to filter the columns that are required
input_df =pd.read_csv('./amazon_reviews_us_Beauty_v1_00.tsv',usecols=input_column,sep='\t',error_bad_lines=False)

In [5]:
#Creating 3 different classes to get 20000 data from each class to avoid computational burden

class_one_df =(input_df[(input_df['star_rating'] == 1) | (input_df['star_rating'] == 2) ]).sample(n=20000)
class_one_df['class']=1

class_two_df =(input_df[(input_df['star_rating'] == 3)]).sample(n=20000)
class_two_df['class']=2

class_three_df =(input_df[(input_df['star_rating'] == 4) | (input_df['star_rating'] == 5) ]).sample(n=20000)
class_three_df['class']=3

#Combining all the data received from each class into a single balanced dataframe

amazon_balanced_df = pd.concat([class_one_df, class_two_df, class_three_df])

#Resetting the index as we have retrieved different data according to the classes created.
#Therefore, we will have irregular or unsorted index keys. 
#We will reset the index to the new and incremental values from 0

amazon_balanced_df = amazon_balanced_df.reset_index(drop=True)

# Created a new dataframe consisting of the two columns (star_rating and review_body) 
#along with class one assigned to them on the basis of star_rating. We are also resetting the index

### Data Cleaning 

### Handling null values

In [6]:
#We are changing all null values to an empty string

amazon_balanced_df = amazon_balanced_df.fillna('')

In [7]:
#Uncleaned data copy
amazon_df=amazon_balanced_df.copy()

### Convert all reviews into lowercase

In [8]:
# Converting all review body into lowercase

amazon_balanced_df['review_body'] = amazon_balanced_df['review_body'].str.lower()

### Remove the HTML from the reviews

In [9]:
# Removing all the html tags from each review body 

amazon_balanced_df['review_body']=amazon_balanced_df['review_body'].apply(lambda x : re.sub('<.*?>','',str(x)))


### Remove the URLs from the reviews

In [10]:
# Removing all the URLs from each review body 

amazon_balanced_df['review_body'] = amazon_balanced_df['review_body'].apply(lambda y: re.split('https:\/\/.*', str(y))[0])


### Remove non-alphabetical characters

In [11]:
# Removing all the non alphabetic chaarcters(symbols, numbers) from each review body 

amazon_balanced_df['review_body'] = amazon_balanced_df['review_body'].apply(lambda z: " ".join([re.sub('[^A-Za-z]+','', z) for z in nltk.word_tokenize(z)]))


### Remove extra spaces

In [12]:
# Will remove leading and trailing spaces
amazon_balanced_df['review_body'] = amazon_balanced_df['review_body'].str.strip()


### Perform contractions on the review_body

In [13]:
## This will elongate the short form used in sentences like (I'll ---> I will)

amazon_balanced_df['without_contraction'] = amazon_balanced_df['review_body'].apply(lambda a: [contractions.fix(word) for word in a.split()])
amazon_balanced_df['review_body'] = [' '.join(map(str, x)) for x in amazon_balanced_df['without_contraction']]


## Remove Punctuations 

In [14]:
amazon_balanced_df['review_body'] = amazon_balanced_df['review_body'].str.replace(r'[^\w\s]+', '')

## 2. Word Embedding

### (a) Downloading pretrained word2vec-google-news-300

In [None]:
# word2vec_model = gensim.downloader.load('word2vec-google-news-300')

In [None]:
# word2vec_model.save('Gensim_word2vec_model.kv')

In [15]:
from gensim.models import KeyedVectors
word2vec_model= KeyedVectors.load("Gensim_word2vec_model.kv")

### (b) Training word2vec model on our own dataset

In [16]:
class dataEmbed:
    def __init__(self, data_set):
      self.data_set = data_set

    def __iter__(self):
        for x in self.data_set:
            yield utils.simple_preprocess(x)

In [None]:
# sentence_embed = dataEmbed(amazon_balanced_df.review_body)
# # window=13
# # vector_size=300
# # min_count=9
# embed_word2vec = Word2Vec(sentences=sentence_embed, vector_size=300, min_count=9, window=13)
# model = embed_word2vec.wv

### Process to extract word2vec embeddings

In [18]:
### To concatenate first 10 Word2Vec vectors for each review as the input feature

embedding_space_concat = []
for i in range(60000):
    vectorWord = np.zeros((1,300))  # change the size of the vector
    listword = amazon_df['review_body'][i].split(" ")
    for item in listword[:10]:
        if item in word2vec_model:
            vectorWord = np.concatenate([vectorWord, np.expand_dims(word2vec_model[item], axis=0)], axis=0)

    vectorWord = vectorWord[1:]
    if len(vectorWord)<10:
        for i in range(10 - len(vectorWord)):
            vectorWord = np.concatenate([vectorWord, np.zeros((1,300))], axis=0)         
    embedding_space_concat.append(vectorWord)
    
embedding_dataset_concat = np.array(embedding_space_concat)
embedding_dataset_concat = embedding_dataset_concat.reshape(embedding_dataset_concat.shape[0], embedding_dataset_concat.shape[1]*embedding_dataset_concat.shape[2])

In [19]:
print(embedding_dataset_concat.shape)

(60000, 3000)


In [20]:
P_train, P_test, Q_train, Q_test = train_test_split(embedding_dataset_concat, amazon_df['class'], test_size=0.20, random_state=42, stratify=amazon_df['class'])

Q_train = Q_train.reset_index(drop=True)
Q_test = Q_test.reset_index(drop=True)


print(P_train.shape, P_test.shape, Q_train.shape, Q_test.shape)

(48000, 3000) (12000, 3000) (48000,) (12000,)


In [None]:
type(P_train)

numpy.ndarray

## 4. Feedforward Neural Networks

In [21]:
from torch.utils.data import Dataset, DataLoader

In [22]:
#Creating a dataloader using torch
class dataloader(torch.utils.data.Dataset):
    def __init__(self, dataset_record, label_record):
        self.dataset = dataset_record
        self.labels = label_record
        
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, index):
        dataset = self.dataset[index]
        labels  = self.labels[index]
        
        return dataset, labels

In [23]:
#Creating classes to define the architecure 
class feedForward(nn.Module):
    def __init__(self, output_size, input_size):
        super(feedForward, self).__init__()
        self.layer1 = nn.Linear(input_size, 300)
        self.relu1 = nn.ReLU()
        self.layer2 = nn.Linear(300, 100)
        self.relu2 = nn.ReLU()
        self.layer3 = nn.Linear(100, output_size)

    def forward(self, x):
        return  self.layer3(self.relu2(self.layer2(self.relu1(self.layer1(x)))))



In [24]:
fnn=feedForward(3,3000)
fnn

feedForward(
  (layer1): Linear(in_features=3000, out_features=300, bias=True)
  (relu1): ReLU()
  (layer2): Linear(in_features=300, out_features=100, bias=True)
  (relu2): ReLU()
  (layer3): Linear(in_features=100, out_features=3, bias=True)
)

### (b) 

In [25]:
# Convert P_train and P_test to float32 
A_word2vec_train = P_train.astype(np.float32)
A_word2vec_test  = P_test.astype(np.float32)

# Subtract 1 from B_train and B_test values
B_train = Q_train - 1
B_test = Q_test - 1

# Create PyTorch DataLoader objects for the training and testing sets
train_dataset = dataloader(A_word2vec_train, B_train)
train_set = torch.utils.data.DataLoader(train_dataset, batch_size=50)

test_dataset = dataloader(A_word2vec_test, B_test)
test_set = torch.utils.data.DataLoader(test_dataset, batch_size=50)

In [26]:
from sklearn.metrics import accuracy_score, f1_score

In [27]:
def train(reviews_dataloader_train, reviews_dataloader_test, model, num_epochs, concat=True, rnn=False, gru=False, prev_loss=float('inf')):
    y_pred_label_train = []
    y_true_label_train = []
    y_pred_label_test = []
    y_true_label_test = []
    
    # Set the device for the model
    # device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # model.to(device)
    
    # Define the loss function and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=0.001)
    softmax = Softmax(dim=1)
    
    # Define the scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
    
    # Keep track of the best model
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    
    # Keep track of the previous loss
    loss_min = prev_loss
    
    # Train the model
    for epoch in range(num_epochs):
        print('\n Epoch: {}'.format(epoch))
        
        # print(reviews_dataloader_train)
        for j, (x, y) in enumerate(reviews_dataloader_train):
            y_pred = model(x)
            y_pred_label_train.append(torch.argmax(softmax(y_pred.detach()), axis=1))
            y_true_label_train.append(y.detach())
            loss = criterion(y_pred, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # if j % 100 == 0:
            #     print('Epoch {:03} Batch {:03}/{:03} Loss: {:.4f}'.format(epoch, j, len(reviews_dataloader_train), loss.item()))
                
        # Evaluate the model on the test set
        with torch.no_grad():
            for x, y in reviews_dataloader_test:
                y_pred = model(x)
                y_pred_label_test.append(torch.argmax(softmax(y_pred.detach()), axis=1))
                y_true_label_test.append(y.detach())

        # Calculate accuracy and f1-score
        y_pred_train = torch.cat(y_pred_label_train)
        y_true_train = torch.cat(y_true_label_train)
        y_pred_test = torch.cat(y_pred_label_test)
        y_true_test = torch.cat(y_true_label_test)
        
        train_acc = accuracy_score(y_true_train.cpu().numpy(), y_pred_train.cpu().numpy())
        test_acc = accuracy_score(y_true_test.cpu().numpy(), y_pred_test.cpu().numpy())
        train_f1 = f1_score(y_true_train.cpu().numpy(), y_pred_train.cpu().numpy(), average='macro')
        test_f1 = f1_score(y_true_test.cpu().numpy(), y_pred_test.cpu().numpy(), average='macro')

        print('Epoch: {:03}, Loss: {:.4f}, Train Acc: {:.4f}, Test Acc: {:.4f}'.format(epoch, loss.item(), train_acc, test_acc))
        
        # Update the learning rate
        scheduler.step()
        
        # Save the best model based on test accuracy
        if test_acc > best_acc:
            best_acc = test_acc
            best_model_wts = copy.deepcopy(model.state_dict())
            
        # Save the model checkpoint
        # if loss.item() < loss_min:
        #     print(f'Loss decreased from {loss_min:.4f} to {loss.item():.4f}. Saving model...')
        #     torch.save(model.state_dict(), 'model_checkpoint.pt')
        #     loss


In [28]:
train(train_set, test_set, fnn, 20)


 Epoch: 0
Epoch: 000, Loss: 0.9506, Train Acc: 0.5246, Test Acc: 0.5460

 Epoch: 1
Epoch: 001, Loss: 0.8462, Train Acc: 0.5612, Test Acc: 0.5457

 Epoch: 2
Epoch: 002, Loss: 0.7152, Train Acc: 0.6004, Test Acc: 0.5366

 Epoch: 3
Epoch: 003, Loss: 0.5106, Train Acc: 0.6424, Test Acc: 0.5287

 Epoch: 4
Epoch: 004, Loss: 0.4892, Train Acc: 0.6795, Test Acc: 0.5248

 Epoch: 5
Epoch: 005, Loss: 0.5019, Train Acc: 0.7056, Test Acc: 0.5249

 Epoch: 6
Epoch: 006, Loss: 0.4329, Train Acc: 0.7307, Test Acc: 0.5245

 Epoch: 7
Epoch: 007, Loss: 0.3555, Train Acc: 0.7528, Test Acc: 0.5241

 Epoch: 8
Epoch: 008, Loss: 0.2691, Train Acc: 0.7723, Test Acc: 0.5237

 Epoch: 9
Epoch: 009, Loss: 0.1962, Train Acc: 0.7894, Test Acc: 0.5233

 Epoch: 10
Epoch: 010, Loss: 0.1898, Train Acc: 0.8041, Test Acc: 0.5228

 Epoch: 11
Epoch: 011, Loss: 0.1857, Train Acc: 0.8167, Test Acc: 0.5224

 Epoch: 12
Epoch: 012, Loss: 0.1792, Train Acc: 0.8277, Test Acc: 0.5219

 Epoch: 13
Epoch: 013, Loss: 0.1708, Train Acc: