In [34]:
!wget https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Video_Games.json.gz

--2024-05-24 16:37:44--  https://datarepo.eng.ucsd.edu/mcauley_group/data/amazon_v2/categoryFiles/Video_Games.json.gz
Resolving datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)... 132.239.8.30
Connecting to datarepo.eng.ucsd.edu (datarepo.eng.ucsd.edu)|132.239.8.30|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 522823613 (499M) [application/x-gzip]
Saving to: 'Video_Games.json.gz.1'

Video_Games.json.gz  25%[====>               ] 129.24M  10.5MB/s    eta 22s    ^C


In [35]:
!pip install transformers datasets accelerate requests regex bitsandbytes peft



In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
import logging
import warnings
from typing import Dict, List
from datasets import Dataset, load_dataset, disable_caching
disable_caching() ## disable huggingface cache
from torch.utils.data import Dataset, Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from IPython.display import Markdown
import matplotlib.pyplot as plt
%matplotlib inline
from functools import partial
import copy
from transformers import BertTokenizer, BertModel, pipeline, AutoModelForQuestionAnswering, AutoModelForCausalLM, AutoTokenizer, DefaultDataCollator, DataCollatorForSeq2Seq, AdamW, get_linear_schedule_with_warmup, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import os
import bitsandbytes
import nltk
import gzip
import json
import pandas as pd
import numpy as np

nltk.download('punkt')

logging.getLogger().setLevel(logging.CRITICAL)

warnings.filterwarnings('ignore')

if torch.cuda.is_available():
  device = torch.device("cuda")
else:
  device = torch.device("cpu")

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [37]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [38]:
df = getDF('/kaggle/working/Video_Games.json.gz')

df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,1.0,True,"06 9, 2014",A21ROB4YDOZA5P,0439381673,Mary M. Clark,I used to play this game years ago and loved i...,Did not like this,1402272000,,,
1,3.0,True,"05 10, 2014",A3TNZ2Q5E7HTHD,0439381673,Sarabatya,The game itself worked great but the story lin...,Almost Perfect,1399680000,,,
2,4.0,True,"02 7, 2014",A1OKRM3QFEATQO,0439381673,Amazon Customer,I had to learn the hard way after ordering thi...,DOES NOT WORK WITH MAC OS unless it is 10.3 or...,1391731200,15,,
3,1.0,True,"02 7, 2014",A2XO1JFCNEYV3T,0439381673,ColoradoPartyof5,The product description should state this clea...,does not work on Mac OSX,1391731200,11,,
4,4.0,True,"01 16, 2014",A19WLPIRHD15TH,0439381673,Karen Robinson,I would recommend this learning game for anyon...,Roughing it,1389830400,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2565344,5.0,True,"08 1, 2018",ANGB54K3888S4,B01HJEBIAA,josh,"Love it, work good",Works good,1533081600,,,
2565345,5.0,True,"07 17, 2018",A3TEVKR0ZVQB2T,B01HJEBIAA,Prime Member,I do a lot of copy/paste and other keyboard sh...,Great mouse for work and gaming,1531785600,,,[https://images-na.ssl-images-amazon.com/image...
2565346,5.0,True,"07 6, 2018",ABE7YPWEHNVJZ,B01HJEBIAA,Amazon Customer,One year in and it's still working great! Hig...,Five Stars,1530835200,,,
2565347,5.0,True,"06 12, 2018",A3ES9QBK3G192O,B01HJEBIAA,Lina Marmolejos,EXCELENTE,Five Stars,1528761600,,,


In [39]:
df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'style', 'image'],
      dtype='object')

In [40]:
df.drop(['verified', 'reviewTime', 'reviewerID', 'asin', 'reviewerName', 'summary', 'unixReviewTime', 'vote', 'style', 'image'], axis=1, inplace=True)

df

Unnamed: 0,overall,reviewText
0,1.0,I used to play this game years ago and loved i...
1,3.0,The game itself worked great but the story lin...
2,4.0,I had to learn the hard way after ordering thi...
3,1.0,The product description should state this clea...
4,4.0,I would recommend this learning game for anyon...
...,...,...
2565344,5.0,"Love it, work good"
2565345,5.0,I do a lot of copy/paste and other keyboard sh...
2565346,5.0,One year in and it's still working great! Hig...
2565347,5.0,EXCELENTE


In [41]:
def read_and_shuffle(df):
    # Random shuffle.
    df.sample(frac=1)
    return df

In [42]:
class AmazonReviewsDataset(Dataset):
    def __init__(self, df, maxlen):
        self.df = df
        # A reset reindexes from 1 to len(df), the shuffled df frames are sparse.
        self.df.reset_index(drop=True, inplace=True)
        self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
        self.maxlen = maxlen

    def __len__(self):
        return(len(self.df))

    def __getitem__(self, index):
        review = self.df.loc[index, 'reviewText']

        # Classes start from 0.
        label = int(self.df.loc[index, 'overall']) - 1

        # Use BERT tokenizer since it needs to be able to match the tokens to the pre trained words.
        tokens = self.tokenizer.tokenize(review)

        # BERT inputs typically start with a '[CLS]' tag and end with a '[SEP]' tag. For
        tokens = ['[CLS]'] + tokens + ['[SEP]']

        if len(tokens) < self.maxlen:
            # Add the ['PAD'] token
            tokens = tokens + ['[PAD]' for item in range(self.maxlen-len(tokens))]
        else:
            # Truncate the tokens at maxLen - 1 and add a '[SEP]' tag.
            tokens = tokens[:self.maxlen-1] + ['[SEP]']

        # BERT tokenizer converts the string tokens to their respective IDs.
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        # Converting to pytorch tensors.
        tokens_ids_tensor = torch.tensor(token_ids)

        # Masks place a 1 if token != PAD else a 0.
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [43]:
def get_train_and_val_split(df, splitRatio=0.8):
    train=df.sample(frac=splitRatio,random_state=200)
    val=df.drop(train.index)
    print("Number of Training Samples: ", len(train))
    print("Number of Validation Samples: ", len(val))
    return(train, val)

In [44]:
def get_max_length(reviews):
    return len(max(reviews, key=len))

In [45]:
def get_accuracy(logits, labels):
    # get the index of the max value in the row.
    predictedClass = logits.max(dim = 1)[1]

    # get accuracy by averaging over entire batch.
    acc = (predictedClass == labels).float().mean()
    return acc

In [46]:
def trainFunc(net, loss_func, opti, train_loader, test_loader, config):
    best_acc = 0
    for ep in range(config["epochs"]):
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            opti.zero_grad()
            #seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

            logits = net(seq, attn_masks)
            loss = loss_func(m(logits), labels)

            loss.backward()
            opti.step()
            print("Iteration: ", it+1)

            if (it + 1) % config["printEvery"] == 0:
                acc = get_accuracy(m(logits), labels)
                if not os.path.exists(config["outputFolder"]):
                    os.makedirs(config["outputFolder"])

                # Since a single epoch could take well over hours, we regularly save the model even during evaluation of training accuracy.
                torch.save(net.state_dict(), os.path.join(projectFolder, config["outputFolder"], config["outputFileName"]))
                print("Iteration {} of epoch {} complete. Loss : {} Accuracy : {}".format(it+1, ep+1, loss.item(), acc))
                print("Saving at", os.path.join(projectFolder, config["outputFolder"], config["outputFileName"]))

        # perform validation at the end of an epoch.
        val_acc, val_loss = evaluate(net, loss_func, val_loader, config)
        print(" Validation Accuracy : {}, Validation Loss : {}".format(val_acc, val_loss))
        if val_acc > best_acc:
            print("Best validation accuracy improved from {} to {}, saving model...".format(best_acc, val_acc))
            best_acc = val_acc
            torch.save(net.state_dict(), os.path.join(projectFolder, config["outputFolder"], config["outputFileName"] + "_valTested_" + str(best_acc)))
     

In [47]:
def evaluate(net, loss_func, dataloader, config):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            #seq, attn_masks, labels = seq.cuda(args.gpu), attn_masks.cuda(args.gpu), labels.cuda(args.gpu)
            seq, attn_masks, labels = seq.to(device), attn_masks.to(device), labels.to(device)

            logits = net(seq, attn_masks)
            mean_loss += loss_func(m(logits), labels)
            mean_acc += get_accuracy(m(logits), labels)
            print("Validation iteration", count+1)
            count += 1

            '''
            The entire validation set was around 0.1 million entries,
            the validationFraction param controls what fraction of the shuffled
            validation set you want to validate the results on.
            '''
            if count > config["validationFraction"] * len(val_set):
                break
    return mean_acc / count, mean_loss / count

In [48]:
config = {
    "splitRatio" : 0.8,
    "maxLength" : 100,
    "printEvery" : 100,
    "outputFolder" : "Models",
    "outputFileName" : "AmazonReviewClassifier.dat",
    "threads" : 4,
    "batchSize" : 64,
    "validationFraction" : 0.0005,
    "epochs" : 5,
    "forceCPU" : False
    }

config["device"] = device

In [49]:
class SentimentClassifier(nn.Module):
    def __init__(self, num_classes, device, freeze_bert = True):
        super(SentimentClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.device = device

        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        self.cls_layer = nn.Linear(768, num_classes)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        cont_reps, _ = self.bert_layer(seq, attention_mask = attn_masks)

        #Obtaining the representation of [CLS] head
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits.to(self.device)


In [50]:
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

print("Configuration is: ", config)

df = read_and_shuffle(df)

df

Loading BERT tokenizer...
Configuration is:  {'splitRatio': 0.8, 'maxLength': 100, 'printEvery': 100, 'outputFolder': 'Models', 'outputFileName': 'AmazonReviewClassifier.dat', 'threads': 4, 'batchSize': 64, 'validationFraction': 0.0005, 'epochs': 5, 'forceCPU': False, 'device': device(type='cuda')}


Unnamed: 0,overall,reviewText
0,1.0,I used to play this game years ago and loved i...
1,3.0,The game itself worked great but the story lin...
2,4.0,I had to learn the hard way after ordering thi...
3,1.0,The product description should state this clea...
4,4.0,I would recommend this learning game for anyon...
...,...,...
2565344,5.0,"Love it, work good"
2565345,5.0,I do a lot of copy/paste and other keyboard sh...
2565346,5.0,One year in and it's still working great! Hig...
2565347,5.0,EXCELENTE


In [53]:
num_classes = df['overall'].nunique()
print("Number of Target Output Classes:", num_classes)
totalDatasetSize = len(df)

Number of Target Output Classes: 5


In [None]:
# Group by the column overall. This helps you get distribution of the Review overalls.
symbols = df.groupby('overall')

overalls_dist = []
for i in range(num_classes):
    overalls_dist.append(len(symbols.groups[i+1])/totalDatasetSize)

In [None]:
train, val = get_train_and_val_split(df, config["splitRatio"])

In [None]:
T = config["maxLength"]

In [None]:
train_set = AmazonReviewsDataset(train, T)
val_set = AmazonReviewsDataset(val, T)

In [None]:
train_loader = DataLoader(train_set, batch_size = config["batchSize"], num_workers = config["threads"])
val_loader = DataLoader(val_set, batch_size = config["batchSize"], num_workers = config["threads"])

In [None]:
# We are unfreezing the BERT layers so as to be able to fine tune and save a new BERT model that is specific to the Sizeable food reviews dataset.

net = SentimentClassifier(num_classes, config["device"], freeze_bert=False)
net.to(config["device"])
weights = torch.tensor(overalls_dist).to(config["device"])

In [None]:
# Setting the Loss function and Optimizer.
loss_func = nn.NLLLoss(weight=weights)
opti = optim.Adam(net.parameters(), lr = 2e-5)
m = nn.LogSoftmax(dim=1)

In [None]:
torch.cuda.set_device(0)
trainFunc(net, loss_func, opti, train_loader, val_loader, config)

In [None]:
data_collator = DefaultDataCollator()

In [None]:
'''trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"].select(range(1000)),
    eval_dataset=tokenized_datasets["validation"].select(range(100)),
    data_collator=data_collator,
    tokenizer=tokenizer,
)'''

In [None]:
'''trainer.train()'''