In [None]:
import json
import openpyxl
import pandas as pd
from collections import Counter
import string
import re
import csv
from gensim.models import LdaModel
from sklearn.preprocessing import normalize
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from torch import nn
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import RobertaTokenizer, RobertaModel
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
import torch.optim as optim
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

In [11]:
raw_data=[]
sentiment=[]
file_path = '../Data/Amazon Review Data.xlsx'  
workbook = openpyxl.load_workbook(file_path)
sheet = workbook.active

for row in sheet.iter_rows(values_only=True):
    raw_data.append(row[1])
    sentiment.append(row[2])

workbook.close()
raw_data=raw_data[1:]
sentiment=sentiment[1:]
raw_data[0]

'some of the figures and illustrations are deficient in e book format'

In [59]:
# Clean data for topic modelling
def pre_process_noun(sentence):
    res=word_tokenize(sentence)
    res=pos_tag(res)
    res=[lemmatizer.lemmatize(word.lower()) for word,tag in res if tag.startswith('NN') and word.isalpha() and word.isascii()]
   
    return res
# Clean data for sentiment analysis
def pre_process_adj(sentence):
    res=word_tokenize(sentence)
    res=pos_tag(res)
    res=[lemmatizer.lemmatize(word.lower()) for word,tag in res if tag.startswith('JJ') and word.isalpha() and word.isascii()]
   
    return res

In [60]:
data_list=[]
for i in raw_data:
    data_list.append(pre_process_noun(i))
print(data_list[0],len(data_list))

['figure', 'illustration', 'book', 'format'] 1235


In [None]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')

# Function to get embeddings from RoBERTa
def get_roberta_embedding(phrase):
    inputs = tokenizer(phrase, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the CLS token embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

In [62]:
len(data_list)

1235

In [63]:
flat_list = [word for sublist in data_list for word in sublist]

In [64]:
len(flat_list)

15374

In [65]:
distinct_word=list(set(flat_list))

In [66]:
len(distinct_word)

1996

In [67]:
# Put roberta_embedding in a dictionary
robert_dict = {}
for word in distinct_word:
    robert_dict[word]=get_roberta_embedding(word)

In [68]:
data_list_adj=[]
for i in raw_data:
    data_list_adj.append(pre_process_adj(i))
print(data_list_adj[0],len(data_list_adj))

['deficient', 'e'] 1235


In [69]:
embedding_dim = 768

In [70]:
from sklearn.preprocessing import normalize
import numpy as np
embed=[]
sen_avg_emd=[]
data_list_adj_final=[]
sentiment_final=[]
# Find average embedding for each sentence
for idx,i in enumerate(data_list):
    if i:
        new=np.array([robert_dict[ii] for ii in i ])
        embed.append(np.mean(new,axis=0))
        data_list_adj_final.append(data_list_adj[idx])
        sentiment_final.append(sentiment[idx])

embed=np.array(embed)
print(embed.shape)
sen_avg_emd=embed
sen_avg_emd.shape
print(len(data_list_adj_final))
print(len(sentiment_final))


(1198, 768)
1198
1198


In [72]:
from sklearn.cluster import KMeans
import numpy as np
output_dim = 5
kmeans = KMeans(n_clusters=output_dim, random_state=0).fit(sen_avg_emd)
kmeans.labels_

topic_embedding_init=kmeans.cluster_centers_
topic_embedding_init.shape

(5, 768)

In [73]:

def find_closest_words(robert_dict, vector, top_n=10):
    # Find 10th closest word (topic) in embedding for topics
    similarities = {}
    for word, embedding in robert_dict.items():
        cosine_similarity = np.dot(vector, embedding) / (np.linalg.norm(vector) * np.linalg.norm(embedding))
        similarities[word] = cosine_similarity
    closest_words = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return closest_words[:10]

for example_vector in topic_embedding_init:
    print(example_vector.shape)
    closest_words = find_closest_words(robert_dict, example_vector,distinct_word)
    
    print("Closest words:")
    #for word, similarity in closest_words:
        #print(f"{word}: {similarity:.4f}")
    print(', '.join([i[0] for i in closest_words]))
    print("--------------------------")

(768,)
Closest words:
song, tale, core, buck, style, leader, pocket, cause, bare, someone
--------------------------
(768,)
Closest words:
root, setting, leader, taker, pointer, binding, depth, gre, absolute, dh
--------------------------
(768,)
Closest words:
book, author, pick, perfect, flame, library, text, uk, building, cloth
--------------------------
(768,)
Closest words:
song, leader, root, tale, bare, depth, shine, setting, bind, stick
--------------------------
(768,)
Closest words:
style, song, flame, perfect, book, model, buck, translation, super, piece
--------------------------


In [75]:
sen_avg_emd.shape[0]//4*3

897

In [76]:
from collections import Counter


class TorchDataset(Dataset):
    def __init__(self, data):
        self.data = data
 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        return torch.tensor(self.data[idx],dtype=torch.float32)
    
# Batch sizes
batch_size = 40
eval_batch_size = 30


dataset = TorchDataset(sen_avg_emd[:sen_avg_emd.shape[0]//4*3])
loader = DataLoader(dataset, batch_size=batch_size)
dev_dataset = TorchDataset(sen_avg_emd[sen_avg_emd.shape[0]//4*3:])
dev_loader = DataLoader(dev_dataset, batch_size=eval_batch_size)



In [77]:
def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for review in loader:
        optimizer.zero_grad()
        outputs = model(review)
        loss = criterion(outputs, review)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    

def train_eval(model, loader, test_loader, optimizer, criterion):
    
    model.train()
    total_loss = 0
   
    for review in loader:
        
        optimizer.zero_grad()
        outputs = model(review)
        loss = criterion(outputs, review)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    a= total_loss / len(loader)

    model.eval()
    results = []
    total_loss = 0
    with torch.no_grad():
        for review in test_loader:
            
            #print(len(evidences[0]))=5
            outputs = model(review)
            loss = criterion(outputs, review)
            total_loss += loss.item()

    avg_loss = total_loss / len(test_loader)
 
    return [a, total_loss / len(test_loader)]

In [78]:
import torch.nn.functional as F
class Regre(nn.Module):
    
 
    def __init__(self, topic_embedding_init,embedding_dim, topic_dim):
        super(Regre, self).__init__()
        self.encoder = nn.Linear(embedding_dim , topic_dim)
        self.topic_dim=topic_dim
        # topic embedding
        self.topic = nn.Parameter(torch.tensor(topic_embedding_init, dtype=torch.float32))

    def forward(self, review):
        
        #h [batch, topic_dim]
        # Distribution of topics
        h_dis=self.encoder(review)
        #out [batch, embedding]
        # projection of topic and distribution
        out=torch.matmul(h_dis,self.topic)/self.topic_dim**0.5

        
        #self.topic [batch, embedding]
        return out

In [None]:

epoch = 100
lr = 0.05
import matplotlib.pyplot as plt 


# Train
                        
# plot the train & validation loss to find the # of epochs to use
for i in range(1):
    model = Regre(topic_embedding_init=topic_embedding_init, embedding_dim=embedding_dim, topic_dim=output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    l1,l2=[],[]
    for epoch in range(epoch):
   
        [a,b]=train_eval(model, loader, dev_loader, optimizer, criterion)
        l1.append(a)
        l2.append(b)
    z=list(range(len(l1)))
    
      
    # plot lines 
    plt.plot(z, l1, label = "train", linestyle="-") 
    plt.plot(z, l2, label = "val", linestyle="--") 
    plt.legend() 
    plt.show()


In [96]:
for name, param in model.named_parameters():
    if name =='topic':
        final_topic_embedding=param
#final_topic_embedding_arr = final_topic_embedding.detach().cpu().numpy()
final_topic_embedding_arr = normalize(final_topic_embedding.detach().cpu().numpy())


for example_vector in final_topic_embedding_arr:
    closest_words = find_closest_words(robert_dict, example_vector,distinct_word)
    
    print("Closest words:")
    #for word, similarity in closest_words:
     #   print(f"{word}: {similarity:.4f}")
    print(', '.join([i[0] for i in closest_words]))
    print("--------------------------")

Closest words:
dentaltown, manual, wearer, jocko, eigth, hygienist, schusterthe, professionalism, leaguer, hygientist
--------------------------
Closest words:
therapist, hygiene, dental, healthcare, therapy, complexity, syndrome, trainer, surgeon, university
--------------------------
Closest words:
texbook, purchase, guidebook, textbook, workbook, book, handbook, certificate, hardcover, packet
--------------------------
Closest words:
explenation, anagram, stylistically, instagram, canada, mike, simpson, lesion, barrons, preface
--------------------------
Closest words:
for, justice, break, change, flow, clear, action, come, cream, item
--------------------------


In [82]:
final_topic=[]
import numpy as np
def similarity(vec1, vec2):

    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
for review_embedding in sen_avg_emd:
    topic_distribution=[]
    for topic_embedding in topic_embedding_init:
        topic_distribution.append(similarity(review_embedding, topic_embedding))
    final_topic.append(topic_distribution.index(max(topic_distribution))+1)
count = Counter(final_topic)

print("Value counts:", count)

Value counts: Counter({4: 475, 1: 301, 5: 212, 2: 172, 3: 38})


In [83]:
final_topic=[]
import numpy as np
def similarity(vec1, vec2):

    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
for review_embedding in sen_avg_emd:
    topic_distribution=[]
    for topic_embedding in final_topic_embedding_arr:
        topic_distribution.append(similarity(review_embedding, topic_embedding))
    final_topic.append(topic_distribution.index(max(topic_distribution))+1)
count = Counter(final_topic)

print("Value counts:", count)


Value counts: Counter({3: 672, 4: 521, 5: 5})


In [85]:
posi=[]
neu=[]
nega=[]
for idx,i in enumerate(sentiment_final):
    if ''.join(i)=='negative':
        nega.extend(data_list_adj_final[idx])
    if ''.join(i)=='positive':
        posi.extend(data_list_adj_final[idx])
    if ''.join(i)=='neutral':
        neu.extend(data_list_adj_final[idx])
    

In [86]:
doc3=[[],[],[]]
doc2=[[],[],[]]
senti_idx_dict={}
senti_idx_dict['positive']=0
senti_idx_dict['negative']=1
senti_idx_dict['neutral']=2
for idx,i in enumerate(final_topic):
    if i==3:
        doc3[senti_idx_dict[''.join(sentiment_final[idx])]].extend(data_list_adj_final[idx])
    if i==2:
        doc2[senti_idx_dict[''.join(sentiment_final[idx])]].extend(data_list_adj_final[idx])
   

In [87]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

senti_doc=[posi,neu,nega]

dictionary = corpora.Dictionary(senti_doc)
corpus = [dictionary.doc2bow(text) for text in senti_doc]

tfidf = TfidfModel(corpus)

for doc in tfidf[corpus]:
    tfidf_val=[[dictionary[id], freq] for id, freq in doc]
    sorted_data = sorted(tfidf_val, key=lambda x: x[1],reverse=True)
    print([i[0] for i in sorted_data[:10]])


['dental', 'easy', 'orthodontic', 'more', 'useful', 'aesthetic', 'effective', 'excellent', 'medical', 'successful']
['pdf', 'bent', 'excited', 'extended', 'glossy', 'inguinal', 'kindle', 'minimal', 'overpriced', 'primary']
['dental', 'spanish', 'more', 'many', 'first', 'most', 'present', 'least', 'local', 'poor']


In [89]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc3.extend(doc2)


dictionary = corpora.Dictionary(doc3)
corpus = [dictionary.doc2bow(text) for text in doc3]


tfidf = TfidfModel(corpus)


for doc in tfidf[corpus][:3]:
    tfidf_val=[[dictionary[id], freq] for id, freq in doc]
    sorted_data = sorted(tfidf_val, key=lambda x: x[1],reverse=True)
    print([i[0] for i in sorted_data[:10]])

['dental', 'great', 'good', 'easy', 'more', 'successful', 'orthodontic', 'useful', 'medical', 'helpful']
['dental', 'good', 'other', 'more', 'spanish', 'many', 'great', 'first', 'most', 'few']
['pdf', 'fine', 'good', 'anatomical', 'bent', 'extended', 'glossy', 'horrible', 'kindle', 'unglued']
