In [None]:
!pip install nltk
!pip install --upgrade gensim 
!pip install keybert[all]
!pip install yake
!pip install pyldavis
!pip install numexpr

In [None]:
import requests
import time
import random
from bs4 import BeautifulSoup
import re
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import random
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
from keybert import KeyBERT
import string
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
import pyLDAvis

  from collections import Iterable


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
pyLDAvis.enable_notebook()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
def parse_reviews(page_html):
  reviews = []
  page_html = BeautifulSoup(page_html)
  review_divs = page_html.find_all(id=re.compile("customer_review.*"))
  for div in review_divs:
    name = div.find(class_=re.compile(r'.*profile-name.*')).text.strip()
    title = div.find("a", class_=re.compile(r'.*review-title.*')).text.strip()
    text = div.find(class_=re.compile(r"review-text-content")).find("span").text.strip()
    
    rating = div.find(title=re.compile(r"\d+\.?\d? out of \d stars")).text.strip()
    rating, out_of = re.findall(r"(\d+\.?\d?).*(\d+\.?\d?)", rating)[0]
    rating = float(rating)
    out_of = float(out_of)
    
    review_date = div.find(class_=r'review-date').text.strip()
    review_date = re.findall(r'\d{1,2}.*\d{4}', review_date)[0]
    review_date = datetime.strptime(review_date, '%d %B %Y').date()

    review = dict(
        name=name,
        title=title,
        review=text,
        rating=rating,
        rating_out_of=out_of,
        date=review_date,
    )
    reviews.append(review)
  return reviews

def scrape_reviews(url, start_page=1, end_page=2, wait_time=10):
  reviews = []
  for page in range(start_page, end_page+1):
    page_url = f"{url}&pageNumber={page}&sortBy=recent"
    response = requests.get(page_url)
    page_reviews = parse_reviews(response.text)
    reviews.extend(page_reviews)
  time.sleep(random.randint(1, wait_time))
  return reviews

In [None]:
url = "https://www.amazon.in/SeCro-USB-Audio-Sound-Card/product-reviews/B07WSBKPXX/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"

start_page = 1
end_page = 30
wait_time = 10

reviews = scrape_reviews(url, 
                         start_page=start_page, 
                         end_page=end_page, 
                         wait_time=wait_time)


In [None]:
reviews = list(pd.read_csv('./reviews.csv').T.to_dict().values())

In [None]:
pd.DataFrame(reviews).to_csv("reviews.csv", index=False)

In [None]:
pd.read_csv("./reviews.csv")

Unnamed: 0,name,title,review,rating,rating_out_of,date
0,Hema,Was not good don't buy this,It was not good.. For earphones connection it...,1.0,5.0,2021-11-09
1,Mahesh kumar,Just ok,Jaldi kharab ho gaya,3.0,5.0,2021-11-07
2,amey ghatge,Product not working,Within one month this product has stopped work...,1.0,5.0,2021-10-09
3,Lobzang Dorjay,Superb product,Sound quality is just great if we use earphone...,3.0,5.0,2021-09-04
4,Kaustubh bhoir,SeCro USB HUB Audio Sound,Very nice long cable with USB ports not to was...,5.0,5.0,2021-09-01
...,...,...,...,...,...,...
144,R.S.Robin,"Great audio quality, affordable price!",Great audio product. I have connected my midi ...,5.0,5.0,2019-11-25
145,Rahul pa,Best under 1000 Rs .,Under 300 this is the best sound card .Build ...,5.0,5.0,2019-11-24
146,Madhu Bala,GOOD QUALITY PRODUCT,crystal clear audio qualityand usb hub working...,5.0,5.0,2019-11-22
147,Rajat Khanduri,Fine,Fine,3.0,5.0,2019-11-11


# Just Simple TF-IDF

In [None]:
def pos_filter(tokens):
  tagged = nltk.pos_tag(tokens)
  return [
    word for word, pos in tagged 
    if len(pos) >= 2 and pos[:2] in ['NN', 'JJ', 'IN', 'RB']
  ]
  # print(pos)
  # raise Exception("asdf")
  # return len(pos) > 2 and pos[:2] in ['NN', 'JJ', 'IN']

def tfidf_preprocess(texts):
  texts = [text.lower() for text in texts] # lower
  texts = [word_tokenize(t.replace(".", " ")) for t in texts] # tokenize
  texts = [[word for word in t if word not in string.punctuation] for t in texts] # remove punc
  # texts = [pos_filter(t) for t in texts] # filter pos
  return texts

reviews_texts = tfidf_preprocess([r['review'] for r in reviews])
# remove puncuations
# preprocess [just nowns and adjectives]
# lemmatize

In [None]:
[r['review'] for r in reviews[:10]]

['It was not good.. For earphones  connection it was not working correctly',
 'Jaldi kharab ho gaya',
 'Within one month this product has stopped working. Does anyone from company read this? Let me know how to replace or return this product?',
 'Sound quality is just great if we use earphones a d headphones.',
 'Very nice long cable with USB ports not to waste money for usb ports very nice loved it I recommend to buy this',
 'Super dislike it. Will never take if for free aslo',
 "It's Amazing...",
 'This is not much better than the DAC on my laptop.The major issue is it keeps getting disconnected randomly and frequently from the laptop USB port.I have nothing plugged into the USB ports on the device and it just keeps happening. The AUX port on device works well.otherwise the packing and shipping had no issues. Device has a green LED to tell you its switched on.',
 'Ok',
 'Audio port not working, usb port only working. Very bad, Sound port not working, mice port not working, only usb wo

In [None]:
reviews_texts[:10]

[['it',
  'was',
  'not',
  'good',
  'for',
  'earphones',
  'connection',
  'it',
  'was',
  'not',
  'working',
  'correctly'],
 ['jaldi', 'kharab', 'ho', 'gaya'],
 ['within',
  'one',
  'month',
  'this',
  'product',
  'has',
  'stopped',
  'working',
  'does',
  'anyone',
  'from',
  'company',
  'read',
  'this',
  'let',
  'me',
  'know',
  'how',
  'to',
  'replace',
  'or',
  'return',
  'this',
  'product'],
 ['sound',
  'quality',
  'is',
  'just',
  'great',
  'if',
  'we',
  'use',
  'earphones',
  'a',
  'd',
  'headphones'],
 ['very',
  'nice',
  'long',
  'cable',
  'with',
  'usb',
  'ports',
  'not',
  'to',
  'waste',
  'money',
  'for',
  'usb',
  'ports',
  'very',
  'nice',
  'loved',
  'it',
  'i',
  'recommend',
  'to',
  'buy',
  'this'],
 ['super',
  'dislike',
  'it',
  'will',
  'never',
  'take',
  'if',
  'for',
  'free',
  'aslo'],
 ['it', "'s", 'amazing'],
 ['this',
  'is',
  'not',
  'much',
  'better',
  'than',
  'the',
  'dac',
  'on',
  'my',
  'la

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(2, 4))
vectorizer.fit([" ".join(r) for r in reviews_texts])

TfidfVectorizer(ngram_range=(2, 4))

In [None]:
vectorizer.vocabulary_

{'it was': 3496,
 'was not': 7329,
 'not good': 4330,
 'good for': 2507,
 'for earphones': 2199,
 'earphones connection': 1947,
 'connection it': 1532,
 'not working': 4381,
 'working correctly': 7758,
 'it was not': 3497,
 'was not good': 7330,
 'not good for': 4333,
 'good for earphones': 2508,
 'for earphones connection': 2200,
 'earphones connection it': 1948,
 'connection it was': 1533,
 'was not working': 7334,
 'not working correctly': 4384,
 'it was not good': 3498,
 'was not good for': 7331,
 'not good for earphones': 4334,
 'good for earphones connection': 2509,
 'for earphones connection it': 2201,
 'earphones connection it was': 1949,
 'connection it was not': 1534,
 'it was not working': 3499,
 'was not working correctly': 7335,
 'jaldi kharab': 3595,
 'kharab ho': 3669,
 'ho gaya': 2788,
 'jaldi kharab ho': 3596,
 'kharab ho gaya': 3670,
 'jaldi kharab ho gaya': 3597,
 'within one': 7674,
 'one month': 4614,
 'month this': 4056,
 'this product': 6622,
 'product has': 5183

In [None]:
vocab = vectorizer.vocabulary_.keys()

# tags given by amazon
tags = set([
  'value for money',
  'sound card',
  'sound quality',
  'usb hub',
  'waste of money',
  'works well',
  'good quality',
  'dont buy',
  'hard drive',
  'audio jack',
  'build quality',
  'working fine',
  'usb port',
  'keeps disconnecting',
  'printer cable',
  'stopped working',
  'worth buying',
  'working properly',
  'audio quality',
  'recommend not to buy'
])

found_tags = set()

for k in vocab:
  for t in tags:
    if t in k:
      found_tags.add(t)

In [None]:
found_tags

{'audio jack',
 'audio quality',
 'build quality',
 'dont buy',
 'good quality',
 'hard drive',
 'keeps disconnecting',
 'printer cable',
 'recommend not to buy',
 'sound card',
 'sound quality',
 'stopped working',
 'usb hub',
 'usb port',
 'value for money',
 'waste of money',
 'working fine',
 'working properly',
 'works well',
 'worth buying'}

In [None]:
missed_tags = tags - found_tags
missed_tags

set()

In [None]:
(
    pd.DataFrame(vectorizer.vocabulary_.items(), 
                 columns=['word', 'count'])
    .sort_values(by='count', ascending=False)
    .iloc[:20]
)

Unnamed: 0,word,count
5616,your systems sounds cards,8046
5608,your systems sounds,8045
5600,your systems,8044
2809,your phone and works,8043
2753,your phone and,8042
2699,your phone,8041
7836,your money,8040
4814,your microphone go for,8039
4797,your microphone go,8038
4782,your microphone,8037


# Topic Modelling


In [None]:
def get_reviews_text(reviews):
  texts = [r['review'] for r in reviews]
  return preprocess_texts(texts)

def pos_filter(tokens):
  tagged = nltk.pos_tag(tokens)
  return [
    word for word, pos in tagged 
    if len(pos) >= 2 and pos[:2] in ['NN', 'JJ']
  ]

def preprocess_texts(texts):
  # return tfidf_preprocess(texts)
  texts = [text.lower() for text in texts]
  texts = [word_tokenize(t) for t in texts]
  texts = [[word for word in t if word not in string.punctuation] for t in texts]
  texts = [pos_filter(text) for text in texts]
  return texts

In [None]:
reviews_texts = get_reviews_text(reviews)
reviews_dict = Dictionary(reviews_texts)
reviews_corpus = [reviews_dict.doc2bow(text) for text in reviews_texts]

In [None]:
lda = LdaModel(reviews_corpus, id2word=reviews_dict, num_topics=10)

In [None]:
# topics_humanized = [(topic[0], [(reviews_dict[int(word[0])], word[1]) for word in topic[1]] )for topic in lda.show_topics(formatted=False)]
# topics_humanized

In [None]:
lda.print_topics()

[(0,
  '0.084*"good" + 0.069*"quality" + 0.024*"price" + 0.023*"sound" + 0.015*"product" + 0.013*"usb" + 0.013*"ports" + 0.009*"microphone" + 0.008*"awesome" + 0.008*"product.."'),
 (1,
  '0.045*"product" + 0.022*"i" + 0.017*"device" + 0.012*"quality" + 0.011*"money" + 0.011*"waste" + 0.011*"purpose" + 0.011*"stops" + 0.011*"mic" + 0.011*"valuable"'),
 (2,
  '0.038*"product" + 0.031*"sound" + 0.025*"money" + 0.025*"usb" + 0.025*"i" + 0.025*"cable" + 0.025*"value" + 0.019*"audio" + 0.019*"microphone" + 0.019*"work"'),
 (3,
  '0.085*"product" + 0.039*"usb" + 0.032*"i" + 0.031*"time" + 0.024*"ports" + 0.017*"quality" + 0.016*"money" + 0.016*"value" + 0.016*"second" + 0.016*"month"'),
 (4,
  '0.079*"i" + 0.041*"product" + 0.033*"usb" + 0.025*"good" + 0.022*"quality" + 0.020*"audio" + 0.019*"laptop" + 0.016*"sound" + 0.015*"cheap" + 0.013*"cable"'),
 (5,
  '0.032*"i" + 0.029*"product" + 0.023*"quality" + 0.019*"audio" + 0.015*"usb" + 0.014*"headphone" + 0.014*"money" + 0.014*"other" + 0.013

In [None]:
import pyLDAvis.gensim_models as gensimvis

In [None]:
gensimvis.prepare(lda, reviews_corpus, reviews_dict)

In [None]:
topic_names = [
    # 'usb port disconnecting',
    # 'replacement',
    # 'not durable',
    # 'usb port not working',
    # 'earphone port',
    # 'great sound',
    # 'works well',
    # 'works well',
    # 'never buy',
    # 'usb port disconnecting',
]

In [None]:
other_texts = ['audio port not working']
other_texts = preprocess_texts(other_texts)
other_corpus = [reviews_dict.doc2bow(text) for text in other_texts]

In [None]:
[[(reviews_dict[token[0]], token[1]) for token in doc] for doc in other_corpus]

[[('port', 1), ('audio', 1)]]

In [None]:
unseen_doc = other_corpus[0]

In [None]:
vector = lda[unseen_doc] 

In [None]:
vector

[(0, 0.033334162),
 (1, 0.033333812),
 (2, 0.03333736),
 (3, 0.033333454),
 (4, 0.03333918),
 (5, 0.033337966),
 (6, 0.033335414),
 (7, 0.6999748),
 (8, 0.033338476),
 (9, 0.033335377)]

In [None]:
[(topic_names[v[0]], v[1])for v in vector]

NameError: ignored

# Keybert

In [None]:
kb = KeyBERT()

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def extract_keyphrases(kb, review):
  doc = review['review']
  return kb.extract_keywords(doc, keyphrase_ngram_range=(2, 3))

In [None]:
keyphrases = [extract_keyphrases(kb, review) for review in reviews]

In [None]:
keyphrases

[[('good earphones connection', 0.6529),
  ('earphones connection', 0.5931),
  ('earphones connection working', 0.5829),
  ('good earphones', 0.5519),
  ('connection working correctly', 0.3248)],
 [('kharab ho gaya', 0.9353),
  ('jaldi kharab ho', 0.8297),
  ('jaldi kharab', 0.751),
  ('kharab ho', 0.7116),
  ('ho gaya', 0.6869)],
 [('product stopped working', 0.6683),
  ('product stopped', 0.6049),
  ('month product stopped', 0.5994),
  ('stopped working', 0.4646),
  ('replace return product', 0.414)],
 [('sound quality', 0.6605),
  ('sound quality just', 0.6472),
  ('great use earphones', 0.53),
  ('earphones headphones', 0.5012),
  ('use earphones', 0.4975)],
 [('long cable usb', 0.765),
  ('nice long cable', 0.738),
  ('long cable', 0.6934),
  ('cable usb', 0.6181),
  ('cable usb ports', 0.587)],
 [('dislike free aslo', 0.8212),
  ('free aslo', 0.7583),
  ('super dislike free', 0.5704),
  ('dislike free', 0.4861),
  ('super dislike', 0.391)],
 [],
 [('better dac laptop', 0.7101),
 

In [None]:
from collections import defaultdict

counts = defaultdict(int)

for kps in keyphrases:
  for kp in kps:
    phrase = kp[0]
    counts[phrase] = counts[phrase] + 1

In [None]:
counts.keys()

dict_keys(['good earphones connection', 'earphones connection', 'earphones connection working', 'good earphones', 'connection working correctly', 'kharab ho gaya', 'jaldi kharab ho', 'jaldi kharab', 'kharab ho', 'ho gaya', 'product stopped working', 'product stopped', 'month product stopped', 'stopped working', 'replace return product', 'sound quality', 'sound quality just', 'great use earphones', 'earphones headphones', 'use earphones', 'long cable usb', 'nice long cable', 'long cable', 'cable usb', 'cable usb ports', 'dislike free aslo', 'free aslo', 'super dislike free', 'dislike free', 'super dislike', 'better dac laptop', 'dac laptop', 'dac laptop major', 'better dac', 'laptop usb port', 'usb port working', 'bad sound port', 'usb working', 'audio port working', 'working usb port', 'stream recording purpose', 'stream recording', 'recommend stream recording', 'mic single channel', 'stereo mic', 'stops working', 'stops working hrsgiv', 'working stops', 'working stops hrs', 'start wor

In [None]:
sorted(list(counts.items()))

[('10 10', 1),
 ('10 days', 1),
 ('1st working', 1),
 ('2nd 3rd usb', 1),
 ('300 best sound', 1),
 ('3rd usb', 1),
 ('3rd usb port', 1),
 ('800 microphonei recommend', 1),
 ('able record electric', 1),
 ('accept phone usb', 1),
 ('adapter wire', 1),
 ('adapter wire good', 1),
 ('adapters audio jack', 1),
 ('adapters really', 1),
 ('adapters really low', 1),
 ('affordable 10', 1),
 ('affordable 10 10', 1),
 ('asked refund', 1),
 ('audacity otg connect', 1),
 ('audio adapter', 1),
 ('audio card', 1),
 ('audio cards', 1),
 ('audio distance', 1),
 ('audio jack', 1),
 ('audio jack need', 1),
 ('audio listening', 1),
 ('audio listening use', 1),
 ('audio port working', 2),
 ('audio product connected', 1),
 ('audio quality', 2),
 ('audio quality good', 1),
 ('audio quality great', 1),
 ('audio quality really', 1),
 ('audio qualityand usb', 1),
 ('audio sound card', 2),
 ('audio usb hub', 1),
 ('auto installation', 1),
 ('auto installation usb', 1),
 ('aux ports problem', 1),
 ('awesome produc

# filter results

In [None]:

[nltk.pos_tag(word_tokenize(tag)) for tag in list(tags)]

[[('printer', 'NN'), ('cable', 'NN')],
 [('sound', 'JJ'), ('quality', 'NN')],
 [('usb', 'JJ'), ('hub', 'NN')],
 [('dont', 'NN'), ('buy', 'NN')],
 [('build', 'NN'), ('quality', 'NN')],
 [('working', 'VBG'), ('fine', 'NN')],
 [('value', 'NN'), ('for', 'IN'), ('money', 'NN')],
 [('waste', 'NN'), ('of', 'IN'), ('money', 'NN')],
 [('good', 'JJ'), ('quality', 'NN')],
 [('hard', 'JJ'), ('drive', 'NN')],
 [('usb', 'JJ'), ('port', 'NN')],
 [('worth', 'JJ'), ('buying', 'NN')],
 [('sound', 'JJ'), ('card', 'NN')],
 [('audio', 'JJ'), ('quality', 'NN')],
 [('stopped', 'VBD'), ('working', 'VBG')],
 [('recommend', 'NN'), ('not', 'RB'), ('to', 'TO'), ('buy', 'VB')],
 [('audio', 'JJ'), ('jack', 'NN')],
 [('works', 'NNS'), ('well', 'RB')],
 [('keeps', 'NNS'), ('disconnecting', 'VBG')],
 [('working', 'VBG'), ('properly', 'RB')]]

In [None]:
def good_pos_comb(tokens):
  good_pos = [
    # ['NN', 'NN'],
    # ['NN', 'IN', 'NN'],
    ['JJ', 'NN'],
    ['VBD', 'VBG'],
    # ['NNS', 'RB'],
    # ['NNS', 'VBG'],
    # ['VBG', 'RB'],
  ]
  pos = [token[1] for token in tokens]
  return pos in good_pos

# [k for k in vectorizer.vocabulary_.keys() if good_pos_comb(nltk.pos_tag(word_tokenize(k)))]


[k for k in counts.keys() if good_pos_comb(nltk.pos_tag(word_tokenize(k)))]

# NN IN NN
# NN NN
# JJ NN
# VBD VBG
# NNS RB
# NNS VBG
# VBG RB

['stopped working',
 'sound quality',
 'free aslo',
 'nice sound',
 'audio jack',
 'mic headphone',
 'loud microphone',
 'good product',
 'audio card',
 'usb audio',
 'usb cable',
 'stoped working',
 'excellent product',
 'good quality',
 'usb aux',
 'multiple usb',
 'sound card',
 'good kind',
 'usb headphone',
 'mic input',
 'laptop audio',
 'usb hub',
 'nice quality',
 'audio quality',
 'nice productvery',
 'super duper',
 'useless don',
 'usb port',
 'bad noise',
 'good usb',
 'easy use',
 'goodpendrive working',
 'audio distance',
 'original cover',
 'worth buying',
 'good sound',
 'cheap product',
 'sound street',
 'local sound',
 'digital piano',
 'dis product',
 'cheap nice',
 'awesome product',
 'usb time',
 'laptop recording',
 'good price',
 'high quality',
 'mic usb',
 'defective headphone',
 'good combination',
 'usb cpu',
 'small compact']

# keyword extraction

# Ideas

* can you phrase that as muliple question answering problem?

* summarization + question answering