In [1]:
# import required libraries

import numpy as np
import pandas as pd

#webscraping
import requests
from bs4 import BeautifulSoup

#tokenization
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

#vectorize
from sklearn.feature_extraction.text import CountVectorizer

#split into train and test datasets
from sklearn.model_selection import train_test_split

#naive_bayes classifer
from sklearn.naive_bayes import MultinomialNB

#visualise and print model metrics
import itertools
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, precision_recall_fscore_support, roc_curve, auc

In [2]:
# function to fetch data 

reviewList = []

def get_soup(url):
    
    r = requests.get('http://localhost:8050/render.html', params={'url': url, 'wait': 2})
    soup = BeautifulSoup(r.text, 'html.parser')
    return soup

In [3]:
# stuff reviews in the list

def get_reviews(soup):
    reviews = soup.find_all('div', {'data-hook': 'review'})
    
    try:
        for item in reviews:
            
            review = {
            'product': soup.title.text.replace('Amazon.com: Customer reviews:', '').split('-')[0].strip(),
            'title': item.find('a', {'data-hook': 'review-title'}).text.strip(),
            'rating': float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()),
            'body': item.find('span', {'data-hook': 'review-body'}).text.strip(),
            }
            
            reviewlist.append(review)
    except:
        pass

In [4]:
# driver code to fetch the jbl reviews

for x in range(1,500):
    soup = get_soup(f'https://www.amazon.com/JBL-Flip-Black-Noise-Cancelling-Speakerphone/product-reviews/B01MSYQWNY/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={x}')
    print(f'Getting page: {x}')
    get_reviews(soup)
    print(len(reviewList))
    if not soup.find('li', {'class': 'a-disabled a-last'}):
        pass
    else:
        break

Getting page: 1
0
Getting page: 2
0
Getting page: 3
0
Getting page: 4
0
Getting page: 5
0
Getting page: 6
0
Getting page: 7
0
Getting page: 8
0
Getting page: 9
0
Getting page: 10
0
Getting page: 11
0
Getting page: 12
0
Getting page: 13
0
Getting page: 14
0
Getting page: 15
0
Getting page: 16
0
Getting page: 17
0
Getting page: 18
0
Getting page: 19
0
Getting page: 20
0
Getting page: 21
0
Getting page: 22
0
Getting page: 23
0
Getting page: 24
0
Getting page: 25
0
Getting page: 26
0
Getting page: 27
0
Getting page: 28
0
Getting page: 29
0
Getting page: 30
0
Getting page: 31
0
Getting page: 32
0
Getting page: 33
0
Getting page: 34
0
Getting page: 35
0
Getting page: 36
0
Getting page: 37
0
Getting page: 38
0
Getting page: 39
0
Getting page: 40
0
Getting page: 41
0
Getting page: 42
0
Getting page: 43
0
Getting page: 44
0
Getting page: 45
0
Getting page: 46
0
Getting page: 47
0
Getting page: 48
0
Getting page: 49
0
Getting page: 50
0
Getting page: 51
0
Getting page: 52
0
Getting page: 53
0
Ge

ConnectionError: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))

In [None]:
# prepare dataframe from the review list

df_speakers = pd.DataFrame(reviewList)
df_speakers.to_excel('JBL_Speaker.xlsx', index=False)

df_speakers.shape

In [None]:
df_speakers

In [None]:
df_speakers.head()

In [None]:
df_speakers.rating.unique()

In [None]:
df_speakers['rating'] = df_speakers['rating'].astype(int)

In [None]:
df_speakers['rating_label'] = df['rating'].apply(lambda x: 'negative' if x <= 2 else ('positive' if x >= 4 else 'neutral'))

In [None]:
df_speakers['rating_label'].unique()

In [None]:
df_speakers = df_speakers.dropna()

In [None]:
df_speakers.isna().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig = plt.figure(figsize=(12, 6))

plt.subplot(121)
sns.countplot(x='rating_label', data=df_speakers)
plt.title('Rating Distribution')

In [None]:
df_speakers['rating_label'].value_counts()

In [None]:
df_speakers['rating_label'].value_counts(normalize=True)

In [None]:
df_speakers.info()

In [None]:
df_speakers.describe()

In [None]:
# Perform tokenization and stemming on the reviews

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

In [None]:
#tokenizing reviews

df['body'] = df['body'].apply(lambda x: ' '.join([stemmer.stem(word) for 
                                                  word in word_tokenize(x.lower()) 
                                                  if word.isalnum() and word not in stop_words]))

In [None]:
# Split the data into training and testing sets with an 80/20 split

X = df['body']
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Vectorize the text data using a CountVectorizer

vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Train a Naive Bayes classifier

clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

In [None]:
# Make predictions on the test data

y_pred = clf.predict(X_test_vectorized)

In [None]:
# Evaluate the performance of the classifier

accuracy = accuracy_score(y_test, y_pred)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", round(accuracy*100, 2), '%')
print("Precision:", round(precision*100, 2), '%')
print("Recall:", round(recall*100, 2), '%')
print("F1-score:", round(f1*100, 2), '%')

In [None]:
# Plot precision, recall, and F1-score for each class
precision, recall, f1, support = precision_recall_fscore_support(y_test, y_pred)

plt.figure(figsize=(10, 5))
plt.bar(range(len(precision)), precision, width=0.25, label='Precision')
plt.bar([x + 0.25 for x in range(len(recall))], recall, width=0.25, label='Recall')
plt.bar([x + 0.5 for x in range(len(f1))], f1, width=0.25, label='F1-score')
plt.xticks(range(len(precision)), [format(i) for i in range(len(precision))])
plt.xlabel('Rating')
plt.ylabel('Score')
plt.legend()
plt.show()

In [None]:
# Define the class labels

classes = np.unique(y_test)

# Define the plot style
cmap = plt.cm.Blues
normalize = False

# Plot the confusion matrix as a heatmap
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title('Confusion matrix')
plt.colorbar()

tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], fmt),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()