Notebook Author: Ayana Andrews-Joseph\
**Data**: Pet Supplies Reviews from [Amazon product data](http://jmcauley.ucsd.edu/data/amazon/)\
**Provider**: Julian McAuley, USCD\
**Description**: This dataset contains product reviews and metadata from Amazon, including 142.8 million reviews spanning May 1996 - July 2014.\
This dataset includes reviews (ratings, text, helpfulness votes), product metadata (descriptions, category information, price, brand, and image features), and links (also viewed/also bought graphs).\
**5-core**: Subset of the data in which all users and items have at least 5 reviews.\
**Format**: one-review-per-lline in (loose) json.

**Definitions**: 
> **reviewerID** - ID of the reviewer, e.g. A2SUAM1J3GNN3B \
> **asin** - ID of the product, e.g. 0000013714 \
> **reviewerName** - name of the reviewer \
> **helpful** - helpfulness rating of the review, e.g. 2/3 \
> **reviewText** - text of the review \
> **overall** - rating of the product \
> **summary** - summary of the review \
> **unixReviewTime** - time of the review (unix time) \
> **reviewTime** - time of the review (raw) 

**NOTE**: Selected a *“Small” subset for experimentation*, which uses `K-cores`. These data have been reduced to extract the k-core, such that each of the remaining users and items have k reviews each. (McAuley) The concept of a k-core was introduced to study the clustering structure of social networks and to describe the evolution of random graphs.

# PART I

## Dataset

In [None]:
pip install ijson

In [None]:
# Data Manipulation & Visualization
import os
import pandas as pd
import numpy as np
import ijson
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns # used for plot interactive graph. 
sns.set_style('darkgrid')

# Text Manipulation
from textblob import TextBlob # text processing
from textblob import Blobber
import nltk
nltk.download('all') # Download stopwords list, updated lemmatizer, tokenizers, etc.
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem.wordnet import WordNetLemmatizer 



In [None]:
# read in data
rawDat = pd.read_json("../input/amzn-reviews-pet-supplies/Pet_Supplies_5.json",
                        lines=True,
                        orient="columns")
print(rawDat.shape)
rawDat.head()

In [None]:
df = rawDat[["helpful", "reviewText", "overall", "summary", "asin"]]

df.head()

#### Output raw dataset

In [None]:
# Save to Kaggle for export
rawDatpub = rawDat.loc[ : , rawDat.columns != 'reviewerName']
rawDatpub.to_csv('rawData_AmazonPetSupplyReviews.csv',index=False)

In [None]:
del rawDatpub

# Overall Rating Distribution

In [None]:
plt.figure(figsize=(8,7))
sns.set_theme(style="whitegrid")
ax = sns.countplot(data=df, x="overall", edgecolor='black', linewidth=2, palette=("rocket_r"))
plt.title("Overall Rating Distribution", size = 17)
plt.xlabel('Rating')
plt.ylabel('Frequency [n]')

for p in ax.patches:
   ax.annotate('{:.0f}'.format(p.get_height()), (p.get_x()+0.25, p.get_height()+0.75))

plt.show()

### The overall rating distribution is "left-skewed", there are more 5-star ratings in comparison to the entire dataset.

## Frequencies on Product Ids

In [None]:
prod_pivot = df.asin.value_counts()
prod_pivot


## Get column integer location

In [None]:
df.columns.get_loc('summary')

## Convert comment text to lower() for text processing

In [None]:
# Set clean data to lowercase for stopwords preprocessing
df['summary'] = df['summary'].str.lower()
df['reviewText'] = df['reviewText'].str.lower()
df['summary'].head()

### Remove Punctuation

In [None]:
import string 

nopunc = "\n\r"+string.punctuation
df['summary'] = df['summary'].str.translate(str.maketrans('','',nopunc))
df['reviewText'] = df['reviewText'].str.translate(str.maketrans('','',nopunc))
df.head(10)

In [None]:
df.reviewText[1]

## Apply `stopwords` 
Although the data set is relatively clean, we can apply stop words for further processing (words that do not count in linguistic analysis). The most common SEO stop words are pronouns, articles, prepositions, and conjunctions.

In [None]:
# datacheck for stopwords
print(stopwords.words('english'))

In [None]:
# Exclude stopwords with Python's list comprehension and pandas.DataFrame.apply.
stop_words = set(stopwords.words('english'))

#add words that aren't in the NLTK stopwords list
add_stopwords = ['pet', 'pets']
new_stopwords_list = stop_words.union(add_stopwords)

#remove words that are in NLTK stopwords list
remove_stopwords = {'no', 'not', 'didnt'} 
stop = set([word for word in new_stopwords_list if word not in not_stopwords])

In [None]:
# apply stopwords
df['clean_text1'] = df['summary'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df['clean_text2'] = df['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.head()

### Tokenization and Lemmatization

In [None]:
# Init tokenizer and Lemmatizer
w_token = WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_token.tokenize(text)]

df['text_lemma_sum'] = df['clean_text1'].apply(lemmatize_text)
df['text_lemma_raw'] = df['clean_text2'].apply(lemmatize_text)

In [None]:
# Transform lemma "list" to "string" structure - Remove brackets and punctuation from lemmatized list
df['text_sum'] = df['text_lemma_sum'].str.join(' ')
df['text_raw'] = df['text_lemma_raw'].str.join(' ')
df.head()

In [None]:
df.dtypes

### Add Sentiment

In [None]:
print("Lemma str of summary text loc: ", df.columns.get_loc('text_sum'),', ', "and lemma str of original review text loc: ", df.columns.get_loc('text_raw') )

In [None]:
# Sentiment of reviewers heading
def add_sentiment_to_df_sum(df):
    sentiment_tuple = []
    
    for x in range(0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][9])
        measures = QuantTextBlob.sentiment
        sentiment_tuple.append(measures)
    df['textScore_sum'] = sentiment_tuple
    return df

In [None]:
add_sentiment_to_df_sum(df)

In [None]:
# Sentiment of raw review text
def add_sentiment_to_df_raw(df):
    sentiment_tuple = []
    
    for x in range(0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][10])
        measures = QuantTextBlob.sentiment
        sentiment_tuple.append(measures)
    df['textScore_raw'] = sentiment_tuple
    return df

In [None]:
add_sentiment_to_df_raw(df)

#### Noting that polarity and subjectivity vary between using the `summary' field vs. the raw text field. We can take a closer look - plan elasticSearch and visual analytics.

### Add POLARITY [-1.0,1.0]

In [None]:
def add_polarity_to_df_sum(df):
    polarity_list = []
    
    for x in range (0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][9])
        measures = QuantTextBlob.sentiment.polarity
        polarity_list.append(measures)
    df['textPolarity_sum'] = polarity_list
    return df

In [None]:
add_polarity_to_df_sum(df)

In [None]:
def add_polarity_to_df_raw(df):
    polarity_list = []
    
    for x in range (0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][10])
        measures = QuantTextBlob.sentiment.polarity
        polarity_list.append(measures)
    df['textPolarity_raw'] = polarity_list
    return df

In [None]:
add_polarity_to_df_raw(df)

In [None]:
def getPolarity(score):
    if score < 0: 
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
    return df

In [None]:
df['Polarity_sum'] = df['textPolarity_sum'].apply(getPolarity)
df['Polarity_raw'] = df['textPolarity_raw'].apply(getPolarity)
df.head(30)

In [None]:
df.dtypes

### Get Subjectivity

In [None]:
def add_subjectivity_to_df_raw(df):
    polarity_list = []
    
    for x in range (0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][10])
        measures = QuantTextBlob.sentiment.subjectivity
        polarity_list.append(measures)
    df['textSubjectivity_raw'] = polarity_list
    return df

In [None]:
add_subjectivity_to_df_raw(df)

In [None]:
df_out = df[["asin", "overall", "text_sum", "text_raw", "textPolarity_sum", "textPolarity_raw", "Polarity_sum", "Polarity_raw", "textSubjectivity_raw"]]

df_out.head(10)

In [None]:
# Save to Kaggle for export
df_out.to_csv('nlpAmazonPetSupplyReviews.csv',index=False)

# PART II

### Create combo text filed *(summary + review text)*

In [None]:
df['comboText'] = df[['text_sum', 'text_raw']].agg(' '.join, axis=1)

In [None]:
df.comboText[1]

### Dedup words

In [None]:
from collections import OrderedDict
df.comboText = df['comboText'].str.split().apply(lambda x: ' '.join(OrderedDict.fromkeys(x).keys()))

In [None]:
df.comboText[1]

### Get Polarity of combined text

In [None]:
df.columns.get_loc('comboText')

In [None]:
def add_polarity_to_df_combo(df):
    polarity_list = []
    
    for x in range (0, df.shape[0]):
        QuantTextBlob = TextBlob(df.iloc[x][18])
        measures = QuantTextBlob.sentiment.polarity
        polarity_list.append(measures)
    df['textPolarity_combo'] = polarity_list
    return df

In [None]:
add_polarity_to_df_combo(df)

In [None]:
df['Polarity_combo'] = df['textPolarity_combo'].apply(getPolarity)

In [None]:
df_out = df[["asin", 
             "overall", 
             "text_sum", 
             "text_raw", 
             "comboText",
             "textPolarity_sum", 
             "textPolarity_raw",
             "textPolarity_combo", 
             "Polarity_sum", 
             "Polarity_raw", 
             "Polarity_combo", 
             "textSubjectivity_raw"]]

df_out.head(10)

In [None]:
# Save to Kaggle for export
df_out.to_csv('nlpAmazonPetSupplyReviews.csv',index=False)

# PART III

In [None]:
!pip install keybert

## Using KeyBERT for keywords extraction

In [None]:
from keybert import KeyBERT

# One entire document
doc = ' '.join(np.unique(df['comboText']))

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

In [None]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None)

In [None]:
kw_model.extract_keywords(doc, keyphrase_ngram_range=(1,2))

In [None]:
#kw_model.extract_keywords(doc, keyphrase_ngram_range=(3, 3), stop_words='english', 
#                         use_mmr=True, diversity=0.2)

In [None]:
!pip install keyphrase-vectorizers

In [None]:
from keyphrase_vectorizers import KeyphraseTfidfVectorizer
        
# Init default vectorizer.
vectorizer = KeyphraseTfidfVectorizer()

# Print parameters
print(vectorizer.get_params())

In [None]:
vectorizer.fit(doc.split())

In [None]:
keyphrases = vectorizer.get_feature_names_out()

print(keyphrases)