In [1]:
from data import generate_dataframe

df = generate_dataframe(["Digital_Music_5.json"])

Loading data from: ['Digital_Music_5.json']

filename             samples
Digital_Music_5      169781 

Data loaded, 169781 total samples.


In [2]:
from preprocessing import preprocess_samples
from utils import get_product_reviews

raw_corpus_samples = list(filter(lambda x: isinstance(x, str), df['reviewText']))
corpus_samples = preprocess_samples(raw_corpus_samples)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/shy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/shy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/shy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Applying lowercase


169623it [00:00, 2031780.89it/s]


Applying remove_punctuation


169623it [00:00, 236588.09it/s]


Applying lemmatize


169623it [04:54, 575.58it/s] 


In [3]:
product_id = 'B009MA34NY'
raw_product_samples = get_product_reviews(df, product_id)
product_samples = preprocess_samples(raw_product_samples)

Applying lowercase


0it [00:00, ?it/s]


Applying remove_punctuation


0it [00:00, ?it/s]


Applying lemmatize


0it [00:00, ?it/s]


In [4]:
import n_grams
bigrams = n_grams.get_bigrams(product_samples, 10, 6)
trigrams = n_grams.get_trigrams(product_samples, 2, 17)
product_samples = n_grams.replace_n_grams(product_samples, bigrams, trigrams)

print(bigrams[:10])
print(trigrams[:10])

[nltk_data] Downloading package stopwords to /Users/shy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: Length mismatch: Expected axis has 0 elements, new values have 1 elements

In [None]:
import evaluation_metrics as em
freqs = em.get_word_freqs(product_samples)
bi_freqs = em.get_bi_freqs(product_samples)

In [None]:
from nltk import word_tokenize
import train
tf_vectorizer = train.get_tf_vectorizer()
tf_vectorizer.set_params(tokenizer=word_tokenize)

In [None]:
product_documents = tf_vectorizer.fit_transform(product_samples)

In [None]:
import train
# Compute topic coherence
n_components = range(1,10)
coherences = []
num_docs = len(product_samples)

for num in n_components:
  lda = train.get_lda(samples=product_documents, n_components=num)
  tf_feature_names = tf_vectorizer.get_feature_names_out()
  coherence = em.avg_umass(lda, tf_feature_names, train.n_top_words, freqs, bi_freqs, num_docs)[1]
  coherences.append(coherence)

for i, c in enumerate(coherences):
  print("Number of topics: ", n_components[i])
  print("Average UMass coherence: ", c)
  print()


In [None]:
from utils import plot_top_words
coherences = [abs(c) for c in coherences]
n_components = n_components[coherences.index(min(coherences))]

lda = train.get_lda(samples=product_documents, n_components=n_components)

tf_feature_names = tf_vectorizer.get_feature_names_out()
plot_top_words(lda, tf_feature_names, train.n_top_words, "Topics in LDA model")

# Exploration

In [None]:
# NOTE: Topic numbering in histogram plots and pyLDAvis are not the same
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

topic_data = pyLDAvis.sklearn.prepare(lda, product_documents, tf_vectorizer)
pyLDAvis.display(topic_data)

In [None]:
similarities, x, y = em.jaccard(lda,tf_feature_names,30)
em.print_jaccard(similarities)

In [None]:
df['asin'].value_counts()[:20]

In [None]:
tf_vectorizer.stop_words_

In [None]:
from preprocessing import preprocess_sample

raw_reviews = list(filter(lambda x: isinstance(x, str), df[df['asin'] == product_id]['reviewText']))
processed_sentences, raw_sentences = preprocess_sample(raw_reviews[337], get_raw=True)

processed_sentences

In [None]:
for i, sample in enumerate(raw_reviews):
    if len(sample) > 300 and len(sample) < 700:
        print(i,":", sample)

In [None]:
feature_names = tf_vectorizer.get_feature_names_out()
topic_words = []
for topic in lda.components_:
    top_features_ind = topic.argsort()[: -10- 1 : -1]
    topic_words.append([feature_names[i] for i in top_features_ind])

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# TODO: for now compound (a composite score) will suffice. Neutrality (neu) might suggest highly informational content.
for raw, processed in zip(raw_sentences, processed_sentences):
    vs = analyzer.polarity_scores(raw)
    probs = lda.transform(tf_vectorizer.transform([processed]))[0]
    topic = probs.argmax()
    if probs[topic] < 0.2:
        print("{} \n\t overall: {:.2f} neutral: {:.2f}, No Topic\n".format(raw, vs['compound'], vs['neu']))
    else:
        print("{} \n\t overall: {:.2f} neutral: {:.2f}, Topic {}: {}\n".format(raw, vs['compound'], vs['neu'], topic+1, ", ".join(topic_words[topic])))

In [None]:
def predict(text):
    raw_sentences = sent_tokenize(text)

    processed_sentences = raw_sentences[:]
    processed_sentences = lowercasing(processed_sentences)
    processed_sentences = punctuation_removal(processed_sentences)
    processed_sentences = lemmatize(processed_sentences)
    

    res = []
    present_topics = set()
    for raw, processed in zip(raw_sentences, processed_sentences):
        vs = analyzer.polarity_scores(raw)
        print("{} \n\t overall: {:.2f} neutral: {:.2f}\n".format(raw, vs['compound'], vs['neu']))


        probs = lda.transform(tf_vectorizer.transform([processed]))[0]
        topic = probs.argmax()
        
        res.append((raw, f"Topic {topic+1} ({round(vs['compound'],2)})"))
        present_topics.add(topic)
        
    topics = {str(i+1): ", ".join(topic_words[i]) for i in sorted(list(present_topics))}
    print(topics)
    return [res, topics]

In [None]:
import numpy as np

sentiment_vals = np.linspace(-1.0, 1.0, num=201)
color_map = {}

colors = {1: "red", 2: "orange", 3: "lime", 4: "pink", 5: "brown", 6: "green", 7: "purple", 8: "blue", 9: "cyan", 10: "yellow"}

for i, color in colors.items():
    color_map.update({f"Topic {i} ({round(val,2)})": color for val in sentiment_vals})

In [None]:
import gradio as gr
from gradio.components import Textbox, HighlightedText, JSON

gr.Interface(fn=predict, 
             inputs=Textbox(placeholder="Enter review here...", lines=5), 
             outputs=[HighlightedText().style(color_map=color_map), JSON()],
             examples=[
        ["Good indoor training shoes for running on treadmill, doing lunges and regular exercises at the gym. These are very flexible, light weight and comfortable. Grip is okay - sticky rubber is used only at the edges of heel and toe areas so I slipped a little when I worked on cable machines, resistance band, etc. on un-carpeted floor.  I would emphasize that if you do lifting as a part of your everyday routine workout I would not recommend them because mine (cushion) lasted only for six months and this is the reason I gave three stars. Other than that, I liked them!"],
        ["I've had these shoes for about a week now and have so far enjoyed using them. Considering the fact that I have wide feet, the shoes are slightly tight. However, it doesn't feel uncomfortable nor does it bothers me as I use them throughout my workouts. I know some people personally like when the shoes are a bit tighter or a bit looser so it's all in personal preference."],
        ["The picture makes the shoe look like it has a \"boxier\" toe rather than the \"pointier\" toe that it actually has. I have wider feet and generally need to buy a size or half size longer to get a comfortable width (in any brand of shoe). I was shooting for a rounder, broader toe design which is more comfortable for me, and I feel that the pictures of this shoe didn't accurately depict what I received, in that one detail. Otherwise, \"the shoe fits\" So I am wearing it."]
    ],
) \
    .launch(share=True)