In [62]:
import pandas as pd
import json
import tiktoken
from dotenv import load_dotenv

In [63]:
df = pd.read_csv('data/amazon-reviews-for-2-cats.csv')

In [64]:
df['reviews'] = df['reviews'].apply(json.loads)
df['helpful_votes'] = df['helpful_votes'].apply(json.loads)
df['verified_purchase'] = df['verified_purchase'].apply(json.loads)

In [65]:
df.head()

Unnamed: 0,asin,reviews,total_reviews,first_review_date,last_review_date,helpful_votes,verified_purchase,category
0,B00006IUTN,"[{'review_title': 'Five Stars', 'review_text':...",509,2006-08-30 14:33:21.000,2023-03-20 17:28:03.486,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1
1,B00006IUV3,"[{'review_title': 'Two Stars', 'review_text': ...",722,2004-03-14 16:51:23.000,2023-05-18 01:54:55.930,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[True, False, True, True, True, True, True, Tr...",1
2,B00009W3G4,"[{'review_title': 'Very disappointed', 'review...",896,2007-10-28 07:01:11.000,2023-02-22 05:10:19.429,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1
3,B00009W3HD,"[{'review_title': 'great', 'review_text': 'wor...",771,2007-01-22 12:21:40.000,2023-05-08 18:13:00.943,"[0, 0, 0, 2, 0, 0, 3, 0, 7, 0, 0, 0, 5, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1
4,B00009W3I4,"[{'review_title': 'Great vent!', 'review_text'...",1037,2007-08-03 16:43:11.000,2023-05-17 17:38:36.885,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 4, ...","[True, True, True, True, True, True, True, Tru...",1


In [66]:
product1 = df[df['asin']=='B00006IUTN']

In [67]:
product1

Unnamed: 0,asin,reviews,total_reviews,first_review_date,last_review_date,helpful_votes,verified_purchase,category
0,B00006IUTN,"[{'review_title': 'Five Stars', 'review_text':...",509,2006-08-30 14:33:21.000,2023-03-20 17:28:03.486,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1


In [68]:
product1['reviews'][0]

[{'review_title': 'Five Stars', 'review_text': 'Just get it!'},
 {'review_title': 'They do a great job and make it easy to dump out the ...',
  'review_text': 'I use these every day in a single serve coffeepot.  I get them in a package of 12 boxes (with each box containing 40 filters).  They do a great job and make it easy to dump out the used grounds.  While my coffeemaker does not require that I use these, it is so much easier and neater that I always use them anyway.'},
 {'review_title': 'Great for one-shot coffeemaker',
  'review_text': 'These filters are hard to find in stores and I have reordered them a couple of times from Amazon.  They make the coffee taste better, help with the cleanup and keep coffee grounds from falling into your cup.  Great idea!'},
 {'review_title': 'and they are very good quality, and unbleached',
  'review_text': 'Cannot find this size filter in any of the major food store chains, nor in the shopping supercenter! Thank goodness for Amazon! Have used thes

In [69]:
# trying to find the count of helpful votes > 1 from one products. 
count = sum(1 for x in product1['helpful_votes'][0] if x != 0)
print(count)  

81


In [70]:
from collections import Counter
counts = Counter(product1['helpful_votes'][0])
sorted_counts = dict(sorted(counts.items(), key=lambda x: x[0], reverse=True))
sorted_counts

{78: 1,
 29: 1,
 16: 1,
 14: 2,
 12: 2,
 10: 1,
 8: 1,
 7: 2,
 6: 1,
 5: 2,
 4: 8,
 3: 5,
 2: 10,
 1: 44,
 0: 428}

In [71]:
! pip install nltk --quiet

In [72]:
import nltk
from nltk.corpus import stopwords
import string

In [73]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /Users/biraj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [74]:
def remove_stopwords(text):
    """Remove stopwords from a given text."""
    if not isinstance(text, str):  # Handle cases where text might be missing or None
        return ""
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)



In [75]:
def process_reviews(reviews):
    """Process list of review dictionaries by removing stopwords from each field."""
    if not isinstance(reviews, list):  # Handle cases where reviews might be missing
        return []
    
    processed_reviews = []
    for review in reviews:
        if isinstance(review, dict):
            cleaned_review = {
                "review_title": remove_stopwords(review.get("review_title", "")),
                "review_text": remove_stopwords(review.get("review_text", ""))
            }
            processed_reviews.append(cleaned_review)
    
    return processed_reviews

In [76]:
df["cleaned_reviews"] = df["reviews"].apply(process_reviews)

In [77]:
def count_words(reviews):
    """Count the total number of words in review_title and review_text combined."""
    if not isinstance(reviews, list):  # Handle cases where reviews might be missing
        return 0
    
    total_words = 0
    for review in reviews:
        if isinstance(review, dict):
            title_words = review.get("review_title", "").split()
            text_words = review.get("review_text", "").split()
            total_words += len(title_words) + len(text_words)
    
    return total_words

In [78]:
df["reviews_word_count"] = df["reviews"].apply(count_words)
df['cleaned_reviews_word_count'] = df["cleaned_reviews"].apply(count_words)

In [79]:
df

Unnamed: 0,asin,reviews,total_reviews,first_review_date,last_review_date,helpful_votes,verified_purchase,category,cleaned_reviews,reviews_word_count,cleaned_reviews_word_count
0,B00006IUTN,"[{'review_title': 'Five Stars', 'review_text':...",509,2006-08-30 14:33:21.000,2023-03-20 17:28:03.486,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'Five Stars', 'review_text':...",18603,10666
1,B00006IUV3,"[{'review_title': 'Two Stars', 'review_text': ...",722,2004-03-14 16:51:23.000,2023-05-18 01:54:55.930,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[True, False, True, True, True, True, True, Tr...",1,"[{'review_title': 'Two Stars', 'review_text': ...",23881,13347
2,B00009W3G4,"[{'review_title': 'Very disappointed', 'review...",896,2007-10-28 07:01:11.000,2023-02-22 05:10:19.429,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'disappointed', 'review_text...",28484,14936
3,B00009W3HD,"[{'review_title': 'great', 'review_text': 'wor...",771,2007-01-22 12:21:40.000,2023-05-08 18:13:00.943,"[0, 0, 0, 2, 0, 0, 3, 0, 7, 0, 0, 0, 5, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'great', 'review_text': 'wor...",28131,15119
4,B00009W3I4,"[{'review_title': 'Great vent!', 'review_text'...",1037,2007-08-03 16:43:11.000,2023-05-17 17:38:36.885,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 4, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'Great vent!', 'review_text'...",58827,32359
...,...,...,...,...,...,...,...,...,...,...,...
586,B0032KHQTS,"[{'review_title': 'A terrific value', 'review_...",573,2010-01-26 02:07:34,2023-01-12 20:33:19.505,"[0, 0, 0, 0, 0, 172, 4, 1, 0, 0, 0, 2, 0, 0, 3...","[True, True, True, True, True, True, False, Tr...",2,"[{'review_title': 'terrific value', 'review_te...",17572,9923
587,B003N3QRF8,[{'review_title': 'Just the info I had hoped f...,727,2010-07-07 21:29:23,2023-06-07 21:00:09.703,"[0, 4, 1, 1, 0, 0, 0, 1, 1, 0, 5, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",2,"[{'review_title': 'info hoped', 'review_text':...",24794,13607
588,B004GVZUUE,[{'review_title': 'Husband and wife both like ...,510,2011-02-16 23:30:34,2022-08-10 12:42:12.322,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 20, 2,...","[True, True, True, True, True, True, True, Tru...",2,"[{'review_title': 'Husband wife like', 'review...",15016,8388
589,B004GVZUUY,"[{'review_title': 'Classic', 'review_text': 'A...",1381,2011-04-20 16:02:37,2023-05-08 22:18:48.434,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",2,"[{'review_title': 'Classic', 'review_text': 't...",34119,19641


In [85]:
df['reviews_word_count'].sum()

22913328

In [86]:
df['reviews_word_count'].mean()


38770.43654822335

In [87]:
df['cleaned_reviews_word_count'].sum()

12847913

In [88]:
df['cleaned_reviews_word_count'].mean()

21739.277495769882

In [89]:
df

Unnamed: 0,asin,reviews,total_reviews,first_review_date,last_review_date,helpful_votes,verified_purchase,category,cleaned_reviews,reviews_word_count,cleaned_reviews_word_count
0,B00006IUTN,"[{'review_title': 'Five Stars', 'review_text':...",509,2006-08-30 14:33:21.000,2023-03-20 17:28:03.486,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 4, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'Five Stars', 'review_text':...",18603,10666
1,B00006IUV3,"[{'review_title': 'Two Stars', 'review_text': ...",722,2004-03-14 16:51:23.000,2023-05-18 01:54:55.930,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[True, False, True, True, True, True, True, Tr...",1,"[{'review_title': 'Two Stars', 'review_text': ...",23881,13347
2,B00009W3G4,"[{'review_title': 'Very disappointed', 'review...",896,2007-10-28 07:01:11.000,2023-02-22 05:10:19.429,"[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 4, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'disappointed', 'review_text...",28484,14936
3,B00009W3HD,"[{'review_title': 'great', 'review_text': 'wor...",771,2007-01-22 12:21:40.000,2023-05-08 18:13:00.943,"[0, 0, 0, 2, 0, 0, 3, 0, 7, 0, 0, 0, 5, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'great', 'review_text': 'wor...",28131,15119
4,B00009W3I4,"[{'review_title': 'Great vent!', 'review_text'...",1037,2007-08-03 16:43:11.000,2023-05-17 17:38:36.885,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 4, ...","[True, True, True, True, True, True, True, Tru...",1,"[{'review_title': 'Great vent!', 'review_text'...",58827,32359
...,...,...,...,...,...,...,...,...,...,...,...
586,B0032KHQTS,"[{'review_title': 'A terrific value', 'review_...",573,2010-01-26 02:07:34,2023-01-12 20:33:19.505,"[0, 0, 0, 0, 0, 172, 4, 1, 0, 0, 0, 2, 0, 0, 3...","[True, True, True, True, True, True, False, Tr...",2,"[{'review_title': 'terrific value', 'review_te...",17572,9923
587,B003N3QRF8,[{'review_title': 'Just the info I had hoped f...,727,2010-07-07 21:29:23,2023-06-07 21:00:09.703,"[0, 4, 1, 1, 0, 0, 0, 1, 1, 0, 5, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",2,"[{'review_title': 'info hoped', 'review_text':...",24794,13607
588,B004GVZUUE,[{'review_title': 'Husband and wife both like ...,510,2011-02-16 23:30:34,2022-08-10 12:42:12.322,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 20, 2,...","[True, True, True, True, True, True, True, Tru...",2,"[{'review_title': 'Husband wife like', 'review...",15016,8388
589,B004GVZUUY,"[{'review_title': 'Classic', 'review_text': 'A...",1381,2011-04-20 16:02:37,2023-05-08 22:18:48.434,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[True, True, True, True, True, True, True, Tru...",2,"[{'review_title': 'Classic', 'review_text': 't...",34119,19641
