In [37]:
import pandas as pd
import json
import string
import nltk

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
def read_json_as_list_of_reviews(file_location:str)-> list:
    with open(file_location, "r") as fr:
        return json.loads(fr.read())

In [4]:
reviews = read_json_as_list_of_reviews("../data/reviews_received/samsung-galaxy-a70-black-128-gb.json")

In [5]:
print(len(reviews))

464


In [6]:
print(reviews[0])

{'rating': '5', 'title': 'Super!', 'review_text': 'fabulous phone loved it....😍😘😘READ MORE', 'reviewer': 'Rahul Pal', 'review_date': 'Rahul Pal', 'review_upvotes': '965', 'review_downvotes': '201'}


In [7]:
reviews_df = pd.DataFrame(reviews)
reviews_df.head()

Unnamed: 0,rating,review_date,review_downvotes,review_text,review_upvotes,reviewer,title
0,5,Rahul Pal,201,fabulous phone loved it....😍😘😘READ MORE,965,Rahul Pal,Super!
1,5,KiranKumar Gajjala,165,Simply love it.. after 2 days of usage.Battery...,799,KiranKumar Gajjala,Samsung Nailed it. Amazing Phone
2,4,Sethukrishnan P,157,Awesome midrange smart phone with reasonably g...,760,Sethukrishnan P,Good choice
3,5,manthan pandey,14,awesome phoneREAD MORE,97,manthan pandey,Highly recommended
4,4,Flipkart Customer,22,fingerprint sensor not working properly.remain...,131,Flipkart Customer,Good choice


## Datatype conversion

In [20]:
reviews_df["review_downvotes"] = reviews_df["review_downvotes"].astype(int)
reviews_df["review_upvotes"] = reviews_df["review_upvotes"].astype(int)
reviews_df["review_text"] = reviews_df["review_text"].astype(str)

In [21]:
def get_upvote_to_downvote_ratio(row):
    if row["review_downvotes"] != 0:
        return row["review_upvotes"]/row["review_downvotes"]
    return 0

reviews_df["upvote_ratio"] = reviews_df.apply(get_upvote_to_downvote_ratio, axis=1)

In [22]:
reviews_df.head()

Unnamed: 0,rating,review_date,review_downvotes,review_text,review_upvotes,reviewer,title,upvote_ratio
0,5,Rahul Pal,201,fabulous phone loved it....😍😘😘READ MORE,965,Rahul Pal,Super!,4.800995
1,5,KiranKumar Gajjala,165,Simply love it.. after 2 days of usage.Battery...,799,KiranKumar Gajjala,Samsung Nailed it. Amazing Phone,4.842424
2,4,Sethukrishnan P,157,Awesome midrange smart phone with reasonably g...,760,Sethukrishnan P,Good choice,4.840764
3,5,manthan pandey,14,awesome phoneREAD MORE,97,manthan pandey,Highly recommended,6.928571
4,4,Flipkart Customer,22,fingerprint sensor not working properly.remain...,131,Flipkart Customer,Good choice,5.954545


In [38]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [44]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

stop_words = set(stopwords.words('english'))

def process_review_text(review_obj):
    review_text = review_obj[0]
    
    # replace the read more text
    review_text = review_text.replace("READ MORE", "")
    
    # convert to lower case
    review_text = review_text.lower()
    
    word_tokens = word_tokenize(review_text)
    print(word_tokens)
  
    filtered_review = [w.strip() for w in word_tokens if not w.strip() in stop_words and string.punctuation]
    print(filtered_review)
    
    processed_review_text = " ".join(filtered_review)
    
    # https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string
    # review_text.translate(None, string.punctuation)
    # review_text = review_text.translate(review_text.maketrans('', '', string.punctuation))
    return processed_review_text

In [None]:
r = process_review_text(["fabulous phone loved it....\ud83d\ude0d\ud83d\ude18\ud83d\ude18READ MORE"])
print(r)

In [42]:
reviews_df["processed_review_text"] = reviews_df[["review_text"]].apply(process_review_text, axis=1)
reviews_df["processed_review_text"].head(10)

0                        fabulous phone loved ... .😍😘😘
1    simply love it.. 2 days usage.battery charging...
2    awesome midrange smart phone reasonably good c...
3                                        awesome phone
4    fingerprint sensor working properly.remaining ...
5    phone camera battery display awesome fingerpri...
6                    looking hot..i loved ... ... go .
7    amazing mid range phone . 25w super fast charg...
8    words say , camera , battery , perfomance insa...
9                                              awesome
Name: processed_review_text, dtype: object