### Data Processing

Brief data exploration and limit our data to subset of reviews in year 2012.

In [3]:
import pandas as pd

data = pd.read_csv('Reviews.csv')

In [4]:
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [2]:
# Convert the timestamp to the year 

from datetime import datetime

data['Time_updated'] = [datetime.fromtimestamp(x) for x in data.Time]
data['Year'] = [x.year for x in data.Time_updated]

In [30]:
data.Year.value_counts()

2012    198064
2011    163546
2010     86092
2009     55403
2008     34144
2007     22358
2006      6686
2005      1344
2004       560
2003       133
2002        73
2000        32
2001        13
1999         6
Name: Year, dtype: int64

We decided to use the subset of year 2012 to the further analysis

In [3]:
sub_df = data[data['Year'] == 2012]

In [51]:
sub_df.to_csv('data.csv')

### TF-IDF Word Count

Define functions to identify the review top words in both high-rating group and low-rating group

In [6]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
import string

In [2]:
def tf_idf_count(df, product='Not Specified', rating='high'):
    '''
    input format:
    - product: default value is 'Not Specified' for all the products; 
               specify the product by input ProductId, support multiple ProductIds e.g.(AAAAAAAA, BBBBBBBB)
    - rating: two values of 'high' (rating score of 4 and 5) and 'low' (rating score between 1 and 3)
    '''
    # Parameter with product:
    
    if product=='Not Specified':
        df = df
    else:
        product_list = product.split(",")
        df = df[df['ProductId'].isin(product_list)]
    
    #Parameter with rating:
    
    if rating == 'high':
        df = df[df['Score']>=4]
    elif rating == 'low':
        df = df[df['Score']<=3]
    else:
        print('Please input high or low for rating parameter')
    
    # Identify corpus with the data filtered
    corpus = list(dict.fromkeys(df['Text'])) 
    
    # Lemmatization:
    lemmatizer = WordNetLemmatizer()

    # Function to convert nltk tag to wordnet tag
    
    def nltk2wn_tag(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None

    def lemmatize_sentence(sentence):
        nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
        wn_tagged = map(lambda x: (x[0], nltk2wn_tag(x[1])), nltk_tagged)
        res_words = []
        for word, tag in wn_tagged:
            if tag is None:            
                res_words.append(word)
            else:
                res_words.append(lemmatizer.lemmatize(word, tag))
        return " ".join(res_words)
    
    # Get the corpus after lemmatization
    
    corpus_lem=[]
    for review in corpus:
        corpus_lem = corpus_lem + [lemmatize_sentence(review)]
        
    # Regex for the reviews:
    
    corpus_lem = [s.lower() for s in corpus_lem] # convert to lowercase
    
    cleanr = re.compile('<.*?>')
    corpus_lem = [re.sub(cleanr,'',s) for s in corpus_lem] # remove html tags
    
    corpus_lem = [str(s).translate(str.maketrans('', '', string.punctuation)) for s in corpus_lem] # remove punc
    
    stop = stopwords.words()+['like','great','love','good','could','even','would','need']
    # add more stopwords
    
    
    # Since the most common food in our products are coffee, cookie, and tea, we can try to generate 
    # several tags to identify the most common product
    
    category_tag = 'unknown'
    
    if product!='Not Specified':
        count_coffee = 0
        count_tea = 0
        count_cookie = 0
        count_pets = 0
        for review in corpus_lem:
            if 'coffee' in review or 'cappuccino' in review or 'latte' in review:
                count_coffee+=1
            if 'tea' in review:
                count_tea+=1
            if 'cookie' in review or 'cooky' in review:
                count_cookie+=1
            if 'dog' in review or 'cat' in review:
                count_pets+=1
        if count_coffee == max(count_coffee, count_tea, count_cookie, count_pets) and count_coffee/len(corpus_lem)>0.5:
            category_tag = 'coffee'
            stop = stop+['coffee','cappuccino','latte','taste']
        elif count_tea == max(count_coffee, count_tea, count_cookie, count_pets) and count_tea/len(corpus_lem)>0.5:
            category_tag = 'tea'
            stop = stop+['tea','taste']
        elif count_cookie == max(count_coffee, count_tea, count_cookie, count_pets) and count_cookie/len(corpus_lem)>0.5:
            category_tag = 'cookie'
            stop = stop+['cookie','cooky']
        elif count_pets == max(count_coffee, count_tea, count_cookie, count_pets) and count_pets/len(corpus_lem)>0.5:
            category_tag = 'pet_foods'
            stop = stop+['pet','cat','dog']
        else: category_tag = 'others'
    
    
    # Generate TF-IDF word vectorizer
    
    vectorizer = TfidfVectorizer(ngram_range=(2,3),
                             token_pattern=r'\b[a-zA-Z0-9]{3,}\b',
                             max_df=0.5,
                             min_df=1, stop_words=stop)

    X = vectorizer.fit_transform(corpus_lem)
    terms = vectorizer.get_feature_names()
    tf_idf = pd.DataFrame(X.toarray().transpose(), index=terms)
    tf_idf = tf_idf.sum(axis=1)
    score = pd.DataFrame(tf_idf, columns=["score"])
    score["term"] = terms
    score.sort_values(by="score", ascending=False, inplace=True)
    
    print(f'This product or These group products category is {category_tag}')
    return score

In [34]:
# test the function

tf_idf_count(sub_df, product = 'B007JFMH8M', rating = 'high').head(20)

This product or These group products category is cookie


Unnamed: 0,score,term
soft chewy,9.717186,soft chewy
oatmeal raisin,9.671538,oatmeal raisin
quaker soft,8.798857,quaker soft
soft baked,8.556104,soft baked
quaker soft baked,7.227345,quaker soft baked
baked oatmeal,6.866643,baked oatmeal
soft baked oatmeal,6.866643,soft baked oatmeal
mom voxbox,6.436194,mom voxbox
year old,4.651767,year old
definitely buy,4.586363,definitely buy


In [8]:
tf_idf_count(sub_df, product = 'B007JFMH8M', rating = 'low').head(20)

This product or These group products category is cookie


Unnamed: 0,score,term
oatmeal raisin,1.368238,oatmeal raisin
170 calorie,0.841357,170 calorie
soft baked,0.832312,soft baked
little dry,0.710769,little dry
year old,0.67973,year old
quaker soft baked,0.657443,quaker soft baked
quaker soft,0.657443,quaker soft
soft baked oatmeal,0.617879,soft baked oatmeal
baked oatmeal,0.617879,baked oatmeal
individually wrap,0.58672,individually wrap


In [29]:
sub_df.ProductId.value_counts().head(15) # get the number of reviews for each product

B007JFMH8M    913
B006MONQMC    491
B002IEZJMA    483
B002IEVJRY    483
B007Y59HVM    480
B005ZBZLT4    480
B001VJ0B0I    479
B002LANN56    460
B0041NYV8E    450
B005K4Q37A    412
B005K4Q34S    412
B005K4Q1YA    412
B005K4Q4LK    412
B003B3OOPA    401
B005HG9ET0    386
Name: ProductId, dtype: int64

In [44]:
neg_review = tf_idf_count(sub_df, product = 'B005K4Q4LK', rating = 'low') 
# randomly select a product and output to the dataset

This product or These group products category is coffee


In [43]:
pos_review = tf_idf_count(sub_df, product = 'B005K4Q4LK', rating = 'high')

This product or These group products category is coffee


In [45]:
neg_review.to_csv('neg_review.csv')

In [46]:
pos_review.to_csv('pos_review.csv')