In [24]:
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
from collections import Counter
import ast

import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import seaborn as sb

from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.stem import WordNetLemmatizer
import nltk
from nltk import ngrams
from itertools import chain

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [14]:
dat = pd.read_csv('hairdryer_cleaned.csv', header=0)
# dat = pd.read_csv('microwave_cleaned.csv', header=0)
# dat = pd.read_csv('pacifier_cleaned.csv', header=0)

In [15]:
from sklearn.feature_extraction import text
my_stopwords = text.ENGLISH_STOP_WORDS.union(['hair','dryer','dryers','hairdryer','dry',
                                              'br','product','use','using','used','amazon','did','does','do','just'])
# ['microwave','oven','product','one','use','used','just','buy','bought','does','did','do'])
# ['baby','pacifier','pacifiers','product','did','does','do','just','buy','bought'])

In [16]:
# popular single adjective word people used for different score
def get_token_adj(score, benchmark):
    df = dat[dat['star_rating'] == score]['review_body']
    
    count = len(df)
    total_text = ' '.join(df)
    total_text = total_text.lower()
    stop = set(my_stopwords)
    total_text = nltk.word_tokenize(total_text)
    total_text = [word for word in total_text if word not in stop and len(word) >= 3]
    lemmatizer = WordNetLemmatizer()
    total_text = [lemmatizer.lemmatize(w,'a') for w in total_text]
    # get adjective only
    total_text = [word for word, form in nltk.pos_tag(total_text) if form == 'JJ']
    
    text = nltk.Text(total_text)
    fdist = nltk.FreqDist(text)
    
    # return only phrase occurs more than benchmark of his reviews
    return sorted([(w,fdist[w],str(round(fdist[w]/count*100,2))+'%') for w in set(text) if fdist[w] >= count*benchmark], key=lambda x: -x[1])

In [20]:
index = ['Phrase', 'Count', 'Occur %']

for j in range(1,6):
    test = pd.DataFrame()
    d = get_token_adj(j, 0.065)
    print('score {} reviews most popular adjectives word:'.format(j))
    for i in d:
        test = test.append(pd.Series(i, index = index), ignore_index = True)
    test = test.sort_values('Count', ascending=False)
    print(test)

score 1 reviews most popular adjectives word:
   Count Occur % Phrase
0  185.0  18.56%    hot
1  171.0  17.15%   good
2  100.0  10.03%    bad
3   97.0   9.73%   high
4   96.0   9.63%  great
5   92.0   9.23%    low
6   92.0   9.23%    new
7   67.0   6.72%   long
score 2 reviews most popular adjectives word:
    Count Occur %       Phrase
0   159.0   25.9%         good
1   155.0  25.24%          hot
2    88.0  14.33%         high
3    85.0  13.84%        great
4    66.0  10.75%          low
5    63.0  10.26%          old
6    60.0   9.77%          new
7    54.0   8.79%         long
8    50.0   8.14%        heavy
9    50.0   8.14%         fine
10   48.0   7.82%        small
11   43.0    7.0%       little
12   41.0   6.68%        short
13   40.0   6.51%  retractable
14   40.0   6.51%     powerful
score 3 reviews most popular adjectives word:
    Count Occur %    Phrase
0   291.0  30.28%      good
1   172.0   17.9%       hot
2   147.0   15.3%      high
3   146.0  15.19%     great
4   135.0 

In [21]:
# frequent phrases
def get_token_ngram(score, benchmark):
    df = dat[dat['star_rating'] == score]['review_body']
        
    count = len(df)
    total_text = ' '.join(df)
    total_text = total_text.lower()
    stop = set(my_stopwords)
    total_text = nltk.word_tokenize(total_text)
    total_text = [word for word in total_text if word not in stop and len(word) >= 3]
    lemmatizer = WordNetLemmatizer()
    total_text = [lemmatizer.lemmatize(w,'v') for w in total_text]
    bigrams = ngrams(total_text,2)
    trigrams = ngrams(total_text, 3)


    # look at 2-gram and 3-gram together
    combine = chain(bigrams, trigrams)
    text = nltk.Text(combine)
    fdist = nltk.FreqDist(text)
    
    # return only phrase occurs more than benchmark of his reviews
    return sorted([(w,fdist[w],str(round(fdist[w]/count*100,2))+'%') for w in set(text) if fdist[w] >= count*benchmark], key=lambda x: -x[1])

In [25]:
index = ['Phrase', 'Count', 'Occur %']

for j in range(1,6):
    test = pd.DataFrame()
    d = get_token_ngram(j, 0.02)
    print('score {} reviews most popular 2-gram / 3-gram:'.format(j))
    for i in d:
        test = test.append(pd.Series(i, index = index), ignore_index = True)
    test = test.sort_values('Count', ascending=False)
    print(test)

score 1 reviews most popular 2-gram / 3-gram:
    Count Occur %               Phrase
0   102.0  10.23%         (stop, work)
1    67.0   6.72%       (waste, money)
2    45.0   4.51%  (retractable, cord)
3    36.0   3.61%           (n't, buy)
4    30.0   3.01%       (work, months)
5    30.0   3.01%        (work, great)
6    30.0   3.01%       (read, review)
7    29.0   2.91%        (months, ago)
8    27.0   2.71%         (n't, waste)
9    26.0   2.61%          (n't, work)
10   26.0   2.61%         (quit, work)
11   25.0   2.51%        (wall, mount)
12   23.0   2.31%  (n't, waste, money)
13   23.0   2.31%           (get, hot)
14   22.0   2.21%           (hot, air)
15   20.0   2.01%       (last, months)
score 2 reviews most popular 2-gram / 3-gram:
    Count Occur %               Phrase
0    47.0   7.65%         (stop, work)
1    39.0   6.35%  (retractable, cord)
2    33.0   5.37%         (work, fine)
3    22.0   3.58%        (wall, mount)
4    20.0   3.26%           (hot, air)
5    19.0  