In [1]:
import pickle

import requests

import pandas as pd
import numpy as np

from IPython.display import clear_output
import datetime

import spacy

from collections import Counter
from tqdm import tqdm

## Notebook Parameters
*During the analysis, several backup-files were created. If you have no backup-files, please, set all of the parameters to `True`.*

In [2]:
PARSE_REVIEWS = False
PREPROCESS_REVIEWS = False
COLLECT_TOKEN_BIGRAMS_MAPPER = False
EXTRACT_TOP_TOKENS = False

SAVE_TO_EXCEL = False

## Getting Data

In [3]:
def parser(url, first_review_num=0, shift=100, requests_num=-1):
    """ Parses reviews:
    -- url - link to the product (from `<distributor website>`);
    -- first_review_num - from which review to start (1 is for the last by date review);
    -- shift - number of reviews per 1 request (max.: 100)
    -- requests_num - number of requests to send (`-1` to get all of the available reviews)
    
    The total amount of parsed reviews will be equal to [shift x requests_num] or
    the number of available reviews.
    """
    
    offset = first_review_num
    limit = shift
    stop = requests_num
    
    data = []
    i = 0
    while True:

        clear_output()
        request_num = i + 1
        start_time = datetime.datetime.now()
        print('Request #{0} | Start time: {1}'.format(request_num, start_time))

        params = {
            'offset': offset,
            'limit': limit,
        }

        response = requests.get(url, params)
        json = response.json()

        new_data = json['data']
        if len(new_data) == 0:
            print('Stop reason: all of the reviews are parsed.')
            break
        data += new_data

        i += 1
        if i == stop:
            print('Stop reason: requests number limit has been reached.')
            break

        offset += limit
    
    return data

In [4]:
if PARSE_REVIEWS:
    
    url = '<distributor website>'
    data = parser(url)

    with open('pkl/1_parsed_reviews_list.pkl', 'wb') as file:
        pickle.dump(data, file)

else:
    with open('pkl/1_parsed_reviews_list.pkl', 'rb') as file:
        data = pickle.load(file)

### Creating Reviews Table 

In [None]:
df_full = pd.DataFrame(data)
df_full['dateCreated'] = pd.to_datetime(df_full['dateCreated'])

df_full.head(3)

# The output was hidden on purpose

In [6]:
if SAVE_TO_EXCEL:
    df_full.to_excel('xlsx/1_raw_reviews.xlsx', index=False)

Filtering out columns. 

In [7]:
cols = ['dateCreated', 'productRating', 'comment', 'userName']
df = df_full[cols].copy()

df.head()

Unnamed: 0,dateCreated,productRating,comment,userName
0,2020-02-01,4,It's nice to only put one thing on my face bef...,paichyfish
1,2020-02-01,5,Definite skin change! I'm in love with this pr...,camsheries
2,2020-02-01,5,I'm absolutely in LOVE with this product! I wi...,Kjsowers
3,2020-02-01,5,"Scent was a little weird, kind of earthy with ...",hilmel
4,2020-02-01,5,Never received this so 🤷🏼‍♀️ Don’t know how it...,sjconjurske


## Analysis
To understand, what customers like and dislike, let's extract all of the words which they wrote in their reviews and compile words rating basing on the amount of words occurrences.

### Tokens (Individual Words)

Words lemmatization, stop-words excluding.

In [8]:
NLP = spacy.load("en_core_web_sm")

def extract_tokens(text):
        
    doc = NLP(text.lower())
    words = [token.lemma_ for token in doc if token.is_punct != True and token.is_stop != True]
        
    return words

In [9]:
if PREPROCESS_REVIEWS:
    
    df['tokens'] = df['comment'].apply(extract_tokens)
    
    with open('pkl/2_df_tokens.pkl', 'wb') as file:
        pickle.dump(df, file)

else:
    with open('pkl/2_df_tokens.pkl', 'rb') as file:
        df = pickle.load(file)

df.head()

Unnamed: 0,dateCreated,productRating,comment,userName,tokens
0,2020-02-01,4,It's nice to only put one thing on my face bef...,paichyfish,"[nice, thing, face, bedtime, moisturize]"
1,2020-02-01,5,Definite skin change! I'm in love with this pr...,camsheries,"[definite, skin, change, love, product, notice..."
2,2020-02-01,5,I'm absolutely in LOVE with this product! I wi...,Kjsowers,"[absolutely, love, product, wish, size, produc..."
3,2020-02-01,5,"Scent was a little weird, kind of earthy with ...",hilmel,"[scent, little, weird, kind, earthy, faint, st..."
4,2020-02-01,5,Never received this so 🤷🏼‍♀️ Don’t know how it...,sjconjurske,"[receive, 🤷, 🏼‍, ♀, ️, know, work]"


#### Counting Tokens

In [10]:
def create_count_df(tokens_list):
    """ Returns df with token counted. """
    
    counter = Counter(tokens_list)
    
    count_df = pd.DataFrame.from_dict(counter, orient='index')
    count_df = count_df.reset_index()
    
    renamer = {'index': 'token', 0: 'count'}
    count_df = count_df.rename(columns=renamer)
    
    count_df['countShare'] = count_df['count'] / count_df['count'].sum()
    count_df['rank'] = count_df['count'].rank(ascending=False)
    
    count_df = count_df.sort_values(by='count', ascending=False)
    count_df = count_df.reset_index(drop=True)
    
    return count_df

In [11]:
tokens_list = [token for comment in df['tokens'] for token in comment]
tokens = create_count_df(tokens_list)

In [12]:
tokens.head()

Unnamed: 0,token,count,countShare,rank
0,skin,2720,0.061259,1.0
1,feel,1909,0.042994,2.0
2,love,1610,0.03626,3.0
3,smell,1464,0.032971,4.0
4,like,1264,0.028467,5.0


#### Tokens Mean Rating Estimation
To estimate a mean rating of a token:
* extract all of the reviews, where the token was met;
* estimate mean rating of the extracted reviews.

If a token was met >1 times in a review, this review rating will be accounted for only once.

In [13]:
tokens_rating = {}
for list_of_tokens, rating in zip(df['tokens'], df['productRating']):
    
    for token in set(list_of_tokens):
        
        if token in tokens_rating:
            tokens_rating[token].append(rating)
        else:
            tokens_rating[token] =[rating]

In [14]:
tokens['rating'] = tokens['token'].apply(lambda token: np.mean(tokens_rating[token]))

In [15]:
tokens.head()

Unnamed: 0,token,count,countShare,rank,rating
0,skin,2720,0.061259,1.0,4.33904
1,feel,1909,0.042994,2.0,4.438746
2,love,1610,0.03626,3.0,4.756811
3,smell,1464,0.032971,4.0,3.737434
4,like,1264,0.028467,5.0,3.847594


### Bigrams (Pairs of Words)
The tokens consist of single words hence it is hard to interpret the meaning of some of them. To annotate the tokens, let's extract bigrams and add to the `tokens_rating` a column with the bigrams where the respective token could be met. To create bigrams, we will use the dependency tree, which could be obtained using [spaCy](https://spacy.io/usage/linguistic-features) library.

#### Extracting Bigrams

In [16]:
def extract_bigrams(text):
    """ Returns list of pairs of dependent words.
    Words are lemmatised, pairs which contain stop-words deleted.
    """
    
    sentence = NLP(text.lower())
    bigrams = []
    
    for token in sentence:
        if token.is_punct != True and token.is_stop != True:
            
            for child in token.children:
                if child.is_punct != True and child.is_stop != True:
                    
                    bigram = frozenset([token.lemma_, child.lemma_])
                    if len(bigram) != 1:
                        bigrams.append(bigram)
    
    return bigrams

In [17]:
if PREPROCESS_REVIEWS:
    
    df['bigrams'] = df['comment'].apply(extract_bigrams)
    
    with open('pkl/3_df_bigrams.pkl', 'wb') as file:
        pickle.dump(df, file)

else:
    with open('pkl/3_df_bigrams.pkl', 'rb') as file:
        df = pickle.load(file)

df.head()

Unnamed: 0,dateCreated,productRating,comment,userName,tokens,bigrams
0,2020-02-01,4,It's nice to only put one thing on my face bef...,paichyfish,"[nice, thing, face, bedtime, moisturize]",[]
1,2020-02-01,5,Definite skin change! I'm in love with this pr...,camsheries,"[definite, skin, change, love, product, notice...","[(definite, skin), (change, skin), (difference..."
2,2020-02-01,5,I'm absolutely in LOVE with this product! I wi...,Kjsowers,"[absolutely, love, product, wish, size, produc...","[(product, size), (100, $), (totally, notice),..."
3,2020-02-01,5,"Scent was a little weird, kind of earthy with ...",hilmel,"[scent, little, weird, kind, earthy, faint, st...","[(little, weird), (cigarette, faint), (cigaret..."
4,2020-02-01,5,Never received this so 🤷🏼‍♀️ Don’t know how it...,sjconjurske,"[receive, 🤷, 🏼‍, ♀, ️, know, work]","[(🤷, receive), (️, know), (know, work)]"


#### Counting Bigrams

In [18]:
bigrams_list = [bigram for comment in df['bigrams'] for bigram in comment]
bigrams = create_count_df(bigrams_list).rename(columns={'token': 'bigram'})

In [19]:
bigrams.head()

Unnamed: 0,bigram,count,countShare,rank
0,"(feel, skin)",615,0.027668,1.0
1,"(face, feel)",259,0.011652,2.0
2,"(smell, like)",244,0.010977,3.0
3,"(soft, feel)",240,0.010797,4.0
4,"(make, feel)",216,0.009717,5.0


#### Bigrams Mean Rating Estimation
*The same methodology as in 'Tokens Mean Rating Estimation'.*

In [20]:
bigrams_rating = {}

for list_of_bigrams, rating in zip(df['bigrams'], df['productRating']):
    
    for bigram in set(list_of_bigrams):
        
        if bigram in bigrams_rating:
            bigrams_rating[bigram].append(rating)
        else:
            bigrams_rating[bigram] =[rating]

In [21]:
bigrams['rating'] = bigrams['bigram'].apply(lambda bigram: np.mean(bigrams_rating[bigram]))

In [22]:
bigrams.head()

Unnamed: 0,bigram,count,countShare,rank,rating
0,"(feel, skin)",615,0.027668,1.0,4.580328
1,"(face, feel)",259,0.011652,2.0,4.562992
2,"(smell, like)",244,0.010977,3.0,3.343348
3,"(soft, feel)",240,0.010797,4.0,4.591667
4,"(make, feel)",216,0.009717,5.0,4.720379


In [23]:
if SAVE_TO_EXCEL:
    bigrams.to_excel('xlsx/2_bigrams.xlsx', index=False)

### Annotating Tokens

In [24]:
if COLLECT_TOKEN_BIGRAMS_MAPPER:

    token_bigrams_mapper = {}

    for token in tokens['token']:
        token_bigrams_mapper[token] = []

        to_iterate = zip(bigrams['bigram'], bigrams['count'], bigrams['rating'])
        for bigram, count, rating in to_iterate:

            if token in bigram:
                token_bigrams_mapper[token].append((bigram, count, rating))
    
    with open('pkl/4_token_bigrams_mapper.pkl', 'wb') as file:
        pickle.dump(token_bigrams_mapper, file)

else:
    with open('pkl/4_token_bigrams_mapper.pkl', 'rb') as file:
        token_bigrams_mapper = pickle.load(file)

In [25]:
print("Example (TOP3 bigrams for 'skin` token:")
token_bigrams_mapper['skin'][0:3]

Example (TOP3 bigrams for 'skin` token:


[(frozenset({'feel', 'skin'}), 615, 4.580327868852459),
 (frozenset({'leave', 'skin'}), 193, 4.435233160621761),
 (frozenset({'look', 'skin'}), 162, 4.701863354037267)]

In [26]:
def annotate_token(token, token_bigrams_mapper, limit=9**9):
    
    bigrams = token_bigrams_mapper[token]
    
    bigram_to_str = lambda b: ' • '.join(['+'.join(b[0]), str(b[1]), str(round(b[2], 2))])
    
    bigrams = [bigram_to_str(bigram) for bigram in bigrams][0:limit]
    
    annotation = ' | '.join(bigrams)
    
    return annotation

In [27]:
annotate = lambda t: annotate_token(t, token_bigrams_mapper, 30)
tokens['annotation (bigram • count • stars)'] = tokens['token'].apply(annotate)

In [28]:
tokens.head(10)

Unnamed: 0,token,count,countShare,rank,rating,annotation (bigram • count • stars)
0,skin,2720,0.061259,1.0,4.33904,feel+skin • 615 • 4.58 | leave+skin • 193 • 4....
1,feel,1909,0.042994,2.0,4.438746,feel+skin • 615 • 4.58 | face+feel • 259 • 4.5...
2,love,1610,0.03626,3.0,4.756811,love+product • 143 • 4.79 | mask+love • 104 • ...
3,smell,1464,0.032971,4.0,3.737434,smell+like • 244 • 3.34 | smell+good • 109 • 4...
4,like,1264,0.028467,5.0,3.847594,smell+like • 244 • 3.34 | feel+like • 95 • 4.1...
5,product,1163,0.026193,6.0,4.165557,love+product • 143 • 4.79 | product+great • 80...
6,face,925,0.020832,7.0,4.164286,face+feel • 259 • 4.56 | leave+face • 52 • 4.4...
7,nice,806,0.018152,8.0,4.285714,nice+feel • 177 • 4.19 | nice+smell • 65 • 4.1...
8,morning,802,0.018062,9.0,4.44557,morning+feel • 55 • 4.53 | morning+wake • 19 •...
9,night,798,0.017972,10.0,4.400268,mask+night • 101 • 4.3 | cream+night • 90 • 4....


#### <font color='red'>Manual</font> Tokens Filtering
Some of the tokens provide no/few useful information and/or are not interpretable. After manual analysis of the tokens which 'count' is ≥30, the following tokens were left.

Each token may be considered as a separate group, but for more general review classification we also decided to (manually) group the informative tokens by their meaning.

In [29]:
tokens_by_group = {
    'aroma': ['smell', 'scent', 'fragrance'],
 
    'brand': ['<hidden on purpose>'], # <brand> is 'below_30'
    
    'effect': ['difference', 'dry', 'detox', 'help', 'greasy','sensitive',
               'light', 'strong', 'refresh', 'long', 'clean', 'fresh',
               'glow', 'clear', 'rinse', 'breakout', 'acne', 'refreshing',
               'bright', 'change', 'pore', 'hard', 'effect', 'refreshed',
               'red', 'redness'],
    
    'feature': ['soft', 'smooth', 'sticky', 'oily', 'new', 'texture',
                'thick', 'serum', 'pine', 'earthy', 'expensive', 'herbal',
                'oil', 'natural', 'tree', 'cigarette', 'creamy'],
    
    'hydrating': ['moisturizer', 'moisturize', 'moisturizing', 'hydrated',
                  'hydrating', 'moisture', 'hydrate', 'soak'],
    
    'package': ['size', 'bag', 'sample', 'small', 'big', 'pillow', 'lightweight'],
    
    'usage': ['morning', 'night', 'mask', 'overnight', 'day', 'sleep',
              'winter', 'bed', 'routine', 'week', 'nighttime', 'eye',
              'nightly', 'application']
}

informative_tokens = [token for group in tokens_by_group.values() for token in group]
print('Informative Tokens #:', len(informative_tokens))
print('Groups #:', len(tokens_by_group))

Informative Tokens #: 79
Groups #: 7


In [30]:
tokens['informative'] = 'no'
tokens.loc[tokens['count'] < 30, 'informative'] = 'below_30'
tokens.loc[tokens['token'].isin(informative_tokens), 'informative'] = 'yes'

In [31]:
groups_by_token = {token: group for group, tokens in tokens_by_group.items()for token in tokens}
tokens['group'] = tokens['token'].map(groups_by_token)

In [32]:
tokens.head()

Unnamed: 0,token,count,countShare,rank,rating,annotation (bigram • count • stars),informative,group
0,skin,2720,0.061259,1.0,4.33904,feel+skin • 615 • 4.58 | leave+skin • 193 • 4....,no,
1,feel,1909,0.042994,2.0,4.438746,feel+skin • 615 • 4.58 | face+feel • 259 • 4.5...,no,
2,love,1610,0.03626,3.0,4.756811,love+product • 143 • 4.79 | mask+love • 104 • ...,no,
3,smell,1464,0.032971,4.0,3.737434,smell+like • 244 • 3.34 | smell+good • 109 • 4...,yes,aroma
4,like,1264,0.028467,5.0,3.847594,smell+like • 244 • 3.34 | feel+like • 95 • 4.1...,no,


In [33]:
if SAVE_TO_EXCEL:
    tokens.to_excel('xlsx/3_tokens.xlsx', index=False)

### Reviews Categorization
Adding a column with the list of groups to which a review belongs.

Also, let's indicate the most popular token, which reviews contain:
* **tokenTOP1** (a token from a review, which is in the `informative_tokens` and has <u>the highest 'count'/'rank'</u>);
* **tokenTOP2** (2nd 'count'/'rank');
* **tokenTOP3**.

#### Categories

In [34]:
def extract_groups(review_tokens, groups_by_token):
    
    groups = [groups_by_token[token] for token in review_tokens
              if token in groups_by_token]
    groups = set(groups)
    
    if len(groups) == 0:
        return 'non-informative'
    else:
        return ' | '.join(groups)

In [35]:
category = lambda x: extract_groups(x, groups_by_token)
df['categories'] = df['tokens'].apply(category)

In [36]:
df.head()

Unnamed: 0,dateCreated,productRating,comment,userName,tokens,bigrams,categories
0,2020-02-01,4,It's nice to only put one thing on my face bef...,paichyfish,"[nice, thing, face, bedtime, moisturize]",[],hydrating
1,2020-02-01,5,Definite skin change! I'm in love with this pr...,camsheries,"[definite, skin, change, love, product, notice...","[(definite, skin), (change, skin), (difference...",usage | effect | feature
2,2020-02-01,5,I'm absolutely in LOVE with this product! I wi...,Kjsowers,"[absolutely, love, product, wish, size, produc...","[(product, size), (100, $), (totally, notice),...",package | effect
3,2020-02-01,5,"Scent was a little weird, kind of earthy with ...",hilmel,"[scent, little, weird, kind, earthy, faint, st...","[(little, weird), (cigarette, faint), (cigaret...",aroma | usage | effect | hydrating | feature
4,2020-02-01,5,Never received this so 🤷🏼‍♀️ Don’t know how it...,sjconjurske,"[receive, 🤷, 🏼‍, ♀, ️, know, work]","[(🤷, receive), (️, know), (know, work)]",non-informative


#### Top Tokens

In [37]:
def extract_top_token(review_tokens, tokens, top=1):
    
    mask_tokens = tokens['token'].isin(review_tokens)
    mask_informative = tokens['informative'] == 'yes'
    mask = mask_tokens & mask_informative

    rating = tokens[mask].copy()
    rating = rating.sort_values(by='count', ascending=False)

    if len(rating) == 0:
        return 'non-informative'
    elif len(rating) < top:
        return 'review contains <{} informative tokens'.format(top)
    else:
        return rating['token'].tolist()[top-1]

In [38]:
top1 = lambda x: extract_top_token(x, tokens, top=1)
top2 = lambda x: extract_top_token(x, tokens, top=2)
top3 = lambda x: extract_top_token(x, tokens, top=3)

In [39]:
df['tokenTOP1'] = df['tokens'].apply(top1)
df['tokenTOP2'] = df['tokens'].apply(top2)
df['tokenTOP3'] = df['tokens'].apply(top3)

In [40]:
df.head()

Unnamed: 0,dateCreated,productRating,comment,userName,tokens,bigrams,categories,tokenTOP1,tokenTOP2,tokenTOP3
0,2020-02-01,4,It's nice to only put one thing on my face bef...,paichyfish,"[nice, thing, face, bedtime, moisturize]",[],hydrating,moisturize,review contains <2 informative tokens,review contains <3 informative tokens
1,2020-02-01,5,Definite skin change! I'm in love with this pr...,camsheries,"[definite, skin, change, love, product, notice...","[(definite, skin), (change, skin), (difference...",usage | effect | feature,morning,mask,smooth
2,2020-02-01,5,I'm absolutely in LOVE with this product! I wi...,Kjsowers,"[absolutely, love, product, wish, size, produc...","[(product, size), (100, $), (totally, notice),...",package | effect,size,change,review contains <3 informative tokens
3,2020-02-01,5,"Scent was a little weird, kind of earthy with ...",hilmel,"[scent, little, weird, kind, earthy, faint, st...","[(little, weird), (cigarette, faint), (cigaret...",aroma | usage | effect | hydrating | feature,morning,scent,refresh
4,2020-02-01,5,Never received this so 🤷🏼‍♀️ Don’t know how it...,sjconjurske,"[receive, 🤷, 🏼‍, ♀, ️, know, work]","[(🤷, receive), (️, know), (know, work)]",non-informative,non-informative,non-informative,non-informative


In [41]:
if EXTRACT_TOP_TOKENS:
    
    df['tokenTOP1'] = df['tokens'].apply(top1)
    df['tokenTOP2'] = df['tokens'].apply(top2)
    df['tokenTOP3'] = df['tokens'].apply(top3)

    with open('pkl/5_df_top_tokens.pkl', 'wb') as file:
        pickle.dump(df, file)

else:
    with open('pkl/5_df_top_tokens.pkl', 'rb') as file:
        df = pickle.load(file)

In [42]:
if SAVE_TO_EXCEL:
    df.drop(columns=['userName', 'tokens', 'bigrams']).
    to_excel('xlsx/4_reviews_categorized.xlsx', index=False)