In [2]:
#!pip install transformers
#from transformers import pipeline
#!pip install emoji
import emoji
import json
import pandas as pd
from collections import Counter
import seaborn as sns
import numpy as np
import pandas as pd
import string
from matplotlib.pyplot import figure
import matplotlib.pyplot as plt

from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
from nltk.tokenize import TweetTokenizer
from nltk.probability import FreqDist
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import stopwords

From the google play store we have scraped reviews for grocery store apps delivery apps: Tesco, Morrisons, M&S, ASDA, Aldi, Sainbury's and Waitrose. Parameters for scraping were set to 300 for reviews with 1 and 2 stars and 250 for 4, and 5 stars and only 150 with 3 stars.

In [None]:
app_reviews_df = pd.read_csv("src/reviews.csv")
app_infos_df = pd.read_csv("src/apps.csv")

## EDA

<ol>
    <li>Check the number of reviews scraped for each company</li>
</ol>

In [None]:
apps = app_reviews_df['appId'].unique()

In [None]:
def check_reviews_per_star(df):
    apps = df['appId'].unique()
    for app in apps:
        name = app.split('.')[1].title() if app != 'de.apptiv.business.android.aldi_uk' else app.split('.')[-1].split('_')[0].title()
        print(name, "reviews per star:")
        stars = df['score'].loc[df['appId'] == app].value_counts()
        total = sum(stars)
        print(stars)
        print('Total # of reviews:',total)
        print('\n')

### Table 1 code

In [None]:
check_reviews_per_star(app_reviews_df)

From the reviews, Morrisons and Waitrose scraped less reviess with 3,2, and 4 stars due to availability however at the extremes (1 and 5 stars) it's the same as others - as expected. As we aim to extract recommendations, the 1 star reviews will have vital importance however we proceed with cautions given the lower sample size relative to other apps.

<ol start='2'>
    <li>Check for missing values</li>

In [None]:
for column in app_reviews_df.columns:
    print("Missing values in", column, ": {}".format(app_reviews_df[column].isnull().sum()))

In [None]:
app_reviews_df.loc[app_reviews_df['reviewCreatedVersion'] == '19.29.0'].head()

Further inspection shows that 'reviewCreatedVersion' is the version of the app the user reviewed. Therefore, we can potentially narrow down the comments by the version of the app rather than dates. Alternatively, I can use the date as a proxy for the created version so will keep those. Regarding the replyContent and repliedAt, doesn't really matter as these just represent whether the company replied or not but worth visualising just in case. 

In [None]:
app_reviews_df.loc[app_reviews_df['replyContent'].notnull()]['appId'].value_counts()

We can see that waitrose is significantly better at replying to comment, with the next company being aldi (a budget supermaket) surprisingly. Aldi, on the other hand, has not replied to anyone. Next questions is, are they replying to good reviews or bad reviews?

In [None]:
for app in apps:
    print(app)
    print(app_reviews_df.loc[(app_reviews_df['appId'] == app) & (app_reviews_df['replyContent'].notnull() == True)]['score'].value_counts())
    print('\n')    

Waitrose mainly responds to low reviews <= 3 stars with most of the responses going to 1 star, M&S responds mainly to 2 and 1 stars, morrisons loves the praise only 4&5 stars. Aldi no reply to 1 stars but only to 2 and 3 stars. Tesco likes praise but has responded to a (1) 2 star and (1) 3 star review. Sainsburys also likes praise with 2 out of 3 responses going to 4 and 5 stars.

### Table 2a - code

<ul>
    <li>Number of words per comment on avg.</li>
    <li>Number of sentences per comment on avg.</li>
</ul>

In [None]:
def sentence_segmentation(df):
    reviews_in_sents = []
    number_of_sents = 0
    for review in df['content']:
        sentences = nltk.sent_tokenize(review)
        number_of_sents += len(sentences)
        reviews_in_sents.append(sentences)
    return reviews_in_sents, number_of_sents

reviews_in_sents, number_of_sents = sentence_segmentation(app_reviews_df)
print(f'There is approx. {round(number_of_sents/len(reviews_in_sents),2)} sentences per review.')

In [None]:
app_reviews_df['reviews_in_sents'] = reviews_in_sents

<ol start="3">
    <li>Check the date for the latest updated of the app and the number of reviews after the date.</li>
    <ul>
        <li>Tesco: <b>30 March 2022</b></li>
        <li>Morrison: <b>9 March 2022</b></li>
        <li>Marks and Spencers: <b>24 March 2022</b></li>    
        <li>ASDA: <b>29 March 2022</b></li>    
        <li>ALDI: <b>28 February 2022</b></li>    
        <li>Sainsbury's: <b>25 March 2022</b></li>    
        <li>Waitrose: <b>25 March 2022</b></li>
    </ul>
    <br>
   <li>Check what the recent changes made are.</li>
</ol>


In [None]:
i = 0
for changes in app_infos_df.iloc[:, 44]:
    name = app_infos_df['title'][i]
    print(name,'changes :', changes)
    i+= 1

From the notes pertaining to the recent changes implemented to the apps, it's unclear if the existing reviews have been analysed as part of the process to create/improve features hence it will be difficult to know when an issue has been addressed by a new version of the app. In addittion, following the dates in which the apps were last updated we can't single out issue which are still present in the current version due to the lack of reviews from the date of update.

<ol start="5">
    <li>Check the date of the earliest review for each company.</li>
</ol>

In [None]:
app_reviews_df['appId'].unique()
for app in app_reviews_df['appId'].unique():
    name = app.split('.')[1].title() if app != 'de.apptiv.business.android.aldi_uk' else app.split('.')[-1].split('_')[0].title()
    date = app_reviews_df.loc[app_reviews_df['appId'] == app]['at'].min()  
    #print(type(date))
    print(name,':',date)

Given that some apps have reviews as early as 2011, and given the lack of infomration related to information on changes made per version <b>I will narrow down the comments per company to include a maximum of the last 12 months.</b>

<ol start="6">
    <li>Check the comment with the most upvotes.</li>
</ol>

### Table 2b code

In [None]:
for app in apps:
    name = app.split('.')[1].title() if app != 'de.apptiv.business.android.aldi_uk' else app.split('.')[-1].split('_')[0].title()
    votes = app_reviews_df.loc[app_reviews_df['appId'] == app]['thumbsUpCount'].max()  
    review = app_reviews_df.loc[(app_reviews_df["appId"] == app) & (app_reviews_df["thumbsUpCount"] == votes), 'content'].value_counts()
    idx = app_reviews_df.loc[(app_reviews_df["appId"] == app) & (app_reviews_df["thumbsUpCount"] == votes), 'content'].index[0]
    date = app_reviews_df['at'][idx].split(' ')[0]
    print(name)
    print('Date published:', date, '    ', votes, 'likes' )
    print(review)
    print('\n')

In the case of Tesco (January), Marks and Spencers (March), Asda (March), and Waitrose (March) the most liked comments are all as of 2022 i.e. recent issues. On the other hand, Sainsbury's (October), Aldi (November), and Morrisons (April) are all from 2021. <br><b>Further investigation required, perhaps look at the most liked comment in the last 3 or 6 months only</b>

<ol start="7">
    <li>Extend the dataframe to include: word count and sentence count.</li>
</ol>

In [None]:
def tokenise_comments(df):
    tokenised_comms = []
    for i in df['content']:
        tknzr = TweetTokenizer()
        s_tweettok = tknzr.tokenize(i)
        tokenised_comms.append(s_tweettok)
    return tokenised_comms

In [None]:
def remove_punctuations(toks_words):
    remove_these = set(list(string.punctuation) + list(string.digits))
    removed_punct = []
    for review in toks_words:
        review_words = []
        for word in review:
            if not word in remove_these:
                review_words.append(word)
        removed_punct.append(review_words)
    return removed_punct

In [None]:
app_reviews_df['sentence_count'] = [len(review) for review in app_reviews_df['reviews_in_sents']]
app_reviews_df['reviews_in_words'] = tokenise_comments(app_reviews_df)
app_reviews_df['word_count'] = [len(review) for review in remove_punctuations(app_reviews_df['reviews_in_words'])]

In [None]:
app_reviews_df.describe()

In [None]:
f, ax = plt.subplots(1,3,figsize=(12,4), sharey=False)
sns.distplot(app_reviews_df.thumbsUpCount, ax=ax[0])
ax[0].set_title("Thumb Up Count Distribution")
ax[0].set_xlabel("Number of Thumbs up")
sns.distplot(app_reviews_df.sentence_count, ax=ax[1])
ax[1].set_title("Sentence Count Distribution")
ax[1].set_xlabel("Number of Sentences")
sns.distplot(app_reviews_df.word_count, ax=ax[2])
ax[2].set_title("Word Count Distribution")
ax[2].set_xlabel("Number of Words")
plt.tight_layout()
plt.show()

Only abnormality was the comement with 0 words and 1 sentence, turns out is just a period. 

In [None]:
app_reviews_df.loc[app_reviews_df['word_count'] == 0]

<ol start="8">
    <li>Count of review per app version.</li>
</ol>

In [None]:
app_reviews_df['ymd'] = [date.split(' ')[0] for date in app_reviews_df['at']]

In [None]:
app_reviews_df['time'] =[date.split(' ')[1] for date in app_reviews_df['at']]

In [None]:
app_reviews_df.loc[app_reviews_df['time'] >'00:00']

In [None]:
app_reviews_df.loc[app_reviews_df['ymd'] >'2022-03-30']

In [None]:
for app in apps:
    print(app,':', len(app_reviews_df.loc[app_reviews_df['appId'] == app]['reviewCreatedVersion'].value_counts()))

The apps seem to be updated very frequently with over 429 versions. Tesco has had 16 versions, morrisons 89, M&S 62, Aldi 73, Sainsburys 74, Waitrose 72.

### Make Sentiment Analysis

Make label of pos, neg, neutral based on score

In [None]:
scores = []
for score in app_reviews_df['score']:
    if score < 3:
        scores.append('neg')
    elif score > 3:
        scores.append('pos')
    else:
        scores.append('neu')
        
app_reviews_df['label'] = scores

In [None]:
app_reviews_df['label'].value_counts()

In [None]:
def get_specific_speech(comments, pos): 
    all_words = []
    i = 0
    for review in comments:
        all_words.append(remove_stopwords(review))
        i += 1    
    words = []
    for review in all_words:
        for word, pos_code in nltk.pos_tag(review):
            pos_to_add = []
            if pos_code in pos:
                pos_to_add.append(word)
        words.append(pos_to_add)
        
    return words 

def remove_stopwords(toks_words, type_of_gram = 'none'):
    if type_of_gram =='bi':
        stopwords = ['to', 'the', 'have', 'on', 'in', 'is', 'this', 'and', 'i',
                 'you', 'of', 'be', 'for', 'my', 'that', 'a', 'very', 'there']
    elif type_of_gram == 'tri':
        stopwords = ['and', 'to', 'the']
    else:
        stopwords = ['..', ',']
    remove_these = set(stopwords + list(string.punctuation) + list(string.digits))
    filtered_text = [word for word in toks_words if not word in remove_these]
    return filtered_text

def make_dist_plot(column, number_of_words):
    flat_list = [item for sublist in column for item in sublist]
    fdist = FreqDist(flat_list)
    return fdist.plot(number_of_words,title=f'Frequency distribution for {number_of_words} most common tokens in our collection.')

Make review_in_words lowercase

In [None]:
make_dist_plot(app_reviews_df['reviews_in_words'], 10)

In [None]:
all_words = []
for review in app_reviews_df['reviews_in_words']:
    review_lower = []
    for word in review:
        review_lower.append(word.lower())
    all_words.append(review_lower)
    
app_reviews_df['reviews_in_words'] = all_words

Make label using VADER

In [None]:
def classifySentenceVADER(reviews_in_sens):
    sid = SIA()
    labels = []
    for review in reviews_in_sens:
        overall_score = 0
        for sentence in review:
            ss = sid.polarity_scores(sentence)
            overall_score += ss['compound'] 
        if overall_score == 0.0:
            labels.append('neu')
        elif overall_score > 0:
            labels.append('pos')
        elif overall_score < 0 :
            labels.append('neg')
    return labels

In [None]:
app_reviews_df['vader_label'] = classifySentenceVADER(app_reviews_df['reviews_in_sents'])

In [None]:
app_reviews_df.loc[app_reviews_df['label'] != app_reviews_df['vader_label']]['vader_label'].value_counts()

We can see that the VADER classifier is not the best, whilst it does classify properly large number of reviews. It has clear discrepancies with those set with the heurisitc of >3 positive <3 negative and 3 stars neutral. With there being 1243 reviews it classifies as neutral which are not neutral in terms of stars, a closer look sees that the classifier in multiple case is unable to determine the sentiment so gives it a score of 0.0 i.e. neutral.

Using TextBlob to check performance and compare VADER

In [None]:
from textblob import TextBlob
pol = lambda x: TextBlob(x).sentiment.polarity
app_reviews_df['textblob_label'] = app_reviews_df['content'].apply(pol)
app_reviews_df['textblob_label'] = ['pos' if label > 0 else 'neg' if label < 0 else 'neu' for label in app_reviews_df['textblob_label']]

Using HuggingFace BERT Base model to check performance and compare VADER and TextBlob

In [None]:
specific_model = pipeline(model="nlptown/bert-base-multilingual-uncased-sentiment")

In [None]:
bert_score = []
for review in app_reviews_df['content']:
    star = specific_model(review)[0]['label'].split(' ')[0]
    bert_score.append(star)

In [None]:
app_reviews_df['bert_score'] = bert_score

In [None]:
scores = []
for score in app_reviews_df['bert_score']:
    if int(score) < 3:
        scores.append('neg')
    elif int(score) > 3:
        scores.append('pos')
    else:
        scores.append('neu')
        
app_reviews_df['bert_label'] = scores

Significant improvement in using BERT to classify the reviews, nearly cuts the mislabeling between pos/neg/neutral by half. Served as a training for use of future unsupervised problems.

In [None]:
app_reviews_df.loc[app_reviews_df['replyContent'].notna()]['replyContent']

Checkpoint

In [None]:
app_reviews_df.to_csv('src/reviews_1.csv', index=None, header=True)

Explore extracting topics from the reviews by selecting nouns

In [None]:
get_specific_speech(app_reviews_df['reviews_in_words'], ['NN'])

Not enough nouns

Normalise words

In [None]:
for review in app_reviews_df['reviews_in_words']:
    for index, word in enumerate(review):
        if (index+1 < len(review) and index - 1 >= 0):
            prev_el = str(review[index-1])
            curr_el = str(word)
            next_el = str(review[index+1])
            #print(prev_el, curr_el, next_el)
            if (prev_el == 'check' and curr_el == 'out') or (prev_el == 'club' and curr_el == 'card') or (prev_el == 'log' and curr_el == 'out') or (prev_el == 'log' and curr_el == 'in') or (prev_el == 'spark' and curr_el == 'card') or (prev_el == 'out' and curr_el == 'stock'):
                review[index-1:index] = [''.join(review[index-1:index])]
            elif (curr_el == 'check' and next_el == 'out') or (curr_el == 'club' and next_el == 'card') or (curr_el == 'log' and next_el == 'out') or (curr_el == 'log' and next_el == 'in') or (curr_el == 'spark' and next_el == 'card') or (curr_el == 'out' and next_el == 'stock'):
                review[index:index+2] = [''.join(review[index:index+2])]

Lemmaatization

In [None]:
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
POS_LIST = [NOUN, VERB, ADJ, ADV]

In [None]:
lemmatizer = WordNetLemmatizer()

def nltk_pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    print(nltk_tagged)
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    lemmatized_sentence = []
    
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:        
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return lemmatized_sentence

print(lemmatize_sentence("I am voting for that politician in this NLTK Lemmatization example sentence"))

In [None]:
lemmatized_sentence = []
for review in app_reviews_df['reviews_in_words']:
    nltk_tagged = nltk.pos_tag(review)
    wordnet_tagged = map(lambda x: (x[0], nltk_pos_tagger(x[1])), nltk_tagged)
    review_lem = []
    #print(wordnet_tagged)
    for word, tag in wordnet_tagged:
        if tag is None:
            review_lem.append(word)
        else:        
            review_lem.append(lemmatizer.lemmatize(word, tag))
    lemmatized_sentence.append(review_lem)
    
app_reviews_df['reviews_lemmatized'] = lemmatized_sentence

In [None]:
app_reviews_df.head()

Words that are nouns

In [None]:
get_specific_speech(app_reviews_df['reviews_lemmatized'], ['NN'])

### Split df by company

In [None]:
def split_df(df, column, value):
    new_df = df[df[column] == value]
    return new_df

def split_per_app(df, column_with_names):
    all_dfs = [] 
    apps = df[column_with_names].unique()
    for app in apps:
        name = split_df(df, column_with_names, app)
        all_dfs.append(name)  
    return all_dfs

In [None]:
split_reviews = split_per_app(app_reviews_df, 'appId')

tesco_reviews = split_reviews[0]
morrisons_reviews = split_reviews[1]
marksandspencer_reviews = split_reviews[2]
asda_reviews = split_reviews[3]
aldi_reviews = split_reviews[4]
sainsburys_reviews = split_reviews[5]
waitrose_reviews = split_reviews[6]

In [None]:
morrisons_reviews.reset_index(inplace = True, drop = True)
marksandspencer_reviews.reset_index(inplace = True, drop = True)
asda_reviews.reset_index(inplace = True, drop = True)
aldi_reviews.reset_index(inplace = True, drop = True)
sainsburys_reviews.reset_index(inplace = True, drop = True)
waitrose_reviews.reset_index(inplace = True, drop = True)

Distribution of words

In [None]:
make_dist_plot(tesco_reviews['reviews_lemmatized'], 25)

In [None]:
make_dist_plot(morrisons_reviews['reviews_lemmatized'], 25)

In [None]:
make_dist_plot(marksandspencer_reviews['reviews_lemmatized'], 25)

In [None]:
make_dist_plot(asda_reviews['reviews_lemmatized'], 25)

In [None]:
make_dist_plot(aldi_reviews['reviews_lemmatized'], 25)

In [None]:
make_dist_plot(sainsburys_reviews['reviews_lemmatized'], 25)

In [None]:
make_dist_plot(waitrose_reviews['reviews_lemmatized'], 25)

Bigrams and trigram

In [None]:
from nltk import ngrams

def get_ngram(lemma_column, number_of_words, n):
    if type(lemma_column[0]) is not str:
        flat_list = [item for sublist in lemma_column for item in sublist]
    else:
        flat_list = lemma_column
    ngram = [b for b in ngrams(flat_list, n)]
    freq_ngrams = nltk.FreqDist(ngram)
    #print(freq_ngrams.most_common(number_of_words)) 
    #fdist = FreqDist(freq_ngrams)
    #fdist.plot(number_of_words,title=f'Frequency distribution for {number_of_words} most common tokens in our collection.')
    return ngram

In [None]:
def remove_stopwords_2(lemma_column, type_of_gram):
    flat_list = [item for sublist in lemma_column for item in sublist]
    if type_of_gram =='Bi':
        new_stopwords = ['to', 'the', 'have', 'on', 'in', 'is', 'this', 'and', 'i',
                 'you', 'of', 'be', 'for', 'my', 'that', 'a', 'very', 'there', '...', '..', ',', 'do', 'not',
                        'it', "doesn't", 'let', 'me', 'every', 'longer', 'even', 'though', 'too', 'an', 'use',
                        'rather', 'through', 'your', 'more', 'now', 'keep', 'but', 'at','all',"can't", 'with',
                        "won't",'when', 'sort', 'please', 'need', 'app', 'easy', 'quick']
    elif type_of_gram == 'Tri':
        new_stopwords = ['and', 'to', 'the', 'i', 'you', 'this', 'a', 'me', 'be', 'very', 'it', '...', '..', ',',
                        'for','of', 'easy', 'quick']
    else:
        new_stopwords = ['..', ',', '...']
    remove_these = set(new_stopwords + list(string.punctuation) + list(string.digits))# + stopwords.words('english'))
    filtered_text = [word for word in flat_list if not word in remove_these]
    return filtered_text

In [None]:
def ngram_by_label(df, stopwords, number_of_grams, n_grams,  label, color):
    pos_df = df.loc[df['label'] == label]
    ngrams = get_ngram(remove_stopwords_2(pos_df['reviews_lemmatized'], stopwords), number_of_grams, n_grams)
    ngrams_series = pd.Series(ngrams).value_counts()[0:number_of_grams]
    plt.title(f'{number_of_grams} Most Frequently Occuring {stopwords}grams')
    plt.ylabel(f'{stopwords}gram')
    plt.xlabel('# of Occurances')
    ngrams_series.sort_values().plot.barh(color=color, width=.9, figsize=(12, 8))
    plt.savefig('graph.png')

### Figure 3 Code

## Tesco's bi/trigrams

In [None]:
ngram_by_label(tesco_reviews, 'Bi', 10, 2, 'neg', 'blue')

In [None]:
ngram_by_label(tesco_reviews, 'Tri', 10, 3, 'neg', 'blue')

## Morrison's bi/trigrams

In [None]:
ngram_by_label(morrisons_reviews, 'Bi', 10, 2, 'pos', 'green')

In [None]:
ngram_by_label(morrisons_reviews, 'Tri', 10, 3, 'pos', 'green')

## M&S's bi/trigrams

In [None]:
ngram_by_label(marksandspencer_reviews, 'Bi', 10, 2, 'pos', 'grey')

In [None]:
ngram_by_label(marksandspencer_reviews, 'Tri', 10, 3, 'pos', 'grey')

## Asda's bi/trigrams

In [None]:
ngram_by_label(asda_reviews, 'Bi', 10, 2, 'pos', 'lightgreen')

In [None]:
ngram_by_label(asda_reviews, 'Tri', 10, 3, 'pos', 'lightgreen')

## ALDI's bi/trigrams

In [None]:
ngram_by_label(aldi_reviews, 'Bi', 10, 2, 'pos', 'lightblue')

In [None]:
ngram_by_label(aldi_reviews, 'Tri', 10, 3, 'pos', 'lightblue')

## Sainsbury's bi/trigrams

In [None]:
ngram_by_label(sainsburys_reviews, 'Bi', 3, 2, 'neg', 'orange')

In [None]:
ngram_by_label(sainsburys_reviews, 'Tri', 4, 3, 'neg', 'orange')

## Waitrose's bi/trigrams

In [None]:
ngram_by_label(waitrose_reviews, 'Bi', 10, 2, 'pos', 'lightgreen')

In [None]:
ngram_by_label(waitrose_reviews, 'Tri', 10, 3, 'pos', 'lightgreen')

Checkpoint

In [None]:
tesco_reviews.to_csv('src/tesco_reviews_2.csv', index=None, header=True)
morrisons_reviews.to_csv('src/morrisons_reviews_2.csv', index=None, header=True)
marksandspencer_reviews.to_csv('src/marksandspencer_reviews_2.csv', index=None, header=True)
asda_reviews.to_csv('src/asda_reviews_2.csv', index=None, header=True)
aldi_reviews.to_csv('src/aldi_reviews_2.csv', index=None, header=True)
sainsburys_reviews.to_csv('src/sainsburys_reviews_2.csv', index=None, header=True)
waitrose_reviews.to_csv('src/waitrose_reviews_2.csv', index=None, header=True)

### Figure 4 code - Get wordcloud of adjectives

In [None]:
filter_for_wordcloud = get_specific_speech(sainsburys_reviews['reviews_lemmatized'], ['JJ', 'JJR', 'JJS'])
filter_for_wordcloud = [word for review in filter_for_wordcloud for word in review ]

In [None]:
fdist_filtered = FreqDist(filter_for_wordcloud)
print(fdist_filtered.most_common(30))
fdist_filtered.plot(30,title='Frequency distribution (excluding stopwords and punctuation)')

In [None]:
simple_frequencies_dict = Counter(fdist_filtered)

In [None]:
mask = np.array(Image.open("src/Sainsbury’s-logo-large.jpeg"))
wordcloud = WordCloud(stopwords=stopwords, background_color="white", width=mask.shape[1],
               height=mask.shape[0], max_words=1000, mask=mask).generate_from_frequencies(simple_frequencies_dict)
# create coloring from image
image_colors = ImageColorGenerator(mask)
plt.figure(figsize=[5,5])
plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
# store to file
plt.savefig("news1.png", format="png") 
plt.show()

### Split date to year/month/day

In [None]:
year = []
month = []
day = []
for date in tesco_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))
    
tesco_reviews['year'] = year
tesco_reviews['month'] = month
tesco_reviews['day'] = day

year = []
month = []
day = []
for date in morrisons_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))

morrisons_reviews['year'] = year
morrisons_reviews['month'] = month
morrisons_reviews['day'] = day    

year = []
month = []
day = []
for date in marksandspencer_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))

marksandspencer_reviews['year'] = year
marksandspencer_reviews['month'] = month
marksandspencer_reviews['day'] = day    

year = []
month = []
day = []
for date in asda_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))

asda_reviews['year'] = year
asda_reviews['month'] = month
asda_reviews['day'] = day    

year = []
month = []
day = []
for date in aldi_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))

aldi_reviews['year'] = year
aldi_reviews['month'] = month
aldi_reviews['day'] = day 


year = []
month = []
day = []
for date in waitrose_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))

waitrose_reviews['year'] = year
waitrose_reviews['month'] = month
waitrose_reviews['day'] = day 

year = []
month = []
day = []
for date in sainsburys_reviews['ymd']:
    year.append(int(date.split('-')[0]))
    month.append(int(date.split('-')[1]))
    day.append(int(date.split('-')[2]))
    
sainsburys_reviews['year'] = year
sainsburys_reviews['month'] = month
sainsburys_reviews['day'] = day

In [None]:
with sns.axes_style('white'):
    g = sns.factorplot("month", data=tesco_reviews, aspect=4.0, kind='count',
                       hue='label', order=range(3, 4), palette=['red','yellow','green'])
    g.set_ylabels('Number of Reviews')
    g.set_axis_labels("", "Number of Reviews")
    g.set_xticklabels(["March"])

In [None]:
tesco_reviews.loc[tesco_reviews['month'] == 3]['label'].value_counts()

In [None]:
with sns.axes_style('white'):
    g = sns.factorplot("month", data=sainsburys_reviews, aspect=4.0, kind='count',
                       hue='label', order=range(3, 4), palette=['red','yellow','green'])
    g.set_ylabels('Number of Reviews')
    g.set_axis_labels("", "Number of Reviews")
    g.set_xticklabels(["March"])

### Figure 1 code

In [None]:
# Seaborn color palette to plot pie chart
colors = sns.color_palette('pastel')

# create pie chart using matplotlib
def cm_to_inch(value):
    return value/2.54

explode = (0.1, 0, 0)  

fig = plt.figure(figsize=(6, 6))
plt.title("Sainsbury's Reviews"+"\n", fontsize = 16)
_, _, autotexts = plt.pie(sainsburys_reviews.loc[sainsburys_reviews['month'] == 3]['label'].value_counts(), 
                          labels=['Positive', 'Negative', 'Neutral'], colors=['cornflowerblue', 'tomato', 'cornsilk'], 
                          autopct='%.0f%%', textprops={'fontsize': 12}, explode=explode)
plt.savefig('graph.png')
for autotext in autotexts:
    autotext.set_color('black')
plt.show()

In [None]:
competition_df = pd.concat([tesco_reviews, morrisons_reviews, marksandspencer_reviews, asda_reviews, aldi_reviews, waitrose_reviews], ignore_index=True, sort=False) 

In [None]:
# Seaborn color palette to plot pie chart
colors = sns.color_palette('pastel')

# create pie chart using matplotlib
def cm_to_inch(value):
    return value/2.54

explode = (0.1, 0, 0)  

fig = plt.figure(figsize=(6, 6))
plt.title("Competition's Reviews"+"\n", fontsize = 16)
_, _, autotexts = plt.pie(competition_df.loc[competition_df['month'] == 3]['label'].value_counts(), 
                          labels=['Positive', 'Negative', 'Neutral'], colors=['cornflowerblue', 'tomato', 'cornsilk'], 
                          autopct='%.0f%%', textprops={'fontsize': 12}, explode=explode)
plt.savefig('graph.png')
for autotext in autotexts:
    autotext.set_color('black')
plt.show()

In [None]:
# Seaborn color palette to plot pie chart
colors = sns.color_palette('pastel')

# create pie chart using matplotlib
def cm_to_inch(value):
    return value/2.54

explode = (0.1, 0, 0)  

fig = plt.figure(figsize=(6, 6))
plt.title("Tesco's Reviews"+"\n", fontsize = 16)
_, _, autotexts = plt.pie(tesco_reviews.loc[tesco_reviews['month'] == 3]['label'].value_counts(), 
                          labels=['Positive', 'Negative', 'Neutral'], colors=['cornflowerblue', 'tomato', 'cornsilk'], 
                          autopct='%.0f%%', textprops={'fontsize': 12}, explode=explode)
plt.savefig('graph.png')
for autotext in autotexts:
    autotext.set_color('black')
plt.show()

In [None]:
sainsburys_reviews.loc[sainsburys_reviews['month'] == 3]['label'].value_counts()

In [None]:
tesco_reviews.describe()

In [None]:
len(tesco_reviews.loc[(tesco_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

In [None]:
morrisons_reviews.describe()

In [None]:
len(morrisons_reviews.loc[(morrisons_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

In [None]:
marksandspencer_reviews.describe()

In [None]:
len(marksandspencer_reviews.loc[(marksandspencer_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

In [None]:
asda_reviews.describe()

In [None]:
len(asda_reviews.loc[(asda_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

In [None]:
aldi_reviews.describe()

In [None]:
len(aldi_reviews.loc[(aldi_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

In [None]:
sainsburys_reviews.describe()

In [None]:
len(sainsburys_reviews.loc[(sainsburys_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

In [None]:
waitrose_reviews.describe()

In [None]:
len(waitrose_reviews.loc[(waitrose_reviews['year'] == 2022)]['reviewCreatedVersion'].value_counts())

### Accuraacy of models at labelling

In [None]:
print(len(sainsburys_reviews.loc[(sainsburys_reviews['label'] == sainsburys_reviews['bert_label'])])/len(sainsburys_reviews))

In [None]:
print(len(sainsburys_reviews.loc[(sainsburys_reviews['label'] == sainsburys_reviews['vader_label'])])/len(sainsburys_reviews))

In [None]:
print(len(sainsburys_reviews.loc[(sainsburys_reviews['label'] == sainsburys_reviews['textblob_label'])])/len(sainsburys_reviews))

In [None]:
sainsburys_reviews

### Figure 3a and 3b Code

In [None]:
sainsburys_reviews.groupby(['day','label'])['label'].count().unstack().plot(legend=True, color = ['red', 'orange', 'green'])
plt.title("Reviews submitted to Sainsbury's")
plt.xlabel('Day of the month')
plt.ylabel('Number of reviews')
plt.xticks(range(1,32, 2))
plt.savefig('Sainsburys_reviews.png')
plt.show()

In [None]:
competition_df.groupby(['day','label'])['label'].count().unstack().plot(legend=True, color = ['red', 'orange', 'green'])
plt.title('Reviews submitted to competitors')
plt.xlabel('Day of the month')
plt.ylabel('Number of reviews')
plt.xticks(range(1,32, 3))
plt.savefig('competition_reviews_per_stars.png')
plt.show()