In [43]:
import json
import pandas as pd
from tqdm import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter

from google_play_scraper import Sort, reviews, app

import nltk
import pandas as pd
from sklearn.utils import shuffle
from nltk.tokenize import word_tokenize, sent_tokenize, wordpunct_tokenize, regexp_tokenize, TweetTokenizer
from nltk.probability import FreqDist

from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn

From the google play store we have scraped reviews for grocery store apps delivery apps: Tesco, Morrisons, M&S, ASDA, Aldi, Sainbury's and Waitrose. Parameters for scraping were set to 250 for reviews with 1,2,4, and 5 stars and only 150 with 3 stars.

In [45]:
app_reviews_df = pd.read_csv("reviews.csv")
app_infos_df = pd.read_csv("apps.csv")

## EDA

<ol>
    <li>Check the number of reviews scraped for each company</li>
</ol>

In [52]:
app_reviews_df['appId'].unique()

array(['com.tesco.grocery.view', 'com.morrisons.atm.mobile.android',
       'com.marksandspencer.app', 'com.asda.android',
       'de.apptiv.business.android.aldi_uk', 'com.sainsburys.gol',
       'com.waitrose.groceries'], dtype=object)

In [65]:
def check_reviews_per_star(df):
    apps = df['appId'].unique()
    for app in apps:
        name = app.split('.')[1].title() if app != 'de.apptiv.business.android.aldi_uk' else app.split('.')[-1].split('_')[0].title()
        print(name, "reviews per star:")
        stars = df['score'].loc[df['appId'] == app].value_counts()
        print(stars)

In [66]:
check_reviews_per_star(app_reviews_df)

Tesco reviews per star:
1    250
2    250
4    250
5    250
3    150
Name: score, dtype: int64
Morrisons reviews per star:
1    250
5    250
2     89
4     87
3     84
Name: score, dtype: int64
Marksandspencer reviews per star:
1    250
2    250
4    250
5    250
3    150
Name: score, dtype: int64
Asda reviews per star:
1    250
2    250
4    250
5    250
3    150
Name: score, dtype: int64
Aldi reviews per star:
1    250
2    250
4    250
5    250
3    150
Name: score, dtype: int64
Sainsburys reviews per star:
1    250
2    250
4    250
5    250
3    150
Name: score, dtype: int64
Waitrose reviews per star:
1    250
4    250
5    250
3    130
2    115
Name: score, dtype: int64


From the reviews, Morrisons and Waitrose scraped less reviess with 3,2, and 4 stars due to availability however at the extremes (1 and 5 stars) it's the same as others - as expected. As we aim to extract recommendations, the 1 star reviews will have vital importance however we proceed with cautions given the lower sample size relative to other apps.

<ol start="2">
    <li>Check the date for the latest updated of the app and the number of reviews after the date.</li>
</ol>

In [68]:
app_infos_df.columns

Index(['title', 'description', 'descriptionHTML', 'summary', 'summaryHTML',
       'installs', 'minInstalls', 'score', 'ratings', 'reviews', 'histogram',
       'price', 'free', 'currency', 'sale', 'saleTime', 'originalPrice',
       'saleText', 'offersIAP', 'inAppProductPrice', 'size', 'androidVersion',
       'androidVersionText', 'developer', 'developerId', 'developerEmail',
       'developerWebsite', 'developerAddress', 'privacyPolicy',
       'developerInternalID', 'genre', 'genreId', 'icon', 'headerImage',
       'screenshots', 'video', 'videoImage', 'contentRating',
       'contentRatingDescription', 'adSupported', 'containsAds', 'released',
       'updated', 'version', 'recentChanges', 'recentChangesHTML',
       'editorsChoice', 'similarApps', 'moreByDeveloper', 'appId', 'url'],
      dtype='object')

In [71]:
app_infos_df['androidVersion']

0    7.0
1    5.0
2    6.0
3    5.0
4    5.0
5    5.0
6    6.0
Name: androidVersion, dtype: float64