# Understanding Tesco Mobile reviews on Trustpilot

The scope of this project includes:
* scraping reviews from TrustPilot
* training an NLP model to predict the sentiment of a new review 
* categorise reviews and track their scores over time 

In [55]:
# import basic modules
import pandas as pd
import numpy as np 

In [57]:
# import modules
import spacy
from spacy.matcher import PhraseMatcher

AttributeError: module 'numpy.linalg.lapack_lite' has no attribute '_ilp64'

## Step 3: Categorise reviews based on their content

In [3]:
# load the dataset
data = pd.read_csv(
    '../data/tp_reviews_20180915_20200830.csv', 
    parse_dates=['published_date','updated_date','reported_date'])

# inspect
data.head()

Unnamed: 0,review_id,published_date,updated_date,reported_date,stars,review_text
0,5bb72dd69d37800b64319d20,2018-10-05 09:24:38+00:00,NaT,NaT,5,great service and on the one occasion i had a ...
1,5bb6969b9d37800b64317ffe,2018-10-04 22:39:23+00:00,NaT,NaT,5,Carl from the escalations team had a lovely ma...
2,5bb60cd49d37800734d8ad92,2018-10-04 12:51:32+00:00,NaT,NaT,4,Has been so straightforward since we switched ...
3,5bb2f4508c83fd0b58de76a2,2018-10-02 04:30:08+00:00,NaT,NaT,1,I made a very bad decision to sign a 12 month ...
4,5bb0b5048c83fd06e0c61921,2018-09-30 11:35:32+00:00,NaT,NaT,5,Quick advice and did the job efficiently!


In [4]:
#
df = data.copy()

# drop columns that we won't need 
to_drop = ['updated_date','reported_date']
df.drop(columns=to_drop, axis=1, inplace=True)

# rename published_date
df.rename(columns={'published_date':'published_at'}, inplace=True)

# inspect 
df.head()

Unnamed: 0,review_id,published_at,stars,review_text
0,5bb72dd69d37800b64319d20,2018-10-05 09:24:38+00:00,5,great service and on the one occasion i had a ...
1,5bb6969b9d37800b64317ffe,2018-10-04 22:39:23+00:00,5,Carl from the escalations team had a lovely ma...
2,5bb60cd49d37800734d8ad92,2018-10-04 12:51:32+00:00,4,Has been so straightforward since we switched ...
3,5bb2f4508c83fd0b58de76a2,2018-10-02 04:30:08+00:00,1,I made a very bad decision to sign a 12 month ...
4,5bb0b5048c83fd06e0c61921,2018-09-30 11:35:32+00:00,5,Quick advice and did the job efficiently!


In [23]:
# let's create some initial categories 
categories_dict = {
    'customer service': ['customer service','customer services','customer care','staff','customer experience'],
    'coverage': ['coverage','signal','no service'],
    'internet': ['speed','4G','buffer','internet','data allowance','data'],
    'billing': ['upfront cost','bill','payment','fees'],
    'sim': ['sim only','SIM'],
    'payg': ['pay as you go', 'topup','top up', 'top-up', 'rocket pack', 'rocketpack','rocketpacks','e-voucher','payg'],
    'insurance': ['insurance'],
    'clubcard': ['clubcard','clubcard plus','club card']}

In [49]:
# create a lookup table that we can reference later
categories = []
words = []

for cat, terms in categories_dict.items():
    for word in terms:
        words.append(word)
        categories.append(cat)
        
# 
categories = pd.Series(categories, name='cat')
words = pd.Series(words, name='term')

# join 
categories_df = pd.DataFrame(words).join(categories)

# inspect
categories_df.head()

Unnamed: 0,term,cat
0,customer service,customer service
1,customer services,customer service
2,customer care,customer service
3,staff,customer service
4,customer experience,customer service


In [52]:
# turn the categories into a list because I think it's what spacy needs 
cat_list = categories_df['term'].to_list()