# imports

In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time

import tqdm.notebook as tq
import spacy



In [None]:
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


# Scraping

In [None]:


def trustpilot_scraper(PATH: str, n_pages):

    #Lists
    body = []
    heading = []
    rating = []
    location = []
    author = []
    date = []

    #Website Load
    page = "{}?page=".format(PATH)

    for page_number in tqdm(range(1, n_pages+1)):
        url = "{x}{y}".format(x = page, y = page_number)
        req = requests.get(url)

        time.sleep(2)
        soup = BeautifulSoup(req.text, 'html.parser')

        #initial reviews
        reviews_raw = soup.find("script", id = "__NEXT_DATA__").string
        reviews_raw = json.loads(reviews_raw)
        rev = reviews_raw["props"]["pageProps"]["reviews"]

        #get reviews into df
        for i in tqdm(range(len(rev))):
            instance = rev[i]
            body_ = GoogleTranslator(source='auto').translate(instance["text"], dest='en')
            heading_ = GoogleTranslator(source='auto').translate(instance["title"], dest='en')
            rating_ = instance["rating"]
            location_ = instance["consumer"]["countryCode"]
            author_ = instance["consumer"]["displayName"]
            date_ = pd.to_datetime(instance["dates"]["publishedDate"]).strftime("%Y-%m-%d")

            #append to the list
            body.append(body_)
            heading.append(heading_)
            rating.append(rating_)
            location.append(location_)
            author.append(author_)
            date.append(date_)

    df = {
    'Date' : date,
    'Author' : author,
    'Body' : body,
    'Heading' : heading,
    'Rating' : rating,
    'Location' : location
    }

    rev_df = pd.DataFrame(df)
    rev_df.sort_values(by = "Date", axis = 0, inplace = True, ignore_index = True)
    rev_df.drop_duplicates(subset=["Body"],keep= 'first', inplace= True)
    rev_df.reset_index(drop = True, inplace = True)

    return rev_df

In [None]:
df = trustpilot_scraper('https://dk.trustpilot.com/review/www.alka.dk',50)
df

# Expand data

In [None]:
def getClasses(rating):
    
    """Function returns if a rating classifies as positive or negative. 
    Neutral is also included"""

    #rating is 2 or less, return negative
    if int(rating) <= 2:
        return 'negative'

    #rating is 3 or less, return neutral
    elif int(rating) == 3:
        return 'neutral'

    #rating is 4 or more, return positive
    else:
        return 'positive'

In [None]:
#call function to get classes
df['class'] = df['Rating'].map(getClasses)

In [None]:
#instantiate instance of gender_guesser
detect_gender = gd.Detector()

def getGender(name_string):
    
    """
    Infers gender using gender-guesser 0.4.0 on users
    first name
    
    Hard recode of 'mostly' and 'andy'
    """
    
    #infer gender on users first name
    gender = detect_gender.get_gender(name_string.split(' ')[0])
    
    #recode if mostly female
    if gender == 'mostly_female':
        
        return 'female'
    
    #recode if mostly male
    elif gender == 'mostly_male':
        
        return 'male'
    
    #recode if androgenous
    elif gender == 'andy':
        
        return 'unknown'
    
    else:
        
        return gender

In [None]:
#call function to get first name
df['gender'] = df['Author'].map(lambda x: getGender(x))

In [None]:
def getDate(df,date_col):
    
    """
    Extracts various time features from a datetime formatted
    column
    """
    #convert to datetime
    df[date_col] = pd.to_datetime(df[date_col]) 
    #get day of week
    df['day'] = df[date_col].map(lambda x: x.day_name())
    #get day of year
    df['day_of_year'] = df[date_col].map(lambda x: x.dayofyear)
    #get day of year
    df['week_of_year'] = df[date_col].map(lambda x: x.weekofyear)
    #get month year
    df['month_year'] = df[date_col].map(lambda x: x.to_period('M'))
    #get month
    df['month'] = df[date_col].map(lambda x: x.month_name())
    #get quarter
    df['quarter'] = df[date_col].map(lambda x: x.quarter)
    #get year
    df['year'] = df[date_col].map(lambda x: x.year)
    
    return df

In [None]:
#call function to get date features
df['Body'].fillna("",inplace=True)
df = getDate(df,'Date')
#get length of review
df['review_length'] = df['Body'].map(lambda x: len(x))

## save data

In [None]:
# save dataframe as a pickle file
df.to_pickle('data.pkl')
df

Unnamed: 0,Date,Author,Body,Heading,Rating,Location,class,gender,day,day_of_year,week_of_year,month_year,month,quarter,year,review_length
0,2022-01-16,René,You get real help when you need it,You get real help when you have…,5,DK,positive,male,Sunday,16,2,2022-01,January,1,2022,34
1,2022-01-16,Tonni Kristensen.,I do not think Alka is neither better nor wors...,I do not think,5,DK,positive,male,Sunday,16,2,2022-01,January,1,2022,106
2,2022-01-16,inga,"I am so happy for Alka, have received a really...",I'm so happy for Alka,5,DK,positive,unknown,Sunday,16,2,2022-01,January,1,2022,85
3,2022-01-17,Ib Lorenzen,"Before we could count to 10, we had a new wind...","Quick, easy and straightforward",5,DK,positive,male,Monday,17,3,2022-01,January,1,2022,227
4,2022-01-17,Susann Taha,Easy reporting and quick feedback.,Easy reporting and fast…,5,DK,positive,female,Monday,17,3,2022-01,January,1,2022,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,2022-04-24,Martine Hocke,In connection with a trip that I had to cancel...,In connection with a trip like me…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,241
978,2022-04-24,Helle Emmerich,Super professional therapist and everything ra...,Super professional therapist and everything…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,85
979,2022-04-24,Susanne Opperby Madsen,A good insurance company they are fast and eff...,A good insurance company they are…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,74
980,2022-04-24,Torben Juel Petersen,"It was a stone's throw in the windshield, so m...",It was a stone's throw in…,4,DK,positive,male,Sunday,114,16,2022-04,April,2,2022,131


# Analysis 

## Load data

In [None]:
df = pd.read_pickle('data.pkl')
df

Unnamed: 0,Date,Author,Body,Heading,Rating,Location,class,gender,day,day_of_year,week_of_year,month_year,month,quarter,year,review_length
0,2022-01-16,Tonni Kristensen.,I do not think Alka is neither better nor wors...,I do not think,5,DK,positive,male,Sunday,16,2,2022-01,January,1,2022,106
1,2022-01-16,inga,"I am so happy for Alka, have received a really...",I'm so happy for Alka,5,DK,positive,unknown,Sunday,16,2,2022-01,January,1,2022,85
2,2022-01-17,Susann Taha,Easy reporting and quick feedback.,Easy reporting and fast…,5,DK,positive,female,Monday,17,3,2022-01,January,1,2022,34
3,2022-01-17,Michelle Lundberg Badsted,"Basically, I have received good service and am...",Basically I got good…,4,DK,positive,female,Monday,17,3,2022-01,January,1,2022,303
4,2022-01-17,Leif Sørensen,Good service and quick response,God service,5,DK,positive,male,Monday,17,3,2022-01,January,1,2022,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,2022-04-24,Helle Emmerich,Super professional therapist and everything ra...,Super professional therapist and everything…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,85
978,2022-04-24,Susanne Opperby Madsen,A good insurance company they are fast and eff...,A good insurance company they are…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,74
979,2022-04-24,Torben Juel Petersen,"It was a stone's throw in the windshield, so m...",It was a stone's throw in…,4,DK,positive,male,Sunday,114,16,2022-04,April,2,2022,131
980,2022-04-24,Jan skov Kristensen,Our neighbor is in the process of building a n...,Our neighbor is getting built…,2,DK,negative,male,Sunday,114,16,2022-04,April,2,2022,612


## analysis 

## Noun Phrase Analysis

In [None]:
#instantiate spaCy model
model=spacy.load('en_core_web_lg')

#fits model using multi-threadding
df['spaCy_doc'] = [i for i in tq.tqdm(model.pipe(df['Body'], batch_size=10000,n_threads=60))]

0it [00:00, ?it/s]

In [None]:
#gets spaCy sentences
df['spaCy_sentences'] = df['spaCy_doc'].map(lambda x:[sent for sent in x.sents])

In [None]:
#gets spaCy noun chunks
df['spaCy_noun_chunk'] =  df['spaCy_sentences'].map(lambda x: [(token.root.text,token.text) for i in x for token in i.noun_chunks])

## Vader

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

def getSentiment(df,title_col,review_col):
    
    """This function uses Vader to analyse sentiment of title and review.
    Positive, neutral, negative and compound scores are added to the DataFrame
    """ 

    #Instantiate instance of Vader sentiment model
    analyser = SentimentIntensityAnalyzer()

    #define columns and source columns
    sentiment_cols = [(('title_pos','title_neu','title_neg','title_com'),title_col),
                      (('review_pos','review_neu','review_neg','review_com'),review_col)]

    #loop through columns
    for i in range(len(sentiment_cols)):

        #loop through pos,neu,neg,compound
        for col in range(len(sentiment_cols[i][:-1])):

            #get positive sentiment score
            df[sentiment_cols[i][col][0]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['pos'])
        
            #get neutral sentiment score
            df[sentiment_cols[i][col][1]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['neu'])
            
            #get negative sentiment score  
            df[sentiment_cols[i][col][2]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['neg'])
            
            #get compound sentiment score
            df[sentiment_cols[i][col][3]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['compound'])

    #return DataFrame
    return df

In [None]:
#call getSentiment function on title and review columns
df['Body'].fillna("",inplace=True)
df['Heading'].fillna("",inplace=True)
df = getSentiment(df,'Heading','Body')

In [82]:
df.explode('spaCy_noun_chunk').spaCy_noun_chunk.str[0]

0               I
0            Alka
0          others
0               I
0       treatment
          ...    
980    department
980      problems
981          Alka
981          they
981         BRAVO
Name: spaCy_noun_chunk, Length: 7062, dtype: object

In [124]:
df[df.Rating < 3].explode('spaCy_noun_chunk').explode('spaCy_noun_chunk').spaCy_noun_chunk.value_counts()[:50]

I                674
it               312
you              228
they             174
Alka             171
insurance        106
we                86
me                82
them              82
who               54
It                46
he                44
car               37
They              36
company           35
DKK               33
alka              32
We                32
damage            31
customer          30
what              30
ALKA              25
time              24
service           24
anything          22
You               22
customers         20
us                20
one               19
price             19
years             19
information       18
month             18
a customer        16
case              16
phone             14
myself            14
nothing           14
people            14
offers            13
my insurance      12
injury            12
something         12
help              11
house             11
the insurance     11
companies         11
offer        

In [123]:
df[df.Rating > 3].explode('spaCy_noun_chunk').explode('spaCy_noun_chunk').spaCy_noun_chunk.value_counts()[:50]

I               855
Alka            265
you             254
service         246
it              240
insurance       177
we              104
me               94
they             92
It               88
what             82
help             74
who              72
treatment        71
them             70
We               60
damage           60
ALKA             57
employee         57
questions        52
company          48
connection       47
experience       45
time             44
everything       43
alka             41
staff            39
They             38
answers          37
prices           36
problems         34
money            32
case             32
advice           31
car              30
insurances       29
years            29
guidance         28
conversation     28
operation        26
she              26
coverage         26
review           25
phone            25
processing       25
answer           25
policies         23
Good service     23
customer         22
things           21


In [144]:
nondf=df.explode('spaCy_noun_chunk').explode('spaCy_noun_chunk').reset_index()

In [153]:
neg = nondf[nondf.Rating < 3]

In [154]:
neg[neg.spaCy_noun_chunk=="help"]

Unnamed: 0,index,Date,Author,Body,Heading,Rating,Location,class,gender,day,...,spaCy_sentences,spaCy_noun_chunk,title_pos,title_neu,title_neg,title_com,review_pos,review_neu,review_neg,review_com
1942,157,2022-01-30,Flemming Rasmussen,The worst of the worst.\n\nHow tired are we of...,The worst of the worst.,1,DK,negative,male,Sunday,...,"[(The, worst, of, the, worst, ., \n\n), (How, ...",help,0.0,0.268,0.732,-0.8481,0.068,0.783,0.15,-0.9811
1943,157,2022-01-30,Flemming Rasmussen,The worst of the worst.\n\nHow tired are we of...,The worst of the worst.,1,DK,negative,male,Sunday,...,"[(The, worst, of, the, worst, ., \n\n), (How, ...",help,0.0,0.268,0.732,-0.8481,0.068,0.783,0.15,-0.9811
2472,202,2022-02-02,KNUD FRANDSEN,I have tried to pay without full I have called...,I have tried to get paid…,1,DK,negative,unknown,Wednesday,...,"[(I, have, tried, to, pay, without, full), (I,...",help,0.0,1.0,0.0,0.0,0.141,0.741,0.118,0.568
2473,202,2022-02-02,KNUD FRANDSEN,I have tried to pay without full I have called...,I have tried to get paid…,1,DK,negative,unknown,Wednesday,...,"[(I, have, tried, to, pay, without, full), (I,...",help,0.0,1.0,0.0,0.0,0.141,0.741,0.118,0.568
3877,279,2022-02-08,Jacob Ravn,WARNING.\nI have long been a customer of alka....,WARNING.,1,PL,negative,male,Tuesday,...,"[(WARNING, ., \n), (I, have, long, been, a, cu...",help,0.0,0.0,1.0,-0.34,0.054,0.854,0.092,-0.4759
4929,351,2022-02-15,Brian Andreasen,"Have 2 cases running at alka, one is over 1 ye...",BAD!,1,DK,negative,male,Tuesday,...,"[(Have, 2, cases, running, at, alka, ,, one, i...",help,0.0,0.0,1.0,-0.5848,0.061,0.855,0.085,-0.4261
4930,351,2022-02-15,Brian Andreasen,"Have 2 cases running at alka, one is over 1 ye...",BAD!,1,DK,negative,male,Tuesday,...,"[(Have, 2, cases, running, at, alka, ,, one, i...",help,0.0,0.0,1.0,-0.5848,0.061,0.855,0.085,-0.4261
8358,574,2022-03-10,Emil,Should change my insurance\n\nI got an insuran...,Should change my insurance,2,DK,negative,male,Thursday,...,"[(Should, change, my, insurance, \n\n), (I, go...",help,0.0,1.0,0.0,0.0,0.093,0.89,0.017,0.9459
8359,574,2022-03-10,Emil,Should change my insurance\n\nI got an insuran...,Should change my insurance,2,DK,negative,male,Thursday,...,"[(Should, change, my, insurance, \n\n), (I, go...",help,0.0,1.0,0.0,0.0,0.093,0.89,0.017,0.9459
11610,827,2022-04-06,Arne,Thought that an insurance was there to cover d...,Thought an insurance was there to…,1,DK,negative,male,Wednesday,...,"[(Thought, that, an, insurance, was, there, to...",help,0.0,1.0,0.0,0.0,0.033,0.706,0.262,-0.9962


In [43]:
import plotly.express as px