# imports

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import time
import plotly.express as px
import tqdm.notebook as tq
import spacy
from wordcloud import WordCloud, STOPWORDS
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [5]:
spacy.cli.download("en_core_web_lg")

KeyboardInterrupt: ignored

# Scraping

In [None]:


def trustpilot_scraper(PATH: str, n_pages):

    #Lists
    body = []
    heading = []
    rating = []
    location = []
    author = []
    date = []

    #Website Load
    page = "{}?page=".format(PATH)

    for page_number in tqdm(range(1, n_pages+1)):
        url = "{x}{y}".format(x = page, y = page_number)
        req = requests.get(url)

        time.sleep(2)
        soup = BeautifulSoup(req.text, 'html.parser')

        #initial reviews
        reviews_raw = soup.find("script", id = "__NEXT_DATA__").string
        reviews_raw = json.loads(reviews_raw)
        rev = reviews_raw["props"]["pageProps"]["reviews"]

        #get reviews into df
        for i in tqdm(range(len(rev))):
            instance = rev[i]
            body_ = GoogleTranslator(source='auto').translate(instance["text"], dest='en')
            heading_ = GoogleTranslator(source='auto').translate(instance["title"], dest='en')
            rating_ = instance["rating"]
            location_ = instance["consumer"]["countryCode"]
            author_ = instance["consumer"]["displayName"]
            date_ = pd.to_datetime(instance["dates"]["publishedDate"]).strftime("%Y-%m-%d")

            #append to the list
            body.append(body_)
            heading.append(heading_)
            rating.append(rating_)
            location.append(location_)
            author.append(author_)
            date.append(date_)

    df = {
    'Date' : date,
    'Author' : author,
    'Body' : body,
    'Heading' : heading,
    'Rating' : rating,
    'Location' : location
    }

    rev_df = pd.DataFrame(df)
    rev_df.sort_values(by = "Date", axis = 0, inplace = True, ignore_index = True)
    rev_df.drop_duplicates(subset=["Body"],keep= 'first', inplace= True)
    rev_df.reset_index(drop = True, inplace = True)

    return rev_df

In [None]:
df = trustpilot_scraper('https://dk.trustpilot.com/review/www.alka.dk',50)
df

# Expand data

In [None]:
def getClasses(rating):
    
    """Function returns if a rating classifies as positive or negative. 
    Neutral is also included"""

    #rating is 2 or less, return negative
    if int(rating) <= 2:
        return 'negative'

    #rating is 3 or less, return neutral
    elif int(rating) == 3:
        return 'neutral'

    #rating is 4 or more, return positive
    else:
        return 'positive'

In [None]:
#call function to get classes
df['class'] = df['Rating'].map(getClasses)

In [None]:
#instantiate instance of gender_guesser
detect_gender = gd.Detector()

def getGender(name_string):
    
    """
    Infers gender using gender-guesser 0.4.0 on users
    first name
    
    Hard recode of 'mostly' and 'andy'
    """
    
    #infer gender on users first name
    gender = detect_gender.get_gender(name_string.split(' ')[0])
    
    #recode if mostly female
    if gender == 'mostly_female':
        
        return 'female'
    
    #recode if mostly male
    elif gender == 'mostly_male':
        
        return 'male'
    
    #recode if androgenous
    elif gender == 'andy':
        
        return 'unknown'
    
    else:
        
        return gender

In [None]:
#call function to get first name
df['gender'] = df['Author'].map(lambda x: getGender(x))

In [None]:
def getDate(df,date_col):
    
    """
    Extracts various time features from a datetime formatted
    column
    """
    #convert to datetime
    df[date_col] = pd.to_datetime(df[date_col]) 
    #get day of week
    df['day'] = df[date_col].map(lambda x: x.day_name())
    #get day of year
    df['day_of_year'] = df[date_col].map(lambda x: x.dayofyear)
    #get day of year
    df['week_of_year'] = df[date_col].map(lambda x: x.weekofyear)
    #get month year
    df['month_year'] = df[date_col].map(lambda x: x.to_period('M'))
    #get month
    df['month'] = df[date_col].map(lambda x: x.month_name())
    #get quarter
    df['quarter'] = df[date_col].map(lambda x: x.quarter)
    #get year
    df['year'] = df[date_col].map(lambda x: x.year)
    
    return df

In [None]:
#call function to get date features
df['Body'].fillna("",inplace=True)
df = getDate(df,'Date')
#get length of review
df['review_length'] = df['Body'].map(lambda x: len(x))

## save data

In [None]:
# save dataframe as a pickle file
df.to_pickle('data.pkl')
df

Unnamed: 0,Date,Author,Body,Heading,Rating,Location,class,gender,day,day_of_year,week_of_year,month_year,month,quarter,year,review_length
0,2022-01-16,René,You get real help when you need it,You get real help when you have…,5,DK,positive,male,Sunday,16,2,2022-01,January,1,2022,34
1,2022-01-16,Tonni Kristensen.,I do not think Alka is neither better nor wors...,I do not think,5,DK,positive,male,Sunday,16,2,2022-01,January,1,2022,106
2,2022-01-16,inga,"I am so happy for Alka, have received a really...",I'm so happy for Alka,5,DK,positive,unknown,Sunday,16,2,2022-01,January,1,2022,85
3,2022-01-17,Ib Lorenzen,"Before we could count to 10, we had a new wind...","Quick, easy and straightforward",5,DK,positive,male,Monday,17,3,2022-01,January,1,2022,227
4,2022-01-17,Susann Taha,Easy reporting and quick feedback.,Easy reporting and fast…,5,DK,positive,female,Monday,17,3,2022-01,January,1,2022,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,2022-04-24,Martine Hocke,In connection with a trip that I had to cancel...,In connection with a trip like me…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,241
978,2022-04-24,Helle Emmerich,Super professional therapist and everything ra...,Super professional therapist and everything…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,85
979,2022-04-24,Susanne Opperby Madsen,A good insurance company they are fast and eff...,A good insurance company they are…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,74
980,2022-04-24,Torben Juel Petersen,"It was a stone's throw in the windshield, so m...",It was a stone's throw in…,4,DK,positive,male,Sunday,114,16,2022-04,April,2,2022,131


# Analysis 

## Load data

In [71]:
df = pd.read_pickle('data.pkl')
df

Unnamed: 0,Date,Author,Body,Heading,Rating,Location,class,gender,day,day_of_year,week_of_year,month_year,month,quarter,year,review_length
0,2022-01-16,Tonni Kristensen.,I do not think Alka is neither better nor wors...,I do not think,5,DK,positive,male,Sunday,16,2,2022-01,January,1,2022,106
1,2022-01-16,inga,"I am so happy for Alka, have received a really...",I'm so happy for Alka,5,DK,positive,unknown,Sunday,16,2,2022-01,January,1,2022,85
2,2022-01-17,Susann Taha,Easy reporting and quick feedback.,Easy reporting and fast…,5,DK,positive,female,Monday,17,3,2022-01,January,1,2022,34
3,2022-01-17,Michelle Lundberg Badsted,"Basically, I have received good service and am...",Basically I got good…,4,DK,positive,female,Monday,17,3,2022-01,January,1,2022,303
4,2022-01-17,Leif Sørensen,Good service and quick response,God service,5,DK,positive,male,Monday,17,3,2022-01,January,1,2022,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
977,2022-04-24,Helle Emmerich,Super professional therapist and everything ra...,Super professional therapist and everything…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,85
978,2022-04-24,Susanne Opperby Madsen,A good insurance company they are fast and eff...,A good insurance company they are…,5,DK,positive,female,Sunday,114,16,2022-04,April,2,2022,74
979,2022-04-24,Torben Juel Petersen,"It was a stone's throw in the windshield, so m...",It was a stone's throw in…,4,DK,positive,male,Sunday,114,16,2022-04,April,2,2022,131
980,2022-04-24,Jan skov Kristensen,Our neighbor is in the process of building a n...,Our neighbor is getting built…,2,DK,negative,male,Sunday,114,16,2022-04,April,2,2022,612


## analysis 

## Noun Phrase Analysis

In [72]:
#instantiate spaCy model
model=spacy.load('en_core_web_lg')

#fits model using multi-threadding
df['spaCy_doc'] = [i for i in tq.tqdm(model.pipe(df['Body'], batch_size=10000,n_threads=60))]

0it [00:00, ?it/s]

In [73]:
#gets spaCy sentences
df['spaCy_sentences'] = df['spaCy_doc'].map(lambda x:[sent for sent in x.sents])

In [74]:
#gets spaCy noun chunks
df['spaCy_noun_chunk'] =  df['spaCy_sentences'].map(lambda x: [(token.root.text,token.text) for i in x for token in i.noun_chunks])

## Vader

In [75]:
def getSentiment(df,title_col,review_col):
    
    """This function uses Vader to analyse sentiment of title and review.
    Positive, neutral, negative and compound scores are added to the DataFrame
    """ 

    #Instantiate instance of Vader sentiment model
    analyser = SentimentIntensityAnalyzer()

    #define columns and source columns
    sentiment_cols = [(('title_pos','title_neu','title_neg','title_com'),title_col),
                      (('review_pos','review_neu','review_neg','review_com'),review_col)]

    #loop through columns
    for i in range(len(sentiment_cols)):

        #loop through pos,neu,neg,compound
        for col in range(len(sentiment_cols[i][:-1])):

            #get positive sentiment score
            df[sentiment_cols[i][col][0]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['pos'])
        
            #get neutral sentiment score
            df[sentiment_cols[i][col][1]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['neu'])
            
            #get negative sentiment score  
            df[sentiment_cols[i][col][2]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['neg'])
            
            #get compound sentiment score
            df[sentiment_cols[i][col][3]] = df[sentiment_cols[i][-1]].map(lambda x: analyser.polarity_scores(x)['compound'])

    #return DataFrame
    return df

# interesting words

In [76]:
#call getSentiment function on title and review columns
df['Body'].fillna("",inplace=True)
df['Heading'].fillna("",inplace=True)
df = getSentiment(df,'Heading','Body')
df = df.explode(['spaCy_noun_chunk']).explode('spaCy_noun_chunk')
df['count'] =df.groupby('spaCy_noun_chunk')['spaCy_noun_chunk'].transform('count')

In [123]:
df['spaCy_noun_chunk'].fillna("",inplace=True)

In [None]:
good = ['website','service','insurance','Alka','treatment','help','employee','company','experience','damage','staff','they','conversation','money','phone']
bad  = ['house','car','company','customer','damage','service','DKK','time','price','information','injury','phone','offer','review','email','Alka','they']

In [145]:
neg=df[df['class']=='negative'].sort_values(by=['count'],ascending=False)
fig2 =px.treemap(neg[neg['spaCy_noun_chunk'].isin(bad)],path=['spaCy_noun_chunk','Heading','Body'],values='count')

In [141]:
pos=df[df['class']=='positive'].sort_values(by=['count'],ascending=False)
fig =px.treemap(pos[pos['spaCy_noun_chunk'].isin(good)],path=['spaCy_noun_chunk','Heading','Body'],values='count')

In [146]:
fig2.write_html("negative.html")

# descriptive stats

In [10]:
df = pd.read_pickle('data.pkl')
df

Unnamed: 0,Date,Author,Body,Heading,Rating,Location,class,gender,day,day_of_year,week_of_year,month_year,month,quarter,year,review_length
0,2018-01-10,Jonathan Feodor Danstrup Karlsen,Everything is simply so good! They helped me a...,Everything is simply so good,5,DK,positive,male,Wednesday,10,2,2018-01,January,1,2018,178
1,2018-01-10,Anne,Easy and at a good price.,Easy and at a good price,5,DK,positive,female,Wednesday,10,2,2018-01,January,1,2018,25
2,2018-01-11,Joan Brøgger,"Quick and easy operation. However, the origina...",Quick and easy operation,4,DK,positive,female,Thursday,11,2,2018-01,January,1,2018,113
3,2018-01-11,Mirwais,"Lovely easy, fast and transparent it will not ...",Nice easy,5,DK,positive,unknown,Thursday,11,2,2018-01,January,1,2018,162
4,2018-01-11,Margit Kolding,Always get a good treatment and answers to the...,Always a good treatment and answer…,5,DK,positive,female,Thursday,11,2,2018-01,January,1,2018,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7672,2022-04-25,Rikke Nielsen,Quick solution to the problem,Quick solution to the problem,5,DK,positive,female,Monday,115,17,2022-04,April,2,2022,29
7673,2022-04-25,Marina Winther,I sent a review via My Alka on April 20th.\nAt...,Misleading on My Alka,1,DK,negative,female,Monday,115,17,2022-04,April,2,2022,690
7674,2022-04-25,janni andersen,Really sweet employee and quick response reall...,Quick response Really sweet employee to talk to,5,DK,positive,unknown,Monday,115,17,2022-04,April,2,2022,60
7675,2022-04-25,klant,"Had to cancel a trip due to Corona, The proces...",Had to cancel a trip due to Corona,5,DK,positive,unknown,Monday,115,17,2022-04,April,2,2022,174


In [18]:
dfd=df.groupby(['gender'])['gender'].count().to_frame()
fig = px.bar(
    data_frame=dfd,
    y="gender",
).update_layout(
    xaxis_title="Rating", yaxis_title="Count"
)

fig.show()

In [27]:
df.groupby(['gender','class'])['class'].count().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,class
gender,class,Unnamed: 2_level_1
female,negative,284
female,neutral,51
female,positive,2538
male,negative,592
male,neutral,76
male,positive,2816
unknown,negative,203
unknown,neutral,31
unknown,positive,1086


In [12]:
dfd=df.groupby(['Rating'])['Rating'].count().to_frame()

In [13]:
fig = px.bar(
    data_frame=dfd,
    y="Rating",
).update_layout(
    xaxis_title="Rating", yaxis_title="Count"
)

fig.show()

In [23]:
d=df.groupby(['Date'])['Rating'].mean().to_frame()

In [24]:
fig = px.line(
    data_frame=d,
    y="Rating",
).update_layout(
    xaxis_title="date", yaxis_title="avg. rating"
)

fig.show()