In [2]:
%pip install torch bs4 tqdm transformers openpyxl

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting tqdm
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.1-py3-none-any.whl.metadata (12 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.5.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m973.2 kB/s[0m eta [36m0:00:00[0m
Coll

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime as dt
from tqdm import tqdm
from transformers import pipeline
import pandas as pd
from tqdm import tqdm


  from .autonotebook import tqdm as notebook_tqdm


## Configuration for scraping
Edit this to your needs

In [2]:
TP_LOCALE = 'nl' # use nl for Dutch reviews
TP_COMPANY_ID = 'livingdna.com'

# go to the review page on Trustpilot for your company to determine how many pages to scrape.
# Set the below parameters to match that
from_page = 1
to_page = 669
# some trustpilot have additional URL parameters (for example "?languages=all"). Specify them below
# (to only get reviews in English, set it to "?languages=en")
TP_ADDITIONAL_URL_PARAMS = '?languages=en'

## Generate an example URL, given these parameters, for testing

In [3]:
URL = f'https://{TP_LOCALE}.trustpilot.com/review/{TP_COMPANY_ID}{TP_ADDITIONAL_URL_PARAMS}?page={3}'
URL

'https://nl.trustpilot.com/review/livingdna.com?languages=en?page=3'

## Run the below code cell to scrape reviews
(no file will be created yet)

In [4]:
review_dicts = []

for i in tqdm(range(from_page, to_page + 1)):
    response = requests.get(f"https://{TP_LOCALE}.trustpilot.com/review/{TP_COMPANY_ID}?page={i}")
    web_page = response.text
    soup = BeautifulSoup(web_page, "html.parser")

    # find all article elements with data-service-review-card-paper="true" attribute
    reviews = soup.find_all("article", {"data-service-review-card-paper": "true"})

    title_class = 'typography_heading-s__f7029 typography_appearance-default__AAY17'
    rating_class = 'styles_reviewHeader__iU9Px'
    rating_data_attribute = 'data-service-review-rating'
    review_text_class = 'typography_body-l__KUYFJ typography_appearance-default__AAY17 typography_color-black__5LYEn'
    date_class = 'typography_body-m__xgxZ_ typography_appearance-default__AAY17'

    for review in reviews:
        title = review.find("h2", class_=title_class).text
        rating = review.find("div", class_=rating_class).get(rating_data_attribute)
        # first check if there is any review text
        if review.find("p", class_=review_text_class) is not None:
            review_text = review.find("p", class_=review_text_class).text
        else:
            review_text = ""
        date = review.find("p", class_=date_class).text

        review_dicts.append({
            "Title": title,
            "Rating": rating,
            "Review": review_text,
            "Date": date
        })


100%|██████████| 669/669 [00:48<00:00, 13.73it/s]


## Save the scraped data to a .json file
Running this will generate a .json file ending with `trustpilot_reviews.json` which you can open to the left

In [5]:
# save the list of dicts to a json file
df = pd.DataFrame(review_dicts)
df.to_json(f"{TP_LOCALE}_{TP_COMPANY_ID}_trustpilot_reviews.json", orient="records")


In [None]:
qa_nl_pipeline = pipeline(
    "question-answering",
    model="henryk/bert-base-multilingual-cased-finetuned-dutch-squad2",
    tokenizer="henryk/bert-base-multilingual-cased-finetuned-dutch-squad2"
)

In [None]:
df = pd.read_json("trustpilot_reviews.json")

# use the qa_nl_pipeline to add a answer column to the dataframe. The question is 'Waar klagen ze over?'
# the pipeline returns a dictionary in the following format:
# {
#  "score": 0.83,
#  "start": 0,
#  "end": 9,
#  "answer": "Amsterdam"
#}

# iterate over the dataframe and add the answer to the dataframe
for i in tqdm(range(len(df))):
    row = df.iloc[i]
    # first check if the Review is not the empty string
    if row["Review"] == "":
        df.at[i, "qa_answer"] = ""
        df.at[i, "qa_score"] = 0
        df.at[i, "qa_start"] = 0
        df.at[i, "qa_end"] = 0
        continue

    answer = qa_nl_pipeline({
        "question": "Waar klagen ze over?",
        "context": row["Review"]
    })
    df.at[i, "qa_answer"] = answer["answer"]
    df.at[i, "qa_score"] = answer["score"]
    df.at[i, "qa_start"] = answer["start"]
    df.at[i, "qa_end"] = answer["end"]


# save the dataframe as json again
df.to_json(f"trustpilot_reviews_with_qa.json", orient="records")

In [None]:
# read in the json again and save it as an excel file
df = pd.read_json("trustpilot_reviews_with_qa.json")
df.to_excel("trustpilot_reviews_with_qa.xlsx", index=False)