In [1]:
import json
from bs4 import BeautifulSoup # ! pip install beautifulsoup4
from urllib.request import urlopen
import pandas as pd

import time
from datetime import date

In [2]:
def scrape_trustpilot(page):
    html = str(urlopen(page).read().decode('utf-8'))
    soup = BeautifulSoup(html, 'html.parser')
    res = soup.find('script',{'id': '__NEXT_DATA__', 'type': 'application/json'})
    reviews = json.loads("".join(res))['props']['pageProps']['reviews']  
    info = {
        'author': [review['consumer']['displayName'] for review in reviews],
        'location': [review['consumer']['countryCode'] for review in reviews],
        'date': [review['labels']['verification']['createdDateTime'][:10] for review in reviews],
        'rating': [review['rating'] for review in reviews],
        'headline': [review['title'] for review in reviews],
        'full_text': [review['text'] for review in reviews],
        'source':page,
        'current date': date.today().strftime("%Y-%m-%d")
    }
    info_df = pd.DataFrame(info)
    return info_df

In [None]:
Npages = 100                                           # Specify how many pages to scrape - check before scpaping
company = 'T-Mobile'                                   # For record purposes - specify company name
input_url = 'https://www.trustpilot.com/review/www.t-mobile.com'

scrape_data = []                                       # Storage for output


t0 = time.time()
for i in range(1,Npages):                              # Loop over N pages
    if i > 1:
        page = '{}{}{}'.format(input_url,'?page=',i)   # Construct URL with page numbering ?page=N
    else: 
        page = input_url                               # First page - input url
    try:
        data = scrape_trustpilot(page)                 # Call scrape function
    except:
        pass                                           # Catch errors
    if i % 10 == 0:
        t1 = time.time()
        print('Processed {} pages in {:.4f} sec.'.format(i,t1-t0))
    
    scrape_data.append(data)                           # Add processed data into the storage list
    time.sleep(1)                                      # Wait 1 second

print('Processed {} pages in {:.4f} sec.'.format(i,t1-t0))

df_scraped = pd.concat(scrape_data, axis=0)            # Combine output into 1 dataset
df_scraped.reset_index(inplace=True)                   # Reset index to start from 0
df_scraped['index'] = company                          # Replace index column with company name 

Processed 10 pages in 11.7148 sec.
Processed 20 pages in 24.1746 sec.
