<a href="https://colab.research.google.com/github/Zalaann/FSE-Project/blob/main/Web_Scraping_from_TrustPilot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import time

def trustpilot_scraper(PATH: str, n_pages):
    # Lists to store review data
    body = []
    heading = []
    rating = []
    location = []
    author = []
    date = []

    # Base URL for pagination
    page = "{}?page=".format(PATH)

    # Scrape from page 2 to n_pages (in this case 10)
    for page_number in range(100, n_pages + 1):  # Ensure this goes up to n_pages
        url = "{}{}".format(page, page_number)

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }

        req = requests.get(url, headers=headers)

        # Check if request was successful
        if req.status_code != 200:
            print(f"Failed to retrieve page {page_number}, status code: {req.status_code}")
            continue

        # Adding a delay to avoid being blocked
        time.sleep(2)

        # Parsing the page content
        soup = BeautifulSoup(req.text, 'html.parser')

        # Check the raw HTML structure (for debugging purposes)
        # Uncomment the line below to print the first page's HTML and inspect its structure
        # print(soup.prettify())

        # Extracting reviews data
        try:
            reviews_raw = soup.find("script", id="__NEXT_DATA__").string
            reviews_raw = json.loads(reviews_raw)
            rev = reviews_raw["props"]["pageProps"]["reviews"]
        except Exception as e:
            print(f"Error extracting reviews on page {page_number}: {e}")
            continue

        # Getting reviews into the lists
        for i in range(len(rev)):
            instance = rev[i]

            body_ = instance["text"]
            heading_ = instance["title"]
            rating_ = instance["rating"]
            location_ = instance["consumer"]["countryCode"]
            author_ = instance["consumer"]["displayName"]
            date_ = pd.to_datetime(instance["dates"]["publishedDate"]).strftime("%Y-%m-%d")

            # Appending to lists
            body.append(body_)
            heading.append(heading_)
            rating.append(rating_)
            location.append(location_)
            author.append(author_)
            date.append(date_)

    # Creating DataFrame
    df = {
        'Date': date,
        'Author': author,
        'Body': body,
        'Heading': heading,
        'Rating': rating,
        'Location': location
    }

    rev_df = pd.DataFrame(df)

    # Sorting and cleaning the data
    rev_df.sort_values(by="Date", axis=0, inplace=True, ignore_index=True)
    rev_df.drop_duplicates(subset=["Body"], keep='first', inplace=True)
    rev_df.reset_index(drop=True, inplace=True)

    return rev_df

# Usage example:
url = "https://www.trustpilot.com/review/www.shein.com"
n_pages = 500
reviews_df = trustpilot_scraper(url, n_pages)

# Optionally, save to CSV
reviews_df.to_csv("trustpilot_reviews.csv", index=True)

# Show the first few rows of the dataframe
print(reviews_df.head())


         Date          Author  \
0  2022-02-17            Catz   
1  2022-02-17            Shea   
2  2022-02-17         Jacquie   
3  2022-02-17  Melanie Farmer   
4  2022-02-17        Jennifer   

                                                Body  \
0  Everything I've bought has been excellent, the...   
1  I never expected such a good quality with such...   
2                         Good clothes, good prices.   
3  Really good product and great service n good c...   
4  Great site for  just about anything! Plus they...   

                               Heading  Rating Location  
0     Everything I've bought has been…       5       GB  
1                 Best value for money       5       US  
2                         Good clothes       5       US  
3                           Very happy       5       GB  
4  Great site for  just about anything       5       US  
