<a href="https://colab.research.google.com/github/analyticswithadam/Python/blob/main/Web_Scraping_from_TrustPilot_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

def extract_reviews(page_url):
    headers = {"User-Agent": "Mozilla/5.0"}  # Mimic a browser
    response = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all articles that represent individual reviews.
    review_articles = soup.find_all('article', attrs={"data-service-review-card-paper": True})

    reviews_data = []
    for article in review_articles:
        review_text = None
        review_date = None
        rating = None

        # Extract the full review text from the <p> tag with the review text attribute.
        text_tag = article.find('p', attrs={"data-service-review-text-typography": True})
        if text_tag:
            review_text = text_tag.get_text(strip=True)

        # Extract the review date from the first <time> element in the article.
        time_tag = article.find('time')
        if time_tag:
            review_date = time_tag.get_text(strip=True)

        # Extract the rating from the parent container using the attribute.
        header_div = article.find('div', attrs={"data-service-review-rating": True})
        if header_div:
            rating = header_div.get("data-service-review-rating")

        reviews_data.append({
            "Review Text": review_text,
            "Review Date": review_date,
            "Rating": rating
        })

    return reviews_data

def extract_all_reviews(base_url, from_page=1, to_page=6):
    all_reviews = []
    for page in range(from_page, to_page + 1):
        page_url = f"{base_url}?page={page}"
        print(f"Scraping: {page_url}")
        all_reviews.extend(extract_reviews(page_url))
        sleep(1)  # Pause to avoid throttling
    return pd.DataFrame(all_reviews)

# Example usage:
base_url = "https://www.trustpilot.com/review/pepsi.com"
df_reviews = extract_all_reviews(base_url, from_page=1, to_page=6)
print(df_reviews)



Scraping: https://www.trustpilot.com/review/pepsi.com?page=1
Scraping: https://www.trustpilot.com/review/pepsi.com?page=2
Scraping: https://www.trustpilot.com/review/pepsi.com?page=3
Scraping: https://www.trustpilot.com/review/pepsi.com?page=4
Scraping: https://www.trustpilot.com/review/pepsi.com?page=5
Scraping: https://www.trustpilot.com/review/pepsi.com?page=6
                                           Review Text   Review Date Rating
0                                                 None   4 hours ago   None
1                                                 None  Feb 27, 2025   None
2                                                 None  Feb 15, 2025   None
3                                                 None  Jan 28, 2025   None
4    I am from the new generation of Pepsi drinkers...   4 hours ago      5
..                                                 ...           ...    ...
139  In the past, lots of people know Pepsi was my ...   Sep 8, 2023      1
140  Absolutely horrible, 

In [2]:
df_reviews['Review Text'][4]

'I am from the new generation of Pepsi drinkers. Now in senior drinkers Pepsi. I spend about $40.00 a month on your company product. I am laying here now with my last Pepsi ever. I need to stop and now your company taking your business position. I will no longer buy any Pepsi products or anything they invest in.'

In [5]:
from google.colab import files
# Convert the DataFrame to a CSV file and download it
df_reviews.to_csv('reviews.csv', index=False)
files.download('reviews.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>