In [19]:
# Data Collection from Google Reviews

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import urllib.parse

def scrape_google_shopping_reviews(api_key, product_url, num_pages=10):
    # ScrapingBee API endpoint
    api_url = 'https://app.scrapingbee.com/api/v1/'
    
    reviews = []

    for page in range(1, num_pages + 1):
        # Modify the product URL to include pagination if applicable
        paginated_url = f"{product_url}&page={page}"
        encoded_url = urllib.parse.quote_plus(paginated_url)

        # Parameters for the API request
        params = {
            'api_key': api_key,
            'url': paginated_url,  
            'render_js': 'true',
            'custom_google': 'true'
        }

        # Debug: Print the request URL and parameters
        print(f"Requesting: {api_url} with params {params}")

        # Send the request to ScrapingBee
        response = requests.get(api_url, params=params)
        
        # Debug: Print the response status code and content
        print(f"Response Status Code: {response.status_code}")
        if response.status_code != 200:
            print(f"Failed to retrieve the website: {response.status_code}")
            print(f"Response Content: {response.content}")
            continue

        # Parse the content of the response with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract review data
        review_elements = soup.find_all('div', {'class': 'sh-dlr__content'})  
        for review_element in review_elements:
            title_element = review_element.find('span', {'class': 'sh-dlr__title'})
            rating_element = review_element.find('div', {'class': 'sh-dlr__rating-stars'})
            text_element = review_element.find('span', {'class': 'sh-dlr__review-content'})

            title = title_element.get_text(strip=True) if title_element else "No title"
            rating = rating_element['aria-label'] if rating_element else "No rating"
            text = text_element.get_text(strip=True) if text_element else "No text"
            
            review = {
                'title': title,
                'rating': rating,
                'text': text
            }
            reviews.append(review)

        # Adding a delay to avoid being blocked
        time.sleep(2)

    return reviews

def save_reviews_to_csv(reviews, filename):
    df = pd.DataFrame(reviews)
    df.to_csv(filename, index=False, encoding='utf-8')

if __name__ == "__main__":
    product_url = "https://www.google.com/shopping/product/388894881820589121/reviews?q=stanely+cup+reviews&rlz=1C5CHFA_enUS950US951&oq=stanely+cup+reviews&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIJCAEQABgKGIAEMgkIAhAAGAoYgAQyCQgDEAAYChiABDIKCAQQLhgKGBYYHjINCAUQABiGAxiABBiKBTINCAYQABiGAxiABBiKBTINCAcQABiGAxiABBiKBTIKCAgQABiABBiiBNIBCDM1MDlqMGo3qAIAsAIA&sourceid=chrome&ie=UTF-8&ved=2ahUKEwiR_9T6rpCGAxU4IzQIHSBUATYQ4jV6BAgFEBI"  # Replace with the target Google Shopping product URL
    # ScrapingBee API key
    api_key = "4VH7GZFPCWBY40S9KSZPMAHZA1DFAAJXUUJ9RD9ADMVJSRXLUJS5OZYQRO2E2VZ6HRDOCJJWW339ELPS"  

    # Scrape the reviews
    reviews = scrape_google_shopping_reviews(api_key, product_url, num_pages=10)
    
    # Print the reviews to verify
    for review in reviews:
        print(f"Title: {review['title']}\nRating: {review['rating']}\nReview: {review['text']}\n{'-'*80}")
    
    # Save the reviews to a CSV file
    save_reviews_to_csv(reviews, 'google_shopping_reviews.csv')
    
    print("Reviews have been saved to 'google_shopping_reviews.csv'")


Requesting: https://app.scrapingbee.com/api/v1/ with params {'api_key': '4VH7GZFPCWBY40S9KSZPMAHZA1DFAAJXUUJ9RD9ADMVJSRXLUJS5OZYQRO2E2VZ6HRDOCJJWW339ELPS', 'url': 'https://www.google.com/shopping/product/388894881820589121/reviews?q=stanely+cup+reviews&rlz=1C5CHFA_enUS950US951&oq=stanely+cup+reviews&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIJCAEQABgKGIAEMgkIAhAAGAoYgAQyCQgDEAAYChiABDIKCAQQLhgKGBYYHjINCAUQABiGAxiABBiKBTINCAYQABiGAxiABBiKBTINCAcQABiGAxiABBiKBTIKCAgQABiABBiiBNIBCDM1MDlqMGo3qAIAsAIA&sourceid=chrome&ie=UTF-8&ved=2ahUKEwiR_9T6rpCGAxU4IzQIHSBUATYQ4jV6BAgFEBI&page=1', 'render_js': 'true', 'custom_google': 'true'}
Response Status Code: 200
Requesting: https://app.scrapingbee.com/api/v1/ with params {'api_key': '4VH7GZFPCWBY40S9KSZPMAHZA1DFAAJXUUJ9RD9ADMVJSRXLUJS5OZYQRO2E2VZ6HRDOCJJWW339ELPS', 'url': 'https://www.google.com/shopping/product/388894881820589121/reviews?q=stanely+cup+reviews&rlz=1C5CHFA_enUS950US951&oq=stanely+cup+reviews&gs_lcrp=EgZjaHJvbWUyBggAEEUYOTIJCAEQABgKGIAEMgkIAhAAGA

In [11]:
# Data Collection from Amazon Reviews
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_amazon_reviews(api_key, product_url, num_pages=10):
    # ScrapingBee API endpoint
    api_url = 'https://app.scrapingbee.com/api/v1/'

    reviews = []

    for page in range(1, num_pages + 1):
        # Modify the product URL to include pagination
        paginated_url = f"{product_url}/ref=cm_cr_arp_d_paging_btm_next_{page}?pageNumber={page}"
        
        # Parameters for the API request
        params = {
            'api_key': api_key,
            'url': paginated_url,
            'render_js': 'false'
        }

        # Send the request to ScrapingBee
        response = requests.get(api_url, params=params)
        
        if response.status_code != 200:
            print(f"Failed to retrieve the website: {response.status_code}")
            continue

        # Parse the content of the response with BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract review data
        review_elements = soup.find_all('div', {'data-hook': 'review'})
        for review_element in review_elements:
            title_element = review_element.find('a', {'data-hook': 'review-title'})
            rating_element = review_element.find('i', {'data-hook': 'review-star-rating'})
            text_element = review_element.find('span', {'data-hook': 'review-body'})

            title = title_element.get_text(strip=True) if title_element else "No title"
            rating = rating_element.get_text(strip=True) if rating_element else "No rating"
            text = text_element.get_text(strip=True) if text_element else "No text"
            
            review = {
                'title': title,
                'rating': rating,
                'text': text
            }
            reviews.append(review)

        # Adding a delay to avoid being blocked
        time.sleep(2)

    return reviews

def save_reviews_to_csv(reviews, filename):
    df = pd.DataFrame(reviews)
    df.to_csv(filename, index=False, encoding='utf-8')

if __name__ == "__main__":
    product_url = "https://www.amazon.com/Stanlely-Quencher-FlowStateTM-Tumbler-Frost/product-reviews/B0CK9PT53B/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews"  # Replace with the target Amazon product URL
    # crapingBee API key
    api_key = "A32FVNJ2C4845CFYLYLRUJVFAURM5QJI8KMJD3MFOT518SWQ7CV4QOJS1F7N0UIS3B4XP5PXHLL5DU5T" 

    # Scrape the reviews
    reviews = scrape_amazon_reviews(api_key, product_url, num_pages=10)
    
    # Save the reviews to a CSV file
    save_reviews_to_csv(reviews, 'amazon_reviews.csv')
    
    print("Reviews have been saved to 'amazon_reviews.csv'")


Reviews have been saved to 'amazon_reviews.csv'


In [12]:
# Data Collection from Target Reviews

import requests
from bs4 import BeautifulSoup
import pandas as pd

api_key = "A32FVNJ2C4845CFYLYLRUJVFAURM5QJI8KMJD3MFOT518SWQ7CV4QOJS1F7N0UIS3B4XP5PXHLL5DU5T"
url = "https://www.target.com/p/stanley-40oz-stainless-steel-h2-0-flowstate-quencher-tumbler-amethyst/-/A-89570975?ref=tgt_adv_xsp&AFID=google&fndsrc=tgtao&DFA=71700000086347363&CPNG=PLA_Dining%2BShopping_Traffic_Local_Traffic%7CDining_Ecomm_Home&adgroup=SC_Dining_Coffee%2FHydration&LID=700000001170770pgs&LNM=PRODUCT_GROUP&network=g&device=c&location=9032151&targetid=pla-1462249812655&gad_source=1&gclid=CjwKCAjwupGyBhBBEiwA0UcqaJvHCrxEbRRKIDVMNJNaUEUErqrz5ilPv_I_k9T_L4rpm48MisUM8xoC-rcQAvD_BwE&gclsrc=aw.ds&page=1"

params = {
    "api_key": api_key,
    "url": url,
    "render_js": "true"
}

response = requests.get("https://app.scrapingbee.com/api/v1/", params=params)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")
    
    reviews = []
    
    review_cards = soup.find_all("div", {"data-test": "review-card"})
    for card in review_cards:
        title = card.find("h4", {"data-test": "review-card--title"})
        rating = card.find("span", {"class": "sc-fd6a822c-0 ivBHhT"})
        text = card.find("div", {"data-test": "review-card--text"})
        
        if title:
            title_text = title.get_text(strip=True)
        else:
            title_text = "No title"
        
        if rating:
            rating_text = rating.get_text(strip=True)
        else:
            rating_text = "No rating"
        
        if text:
            text_content = text.get_text(strip=True)
        else:
            text_content = "No text"
        
        reviews.append({
            "title": title_text,
            "rating": rating_text,
            "text": text_content
        })
    
    # Print extracted reviews for debugging
    for review in reviews:
        print(review)
    
    df = pd.DataFrame(reviews)
    df.to_csv("target_reviews.csv", index=False)
    
    print("Reviews have been saved to 'target_reviews.csv'")
else:
    print(f"Failed to retrieve the website: {response.status_code}")

Reviews have been saved to 'target_reviews.csv'
