In [1]:
import undetected_chromedriver as uc
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
import re

# Step 1: Set up undetected ChromeDriver with stealth techniques
def setup_selenium():
    """Set up undetected Chrome WebDriver with stealth techniques."""
    options = uc.ChromeOptions()

    # Set a specific user agent to mimic Firefox on Linux
    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0"
    options.add_argument(f"user-agent={user_agent}")

    # Specify the path to the Chrome binary
    

    # Disable browser detection flags
    # options.add_argument("--headless")  # Use newer headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("window-size=1920,1080")
    options.add_argument("--remote-debugging-port=9222")


    print("ChromeDriver options configured successfully.")

    # Start the Chrome WebDriver using undetected-chromedriver
    try:
        driver = uc.Chrome(options=options)
        time.sleep(10) 
        print("ChromeDriver initialized successfully.")
    except Exception as e:
        print(f"Error initializing ChromeDriver: {e}")
        return None

    # Additional stealth techniques
    try:
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        print("Stealth techniques applied successfully.")
    except Exception as e:
        print(f"Error applying stealth techniques: {e}")
        return None

    return driver


# Random sleep time for human-like behavior
def random_sleep(min_delay=2, max_delay=5):
    """Introduce random sleep times to simulate human behavior."""
    delay = random.uniform(min_delay, max_delay)
    print(f"Sleeping for {delay:.2f} seconds...")
    time.sleep(delay)

# Step 2: Extract Reviews from Page using Selenium and BeautifulSoup
def extract_reviews(page_html, url):
    """Extract reviews from a Walmart page using BeautifulSoup."""
    extracted_reviews = []
    soup = BeautifulSoup(page_html, 'html.parser')
    
    # Finding all review blocks (adjust class based on actual Walmart page structure)
    review_elements = soup.find_all('div', class_=re.compile(r'overflow-visible b--none mt\d-l ma0 dark-gray'))

    if not review_elements:
        return None  # No reviews found, return None to trigger retry

    for review in review_elements:
        product = {}

        # Extract review details
        review_rating_element = review.select_one('.w_iUH7')
        product['Review rating'] = review_rating_element.text if review_rating_element else None

        verified_purchase_element = review.select_one('.pl2.green.b.f7.self-center')
        product['Verified Purchase or not'] = verified_purchase_element.text if verified_purchase_element else None

        review_date_element = review.select_one('.f7.gray')
        product['Review date'] = review_date_element.text if review_date_element else None

        review_title_element = review.select_one('.w_kV33.w_Sl3f.w_mvVb.f5.b')
        product['Review title'] = review_title_element.text if review_title_element else None

        review_content_element = review.select_one('span.tl-m.db-m')
        product['Review Content'] = review_content_element.text.strip() if review_content_element else None

        review_name_element = review.select_one('.f7.b.mv0')
        product['Review name'] = review_name_element.text if review_name_element else None

        # Adding the URL of the review
        product['URL'] = url

        # Append the extracted product information to the list of reviews
        extracted_reviews.append(product)

    return extracted_reviews

# Step 3: Fetch Reviews with Selenium, retry if necessary
def fetch_reviews_with_retry(url, max_retries=50):
    """Fetch reviews with Selenium, relaunching ChromeDriver if necessary."""
    retries = 0
    while retries < max_retries:
        print(f"Attempt {retries + 1} of {max_retries}...")

        driver = setup_selenium()  # Re-launch ChromeDriver
        if driver is None:
            print("Failed to initialize ChromeDriver. Retrying...")
            retries += 1
            random_sleep(5, 10)  # Longer delay before retry
            continue

        try:
            driver.get(url)
            random_sleep()  # Random delay to wait for the page to load

            # Get the page source after it has fully loaded
            page_html = driver.page_source

            # Extract reviews using BeautifulSoup
            reviews = extract_reviews(page_html, url)

            if reviews:
                print(f"Successfully extracted {len(reviews)} reviews.")
                driver.quit()  # Close the driver after success
                return reviews  # Return the extracted reviews
            else:
                print(f"No reviews found on attempt {retries + 1}. Retrying...")

        except Exception as e:
            print(f"Error during review fetching: {e}. Retrying...")

        driver.quit()  # Close driver before retrying
        retries += 1
        random_sleep(5, 10)  # Random sleep between retries to avoid detection

    print(f"Failed to extract reviews after {max_retries} attempts.")
    return []

# Step 4: Fetch All Reviews (with pagination handling if necessary)
def fetch_all_reviews(url):
    """Main function to scrape reviews from all pages."""
    page = 1
    all_reviews = []

    while True:
        print(f"Fetching page {page}...")

        # Construct the URL for the current page
        page_url = f"{url}?page={page}"

        # Fetch reviews for the current page
        reviews = fetch_reviews_with_retry(page_url)

        # If no reviews were found, stop the loop
        if not reviews:
            print(f"Stopping at page {page}. No more reviews.")
            break

        # Add reviews to the total list
        all_reviews.extend(reviews)
        print(f"Reviews extracted from page {page}: {len(reviews)}")

        # Increment the page counter
        page += 1

        # Random sleep to avoid detection
        random_sleep()

    return all_reviews

# Step 5: Define Walmart URLs
urls = [
    'https://www.walmart.com/reviews/product/5129928603'
]

# Step 6: Scrape Reviews from All URLs
walmart_reviews = []

for url in urls:
    walmart_reviews.extend(fetch_all_reviews(url))

# Step 7: Convert Reviews to DataFrame and Save to CSV
df_walmart = pd.DataFrame(walmart_reviews)

# Post-processing and cleaning the DataFrame
df_walmart['Retailer'] = "Walmart"
df_walmart['scraping_date'] = pd.to_datetime('today').date()
df_walmart['Review date'] = pd.to_datetime(df_walmart['Review date']).dt.date
df_walmart['Review rating'] = df_walmart['Review rating'].str.replace(' out of 5 stars review', '').astype(float)
df_walmart.drop_duplicates(inplace=True)

# Save the DataFrame to a CSV file
df_walmart.to_csv('walmart_reviews.csv', index=False)

print("Reviews scraped and saved to 'walmart_reviews.csv'.")


Fetching page 1...
Attempt 1 of 50...
ChromeDriver options configured successfully.
ChromeDriver initialized successfully.
Stealth techniques applied successfully.
Sleeping for 2.10 seconds...
No reviews found on attempt 1. Retrying...
Sleeping for 8.48 seconds...
Attempt 2 of 50...
ChromeDriver options configured successfully.
ChromeDriver initialized successfully.
Stealth techniques applied successfully.
Sleeping for 4.14 seconds...
No reviews found on attempt 2. Retrying...
Sleeping for 5.42 seconds...
Attempt 3 of 50...
ChromeDriver options configured successfully.
ChromeDriver initialized successfully.
Stealth techniques applied successfully.
Sleeping for 4.04 seconds...
Successfully extracted 10 reviews.
Reviews extracted from page 1: 10
Sleeping for 4.27 seconds...
Fetching page 2...
Attempt 1 of 50...
ChromeDriver options configured successfully.
ChromeDriver initialized successfully.
Stealth techniques applied successfully.
Sleeping for 2.05 seconds...
Error during review fetc

KeyboardInterrupt: 

In [4]:
import undetected_chromedriver as uc
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
import re

# Step 1: Set up undetected ChromeDriver with stealth techniques
def setup_selenium():
    """Set up undetected Chrome WebDriver with stealth techniques."""
    options = uc.ChromeOptions()

    # Set a specific user agent to mimic Firefox on Linux
    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0"
    options.add_argument(f"user-agent={user_agent}")

    # Disable browser detection flags
    # options.add_argument("--headless")  # Use newer headless mode
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("window-size=1920,1080")
    options.add_argument("--remote-debugging-port=9222")

    print("ChromeDriver options configured successfully.")

    # Start the Chrome WebDriver using undetected-chromedriver
    try:
        driver = uc.Chrome(options=options)
        # time.sleep(10)  # Wait for ChromeDriver to initialize
        print("ChromeDriver initialized successfully.")
    except Exception as e:
        print(f"Error initializing ChromeDriver: {e}")
        return None

    # Additional stealth techniques
    try:
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        print("Stealth techniques applied successfully.")
    except Exception as e:
        print(f"Error applying stealth techniques: {e}")
        return None

    return driver

# Random sleep time for human-like behavior
def random_sleep(min_delay=2, max_delay=5):
    """Introduce random sleep times to simulate human behavior."""
    delay = random.uniform(min_delay, max_delay)
    print(f"Sleeping for {delay:.2f} seconds...")
    time.sleep(delay)

# Step 2: Extract Reviews from Page using Selenium and BeautifulSoup
def extract_reviews(page_html, url):
    """Extract reviews from a Walmart page using BeautifulSoup."""
    extracted_reviews = []
    soup = BeautifulSoup(page_html, 'html.parser')

    # Finding all review blocks (adjust class based on actual Walmart page structure)
    review_elements = soup.find_all('div', class_=re.compile(r'overflow-visible b--none mt\d-l ma0 dark-gray'))

    if not review_elements:
        return None  # No reviews found, return None to trigger retry

    for review in review_elements:
        product = {}

        # Extract review details
        review_rating_element = review.select_one('.w_iUH7')
        product['Review rating'] = review_rating_element.text if review_rating_element else None

        verified_purchase_element = review.select_one('.pl2.green.b.f7.self-center')
        product['Verified Purchase or not'] = verified_purchase_element.text if verified_purchase_element else None

        review_date_element = review.select_one('.f7.gray')
        product['Review date'] = review_date_element.text if review_date_element else None

        review_title_element = review.select_one('.w_kV33.w_Sl3f.w_mvVb.f5.b')
        product['Review title'] = review_title_element.text if review_title_element else None

        review_content_element = review.select_one('span.tl-m.db-m')
        product['Review Content'] = review_content_element.text.strip() if review_content_element else None

        review_name_element = review.select_one('.f7.b.mv0')
        product['Review name'] = review_name_element.text if review_name_element else None

        # Adding the URL of the review
        product['URL'] = url

        # Append the extracted product information to the list of reviews
        extracted_reviews.append(product)

    return extracted_reviews

# Step 3: Fetch Reviews with Selenium, only restart driver if necessary
def fetch_reviews(driver, url):
    """Fetch reviews with Selenium and restart ChromeDriver only if no reviews are found."""
    try:
        driver.get(url)
        random_sleep()  # Random delay to wait for the page to load

        # Get the page source after it has fully loaded
        page_html = driver.page_source

        # Extract reviews using BeautifulSoup
        reviews = extract_reviews(page_html, url)

        if reviews:
            print(f"Successfully extracted {len(reviews)} reviews.")
            return reviews  # Return the extracted reviews
        else:
            print("No reviews found.")
            return None  # Return None if no reviews are found

    except Exception as e:
        print(f"Error during review fetching: {e}")
        return None

# Step 4: Fetch All Reviews (with pagination handling if necessary)
def fetch_all_reviews(url):
    """Main function to scrape reviews from all pages."""
    page = 1
    all_reviews = []

    driver = setup_selenium()  # Set up ChromeDriver initially

    while True:
        print(f"Fetching page {page}...")

        # Construct the URL for the current page
        page_url = f"{url}?page={page}"

        # Fetch reviews for the current page
        reviews = fetch_reviews(driver, page_url)

        # If no reviews were found, restart the driver and retry the current page
        if not reviews:
            print(f"No reviews on page {page}. Restarting ChromeDriver and retrying...")
            driver.quit()  # Quit the driver
            driver = setup_selenium()  # Restart ChromeDriver
            if driver is None:
                print("Failed to restart ChromeDriver.")
                break  # If driver fails to restart, stop the loop
            reviews = fetch_reviews(driver, page_url)  # Retry fetching reviews
            if not reviews:
                print(f"Stopping at page {page}. No more reviews found.")
                break

        # Add reviews to the total list
        all_reviews.extend(reviews)
        print(f"Reviews extracted from page {page}: {len(reviews)}")

        # Increment the page counter
        page += 1

        # Random sleep to avoid detection
        random_sleep()

    driver.quit()  # Close the driver after all reviews are fetched
    return all_reviews

# Step 5: Define Walmart URLs
urls = [
    'https://www.walmart.com/reviews/product/5129928603'
]

# Step 6: Scrape Reviews from All URLs
walmart_reviews = []

for url in urls:
    walmart_reviews.extend(fetch_all_reviews(url))

# Step 7: Convert Reviews to DataFrame and Save to CSV
df_walmart = pd.DataFrame(walmart_reviews)

# Post-processing and cleaning the DataFrame
df_walmart['Retailer'] = "Walmart"
df_walmart['scraping_date'] = pd.to_datetime('today').date()
df_walmart['Review date'] = pd.to_datetime(df_walmart['Review date']).dt.date
df_walmart['Review rating'] = df_walmart['Review rating'].str.replace(' out of 5 stars review', '').astype(float)
df_walmart.drop_duplicates(inplace=True)

# Save the DataFrame to a CSV file
df_walmart.to_csv('walmart_reviews.csv', index=False)

print("Reviews scraped and saved to 'walmart_reviews.csv'.")


ChromeDriver options configured successfully.
ChromeDriver initialized successfully.
Stealth techniques applied successfully.
Fetching page 1...
Sleeping for 2.89 seconds...
No reviews found.
No reviews on page 1. Restarting ChromeDriver and retrying...
ChromeDriver options configured successfully.
ChromeDriver initialized successfully.
Stealth techniques applied successfully.
Sleeping for 2.58 seconds...
No reviews found.
Stopping at page 1. No more reviews found.


KeyError: 'Review date'

In [5]:
import undetected_chromedriver as uc
import time
import random
import pandas as pd
from bs4 import BeautifulSoup
import re

# Step 1: Set up undetected ChromeDriver with stealth techniques
def setup_selenium(cookies):
    """Set up undetected Chrome WebDriver with stealth techniques and inject cookies."""
    options = uc.ChromeOptions()

    # Set a specific user agent to mimic Firefox on Linux
    user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0"
    options.add_argument(f"user-agent={user_agent}")

    # Disable browser detection flags
    options.add_argument("--disable-gpu")
    options.add_argument("--disable-software-rasterizer")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("window-size=1920,1080")
    options.add_argument("--remote-debugging-port=9222")

    print("ChromeDriver options configured successfully.")

    # Start the Chrome WebDriver using undetected-chromedriver
    try:
        driver = uc.Chrome(options=options)
        time.sleep(5)  # Wait for ChromeDriver to initialize
        print("ChromeDriver initialized successfully.")
    except Exception as e:
        print(f"Error initializing ChromeDriver: {e}")
        return None

    # Inject cookies into the session
    driver.get("https://www.walmart.com")  # Open Walmart homepage to set cookies for the correct domain
    time.sleep(5)  # Wait for the page to load before injecting cookies

    for cookie in cookies:
        driver.add_cookie(cookie)

    print("Cookies injected successfully.")

    # Additional stealth techniques
    try:
        driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        print("Stealth techniques applied successfully.")
    except Exception as e:
        print(f"Error applying stealth techniques: {e}")
        return None

    return driver

# Random sleep time for human-like behavior
def random_sleep(min_delay=2, max_delay=5):
    """Introduce random sleep times to simulate human behavior."""
    delay = random.uniform(min_delay, max_delay)
    print(f"Sleeping for {delay:.2f} seconds...")
    time.sleep(delay)

# Step 2: Extract Reviews from Page using Selenium and BeautifulSoup
def extract_reviews(page_html, url):
    """Extract reviews from a Walmart page using BeautifulSoup."""
    extracted_reviews = []
    soup = BeautifulSoup(page_html, 'html.parser')

    # Finding all review blocks (adjust class based on actual Walmart page structure)
    review_elements = soup.find_all('div', class_=re.compile(r'overflow-visible b--none mt\d-l ma0 dark-gray'))

    if not review_elements:
        return None  # No reviews found, return None to trigger retry

    for review in review_elements:
        product = {}

        # Extract review details
        review_rating_element = review.select_one('.w_iUH7')
        product['Review rating'] = review_rating_element.text if review_rating_element else None

        verified_purchase_element = review.select_one('.pl2.green.b.f7.self-center')
        product['Verified Purchase or not'] = verified_purchase_element.text if verified_purchase_element else None

        review_date_element = review.select_one('.f7.gray')
        product['Review date'] = review_date_element.text if review_date_element else None

        review_title_element = review.select_one('.w_kV33.w_Sl3f.w_mvVb.f5.b')
        product['Review title'] = review_title_element.text if review_title_element else None

        review_content_element = review.select_one('span.tl-m.db-m')
        product['Review Content'] = review_content_element.text.strip() if review_content_element else None

        review_name_element = review.select_one('.f7.b.mv0')
        product['Review name'] = review_name_element.text if review_name_element else None

        # Adding the URL of the review
        product['URL'] = url

        # Append the extracted product information to the list of reviews
        extracted_reviews.append(product)

    return extracted_reviews

# Step 3: Fetch Reviews with Selenium, retry if necessary
def fetch_reviews_with_retry(url, cookies, max_retries=50):
    """Fetch reviews with Selenium, relaunching ChromeDriver if necessary."""
    retries = 0
    while retries < max_retries:
        print(f"Attempt {retries + 1} of {max_retries}...")

        driver = setup_selenium(cookies)  # Re-launch ChromeDriver with cookies
        if driver is None:
            print("Failed to initialize ChromeDriver. Retrying...")
            retries += 1
            random_sleep(5, 10)  # Longer delay before retry
            continue

        try:
            driver.get(url)
            random_sleep()  # Random delay to wait for the page to load

            # Get the page source after it has fully loaded
            page_html = driver.page_source

            # Extract reviews using BeautifulSoup
            reviews = extract_reviews(page_html, url)

            if reviews:
                print(f"Successfully extracted {len(reviews)} reviews.")
                driver.quit()  # Close the driver after success
                return reviews  # Return the extracted reviews
            else:
                print(f"No reviews found on attempt {retries + 1}. Retrying...")

        except Exception as e:
            print(f"Error during review fetching: {e}. Retrying...")

        driver.quit()  # Close driver before retrying
        retries += 1
        random_sleep(5, 10)  # Random sleep between retries to avoid detection

    print(f"Failed to extract reviews after {max_retries} attempts.")
    return []

# Step 4: Fetch All Reviews (with pagination handling if necessary)
def fetch_all_reviews(url, cookies):
    """Main function to scrape reviews from all pages."""
    page = 1
    all_reviews = []

    while True:
        print(f"Fetching page {page}...")

        # Construct the URL for the current page
        page_url = f"{url}?page={page}"

        # Fetch reviews for the current page
        reviews = fetch_reviews_with_retry(page_url, cookies)

        # If no reviews were found, stop the loop
        if not reviews:
            print(f"Stopping at page {page}. No more reviews.")
            break

        # Add reviews to the total list
        all_reviews.extend(reviews)
        print(f"Reviews extracted from page {page}: {len(reviews)}")

        # Increment the page counter
        page += 1

        # Random sleep to avoid detection
        random_sleep()

    return all_reviews

# Step 5: Convert cookie data to ChromeDriver's cookie format
def format_cookies(raw_cookies):
    """Convert raw cookie data to the format required by ChromeDriver."""
    formatted_cookies = []
    for raw_cookie in raw_cookies:
        cookie = {
            'name': raw_cookie.get('Name raw'),
            'value': raw_cookie.get('Content raw'),
            'domain': raw_cookie.get('Host raw').replace("http://", "").replace("https://", "").replace("www.", ""),
            'path': raw_cookie.get('Path raw'),
            'expiry': int(raw_cookie.get('Expires raw')) if raw_cookie.get('Expires raw') != '0' else None,
            'secure': raw_cookie.get('Send for raw') == 'true',
            'httpOnly': raw_cookie.get('HTTP only raw') == 'true'
        }
        formatted_cookies.append(cookie)
    return formatted_cookies

# Step 6: Define Walmart URLs and Raw Cookies
urls = [
    'https://www.walmart.com/reviews/product/5129928603'
]

raw_cookies = [
    {
        "Host raw": "https://drfdisvc.walmart.com/",
        "Name raw": "thx_guid",
        "Path raw": "/",
        "Content raw": "695992473ab83a5fc29f753ea1039108",
        "Expires raw": "1879753265",
        "Send for raw": "true",
        "HTTP only raw": "true"
    },
    {
        "Host raw": "https://identity.walmart.com/",
        "Name raw": "locGuestData",
        "Path raw": "/",
        "Content raw": "eyJpbnRlbnQiOiJTSElQUElORyIsImlzRXhwbGljaXQiOmZhbHNlLCJzdG9yZUludGVudCI6IlBJQ0tVUCIsIm1lcmdlRmxhZyI6ZmFsc2UsImlzRGVmYXVsdGVkIjp0cnVlLCJwaWNrdXAiOnsibm9kZUlkIjoiMzA4MSIsInRpbWVzdGFtcCI6MTcyNjk0NDA2NDk1OSwic2VsZWN0aW9uVHlwZSI6IkRFRkFVTFRFRCJ9LCJzaGlwcGluZ0FkZHJlc3MiOnsidGltZXN0YW1wIjoxNzI2OTQ0MDY0OTU5LCJ0eXBlIjoicGFydGlhbC1sb2NhdGlvbiIsImdpZnRBZGRyZXNzIjpmYWxzZSwicG9zdGFsQ29kZSI6Ijk1ODI5IiwiZGVsaXZlcnlTdG9yZUxpc3QiOlt7Im5vZGVJZCI6IjMwODEiLCJ0eXBlIjoiREVMSVZFUlkiLCJ0aW1lc3RhbXAiOjE3MjY5NDQwNjQ5NDUsImRlbGl2ZXJ5VGllciI6bnVsbCwic2VsZWN0aW9uVHlwZSI6IkRFRkFVTFRFRCIsInNlbGVjdGlvblNvdXJjZSI6bnVsbH1dLCJjaXR5IjoiU2FjcmFtZW50byIsInN0YXRlIjoiQ0EifSwicG9zdGFsQ29kZSI6eyJ0aW1lc3RhbXAiOjE3MjY5NDQwNjQ5NTksImJhc2UiOiI5NTgyOSJ9LCJtcCI6W10sIm1zcCI6eyJub2RlSWRzIjpbXSwidGltZXN0YW1wIjpudWxsfSwidmFsaWRhdGVLZXkiOiJwcm9kOnYyOjIwZGE1MWIwLWNmYzItNDZiNC05ZWNhLTZlNjAyNWZkNTBhMSJ9",
        "Expires raw": "1758480065",
        "Send for raw": "true",
        "HTTP only raw": "true"
    },
    {
        "Host raw": "https://identity.walmart.com/",
        "Name raw": "userAppVersion",
        "Path raw": "/",
        "Content raw": "us-web-1.160.0-e7b8d8136f0ca2c796140aa8f52c5f4d1b49c7b0-091816",
        "Expires raw": "1758480065",
        "Send for raw": "true",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://identity.walmart.com/",
        "Name raw": "_pxhd",
        "Path raw": "/",
        "Content raw": "ceabe07eaa553b4280b1b7d71fd5e683fb59d85a80f7cfcd82bcda47d40e63b9:7ac1ccf5-5fa1-11ef-864e-d0cd5ff06627",
        "Expires raw": "1758480089",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://tap.walmart.com/",
        "Name raw": "TS2a5e0c5c027",
        "Path raw": "/",
        "Content raw": "08b0b9c65dab20004b5d11191b366f74081ee7f3afb22ad16748024b018281741a0493d6db4b315b086396a6841130004e164d4767ef664a647e2e23e2500b46243e58ed41956075a25fea2da78841cbc46a50b226d94e84fdbe19fb96363e30",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://identity.walmart.com/",
        "Name raw": "TS013f65e0",
        "Path raw": "/",
        "Content raw": "01816dd022dff3e210457ca19a7e5e3ddc4dc273392bb7eadc6db2916ac13642a7ae25a5f16992fcb431c6403e1cac6b4dc59172ea",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://identity.walmart.com/",
        "Name raw": "TS2a5e0c5c027",
        "Path raw": "/",
        "Content raw": "08046e0832ab2000d001e8b18a638002dea1a3687a9cf89f555568439c3511f62100fab902da6ca1088d1ae757113000173177d7bdc650dfc318f021fb711441aa924d9bb7ac2cbd13c3b38e5418c457c64acd831e3d2a52b9b1d80c4c4319e3",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://identity.walmart.com/",
        "Name raw": "TS016ef4c8",
        "Path raw": "/",
        "Content raw": "01144c8cd2cb2d125761b09834229f7882e69c8abb911278bd94f099210056a4f43c6c5035a00af1f10f6a06c76a5117f331eb5e09",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://identity.walmart.com/",
        "Name raw": "TS8cb5a80e027",
        "Path raw": "/",
        "Content raw": "08faef4f78ab2000f2d968ab933894cbaafd4c637aca7a58702cba9222836e7e7263d5ec8027d5a208905f757e1130004d8cc94180ef280e204311c8d54675e2c955e1ff1b73ac150466c2416b86962e0a417ecdd37b87944c0c46187994c071",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "bsc",
        "Path raw": "/",
        "Content raw": "VqLPWatmibTk42IIYqRX0Q",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap_path",
        "Path raw": "/",
        "Content raw": "/rum.gif",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap-criteo",
        "Path raw": "/",
        "Content raw": "1728213127506:1728213129585:1",
        "Expires raw": "1729422729",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap-lrB",
        "Path raw": "/",
        "Content raw": "1728213129584:0:1",
        "Expires raw": "1729422729",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap-lrV",
        "Path raw": "/",
        "Content raw": "1728213134484:0:1",
        "Expires raw": "1729422734",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap-wmt-dw",
        "Path raw": "/",
        "Content raw": "1728213167814:1728213169142:1",
        "Expires raw": "1729422769",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap-googdsp",
        "Path raw": "/",
        "Content raw": "1728213169141:0:1",
        "Expires raw": "1728213769",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "_tap-appnexus",
        "Path raw": "/",
        "Content raw": "1728213138897:0:2",
        "Expires raw": "1729422795",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "btc",
        "Path raw": "/",
        "Content raw": "Zdk8B67IkX6yJUy6I9lBhY",
        "Expires raw": "2043789216",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://beacon.walmart.com/",
        "Name raw": "b30msc",
        "Path raw": "/",
        "Content raw": "TPd2HjHFGe8sc-1wf3j8ME",
        "Expires raw": "1728215016",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://b.www.walmart.com/",
        "Name raw": "bsc",
        "Path raw": "/",
        "Content raw": "dtf2pVFcLW2woLI2E9bsUI",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://b.www.walmart.com/",
        "Name raw": "_tap_path",
        "Path raw": "/",
        "Content raw": "/rum.gif",
        "Expires raw": "0",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://b.www.walmart.com/",
        "Name raw": "_tap-criteo",
        "Path raw": "/",
        "Content raw": "1728213217898:1728213218528:1",
        "Expires raw": "1729422818",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://b.www.walmart.com/",
        "Name raw": "_tap-lrB",
        "Path raw": "/",
        "Content raw": "1728213218527:0:1",
        "Expires raw": "1729422818",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "http://.walmart.com/",
        "Name raw": "AID",
        "Path raw": "/",
        "Content raw": "wmlspartner%3D0%3Areflectorid%3D0000000000000000000000%3Alastupd%3D1728213222366",
        "Expires raw": "2043573222",
        "Send for raw": "false",
        "HTTP only raw": "true"
    },
    {
        "Host raw": "http://.walmart.com/",
        "Name raw": "com.wm.reflector",
        "Path raw": "/",
        "Content raw": "%22reflectorid%3A0000000000000000000000%40lastupd%3A1728213222366%40firstcreate%3A1728213222366%22",
        "Expires raw": "2043573222",
        "Send for raw": "false",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "https://.www.walmart.com/",
        "Name raw": "ACID",
        "Path raw": "/",
        "Content raw": "fff2fa77-0bb3-43a8-9a7c-4ae78cfc76b7",
        "Expires raw": "1759749222",
        "Send for raw": "true",
        "HTTP only raw": "true"
    },
    {
        "Host raw": "https://www.walmart.com/",
        "Name raw": "_intlbu",
        "Path raw": "/",
        "Content raw": "false",
        "Expires raw": "1728216822",
        "Send for raw": "true",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "https://www.walmart.com/",
        "Name raw": "_shcc",
        "Path raw": "/",
        "Content raw": "US",
        "Expires raw": "1728216822",
        "Send for raw": "true",
        "HTTP only raw": "false"
    },
    {
        "Host raw": "https://www.walmart.com/",
        "Name raw": "assortmentStoreId",
        "Path raw": "/",
        "Content raw": "3081",
        "Expires raw": "1728216822",
        "Send for raw": "true",
        "HTTP only raw": "false"
    }
]



cookies = format_cookies(raw_cookies)  # Format cookies to be used with ChromeDriver

# Step 7: Scrape Reviews from All URLs
walmart_reviews = []

for url in urls:
    walmart_reviews.extend(fetch_all_reviews(url, cookies))

# Step 8: Convert Reviews to DataFrame and Save to CSV
df_walmart = pd.DataFrame(walmart_reviews)

# Post-processing and cleaning the DataFrame
df_walmart['Retailer'] = "Walmart"
df_walmart['scraping_date'] = pd.to_datetime('today').date()
df_walmart['Review date'] = pd.to_datetime(df_walmart['Review date']).dt.date
df_walmart['Review rating'] = df_walmart['Review rating'].str.replace(' out of 5 stars review', '').astype(float)
df_walmart.drop_duplicates(inplace=True)

# Save the DataFrame to a CSV file
df_walmart.to_csv('walmart_reviews.csv', index=False)

print("Reviews scraped and saved to 'walmart_reviews.csv'.")


Fetching page 1...
Attempt 1 of 50...
ChromeDriver options configured successfully.
ChromeDriver initialized successfully.


InvalidCookieDomainException: Message: invalid cookie domain: Cookie 'domain' mismatch
  (Session info: chrome=129.0.6668.89)
Stacktrace:
#0 0x555ec0de402a <unknown>
#1 0x555ec0aca5e0 <unknown>
#2 0x555ec0b6f9d9 <unknown>
#3 0x555ec0b3eb22 <unknown>
#4 0x555ec0b5dd7d <unknown>
#5 0x555ec0b3e8c3 <unknown>
#6 0x555ec0b0c6b3 <unknown>
#7 0x555ec0b0d68e <unknown>
#8 0x555ec0daea2b <unknown>
#9 0x555ec0db29b1 <unknown>
#10 0x555ec0d9b225 <unknown>
#11 0x555ec0db3532 <unknown>
#12 0x555ec0d8038f <unknown>
#13 0x555ec0dd2f28 <unknown>
#14 0x555ec0dd30f3 <unknown>
#15 0x555ec0de2e7c <unknown>
#16 0x7fa97c786732 <unknown>
