In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# Function to scrape data from a single page of Amazon search results
def scrape_amazon_page(url):
    try:
        response = requests.get(url, headers=headers)
        print(f"Requesting {url} - Status Code: {response.status_code}")
        
        # Retry mechanism if 503 error is encountered
        retry_count = 0
        while response.status_code == 503 and retry_count < 3:
            print(f"503 Error encountered. Retrying... ({retry_count + 1}/3)")
            time.sleep(random.uniform(5, 10))  # Wait longer before retrying
            response = requests.get(url, headers=headers)
            retry_count += 1

        if response.status_code != 200:
            print(f"Failed to retrieve the page: {response.status_code}")
            return []  # Return empty list if the page could not be loaded
        
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check if any products are found
        products = soup.find_all('div', {'data-component-type': 's-search-result'})
        print(f"Found {len(products)} products on the page.")

        products_data = []

        for product in products:
            try:
                # Extracting the necessary fields
                brand_element = product.find('span', {'class': 'a-size-base-plus a-color-base a-text-normal'})
                brand = brand_element.text.split()[0] if brand_element else 'Unknown'

                price_whole = product.find('span', {'class': 'a-price-whole'})
                price_fraction = product.find('span', {'class': 'a-price-fraction'})

                # Construct the full price if both parts are present
                if price_whole and price_fraction:
                    price = float(price_whole.text.replace(',', '') + '.' + price_fraction.text)
                elif price_whole:
                    price = float(price_whole.text.replace(',', ''))
                else:
                    price = 0.0

                # Extracting rating and count, ensuring they're valid numbers
                rating_text = product.find('span', {'class': 'a-icon-alt'})
                rating = float(rating_text.text.split()[0]) if rating_text else 0.0

                rating_count_text = product.find('span', {'class': 'a-size-base'})
                rating_count = int(rating_count_text.text.replace(',', '')) if rating_count_text and rating_count_text.text.replace(',', '').isdigit() else 0

                # Review count is often the same as the rating count
                review_count = rating_count

                # Extract product URL
                product_url = 'https://www.amazon.in' + product.h2.a['href']
                
                rank = len(products_data) + 1  # Rank based on position

                print(f"Scraped product: {brand}, Price: {price}, Rating: {rating}, Rating Count: {rating_count}")

                products_data.append({
                    'Brand': brand,
                    'Price': price,
                    'Rating': rating,
                    'Rating Count': rating_count,
                    'Review Count': review_count,
                    'Rank': rank,
                    'URL': product_url
                })

            except AttributeError as e:
                print(f"Skipping a product due to missing data: {e}")
                continue
            except ValueError as e:
                print(f"Skipping a product due to ValueError: {e}")
                continue

        return products_data

    except requests.exceptions.RequestException as e:
        print(f"Error requesting page: {e}")
        return []

# List of specific URLs to scrape
urls = [
    "https://www.amazon.in/s?k=smartlock&crid=3JY679FXAAQ2B&qid=1724948030&sprefix=%2Caps%2C238&ref=sr_pg_1",
    "https://www.amazon.in/s?k=smartlock&page=2&crid=3JY679FXAAQ2B&qid=1724948089&sprefix=%2Caps%2C238&ref=sr_pg_2",
    "https://www.amazon.in/s?k=smartlock&page=3&crid=3JY679FXAAQ2B&qid=1724948096&sprefix=%2Caps%2C238&ref=sr_pg_3",
    "https://www.amazon.in/s?k=smartlock&page=4&crid=3JY679FXAAQ2B&qid=1724948117&sprefix=%2Caps%2C238&ref=sr_pg_4",
    "https://www.amazon.in/s?k=smartlocks&crid=2BFH59N330DM1&sprefix=smartlock%2Caps%2C243&ref=nb_sb_ss_ts-doa-p_4_9"
]

# List to store data from all pages
all_products = []

# Scrape data from each URL
for url in urls:
    print(f"Scraping URL: {url}")
    page_data = scrape_amazon_page(url)
    all_products.extend(page_data)
    time.sleep(random.uniform(10, 15))  # Longer random delay to avoid getting blocked

# Convert the data to a pandas DataFrame and save it to a CSV file
df = pd.DataFrame(all_products)
df.to_csv('amazon_smart_locks_data.csv', index=False)

print("Data scraping completed. Data saved to amazon_smart_locks_data.csv")


Scraping URL: https://www.amazon.in/s?k=smartlock&crid=3JY679FXAAQ2B&qid=1724948030&sprefix=%2Caps%2C238&ref=sr_pg_1
Requesting https://www.amazon.in/s?k=smartlock&crid=3JY679FXAAQ2B&qid=1724948030&sprefix=%2Caps%2C238&ref=sr_pg_1 - Status Code: 200
Found 24 products on the page.
Scraped product: LAVNA, Price: 1699.0, Rating: 4.1, Rating Count: 16
Scraped product: QUBO, Price: 8990.0, Rating: 4.4, Rating Count: 1305
Scraped product: LAVNA, Price: 3299.0, Rating: 4.2, Rating Count: 58
Scraped product: Godrej, Price: 14607.0, Rating: 4.4, Rating Count: 485
Scraped product: Yale, Price: 10999.0, Rating: 4.1, Rating Count: 350
Scraped product: LAVNA, Price: 5689.0, Rating: 4.3, Rating Count: 283
Scraped product: Plantex, Price: 2999.0, Rating: 4.3, Rating Count: 6
Scraped product: Escozor, Price: 1599.0, Rating: 3.9, Rating Count: 133
Scraped product: Escozor®, Price: 6990.0, Rating: 3.0, Rating Count: 9
Scraped product: LAVNA, Price: 7980.0, Rating: 4.3, Rating Count: 959
Scraped product: