In [None]:
# Import Required Libraries
# Import pandas for data manipulation, requests for HTTP requests,
# BeautifulSoup for HTML parsing, re for regex patterns, time for delays
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time

In [None]:
#  Set Amazon URL
# Define the URL to scrape top 10 best performance laptops from Amazon India
URL="https://www.amazon.in/s?k=top+10+best+performance+laptops&adgrpid=60753480223&ext_vrnc=hi&hvadid=590712376475&hvdev=c&hvlocphy=9300476&hvnetw=g&hvqmt=b&hvrand=419610393738734445&hvtargid=kwd-393673962435&hydadcr=24510_2265449&mcid=8a64569d104f3e198d4d08465e1990a3&tag=googinhydr1-21&ref=pd_sl_7ssry6798d_b"

In [None]:
#  Set Request Headers
# Define headers to mimic a browser request and avoid blocking by Amazon
headers={
    
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"
}

In [None]:
# Make Initial HTTP Request
# Send an HTTP GET request to Amazon to retrieve the page content
response=requests.get(URL,headers=headers)
response

In [None]:
#  Parse HTML Content
# Convert the HTML response into a BeautifulSoup object for parsing
soup=BeautifulSoup(response.content,"html.parser")
soup

In [None]:
#  Initialize Data List
# Create an empty list to store all scraped product data
data=[]

In [None]:
#  Check Response Status
# Send another request and verify the status code (200 = success)
response=requests.get(URL,headers=headers)
response.status_code

In [None]:
#  Loop Through Multiple Pages
# Iterate through multiple pages and make requests with pagination parameters
for page in range(1,3):
    params={'k':'laptope','page':page}
    #make the request to the website with the parameters
    response=requests.get(URL,headers=headers,params=params)

In [None]:
#  Display Response Object
# Display the response object to verify request was successful
response

In [None]:
#  Display HTML Content
# Display the raw HTML content of the response
response.content

In [None]:
#  Display Parsed Soup Object
# Display the BeautifulSoup object showing parsed HTML structure
soup

In [None]:
#  Scrape and Extract Product Data (Main Scraping Logic)
# This cell loops through pages and extracts all product details:
# title, brand, price, storage, color, rating, and RAM
for page in range(1,2):
    params={'k':'laptope','page':page}
    #make the request to the website with the parameters
    response=requests.get(URL,headers=headers,params=params)

    #beautifulsoup object
    soup=BeautifulSoup(response.content,"html.parser")
    #find all the
    products=soup.find_all("div",{"data-component-type":"s-search-result"})
    #extract the data from the products
    for produts in products:
     #extract title
        title_tag=produts.find("h2")
    
        if not title_tag:
            continue
        title_text=title_tag.get_text(strip=True)

        # extract brand
        match=re.match(r'^\W*([A-Za-z]+)',title_text)
        brand=match.group(1).upper() if match else "UNKNOWN"

        # extract price
        price_tag=produts.find("span",{"class":"a-price-whole"})
        price_text=price_tag.text if price_tag else "N/A"
        
        # extract storage (GB/TB and SSD/HDD)
        storage_match=re.search(r"(\d+)\s?(GB|TB)\s?(SSD|HDD)?", title_text)
        storage=storage_match.group() if storage_match else "N/A"
        
        # extract color
        color_match=re.search(r"(Black|Silver|Gold|White|Blue|Red|Gray|Grey|Space Gray)", title_text, re.IGNORECASE)
        color=color_match.group(1) if color_match else "N/A"
        
        # extract RAM (looking for GB RAM pattern)
        ram_match=re.search(r"(\d+)\s?GB\s?RAM", title_text, re.IGNORECASE)
        ram=ram_match.group(1)+"GB" if ram_match else "N/A"
        
        # extract rating
        rating_tag=produts.find("span",{"class":re.compile("a-icon-star")})
        rating_text=rating_tag.text if rating_tag else "N/A"
        
        data.append({"title": title_text,
                     "brand": brand,
                     "price": price_text,
                     "storage": storage,
                     "color": color,
                     "rating": rating_text,
                     "ram": ram
        })
    

    print(f"page{page}scrapped")
    time.sleep(1)

In [None]:
#  Display All Product Data
# Print all extracted product information with all fields in a formatted output
for product in data:
    print("Title:", product.get("title","N/A"))
    print("Brand:", product.get("brand","N/A"))
    print("Price:", product.get("price","N/A"))
    print("Storage:", product.get("storage","N/A"))
    print("Color:", product.get("color","N/A"))
    print("Rating:", product.get("rating","N/A"))
    print("RAM:", product.get("ram","N/A"))
    print("-"*50)

In [None]:
#  Extract and Display Storage Information
# Extract storage capacity and type from product titles and display separately
for product in data:
    title = product.get("title", "")
    storage = re.search(r"(\d+)\s?(GB|TB)\s?(SSD|HDD)?", title)
    storage_text = storage.group() if storage else "N/A"
    print(f"Title: {title}")
    print(f"Storage: {storage_text}")
    print("-"*50)