In [2]:
# Requires: requests, beautifulsoup4
import requests
from bs4 import BeautifulSoup
import time
import re


URL = "https://www.amazon.in/s?k=laptop&crid=31TPTM1NJVMEC&sprefix=laptop%2Caps%2C355&ref=nb_sb_noss_2"


# make header to mimic a browser visit
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36"
}
# Create a empty list to store laptop Details

data = []
print("Data list cleared")

for page in range(1, 2):  
    params = {"k": "laptop", "page": page}
    
    # Use verify=False to bypass SSL verification (for testing purposes)
    response = requests.get(URL, params=params, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    
    products = soup.find_all("div", {"data-component-type": "s-search-result"})

    
    
    for product in products:
        # Extract the product title
        title_tag = product.find("h2")
        if not title_tag:
            continue
        title_text = title_tag.get_text(strip=True)

        
        # Extract the product price
        price_tag = product.find("span", {"class": "a-price-whole"})
        price_text = price_tag.get_text(strip=True) if price_tag else "N/A"
        
        # Extract the product brand
        match = re.match(r'^\W*([A-Za-z]+)', title_text)
        brand = match.group(1).upper() if match else "UNKNOWN"

        # Extract RAM (improved version)
        ram_match = re.search(r'(\d+)\s*GB\s*(RAM|DDR\d+|LPDDR\d+)', title_text, re.IGNORECASE)
        ram = ram_match.group(1) + "GB" if ram_match else "N/A"
        
        # Extract the product rating 
        rating_tag = product.find("span", class_="a-icon-alt")
        rating = "N/A"
        if rating_tag:
            rating_match = re.search(r'(\d+\.?\d*)', rating_tag.get_text())
            rating = rating_match.group(1) if rating_match else "N/A"

        # Extract the Product SSD - Simplified
        ssd_match = re.search(r'(\d+)\s*(GB|TB)\s*(?:SSD|NVMe)', title_text, re.IGNORECASE) or re.search(r'(\d+)\s*(GB|TB)', title_text)
        ssd = (ssd_match.group(1) + ssd_match.group(2).upper()) if ssd_match else "N/A"
      

        # Extract the Product Windows Version - Improved version
        windows_version = "N/A"
        
        # Try: "Windows 11", "Windows 10", "Win11", "Win10", etc.
        windows_match = re.search(r'(?:Windows\s*|Win\s*)(\d+)', title_text, re.IGNORECASE)
        if windows_match:
            windows_version = "Windows " + windows_match.group(1)
        
        # Extract the Product Color
        color_match = re.search(r'\b(Black|Silver|Gray|Grey|White|Blue|Red|Gold|Green|Brown|Pink|Purple|Yellow|Orange|Champagne|Midnight|Space|Cosmic|Stardust|Graphite|Ash|Onyx|Platinum|Metallic)\b', title_text, re.IGNORECASE)
        color = color_match.group(1) if color_match else "N/A"

        # Extract the Processor (Intel, AMD, Apple M, etc)
        processor = "N/A"
        
        # Try Intel processors - more flexible patterns
        intel_match = re.search(r'Intel\s+(?:Core\s+)?(?:i[3579]|m\d|Pentium|Celeron|Atom|Xeon)[-\w]*', title_text, re.IGNORECASE)
        if intel_match:
            processor = intel_match.group(0).strip()
        
        # Try AMD Ryzen/Athlon processors
        elif re.search(r'AMD', title_text, re.IGNORECASE):
            amd_match = re.search(r'AMD\s+(?:Ryzen|Athlon)[\s\w]*', title_text, re.IGNORECASE)
            if amd_match:
                processor = amd_match.group(0).strip()
        
        # Try Apple M series
        elif re.search(r'Apple\s+M', title_text, re.IGNORECASE):
            apple_match = re.search(r'Apple\s+M\d+\w*', title_text, re.IGNORECASE)
            if apple_match:
                processor = apple_match.group(0).strip()
        
        # Try other processors
        else:
            other_match = re.search(r'Qualcomm\s+Snapdragon|MediaTek|ARM|Exynos', title_text, re.IGNORECASE)
            if other_match:
                processor = other_match.group(0).strip()


        # Store the extracted data in a dictionary and append to the list
        data.append({
            "Title": title_text,
            "Price": price_text,
            "Brand": brand,
            'RAM': ram,
            "Rating": rating,
            "Storage": ssd,
            "Windows": windows_version,
            "Color": color,
            "Processor": processor
            
        })

      
    print(f"Page {page} scraped")
    time.sleep(1)

for product in data:
    print("Title:", product["Title"])
    print("Price:", product["Price"])
    print("Brand:", product["Brand"])
    print("RAM:", product.get("RAM"))
    print("Processor:", product.get("Processor"))
    print("Rating:", product.get("Rating"))
    print("Storage:", product.get("Storage"))
    print("Windows:", product.get("Windows"))
    print("Color:", product.get("Color"))
    print("-" * 50)

# make data into a dataframe 
import pandas as pd

# Create a DataFrame from the data list
df = pd.DataFrame(data)
df.head()
print("First 3 product titles:")
for i in range(min(3, len(data))):
    print(f"\n{i+1}. Title: {data[i]['Title']}")
    print(f"   Processor: {data[i]['Processor']}")



Data list cleared
Page 1 scraped
Title: HP Omnibook 5 OLED, Snapdragon X Processor (16GB LPDDR5x,1TB SSD) 2K OLED, Micro-Edge, 16''/40.6cm, Win11, M365*Office24, Glacier Silver, 1.59kg, fb0001QU, FHD Camera, Backlit, Next-Gen AI Laptop
Price: 69,990
Brand: HP
RAM: 16GB
Processor: N/A
Rating: 4.1
Storage: 1TB
Windows: Windows 11
Color: Silver
--------------------------------------------------
Title: HP 15, AMD Ryzen 7 7735HS (16GB DDR5,512GB SSD) FHD, Anti-Glare, Micro-Edge, 15.6''/39.6cm, Win11, M365 Basic(1yr)* Office24, Silver, 1.59kg, fc1038AU, AMD Radeon FHD Camera w/Shutter, Backlit Laptop
Price: 56,990
Brand: HP
RAM: 16GB
Processor: AMD Ryzen 7 7735HS
Rating: 4.0
Storage: 512GB
Windows: Windows 11
Color: Silver
--------------------------------------------------
Title: EBook 11.6" HD Laptop | Best Student & Office Work Laptop | Celeron N4020 | 4GB DDR4 | 128GB eMMC + M.2 SSD Expandable Slot | Win 11 Home |31Wh Battery | UHD Graphics 600 | Black
Price: 10,990
Brand: EBOOK
RAM: 4GB
