In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

def get_product_links(base_url, pages=25):
    """Extract all laptop product links from multiple pages"""
    all_links = []
    
    for page in range(1, pages + 1):
        if page == 1:
            url = f"{base_url}/laptop-notebook/laptop"
        else:
            url = f"{base_url}/laptop-notebook/laptop?page={page}"
        
        print(f"Fetching links from page {page}...")
        response = requests.get(url)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            product_items = soup.find_all('h4', class_='p-item-name')
            
            for item in product_items:
                link = item.find('a')['href']
                all_links.append(link)
        else:
            print(f"Failed to fetch page {page}. Status code: {response.status_code}")
        
        # Add delay to avoid overwhelming the server
        time.sleep(1)
    
    print(f"Total {len(all_links)} product links found.")
    return all_links

def extract_specification_data(soup):
    """Extract all specification data from the product page"""
    spec_dict = {}
    
    # Get specification table
    spec_tables = soup.find_all('table', class_='data-table')
    
    if not spec_tables:
        return spec_dict
    
    for table in spec_tables:
        # Get all section headers
        headers = table.find_all('td', class_='heading-row')
        
        for header in headers:
            section_name = header.text.strip()
            # Find the next tbody after this header
            next_tbody = header.parent.parent.find_next('tbody')
            
            # Extract all rows from this section
            if next_tbody:
                rows = next_tbody.find_all('tr')
                for row in rows:
                    name_cell = row.find('td', class_='name')
                    value_cell = row.find('td', class_='value')
                    if name_cell and value_cell:
                        key = f"{section_name}_{name_cell.text.strip()}"
                        value = value_cell.text.strip().replace('\n', ' ')
                        spec_dict[key] = value
    
    return spec_dict

def extract_product_info(soup):
    """Extract basic product information"""
    info_dict = {}
    
    # Get product name
    product_name_element = soup.find('h1', class_='product-name')
    if product_name_element:
        info_dict['product_name'] = product_name_element.text.strip()
    
    # Get product info table
    product_info_table = soup.find('table', class_='product-info-table')
    if product_info_table:
        rows = product_info_table.find_all('tr', class_='product-info-group')
        for row in rows:
            label = row.find('td', class_='product-info-label')
            data = row.find('td', class_='product-info-data')
            if label and data:
                key = label.text.strip()
                value = data.text.strip()
                # Clean the price value by removing currency symbol
                if key == 'Price' or key == 'Regular Price':
                    value = re.sub(r'[^\d.]', '', value)
                info_dict[key] = value
    
    return info_dict

def scrape_product_details(url):
    """Scrape all details from a product page"""
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch {url}. Status code: {response.status_code}")
            return {}
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract product info and specifications
        product_info = extract_product_info(soup)
        specifications = extract_specification_data(soup)
        
        # Combine all data
        combined_data = {**product_info, **specifications, 'product_url': url}
        
        return combined_data
    
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return {}

def main():
    base_url = "https://www.startech.com.bd"
    
    # Get all product links
    all_product_links = get_product_links(base_url)
    
    # Scrape details from each product page
    all_products_data = []
    
    for i, link in enumerate(all_product_links):
        print(f"Scraping product {i+1} of {len(all_product_links)}: {link}")
        product_data = scrape_product_details(link)
        if product_data:
            all_products_data.append(product_data)
        
        # Add delay to avoid overwhelming the server
        time.sleep(2)
    
    # Create DataFrame and save to CSV
    if all_products_data:
        df = pd.DataFrame(all_products_data)
        
        # Save to CSV
        csv_filename = 'startech_laptops_data.csv'
        df.to_csv(csv_filename, index=False)
        print(f"Data saved to {csv_filename}. Total items: {len(df)}")
    else:
        print("No data was collected.")

if __name__ == "__main__":
    main()

Fetching links from page 1...
Fetching links from page 2...
Fetching links from page 3...
Fetching links from page 4...
Fetching links from page 5...
Fetching links from page 6...
Fetching links from page 7...
Fetching links from page 8...
Fetching links from page 9...
Fetching links from page 10...
Fetching links from page 11...
Fetching links from page 12...
Fetching links from page 13...
Fetching links from page 14...
Fetching links from page 15...
Fetching links from page 16...
Fetching links from page 17...
Fetching links from page 18...
Fetching links from page 19...
Fetching links from page 20...
Fetching links from page 21...
Fetching links from page 22...
Fetching links from page 23...
Fetching links from page 24...
Fetching links from page 25...
Total 485 product links found.
Scraping product 1 of 485: https://www.startech.com.bd/hp-15-fc0296au-ryzen-3-laptop
Scraping product 2 of 485: https://www.startech.com.bd/apple-macbook-pro-16-inch-m4-pro-48gb-ram-512gb-ssd-space-black