In [1]:
import os
import re
import csv
import time
import math
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [2]:


# -------------------------------
# STEP 1: READ LINKS FROM EXCEL
# -------------------------------
input_file = "C:/Users/Asus/Pictures/Lenskart/Lenskart_Links.xlsx"
df_links = pd.read_excel(input_file)

possible_cols = [c for c in df_links.columns if "link" in c.lower() or "url" in c.lower()]
if not possible_cols:
    raise ValueError("‚ùå No column found with 'link' or 'url' in name.")

LINKS = df_links[possible_cols[0]].dropna().tolist()
print(f"‚úÖ Loaded {len(LINKS)} links from Excel.\n")

# -------------------------------
# STEP 2: SETUP SELENIUM DRIVER
# -------------------------------
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# -------------------------------
# STEP 3: EXTRACT PRODUCTS (FIXED)
# -------------------------------
def extract_products(driver, base_url):
    print(f"\nüåê Opening page: {base_url}")
    driver.get(base_url)
    time.sleep(6)

    # Try to detect total products (optional)
    total_products = 300  # fallback default
    try:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        total_text = soup.find("div", {"data-cy": "showing-result-desktop"})
        if total_text:
            spans = total_text.find_all("span")
            total_products = int(spans[-2].get_text(strip=True))
            print(f"üî¢ Total products found: {total_products}")
    except Exception:
        print("‚ö†Ô∏è Could not detect total products. Using default.")

    # Folder and CSV setup
    category_name = re.sub(r'[\\/*?:"<>|]', "_", base_url.split("/")[-1].replace(".html", ""))
    images_dir = os.path.join("lenskart_images", category_name)
    os.makedirs(images_dir, exist_ok=True)
    output_csv = f"lenskart_{category_name}.csv"

    seen_links = set()
    data = []

    # Progressive scroll + extract loop
    approx_scrolls = math.ceil(total_products / 15) + 10
    print(f"üßÆ Estimated scrolls needed: {approx_scrolls}")

    for scroll in range(approx_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(3)
        driver.execute_script("window.scrollBy(0, -550);")
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, "html.parser")
        product_anchors = soup.find_all("a", {"class": "sc-710ae5be-7 ipZuLh"})

        for a in product_anchors:
            try:
                link = "https://www.lenskart.com" + a.get("href", "")
                if link in seen_links:
                    continue
                seen_links.add(link)

                brand = a.find("p", {"data-cy": "plpProductTitle"})
                brand = brand.get_text(strip=True) if brand else ""

                image = a.find("img")["src"] if a.find("img") else ""

                price = a.find("div", {"data-cy": "plpOfferPrice"})
                price = price.get_text(strip=True) if price else ""

                original_price_tag = a.find("span", {"class": "sc-ddbf73a-0 cWCazb"})
                original_price = original_price_tag.get_text(strip=True) if original_price_tag else ""

                discount_tag = a.find("h5", {"data-cy": "plpTitleTypography"})
                discount = discount_tag.get_text(strip=True) if discount_tag else ""

                rating_tag = a.find("span", {"class": "sc-10fbc79f-1 kSWBHN"})
                rating = rating_tag.get_text(strip=True) if rating_tag else ""

                reviews_tag = a.find("span", {"class": "sc-10fbc79f-1 dFjgXS"})
                reviews = reviews_tag.get_text(strip=True) if reviews_tag else ""

                # Optional image download
                if image:
                    clean_name = re.sub(r'[\\/*?:"<>|]', "_", os.path.basename(image.split("?")[0]))
                    img_name = os.path.join(images_dir, clean_name)
                    if not os.path.exists(img_name):
                        try:
                            img_data = requests.get(image, timeout=10).content
                            with open(img_name, "wb") as f:
                                f.write(img_data)
                        except Exception as e:
                            print(f"‚ö†Ô∏è Error downloading {clean_name}: {e}")

                data.append({
                    "Brand": brand,
                    "Price": price,
                    "Original Price": original_price,
                    "Discount": discount,
                    "Rating": rating,
                    "Reviews": reviews,
                    "Product Link": link,
                    "Image URL": image
                })

            except Exception as e:
                print(f"‚ö†Ô∏è Error parsing product: {e}")

        print(f"‚¨áÔ∏è Scroll {scroll+1}/{approx_scrolls}: {len(seen_links)} products collected...")
        if scroll >= 5 and len(product_anchors) < 10:
            print("üõë Page seems fully loaded. Stopping early.")
            break

    # Save to CSV
    if data:
        pd.DataFrame(data).to_csv(output_csv, index=False, encoding="utf-8-sig")
        print(f"‚úÖ Saved {len(data)} products to: {output_csv}")
    else:
        print("‚ö†Ô∏è No data found for this page.")

# -------------------------------
# STEP 4: MAIN EXECUTION
# -------------------------------
for link in LINKS:
    try:
        extract_products(driver, link)
    except Exception as e:
        print(f"‚ùå Error while processing {link}: {e}")
        continue

driver.quit()
print("\nüéØ All pages scraped successfully!")


‚úÖ Loaded 4 links from Excel.


üåê Opening page: https://www.lenskart.com/eyeglasses/collections/all-computer-glasses.html
üî¢ Total products found: 408
üßÆ Estimated scrolls needed: 38
‚ö†Ô∏è Error downloading blue-block-phone-&-computer-glasses_-blue-full-rim-rectangle-lenskart-blu-lb-e13737-c2_lenskart-blu-lb-e13737-c2-eyeglasses_lenskart-blu-lb-e13737-c2-eyeglasses_eyeglasses_g_101023_02_2022.jpg: [Errno 2] No such file or directory: 'lenskart_images\\all-computer-glasses\\blue-block-phone-&-computer-glasses_-blue-full-rim-rectangle-lenskart-blu-lb-e13737-c2_lenskart-blu-lb-e13737-c2-eyeglasses_lenskart-blu-lb-e13737-c2-eyeglasses_eyeglasses_g_101023_02_2022.jpg'
‚ö†Ô∏è Error downloading blue-block-phone-&-computer-glasses_-light-blue-transparent-full-rim-round-lenskart-blu-lb-e14061-c1_lenskart-blu-lb-e14061-c1-eyeglasses_lenskart-blu-lb-e14061-c1-eyeglasses_eyeglasses_g_9196_325_02_2022.jpg: [Errno 2] No such file or directory: 'lenskart_images\\all-computer-glasses\\blue-bl