In [24]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import re

# -------------------------------
# 1. Setup Selenium WebDriver with Anti-Detection
# -------------------------------
def get_driver():
    chrome_options = Options()
    # Use a standard User-Agent to look like a real browser
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
    
    # Disable automation flags
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option("useAutomationExtension", False)

    # Optional: Run headless (without opening window) if you want speed
    # chrome_options.add_argument("--headless") 

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    # Script to prevent Selenium detection variables
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
    
    return driver

driver = get_driver()

# -------------------------------
# 2. Categories
# -------------------------------
categories = {
    # Note: Ensure these URLs are valid category pages that list products
    "Electronics": "https://www.banggood.com/Wholesale-Electronics-ca-2001.html",
    "Smartphones": "https://www.banggood.com/Wholesale-Phones-and-Accessories-ca-1001.html",
    "Home & Garden": "https://www.banggood.com/Wholesale-Home,Garden-and-Furniture-ca-12001.html"
}

all_products = []

# -------------------------------
# 3. Scrape Each Category
# -------------------------------
try:
    for cat_name, cat_url in categories.items():
        print(f"\n--- Scraping Category: {cat_name} ---")

        # Scrape 2 pages per category for testing
        for page in range(1, 3):
            target_url = f"{cat_url}?page={page}"
            print(f"Loading {target_url}...")
            
            driver.get(target_url)
            
            # Wait for random time to act human
            time.sleep(random.uniform(3, 6))

            # Scroll down slowly to trigger image/price loading
            last_height = driver.execute_script("return document.body.scrollHeight")
            for i in range(3):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight * (arguments[0] / 3));", i + 1)
                time.sleep(2)
            
            # Parse content
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # UPDATED SELECTORS:
            # Banggood usually puts products in a wrapper class 'p-wrap' 
            # or inside list items 'li' with specific data attributes.
            # We try to find the product container:
            products = soup.select('.p-wrap') 
            
            if not products:
                # Fallback selector if layout changes
                products = soup.select('.product-item')

            print(f"Found {len(products)} products on page {page}")

            if not products:
                print("Warning: No products found. Site might be blocking or selectors changed.")
                continue

            for p in products:
                try:
                    # 1. Title
                    # Usually in <a class="title"> or <div class="name">
                    name_tag = p.select_one('.title') or p.select_one('a[title]')
                    name = name_tag.get_text(strip=True) if name_tag else "N/A"

                    # 2. Price
                    # Usually in <span class="price"> or <div class="price-box">
                    price_tag = p.select_one('.price') or p.select_one('.price-box')
                    price = price_tag.get_text(strip=True) if price_tag else "N/A"

                    # 3. Link
                    link_tag = p.select_one('a')
                    link = link_tag['href'] if link_tag else "N/A"
                    # Fix relative links
                    if link != "N/A" and not link.startswith('http'):
                        link = "https:" + link if link.startswith('//') else "https://www.banggood.com" + link

                    # 4. Reviews/Stars (Often loaded dynamically, might be missing)
                    review_tag = p.select_one('.review-text') or p.select_one('.review')
                    reviews = review_tag.get_text(strip=True) if review_tag else "0"
                    
                    all_products.append({
                        "Category": cat_name,
                        "Name": name,
                        "Price": price,
                        "Reviews": reviews,
                        "URL": link
                    })
                except Exception as e:
                    continue

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()

# -------------------------------
# 4. Save to CSV
# -------------------------------
if all_products:
    df = pd.DataFrame(all_products)
    # Clean up price (remove currency symbols if needed)
    df.to_csv("banggood_scraped_data.csv", index=False, encoding='utf-8-sig')
    print(f"\nSuccess! Scraped {len(all_products)} items. Saved to 'banggood_scraped_data.csv'")
    print(df.head())
else:
    print("\nNo data was scraped. Check your internet or selectors.")


--- Scraping Category: Electronics ---
Loading https://www.banggood.com/Wholesale-Electronics-ca-2001.html?page=1...
Found 60 products on page 1
Loading https://www.banggood.com/Wholesale-Electronics-ca-2001.html?page=2...
Found 60 products on page 2

--- Scraping Category: Smartphones ---
Loading https://www.banggood.com/Wholesale-Phones-and-Accessories-ca-1001.html?page=1...
Found 60 products on page 1
Loading https://www.banggood.com/Wholesale-Phones-and-Accessories-ca-1001.html?page=2...
Found 60 products on page 2

--- Scraping Category: Home & Garden ---
Loading https://www.banggood.com/Wholesale-Home,Garden-and-Furniture-ca-12001.html?page=1...
Found 60 products on page 1
Loading https://www.banggood.com/Wholesale-Home,Garden-and-Furniture-ca-12001.html?page=2...
Found 60 products on page 2

No data was scraped. Check your internet or selectors.


In [None]:
link_tag = p.select_one('a[title]')  # anchor tag with title attribute
name = link_tag['title'].strip() if link_tag and 'title' in link_tag.attrs else "N/A"

In [20]:
#1.Load scraped data into pandas DataFrames.
import pandas as pd
# Step 1: Load CSV into DataFrame
df = pd.read_csv("banggood_scraped_data.csv")
# Display first 5 rows
print(df.head())
# Show summary info
print(df.info())

      Category                                               Name      Price  \
0  Electronics  AMNVOLT V4 ATS Mini SI4732 Radio ALL Band DSP ...   US$34.99   
1  Electronics  2Pcs BAOFENG UV-5R Mini 5W Walkie Talkie Bluet...   US$39.99   
2  Electronics  [EU/US Direct]ATOMSTACK Maker A30 PRO/S30 PRO ...  US$569.99   
3  Electronics  Hiseeu C90B 4MP Solar WIFI IP Camera Full Colo...   US$32.99   
4  Electronics  GA800 Radio Shortwave Antenna Small Active Loo...   US$94.99   

   Reviews                                                URL  
0        0  https://www.banggood.com/AMNVOLT-V4-ATS-Mini-S...  
1        0  https://www.banggood.com/2Pcs-BAOFENG-UV-5R-Mi...  
2        0  https://www.banggood.com/EU-or-US-DirectATOMST...  
3        0  https://www.banggood.com/Hiseeu-C90B-4MP-Solar...  
4        0  https://www.banggood.com/GA800-Radio-Shortwave...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 5 columns):
 #   Column    Non-Null Count  

In [None]:
2. Clean price, rating, review counts, and handle missing values.

# ---------- STEP 2: CLEANING DATA ----------

import pandas as pd

# Example: assume df already loaded from CSV
# df = pd.read_csv("scraped_data.csv")

# --- 1. Clean Price ---
df['Price'] = (
    df['Price']
    .astype(str)
    .str.replace("Rs.", "", regex=False)
    .str.replace("$", "", regex=False)
    .str.replace(",", "", regex=False)
    .str.strip()
)

df['Price'] = pd.to_numeric(df['Price'], errors='coerce')


# --- 2. Clean Reviews ---
df['Reviews'] = (
    df['Reviews']
    .astype(str)
    .str.extract(r'(\d+)')      # extract only digits
)

df['Reviews'] = pd.to_numeric(df['Reviews'], errors='coerce')


# --- 3. Handle Missing Values ---
df['Price'] = df['Price'].fillna(0)
df['Reviews'] = df['Reviews'].fillna(0)


# --- 4. Show cleaned data ---
print("\nCleaned DataFrame:")
print(df.head())

In [None]:
# 3. Create at least two additional derived features.
# ---------- STEP 3: CREATE DERIVED FEATURES ----------

import numpy as np

# Agar Rating column nahi hai, to ek dummy rating create kar lo
if 'Rating' not in df.columns:
    df['Rating'] = 4.0

# 1. Price Category Feature
def price_category(p):
    if p <= 2000:
        return "Low"
    elif p <= 10000:
        return "Medium"
    else:
        return "High"

df['Price_Category'] = df['Price'].apply(price_category)

# 2. Popularity Score Feature
df['Popularity_Score'] = df['Reviews'] * (1 + df['Rating'] / 5.0)

# --- Show final dataframe ---
print("\nDerived Features Added:")
print(df[['Name', 'Price', 'Reviews', 'Rating', 'Price_Category', 'Popularity_Score']].head())

In [None]:
# Analysis 1: Price Distribution per Category
print("\n--- Analysis 1: Price Distribution per Category ---")
print(df.groupby("Category")["Price"].describe())

# Analysis 2: Rating vs Price Correlation
print("\n--- Analysis 2: Rating vs Price Correlation ---")
if df["Price"].std() == 0 or df["Rating"].std() == 0:
    print("Correlation cannot be computed â€” zero variance in a column.")
else:
    correlation = df["Price"].corr(df["Rating"])
    print("Correlation:", correlation)

# Analysis 3: Top Reviewed Products
print("\n--- Analysis 3: Top 10 Most Reviewed Products ---")
print(df.sort_values(by="Reviews", ascending=False).head(10)[["Name","Category","Price","Reviews","URL"]])

# Analysis 4: Best Value Products (Price per Review)
df['Price_per_Review'] = df['Price'] / (df['Reviews'] + 1)
print("\n--- Analysis 4: Best Value Products ---")
print(df.sort_values(by="Price_per_Review").head(10)[["Name","Category","Price","Reviews","Price_per_Review"]])

# Analysis 5: Average Popularity Score per Category
print("\n--- Analysis 5: Average Popularity Score per Category ---")
print(df.groupby("Category")["Popularity_Score"].mean())