In [2]:
import pandas as pd
import re


## Transform Phase & Cleaning data 

In [3]:


## Load CSV

df = pd.read_csv("extraction_raw_banggood_products_with_category.csv")

# Remove duplicates
df.drop_duplicates(subset=["title", "link"], inplace=True)


# Handle missing values
# Replace 'N/A' with NaN
df.replace("N/A", pd.NA, inplace=True)

# Optional: Drop rows with missing title or link
df.dropna(subset=["title", "link"], inplace=True)

## Clean and standardize price

def clean_price(price_str):
    if pd.isna(price_str):
        return pd.NA
    # Remove currency symbols, commas, extra spaces
    price_str = re.sub(r"[^\d\.]", "", price_str)
    try:
        return float(price_str)
    except:
        return pd.NA

df["price"] = df["price"].apply(clean_price)


##  Strip whitespaces in text fields

df["title"] = df["title"].str.strip()
df["category"] = df["category"].str.strip()


#  Optional: reset index

df.reset_index(drop=True, inplace=True)





Cleaning complete! Total products: 120


In [11]:
#  Save cleaned CSV
cleaned_csv = df.to_csv("transformed_data_banggood_products_cleaned.csv", index=False)
print("Cleaning complete! Total products:", len(df))

Cleaning complete! Total products: 120


In [12]:
df

Unnamed: 0,title,price,img,link,category
0,Signal Amplifier 100k-6GHz Full Band Low Noise...,10.06,https://imgaz3.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Signal-Amplifier-100k...,Electronics
1,JYETech DSO183 DIY Oscilloscope Kit 500KHz ADC...,12.21,https://imgaz2.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/JYETech-DSO183-DIY-Os...,Electronics
2,Donut AM MW/SW Mini Loop Antenna+High Resistan...,22.22,https://imgaz1.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Donut-AM-MW-or-SW-Min...,Electronics
3,Hohem M7 iSteady M7 Cell Phone Gimbal Stabiliz...,205.15,https://imgaz2.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Hohem-M7-iSteady-M7-C...,Electronics
4,KEWEISI KWS-1902L Type-C Tester 4-30V 0-8A Pow...,7.42,https://imgaz2.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/KEWEISI-KWS-1902L-Typ...,Electronics
...,...,...,...,...,...
115,HONGDUI KM-17 Pro Router Plane Die Steel Body ...,116.12,https://imgaz2.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/HONGDUI-KM-17-Pro-Rou...,"Tools, Industrial and Scientific"
116,Woodworking Depth Gauge Versatile 0-50mm Dual ...,7.73,https://imgaz2.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Woodworking-Depth-Gau...,"Tools, Industrial and Scientific"
117,Universal AC to DC Adapter 48W Adjustable Volt...,7.73,https://imgaz3.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Universal-AC-to-DC-Ad...,"Tools, Industrial and Scientific"
118,Xiaomi AtuMan Q2 Portable Flexible Electronic ...,10.83,https://imgaz3.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Xiaomi-AtuMan-Q2-Port...,"Tools, Industrial and Scientific"


## DataFrame with extra three columns

In [None]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import random

# URLs and categories
url = [
    "https://www.banggood.com/Wholesale-Electronics-ca-2001.html",
    "https://www.banggood.com/Wholesale-Sports-and-Outdoors-ca-6001.html",
    "https://www.banggood.com/Wholesale-Automobiles-and-Motorcycles-ca-4001.html",
    "https://www.banggood.com/Wholesale-Computers-and-Office-ca-5001.html",
    "https://www.banggood.com/Wholesale-Tools,Industrial-and-Scientific-ca-3001.html"
]

category = ["Electronics", "Sports and Outdoors", "Automobiles and Motorcycles", "Computers and Office", "Tools, Industrial and Scientific"]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

All_products = []

# Scraping
for link, cat in zip(url, category):
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    products = soup.find_all("li", class_="product-item")

    for p in products:
        title_tag = p.find("div", class_="text")
        title = title_tag.get_text(strip=True) if title_tag else "N/A"

        price_tag = p.find("div", class_="price")
        price = price_tag.get_text(strip=True) if price_tag else "N/A"

        img_tag = p.find("img")
        img = img_tag["data-src"] if img_tag and img_tag.has_attr("data-src") else (img_tag["src"] if img_tag else "N/A")

        link_tag = p.find("a")
        link = link_tag["href"] if link_tag else "N/A"

        # --- Updated fields ---
        # Fake rating between 3.0 and 5.0
        rating = round(random.uniform(3.0, 5.0), 1)

        # Fake review count between 10 and 500
        review_count = random.randint(10, 500)

        All_products.append({
            "title": title,
            "price": price,
            "img": img,
            "link": link,
            "category": cat,
            "rating": rating,
            "review_count": review_count
        })

# Create DataFrame
df = pd.DataFrame(All_products)
df.to_csv("banggood_products_with_category_raw.csv", index=False)

# Cleaning
df.drop_duplicates(subset=["title", "link"], inplace=True)
df.replace("N/A", pd.NA, inplace=True)
df.dropna(subset=["title", "link"], inplace=True)

# Clean price
def clean_price(price_str):
    if pd.isna(price_str):
        return pd.NA
    price_str = re.sub(r"[^\d\.]", "", price_str)
    try:
        return float(price_str)
    except:
        return pd.NA

df["price"] = df["price"].apply(clean_price)

# Strip whitespaces
df["title"] = df["title"].str.strip()
df["category"] = df["category"].str.strip()

# Reset index
df.reset_index(drop=True, inplace=True)

# Save cleaned CSV
df.to_csv("transformed_banggood_products_with_category_clean.csv", index=False)

print("Cleaning complete! Total products:", len(df))


Cleaning complete! Total products: 120


In [14]:
df

Unnamed: 0,title,price,img,link,category,rating,review_count
0,Signal Amplifier 100k-6GHz Full Band Low Noise...,10.06,https://imgaz1.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Signal-Amplifier-100k...,Electronics,4.8,87
1,JYETech DSO183 DIY Oscilloscope Kit 500KHz ADC...,12.21,https://imgaz3.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/JYETech-DSO183-DIY-Os...,Electronics,4.3,317
2,Donut AM MW/SW Mini Loop Antenna+High Resistan...,22.22,https://imgaz1.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Donut-AM-MW-or-SW-Min...,Electronics,4.9,323
3,Hohem M7 iSteady M7 Cell Phone Gimbal Stabiliz...,205.15,https://imgaz1.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Hohem-M7-iSteady-M7-C...,Electronics,5.0,179
4,KEWEISI KWS-1902L Type-C Tester 4-30V 0-8A Pow...,7.42,https://imgaz3.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/KEWEISI-KWS-1902L-Typ...,Electronics,3.3,46
...,...,...,...,...,...,...,...
115,HONGDUI KM-17 Pro Router Plane Die Steel Body ...,116.12,https://imgaz.staticbg.com/thumb/view/oaupload...,https://www.banggood.com/HONGDUI-KM-17-Pro-Rou...,"Tools, Industrial and Scientific",4.3,309
116,Woodworking Depth Gauge Versatile 0-50mm Dual ...,7.73,https://imgaz2.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Woodworking-Depth-Gau...,"Tools, Industrial and Scientific",4.6,185
117,Universal AC to DC Adapter 48W Adjustable Volt...,7.73,https://imgaz.staticbg.com/thumb/view/oaupload...,https://www.banggood.com/Universal-AC-to-DC-Ad...,"Tools, Industrial and Scientific",4.5,55
118,Xiaomi AtuMan Q2 Portable Flexible Electronic ...,10.83,https://imgaz1.staticbg.com/thumb/view/oauploa...,https://www.banggood.com/Xiaomi-AtuMan-Q2-Port...,"Tools, Industrial and Scientific",4.6,211
