In [None]:

import requests
from bs4 import BeautifulSoup
import csv
import time
import re

base_url = "https://www.flipkart.com/search?q=washing+machine&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page="

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def clean(t):
    return re.sub(r"\s+", " ", t.replace("\xa0"," ")).strip() if t else "N/A"

def extract_title(item):
    tag = item.select_one('div.KzDlHZ') 
    if tag:
        txt = tag.get('title') 
        if txt.strip():
            return clean(txt)
    return "N/A"

def extract_rating_score(item):
    tag = item.select_one("div.XQDdHH")
    if tag:
        m = re.search(r"\d+(\.\d+)?", tag.text)
        return m.group(0) if m else "N/A"
    m = re.search(r"\d+\.\d+", item.get_text())
    return m.group(0) if m else "N/A"

def extract_counts(item):
    txt = clean(item.get_text(" ", strip=True))
    r = re.search(r"([\d,]+)\s*Ratings", txt, re.I)
    rv = re.search(r"([\d,]+)\s*Reviews", txt, re.I)
    ratings = r.group(1).replace(",","") if r else "N/A"
    reviews = rv.group(1).replace(",","") if rv else "N/A"
    return ratings, reviews

def extract_highlights(item):
    block = item.select_one("ul.G4BRas")
    if not block: return "N/A"
    vals = [clean(li.text) for li in block.select("li")]
    return " | ".join(vals) if vals else "N/A"
products = []

for page in range(1, 100):
    print("Page", page)
    try:
        res = requests.get(base_url + str(page), headers=headers, timeout=10)
        res.raise_for_status()
    except:
        continue

    soup = BeautifulSoup(res.text, "html.parser")
    items = soup.select("div[data-id]") or soup.select("div._1AtVbE")

    for item in items:
        title = extract_title(item)
        if title == "N/A" or title.lower().startswith("show results"):
            continue

        link_tag = item.select_one("a[href]")
        product_url = "https://www.flipkart.com" + link_tag.get("href").split("?")[0] if link_tag else "N/A"

        cp = item.select_one("div.Nx9bqj") 
        op = item.select_one("div.yRaY8j")
        dc = item.select_one("div.UkUFwK") 

        rating = extract_rating_score(item)
        ratings, reviews = extract_counts(item)
        highlights = extract_highlights(item)

        products.append({
            "Page": page,
            "Product Name": title,
            "Current Price": clean(cp.text if cp else "N/A"),
            "Original Price": clean(op.text if op else "N/A"),
            "Discount": clean(dc.text if dc else "N/A"),
            "Rating Score": rating,
            "Ratings Count": ratings,
            "Reviews Count": reviews,
            "Highlights": highlights,
            "Product URL": product_url  
        })

    time.sleep(1)


Page 1
Page 2
Page 3
Page 4
Page 5
Page 6
Page 7
Page 8
Page 9
Page 10
Page 11
Page 12
Page 13
Page 14
Page 15
Page 16
Page 17
Page 18
Page 19
Page 20
Page 21
Page 22
Page 23
Page 24
Page 25
Page 26
Page 27
Page 28
Page 29
Page 30
Page 31
Page 32
Page 33
Page 34
Page 35
Page 36
Page 37
Page 38
Page 39
Page 40
Page 41
Page 42
Page 43
Page 44
Page 45
Page 46
Page 47
Page 48
Page 49
Page 50
Page 51
Page 52
Page 53
Page 54
Page 55
Page 56
Page 57
Page 58
Page 59
Page 60
Page 61
Page 62
Page 63
Page 64
Page 65
Page 66
Page 67
Page 68
Page 69
Page 70
Page 71
Page 72
Page 73
Page 74
Page 75
Page 76
Page 77
Page 78
Page 79
Page 80
Page 81
Page 82
Page 83
Page 84
Page 85
Page 86
Page 87
Page 88
Page 89
Page 90
Page 91
Page 92
Page 93
Page 94
Page 95
Page 96
Page 97
Page 98
Page 99


In [4]:
with open("flipkart_WM.csv","w",newline="",encoding="utf-8") as f:
        w = csv.DictWriter(f, products[0].keys())
        w.writeheader()
        w.writerows(products)

print("Done. Total:", len(products))

Done. Total: 984


In [None]:
import pandas as pd


df = pd.read_csv("flipkart_WM.csv")


In [6]:
df_trunc = df[df["Product Name"].str.endswith("...")]
print("Truncated count:", len(df_trunc))


Truncated count: 356


In [7]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
}

def fetch_full_name(url):
    try:
        res = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")

        full = soup.select_one("span.VU-ZEz")
        if full:
            return full.get_text(strip=True).replace("\xa0", " ")
        return None
    except:
        return None


In [8]:
for idx, row in df_trunc.iterrows():
    url = row["Product URL"]

    print("Fetching:", url)

    full_name = fetch_full_name(url)
    if full_name:
        df.loc[idx, "Product Name"] = full_name
        print(" → Updated:", full_name)
    else:
        print(" → Failed to fetch")

    time.sleep(1)


Fetching: https://www.flipkart.com/whirlpool-7-kg-5-star-ace-wash-station-1400-rpm-speed-rust-proof-semi-automatic-top-load-washing-machine-grey/p/itm4a1a90b50cd6a
 → Updated: Whirlpool 7 kg 5 Star with Ace Wash Station, 1400 RPM Speed and Rust Proof Semi Automatic Top Load Washing Machine Grey(MAGIC CLEAN 7.0 GREY DAZZLE (5YR))
Fetching: https://www.flipkart.com/marq-flipkart-6-kg-5-star-rating-innowash-range-semi-automatic-top-load-washing-machine-white-maroon/p/itm11df0fd0f85df
 → Updated: MarQ by Flipkart 6 kg 5 Star Rating Innowash Range Semi Automatic Top Load Washing Machine White, Maroon(MQSA605NNNDM / MQSA60H5W)
Fetching: https://www.flipkart.com/lg-7-kg-5-star-wind-jet-dry-collar-scrubber-rust-free-plastic-base-semi-automatic-top-load-washing-machine-grey-white/p/itmc9d497e9b58bb
 → Updated: LG 7 kg 5 Star with Wind Jet Dry, Collar Scrubber and Rust Free Plastic Base Semi Automatic Top Load Washing Machine Grey, White(P7020NGAZ)
Fetching: https://www.flipkart.com/whirlpool-8-

In [9]:
df.to_csv("flipkart_washing_machine_fullname_cleaned.csv", index=False)
print("Saved updated file!")


Saved updated file!
