In [9]:
import os
import pandas as pd
from bs4 import BeautifulSoup



# Initialize lists to store all extracted data across multiple files
product_names = []
prices = []
max_powers = []
displacements = []
brake_fronts = []
tire_types = []
ratings = []
num_ratings_list = []
num_reviews_list = []

# Function to parse each HTML page and extract the data
def parse_html_page(html_file):
    with open(html_file, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Assuming product_cards is a list of card divs or similar elements
    product_cards = soup.find_all("div", class_="_75nlfW")  # Replace with correct class or tag

    # Lists to store data for the current page
    page_product_names = []
    page_prices = []
    page_max_powers = []
    page_displacements = []
    page_brake_fronts = []
    page_tire_types = []
    page_ratings = []
    page_num_ratings_list = []
    page_num_reviews_list = []

    # Extract data from each product card
    for product_card in product_cards:
        # Extract Product Name
        product_name_div = product_card.find("div", class_="KzDlHZ")
        product_name = product_name_div.text.strip() if product_name_div else "N/A"

        # Extract Price
        price_div = product_card.find("div", class_="Nx9bqj _4b5DiR")
        price = price_div.text.strip() if price_div else "N/A"

        # Extract Maximum Power, Displacement, Brake Front, Tire Type, Console Features, and Warranty Statement
        details_div = product_card.find("div", class_="_6NESgJ")
        details = details_div.find_all("li", class_="J+igdf") if details_div else []

        max_power = details[0].text.strip() if len(details) > 0 else "N/A"
        displacement = details[1].text.strip() if len(details) > 1 else "N/A"
        brake_front = details[2].text.strip() if len(details) > 2 else "N/A"
        tire_type = details[3].text.strip() if len(details) > 3 else "N/A"

        # Extract Ratings, Number of Ratings, and Reviews
        ratings_div = product_card.find("div", class_="_5OesEi")
        rating = "N/A"
        num_ratings = "N/A"
        num_reviews = "N/A"

        if ratings_div:
            rating_span = ratings_div.find("div", class_="XQDdHH")
            rating = rating_span.text.strip() if rating_span else "N/A"
            
            review_span = ratings_div.find("span", class_="Wphh3N")
            ratings_reviews = review_span.text.strip() if review_span else "N/A"
            
            if "Ratings" in ratings_reviews and "Reviews" in ratings_reviews:
                num_ratings = ratings_reviews.split("Ratings")[0].strip()
                num_reviews = ratings_reviews.split("Reviews")[0].split("&")[-1].strip()

        # Append the extracted data to the lists for the current page
        page_product_names.append(product_name)
        page_prices.append(price)
        page_max_powers.append(max_power)
        page_displacements.append(displacement)
        page_brake_fronts.append(brake_front)
        page_tire_types.append(tire_type)
        page_ratings.append(rating)
        page_num_ratings_list.append(num_ratings)
        page_num_reviews_list.append(num_reviews)

    # Return the lists for the current page
    return {
        "Product Name": page_product_names,
        "Price": page_prices,
        "Maximum Power": page_max_powers,
        "Displacement": page_displacements,
        "Brake Front": page_brake_fronts,
        "Tire Type": page_tire_types,
        "Rating": page_ratings,
        "Number of Ratings": page_num_ratings_list,
        "Number of Reviews": page_num_reviews_list,
    }

# Iterate through all HTML files in the directory and parse them
all_data = []
for i in range(1, 3):  # Change this range based on the number of pages you want to scrape
    html_file = f"flipkart_page_{i}.html"
    if os.path.exists(html_file):
        page_data = parse_html_page(html_file)
        all_data.append(page_data)

# Combine all the data from each page into a single DataFrame
df = pd.DataFrame()

for page_data in all_data:
    temp_df = pd.DataFrame(page_data)
    df = pd.concat([df, temp_df], ignore_index=True)

# Save the combined DataFrame to a CSV file
df.to_csv("flipkart_product_data_all_pages.csv", index=False)

# Print the DataFrame
print(df)

                                         Product Name      Price  \
0   Hero Splendor+ (Non i3S) Booking for Ex-Showro...    ₹76,526   
1   BAJAJ PULSAR 220 F (UG) Booking for Ex-Showroo...  ₹1,40,954   
2   Hero Super Splendor (Disc) Booking for Ex-Show...    ₹84,198   
3   Hero Glamour XTEC (Drum) Booking for Ex-Showro...    ₹90,794   
4   BAJAJ Pulsar 125 (CARBON FIBRE SINGLE SEAT UG)...    ₹89,350   
5   Hero Super Splendor Xtec (Drum) Booking for Ex...    ₹83,150   
6   Hero XTREME 125R (ABS) Booking for Ex-Showroom...  ₹1,03,035   
7   BAJAJ Pulsar 150 (Single Disc UG) Booking for ...  ₹1,14,438   
8   BAJAJ Avenger (220 CRUISE) Booking for Ex-Show...  ₹1,47,507   
9   Hero Glamour XTEC (Disc) Booking for Ex-Showro...    ₹94,998   
10  Hero XTREME 125R (IBS) Booking for Ex-Showroom...    ₹97,107   
11  Hero Super Splendor (Drum) Booking for Ex-Show...    ₹81,498   
12  BAJAJ Pulsar N 150 (Twin Disc UG) Booking for ...  ₹1,25,728   
13  Hero Xpulse 200 4V (ABS Disc) Booking for Ex

In [10]:
df

Unnamed: 0,Product Name,Price,Maximum Power,Displacement,Brake Front,Tire Type,Rating,Number of Ratings,Number of Reviews
0,Hero Splendor+ (Non i3S) Booking for Ex-Showro...,"₹76,526",Maximum Power: 5.9 kW @ 8000 rpm,Displacement: 97.2 cc,Brake Front: Drum,Tire Type: Tubeless,4.5,5695.0,397.0
1,BAJAJ PULSAR 220 F (UG) Booking for Ex-Showroo...,"₹1,40,954",Maximum Power: 15 kW (20.4 PS) @ 8500 rpm,Displacement: 220 cc,Brake Front: Disc,Tire Type: Tubeless,4.3,21.0,2.0
2,Hero Super Splendor (Disc) Booking for Ex-Show...,"₹84,198",Maximum Power: 10.7,Displacement: 124.7 cc,Brake Front: Disc,Tire Type: Tubeless,4.5,513.0,46.0
3,Hero Glamour XTEC (Drum) Booking for Ex-Showro...,"₹90,794",Maximum Power: 10.7,Displacement: 124.7 cc,Brake Front: Drum,Tire Type: Tubeless,4.5,1860.0,197.0
4,BAJAJ Pulsar 125 (CARBON FIBRE SINGLE SEAT UG)...,"₹89,350",Maximum Power: 8.68 kW @ 8500 rpm,Displacement: 124.38 cc,Brake Front: Disc,Tire Type: Tubeless,4.5,456.0,54.0
5,Hero Super Splendor Xtec (Drum) Booking for Ex...,"₹83,150",Maximum Power: 10.7 BHP (at 7500 RPM),Displacement: 125 cc,Brake Front: Drum,Tire Type: Tubeless,4.5,3550.0,340.0
6,Hero XTREME 125R (ABS) Booking for Ex-Showroom...,"₹1,03,035",Maximum Power: 8.5 kW @ 8250 rpm,Displacement: 124.7 cc,Brake Front: Disc,Tire Type: Tubeless,4.6,3436.0,344.0
7,BAJAJ Pulsar 150 (Single Disc UG) Booking for ...,"₹1,14,438",Maximum Power: 10.3 kW (14 PS) @ 8500 rpm,Displacement: 149.5 cc,Brake Front: Disc,Tire Type: Tubeless,4.5,95.0,10.0
8,BAJAJ Avenger (220 CRUISE) Booking for Ex-Show...,"₹1,47,507",Maximum Power: 14 kW @ 8400 rpm,Displacement: 220 cc,Brake Front: Disc,Tire Type: Tubed,4.3,11.0,0.0
9,Hero Glamour XTEC (Disc) Booking for Ex-Showro...,"₹94,998",Maximum Power: 10.7,Displacement: 124.7 cc,Brake Front: Disc,Tire Type: Tubeless,4.5,1860.0,197.0
