## Web Scraping

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Function to extract Product Title
def get_title(soup):
    try:
        title = soup.find('span', class_='B_NuCI')
        title_value = title.text
        title_string = title_value.strip()
    except AttributeError:
        title_string = ""
    return title_string

# Function to extract Deal Price
def get_deal_price(soup):
    deal_price = soup.find("div", class_='_30jeq3')
    deal_price_value = deal_price.text.strip() if deal_price else ""
    return deal_price_value

# Function to extract Screen Type
def get_screen_type(soup):
    screen_type_td = soup.find("td", text="Screen Type")
    if screen_type_td:
        li_element = screen_type_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract HDMI count
def get_hdmi_count(soup):
    hdmi_td = soup.find("td", text="HDMI")
    if hdmi_td:
        li_element = hdmi_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract USB count
def get_usb_count(soup):
    usb_td = soup.find("td", text="USB")
    if usb_td:
        li_element = usb_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract Operating System
def get_os(soup):
    os_td = soup.find("td", text="Operating System")
    if os_td:
        li_element = os_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract Smart TV information
def get_smart_tv(soup):
    smart_tv_td = soup.find("td", text="Smart Tv")
    if smart_tv_td:
        li_element = smart_tv_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract Display Size information
def get_display_size(soup):
    display_size_td = soup.find("td", text="Display Size")
    if display_size_td:
        li_element = display_size_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract HD Technology & Resolution
def get_hd_technology_resolution(soup):
    hd_technology_resolution_td = soup.find("td", text="HD Technology & Resolution")
    if hd_technology_resolution_td:
        ul_element = hd_technology_resolution_td.find_next("ul")
        li_element = ul_element.find("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract Launch Year
def get_launch_year(soup):
    launch_year_td = soup.find("td", text="Launch Year")
    if launch_year_td:
        li_element = launch_year_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

# Function to extract Model Name
def get_model_name(soup):
    model_name_td = soup.find("td", text="Model Name")
    if model_name_td:
        li_element = model_name_td.find_next("li", class_="_21lJbe")
        if li_element:
            return li_element.text
    return ""

if __name__ == '__main__':
    d = {
        "title": [],
        "deal_price": [],
        "screen_type": [],
        "hdmi_count": [],
        "usb_count": [],
        "os": [],
        "smart_tv": [],
        "display_size": [],
        "hd_technology_resolution": [],
        "launch_year": [],
        "model_name": [],  # Add the model_name key
    }

    # Define the number of pages to scrape
    num_pages = 41
    items_scraped = 0  # Initialize a counter for items scraped

    # Iterate through the pages
    for page_number in range(1, num_pages + 1):
        URL = f"https://www.flipkart.com/search?q=TV&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&page={page_number}"

        retry = 3  # Number of retries in case of request failure
        while retry > 0:
            try:
                r = requests.get(URL)
                soup = BeautifulSoup(r.content, "html.parser")
                links = soup.find_all("a", class_='_1fQZEK')

                for link in links:
                    new_webpage = requests.get("https://www.flipkart.com" + link.get('href'))
                    new_soup = BeautifulSoup(new_webpage.content, "html.parser")

                    title = get_title(new_soup)
                    deal_price = get_deal_price(new_soup)
                    screen_type = get_screen_type(new_soup)
                    hdmi_count = get_hdmi_count(new_soup)
                    usb_count = get_usb_count(new_soup)
                    os = get_os(new_soup)
                    smart_tv = get_smart_tv(new_soup)
                    display_size = get_display_size(new_soup)
                    hd_technology_resolution = get_hd_technology_resolution(new_soup)
                    launch_year = get_launch_year(new_soup)
                    model_name = get_model_name(new_soup)  # Get the model name

                    # Add the extracted data to the dictionary
                    d['title'].append(title)
                    d['deal_price'].append(deal_price)
                    d['screen_type'].append(screen_type)
                    d['hdmi_count'].append(hdmi_count)
                    d['usb_count'].append(usb_count)
                    d['os'].append(os)
                    d['smart_tv'].append(smart_tv)
                    d['display_size'].append(display_size)
                    d['hd_technology_resolution'].append(hd_technology_resolution)
                    d['launch_year'].append(launch_year)
                    d['model_name'].append(model_name)  # Add the model name

                    items_scraped += 1

                # Print the current page number for tracking progress
                print(f"Scraped data from page {page_number} (Items scraped: {items_scraped})")

                # Break out of retry loop if the page was successfully scraped
                break
            except Exception as e:
                # Handle request errors
                print(f"Error while scraping page {page_number}: {str(e)}")
                retry -= 1
                if retry == 0:
                    print(f"Failed to scrape page {page_number}.")
                    break
                else:
                    print(f"Retrying page {page_number}...")
                    time.sleep(5)  # Wait for a few seconds before retrying

    df = pd.DataFrame.from_dict(d)

    df.to_csv("Dataset.csv", header=True, index=False)


Scraped data from page 1 (Items scraped: 24)
Scraped data from page 2 (Items scraped: 48)
Scraped data from page 3 (Items scraped: 72)
Scraped data from page 4 (Items scraped: 96)
Scraped data from page 5 (Items scraped: 120)
Scraped data from page 6 (Items scraped: 144)
Scraped data from page 7 (Items scraped: 168)
Scraped data from page 8 (Items scraped: 192)
Scraped data from page 9 (Items scraped: 216)
Scraped data from page 10 (Items scraped: 216)
Scraped data from page 11 (Items scraped: 240)
Scraped data from page 12 (Items scraped: 264)
Scraped data from page 13 (Items scraped: 288)
Scraped data from page 14 (Items scraped: 312)
Scraped data from page 15 (Items scraped: 336)
Scraped data from page 16 (Items scraped: 360)
Scraped data from page 17 (Items scraped: 384)
Scraped data from page 18 (Items scraped: 384)
Scraped data from page 19 (Items scraped: 408)
Scraped data from page 20 (Items scraped: 432)
Scraped data from page 21 (Items scraped: 456)
Scraped data from page 22 