# Importing necessary modules for web scraping and data manipulation

In [1]:
# Importing Selenium's webdriver for browser automation
from selenium import webdriver

# Importing By class to locate elements in the web page using different strategies (e.g., ID, class name, etc.)
from selenium.webdriver.common.by import By

# Importing the WebElement class from Selenium's remote module
from selenium.webdriver.remote.webelement import WebElement

# Importing pandas for handling and manipulating tabular data
import pandas as pd

# Importing time module to introduce delays in the script (useful for mimicking human behavior and avoiding detection)
import time

# Importing random module to generate random numbers (used to vary delays and mimic more natural browsing behavior)
import random

In [2]:
# Initializing a Chrome WebDriver instance to automate browser actions.
driver = webdriver.Chrome()


In [6]:
# Dictionary to store categories and their corresponding URLs to track on Myntra
links_to_track = {
   "kurtas-and-suits": "https://www.myntra.com/women-kurtas-kurtis-suits?",
   "kurtis-tunics-tops": "https://www.myntra.com/ethnic-tops?",
   "Sarees": "https://www.myntra.com/saree?",
   "ethnic-wear": "https://www.myntra.com/women-ethnic-wear?",
   "dresses": "https://www.myntra.com/dresses?f=Gender%3Amen%20women%2Cwomen&",
   "tops": "https://www.myntra.com/tops?f=Gender%3Amen%20women%2Cwomen&",
   "tshirts": "https://www.myntra.com/myntra-fashion-store?f=Categories%3ATshirts%3A%3AGender%3Amen%20women%2Cwomen&",
   "jeans": "https://www.myntra.com/women-jeans?",
}

# List of sorting parameters to apply to the links (e.g., sort by popularity, newest items, etc.)
sort_by = ['popularity', 'new', 'discount', 'Customer%20Rating']

# Explanation of HTTP request methods for interacting with web pages:
# GET request: Parameters are displayed in the URL (used for retrieving data from the server).
# POST request: Parameters are not displayed in the URL (more secure, often requires authentication tokens for access).


In [12]:
# Function to extract product details from a given li_element (WebElement)
def extract_product_info(li_element: WebElement):
    data = {}  # Dictionary to store the extracted product details
    
    try:
        # Scroll the element into view (this action can be used to ensure the element is visible before interacting with it)
        li_element.location_once_scrolled_into_view
        
        # Extract the product's unique ID from the 'id' attribute of the li element
        product_id = li_element.get_attribute("id")
        print("Getting the product info for id", product_id)

        # Extract product details such as brand, name, rating, and image URL using find_element
        product_brand = li_element.find_element(By.CLASS_NAME, "product-brand").text  # Product brand
        product_name = li_element.find_element(By.CLASS_NAME, "product-product").text  # Product name
        
        # Extract product ratings and review count from the product-ratingsContainer (handle missing ratings)
        try:
            product_rating_container = li_element.find_element(By.CLASS_NAME, "product-ratingsContainer").text
            product_rating = product_rating_container.split('|')[0].strip()  # Extract rating value
            product_rating_count = product_rating_container.split('|')[-1].strip()  # Extract rating count
        except:
            product_rating = "No Rating"
            product_rating_count = "0 reviews"
        
        # Extract image URL and product page URL from the <img> and <a> tags
        image_url = li_element.find_element(By.TAG_NAME, "img").get_attribute("src")
        product_url = li_element.find_element(By.TAG_NAME, "a").get_attribute("href")

        # Try to extract discounted price and original price, handle cases where they may be missing
        try: 
            discounted_price = li_element.find_element(By.CLASS_NAME, "product-discountedPrice").text  # Discounted price
            original_price = li_element.find_element(By.CLASS_NAME, "product-strike").text  # Original price
        except:
            original_price = li_element.find_element(By.CLASS_NAME, "product-price").text  # Original price
            discounted_price = 0  # If discounted price is not found, set it to 0

        # Organize extracted data into a dictionary for easy access and further use
        data = {
            "id": product_id,
            "product_brand": product_brand,
            "product_name": product_name,
            "discounted_price": discounted_price,
            "original_price": original_price,
            "product_rating": product_rating,
            "product_rating_count": product_rating_count,
            "image_url": image_url,
            "product_url": product_url
        }

    except Exception as e:
        print(f"Error: {e}")  # Print error if an exception occurs during extraction
    
    return data  # Return the extracted data dictionary


In [13]:
for key, link in links_to_track.items():
    for sort_method in sort_by:
        full_link = f"{link}sort={sort_method}"  # Create the full URL by appending sort method
        print(f"Fetching: {key}. Link: {full_link}")
        
        try:
            driver.get(full_link)
            container = driver.find_element(By.CLASS_NAME, "results-base")
            li_elements = container.find_elements(By.CLASS_NAME, "product-base")

            products = []

            for li in li_elements:
                # Extract product information
                product_info = extract_product_info(li_element=li)
                products.append(product_info)

                # Random delay to avoid being flagged as a bot
                time.sleep(random.uniform(0.3, 2))

            # Save the results into a DataFrame and export to CSV
            df = pd.DataFrame(products)
            df.to_csv(f"{key}_{sort_method}.csv", index=False)
        
        except Exception as e:
            print(f"Error occurred while fetching data for {key} with sort method {sort_method}: {e}")

Fetching: kurtas-and-suits. Link: https://www.myntra.com/women-kurtas-kurtis-suits?sort=popularity
Getting the product info for id 29726958
Getting the product info for id 27950524
Getting the product info for id 29629130
Getting the product info for id 31141456
Getting the product info for id 19674622
Getting the product info for id 23158876
Getting the product info for id 30938093
Getting the product info for id 30271855
Getting the product info for id 20953150
Getting the product info for id 24838938
Getting the product info for id 20695836
Getting the product info for id 30679173
Getting the product info for id 26975612
Getting the product info for id 25234310
Getting the product info for id 30829353
Getting the product info for id 24275388
Getting the product info for id 28748086
Getting the product info for id 28175930
Getting the product info for id 30259206
Getting the product info for id 22290378
Getting the product info for id 31764827
Getting the product info for id 30086165