# Importing necessary modules for web scraping and data manipulation

In [1]:
# Importing Selenium's webdriver for browser automation
from selenium import webdriver

# Importing By class to locate elements in the web page using different strategies (e.g., ID, class name, etc.)
from selenium.webdriver.common.by import By

# Importing the WebElement class from Selenium's remote module
from selenium.webdriver.remote.webelement import WebElement

# Importing pandas for handling and manipulating tabular data
import pandas as pd

# Importing time module to introduce delays in the script (useful for mimicking human behavior and avoiding detection)
import time

# Importing random module to generate random numbers (used to vary delays and mimic more natural browsing behavior)
import random


In [65]:
# Initializing a Chrome WebDriver instance to automate browser actions.
driver = webdriver.Chrome()

# Closing the WebDriver instance and associated browser window when the task is completed.
# It's good practice to release resources and avoid leaving open browser processes.
#driver.quit()

In [66]:
# Dictionary to store categories and their corresponding URLs to track on Myntra
links_to_track = {
   "Kurtas & Suits": "https://www.myntra.com/women-kurtas-kurtis-suits?",
   "Kurtis, Tunics & Tops": "https://www.myntra.com/ethnic-tops?",
   "Sarees": "https://www.myntra.com/saree?",
   "Ethnic Wear": "https://www.myntra.com/women-ethnic-wear?",
   "Dresses": "https://www.myntra.com/dresses?f=Gender%3Amen%20women%2Cwomen&",
   "Tops": "https://www.myntra.com/tops?f=Gender%3Amen%20women%2Cwomen&",
   "Tshirts": "https://www.myntra.com/myntra-fashion-store?f=Categories%3ATshirts%3A%3AGender%3Amen%20women%2Cwomen&",
   "Jeans": "https://www.myntra.com/women-jeans?",
}

# List of sorting parameters to apply to the links (e.g., sort by popularity, newest items, etc.)
sort_by = ['popularity', 'new', 'discount', 'Customer%20Rating']

# Explanation of HTTP request methods for interacting with web pages:
# GET request: Parameters are displayed in the URL (used for retrieving data from the server).
# POST request: Parameters are not displayed in the URL (more secure, often requires authentication tokens for access).


In [68]:
# Loop through each category and its corresponding link in the 'links_to_track' dictionary
for key, link in links_to_track.items():
    
    # Loop through the available sorting methods (index-based iteration)
    for sort_method in range(len(sort_by)):
        
        # Append the sorting parameter (only the first one due to the break statement) to the URL
        link = link + f"sort={sort_by[sort_method]}"
        
        # Print the category and the modified URL
        print(f"Fetching: {key}. Link: {link}. ")
        
        # Break after processing the first sorting method (only 'popularity' in this case)
        break


Fetching: Kurtas & Suits. Link: https://www.myntra.com/women-kurtas-kurtis-suits?sort=popularity. 
Fetching: Kurtis, Tunics & Tops. Link: https://www.myntra.com/ethnic-tops?sort=popularity. 
Fetching: Sarees. Link: https://www.myntra.com/saree?sort=popularity. 
Fetching: Ethnic Wear. Link: https://www.myntra.com/women-ethnic-wear?sort=popularity. 
Fetching: Dresses. Link: https://www.myntra.com/dresses?f=Gender%3Amen%20women%2Cwomen&sort=popularity. 
Fetching: Tops. Link: https://www.myntra.com/tops?f=Gender%3Amen%20women%2Cwomen&sort=popularity. 
Fetching: Tshirts. Link: https://www.myntra.com/myntra-fashion-store?f=Categories%3ATshirts%3A%3AGender%3Amen%20women%2Cwomen&sort=popularity. 
Fetching: Jeans. Link: https://www.myntra.com/women-jeans?sort=popularity. 


In [60]:
# Function to extract product details from a given li_element (WebElement)
def extract_product_info(li_element: WebElement):
    data = {}  # Dictionary to store the extracted product details
    
    try:
        # Scroll the element into view (this action can be used to ensure the element is visible before interacting with it)
        li_element.location_once_scrolled_into_view
        
        # Extract the product's unique ID from the 'id' attribute of the li element
        product_id = li_element.get_attribute("id")
        print("Getting the product info for id", product_id)

        # Extract product details such as brand, name, rating, and image URL using find_element
        product_brand = li_element.find_element(By.CLASS_NAME, "product-brand").text  # Product brand
        product_name = li_element.find_element(By.CLASS_NAME, "product-product").text  # Product name
        
        # Extract product ratings and review count from the product-ratingsContainer
        product_rating_container = li_element.find_element(By.CLASS_NAME, "product-ratingsContainer").text
        product_rating = product_rating_container.split('|')[0].strip()  # Extract rating value
        product_rating_count = product_rating_container.split('|')[-1].strip()  # Extract rating count
        
        # Extract image URL and product page URL from the <img> and <a> tags
        image_url = li_element.find_element(By.TAG_NAME, "img").get_attribute("src")
        product_url = li_element.find_element(By.TAG_NAME, "a").get_attribute("href")

        # Try to extract discounted price and original price, handle cases where they may be missing
        try: 
            discounted_price = li_element.find_element(By.CLASS_NAME, "product-discountedPrice").text  # Discounted price
            original_price = li_element.find_element(By.CLASS_NAME, "product-strike").text  # Original price
        except:
            original_price = li_element.find_element(By.CLASS_NAME, "product-strike").text  # Original price
            discounted_price = 0  # If discounted price is not found, set it to 0

        # Organize extracted data into a dictionary for easy access and further use
        data = {
            "id": product_id,
            "product_brand": product_brand,
            "product_name": product_name,
            "discounted_price": discounted_price,
            "original_price": original_price,
            "product_rating": product_rating,
            "product_rating_count": product_rating_count,
            "image_url": image_url,
            "product_url": product_url
        }

    except Exception as e:
        print(f"Error: {e}")  # Print error if an exception occurs during extraction
    
    return data  # Return the extracted data dictionary


In [None]:
#driver.get(link)
#fetching and storing elements
container =driver.find_element(By.CLASS_NAME,"results-base")
li_elements= container.find_elements(By.CLASS_NAME, "product-base")

products= []

for li in li_elements:
    #print(li.text)
    product_info=extract_product_info(li_element=li)

    products.append(product_info)
    time.sleep(0.5)

print(products)

In [58]:
df= pd.DataFrame(products)
df.to_csv(f"{key}_{sort_by[sort_method]}.csv")

In [64]:
#main code 


for key,link in links_to_track.items():
    for sort_method in range(len(sort_by)):
        #only accesing the popularity criteria for the website
        link= link+f"sort={sort_by[sort_method]}"
        print(f"fetching: {key}. Link: {link}. ")
        #break #only use the first link 

        container =driver.find_element(By.CLASS_NAME,"results-base")
        li_elements= container.find_elements(By.CLASS_NAME, "product-base")

        products= []

        for li in li_elements:
            #print(li.text)
            product_info=extract_product_info(li_element=li)

            products.append(product_info)
            time.sleep(0.5)

        print(products)

        df= pd.DataFrame(products)
        df.to_csv(f"{key}_{sort_by[sort_method]}.csv")

    




fetching: Kurtas & Suits. Link: https://www.myntra.com/women-kurtas-kurtis-suits?sort=popularity. 
getting the product info for id 29726958
getting the product info for id 27950524
getting the product info for id 29629130
getting the product info for id 19674622
getting the product info for id 31141456
getting the product info for id 30938093
getting the product info for id 23158876
getting the product info for id 30271855
getting the product info for id 20953150
getting the product info for id 28175930
getting the product info for id 20695836
getting the product info for id 24838938
getting the product info for id 30679173
getting the product info for id 26975612
getting the product info for id 25234310
getting the product info for id 30829353
getting the product info for id 28748086
getting the product info for id 30259206
getting the product info for id 31764827
getting the product info for id 22290378
getting the product info for id 30086165
getting the product info for id 25405732