In [1]:
from requests_html import HTMLSession 
import pandas as pd 
from concurrent.futures import ThreadPoolExecutor

In [2]:
# ---------------------------------------------------------------------------- #
# scraping starts here # 

In [3]:
base_url = 'https://www.noon.com'
women_shoes_brand = []
women_shoes_name = []
women_shoes_mod_no = []
women_shoes_price = []
women_shoes_link = []

In [4]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

In [5]:
# Create a HTMLSession instance with the heders usage. 
session = HTMLSession()

# Set headers for the session
session.headers.update(headers)

def scrape_product_info(link):
    try:
        response = session.get(link)
        info_cont = response.html.find('div.sc-6f72a2a1-4.ewCbcV', first=True)

        brand_name = info_cont.find('div.sc-6f72a2a1-16.bUsDae', first=True).text
        shoe_name = info_cont.find('h1.sc-6f72a2a1-17.xcZgf', first=True).text
        model_number_info = info_cont.find('div.modelNumber', first=True).text
        model_number = model_number_info.split(':')[1].strip()
        price_info = info_cont.find('div.priceNow', first=True).text
        price = price_info.split('\xa0')[1]

        women_shoes_brand.append(brand_name)
        women_shoes_name.append(shoe_name)
        women_shoes_mod_no.append(model_number)
        women_shoes_price.append(price)
        women_shoes_link.append(link)

    except requests.RequestException as e: 
        print(f"Error while processing {link}: {e}")

# Using session for the initial request as well
with session as session:
    with ThreadPoolExecutor(max_workers=10) as executor:
        for x in range(1, 32):
            response = session.get(f'https://www.noon.com/egypt-en/fashion/women-31229/shoes-16238/fashion-sneakers-24738/?limit=50&originalQuery=shoes&page={x}')
            response.raise_for_status()  # Raise HTTPError for bad requests
            list_grid = response.html.find('div.sc-926ab76d-7.eCDCTP.grid', first=True)
            span_links = list_grid.find('span.sc-deebe925-0.fEembb.wrapper.productContainer.show')

            # Getting the links
            links_to_scrape = [base_url + link.find('a', first=True).attrs['href'] for link in span_links]

            # Scraping product information in parallel
            # Using executor.map to run the function defined above.
            # The second argument of this function will be used as an argument to the first argument.
            executor.map(scrape_product_info, links_to_scrape)

# Now, men_shoes_brand, men_shoes_name, men_shoes_mod_no, and men_shoes_price contain the scraped data.

In [6]:
# checking all the lists if they all have data. 
print(len(women_shoes_brand))
print(len(women_shoes_name))
print(len(women_shoes_mod_no))
print(len(women_shoes_price))
print(len(women_shoes_link))

1488
1488
1488
1488
1488


In [7]:
# scraping ends here # 
# ---------------------------------------------------------------------------- #
# csv creation thru pandas starts here.  # 

In [9]:
# making the scraped data to become a dictionary data for making a dataframe. 
women_shoe_data = {
    'Shoe_Brand': women_shoes_brand,
    'Shoe_Name': women_shoes_name,
    'Shoe_Model_Number': women_shoes_mod_no,
    'Shoe_Price': women_shoes_price,
    'Shoe_Link': women_shoes_link
}

In [10]:
# creating dataframe 
shoe_pd = pd.DataFrame(men_shoe_data)

In [11]:
shoe_pd

Unnamed: 0,Shoe_Brand,Shoe_Name,Shoe_Model_Number,Shoe_Price,Shoe_Link
0,Nike,Tanjun Running Shoes Wolf Grey/White,812655-010,4269.00,https://www.noon.com/egypt-en/tanjun-running-s...
1,Nike,Womens Tanjun Sneaker,NK812655-110,2999.00,https://www.noon.com/egypt-en/womens-tanjun-sn...
2,CONVERSE,Chuck 70 Sneakers Yellow,162063C,3289.00,https://www.noon.com/egypt-en/chuck-70-sneaker...
3,Desert,Canvas Slip On Sock Sneakers Black,WR302/4,169.00,https://www.noon.com/egypt-en/canvas-slip-on-s...
4,Desert,Canvas Slip On Sock Sneakers Kashmeer,WR302/4,169.00,https://www.noon.com/egypt-en/canvas-slip-on-s...
...,...,...,...,...,...
1483,SKECHERS,Skech-Air Dynamight,149755-NVBL,2654.00,https://www.noon.com/egypt-en/skech-air-dynami...
1484,VANS,Era Low Top Sneakers Black,ERA-BLACK,2834.00,https://www.noon.com/egypt-en/era-low-top-snea...
1485,Adidas,Ultraboost Light Running Shoes,GZ5159,10349.00,https://www.noon.com/egypt-en/ultraboost-light...
1486,Desert,Sportive Lace-Up Sneakers For Women - Black,WF18,249.00,https://www.noon.com/egypt-en/sportive-lace-up...


In [14]:
# now creating csv for this dataframe 
shoe_pd.to_csv('women_shoes.csv')

In [15]:
# converting to csv ends here # 
# ---------------------------------------------------------------------------- #