In [1]:
from bs4 import BeautifulSoup 
import requests
import pandas as pd 
from concurrent.futures import ThreadPoolExecutor

In [2]:
# ---------------------------------------------------------------------------- #
# scraping starts here # 

In [3]:
# Men shoes: 

In [4]:
base_url = 'https://www.noon.com'
men_shoes_brand = []
men_shoes_name = []
men_shoes_mod_no = []
men_shoes_price = []
men_shoes_link = []

In [5]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}

In [6]:
# making a function for the item info to run in parallel to the saving of links. 
def scrape_product_info(link):
    try:
        src_prod = requests.get(link, headers=headers)
        soup2 = BeautifulSoup(src_prod.content, 'lxml')
        info_cont = soup2.find('div', {'class': 'sc-6f72a2a1-4 ewCbcV'})
        brand_name = info_cont.find('div', class_='sc-6f72a2a1-16 bUsDae').text
        shoe_name = info_cont.find('h1', class_='sc-6f72a2a1-17 xcZgf').text
        model_number_info = info_cont.find('div', class_='modelNumber').text
        model_number = model_number_info.split(':')[1].strip()
        price_info = info_cont.find('div', class_='priceNow').text
        price = price_info.split('\xa0')[1]

        men_shoes_brand.append(brand_name)
        men_shoes_name.append(shoe_name)
        men_shoes_mod_no.append(model_number)
        men_shoes_price.append(price)
        men_shoes_link.append(link)

    except requests.RequestException as e:
        print(f"Error while processing {link}: {e}")

with requests.Session() as session:
    with ThreadPoolExecutor(max_workers=10) as executor:
        for x in range(1, 22):
            src_cont = session.get(f'https://www.noon.com/egypt-en/fashion/men-31225/shoes-17421/fashion-sneakers-20082/?limit=50&originalQuery=shoes&page={x}', headers=headers)
            src_cont.raise_for_status()  # Raise HTTPError for bad requests
            soup = BeautifulSoup(src_cont.content, 'lxml')
            list_grid = soup.find('div', {'class':'sc-926ab76d-7 eCDCTP grid'})
            span_links = list_grid.find_all('span', class_='sc-deebe925-0 fEembb wrapper productContainer show')

            # Getting the links
            links_to_scrape = [base_url + link.find('a')['href'] for link in span_links]

            # Scraping product information in parallel
            # this executor.map will run the function defined above. 
            # the second argument of this function will be used as an argument to the first argument. 
            executor.map(scrape_product_info, links_to_scrape)

# Now, men_shoes_brand, men_shoes_name, men_shoes_mod_no, and men_shoes_price contain the scraped data.



In [7]:
# checking all the lists if they all have data. 
print(len(men_shoes_brand))
print(len(men_shoes_name))
print(len(men_shoes_mod_no))
print(len(men_shoes_price))
print(len(men_shoes_link))


1050
1050
1050
1050
1050


In [8]:
# scraping ends here # 
# ---------------------------------------------------------------------------- #
# csv creation thru pandas starts here.  # 

In [9]:
# making the scraped data to become a dictionary data for making a dataframe. 
men_shoe_data = {
    'Shoe_Brand': men_shoes_brand,
    'Shoe_Name': men_shoes_name,
    'Shoe_Model_Number': men_shoes_mod_no,
    'Shoe_Price': men_shoes_price,
    'Shoe_Link': men_shoes_link
}

In [10]:
# creating dataframe 
shoe_pd = pd.DataFrame(men_shoe_data)

In [11]:
shoe_pd

Unnamed: 0,Shoe_Brand,Shoe_Name,Shoe_Model_Number,Shoe_Price,Shoe_Link
0,Nike,React Live Sneakers Black/White-Dk Smoke Grey,CV1772-003,5149.00,https://www.noon.com/egypt-en/react-live-sneak...
1,Adidas,Ultimashow Running Shoes,fx3633,2494.00,https://www.noon.com/egypt-en/ultimashow-runni...
2,Activ,Special Sports Shoes,RU23089,534.00,https://www.noon.com/egypt-en/special-sports-s...
3,Adidas,Runfalcon 3.0 Running Shoes,ID2291,3374.00,https://www.noon.com/egypt-en/runfalcon-3-0-ru...
4,Activ,Special Sports Shoes,RU23092,534.00,https://www.noon.com/egypt-en/special-sports-s...
...,...,...,...,...,...
1045,Desert,SPORTIVE canvas lace-up sneakers for men - COR...,F18,265.00,https://www.noon.com/egypt-en/sportive-canvas-...
1046,ZONTA,Zonta Men's Microfibre Fabric Lace Up Shoes,10102Nav,350.00,https://www.noon.com/egypt-en/zonta-men-s-micr...
1047,anta,EASY RUN RUNNING SHOES FOR MEN IN BLACK,812215571-6,1604.00,https://www.noon.com/egypt-en/easy-run-running...
1048,Desert,Basic Fashion Leather Flat Sneakers For Men,2057/6,239.00,https://www.noon.com/egypt-en/basic-fashion-le...


In [12]:
shoe_pd.dtypes

Shoe_Brand           object
Shoe_Name            object
Shoe_Model_Number    object
Shoe_Price           object
Shoe_Link            object
dtype: object

In [13]:
# now creating csv for this dataframe 
shoe_pd.to_csv('men_shoes.csv')

In [14]:
# converting to csv ends here # 
# ---------------------------------------------------------------------------- #