In [40]:
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_product_page(session, url):
    try:
        product_page = session.get(url)
        product_soup = BeautifulSoup(product_page.text, "html.parser")
        return product_soup
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def parse_product_details(product_soup):
    if product_soup is None:
        return None

    product_details = {}
    name_tag = product_soup.find("h1", class_="product__name")
    product_details["name"] = name_tag.text.strip() if name_tag else "N/A"

    price_tag = product_soup.find("span", class_="product__prices-sale")
    product_details["price"] = price_tag.text.strip() if price_tag else "N/A"

    color_label = product_soup.find("div", class_="detail__properties-label", string=lambda t: "Colore" in t)
    if color_label:
        color_value = color_label.find_next_sibling("div", class_="detail__properties-link")
        product_details["color"] = color_value.text.strip() if color_value else "N/A"

    material_label = product_soup.find("div", class_="detail__properties-label", string=lambda t: "Materiale" in t)
    if material_label:
        material_value = material_label.find_next_sibling("div", class_="detail__properties-link")
        product_details["materiale"] = material_value.text.strip() if material_value else "N/A"

    return product_details

def fetch_product_details(links, max_workers=5):
    category_data = []

    with requests.Session() as session:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            future_to_url = {}

            for link in links:
                page = session.get(link)
                soup = BeautifulSoup(page.text, "html.parser")
                product_divs = soup.find_all("div", class_="product__name")

                for product_div in product_divs:
                    product_link = product_div.find_parent("a")
                    if product_link and 'href' in product_link.attrs:
                        product_url = "https://www.lumberjack.com" + product_link['href']
                        future = executor.submit(fetch_product_page, session, product_url)
                        future_to_url[future] = product_url

            for future in as_completed(future_to_url):
                product_soup = future.result()
                product_details = parse_product_details(product_soup)
                if product_details is not None:
                    category_data.append(product_details)

    return category_data

links = [
    "https://www.lumberjack.com/it/sport_shoes?gender=man",
    "https://www.lumberjack.com/it/boat-shoes?gender=man",
    "https://www.lumberjack.com/it/slip_on?gender=woman",
    "https://www.lumberjack.com/it/sandals?gender=woman",
    "https://www.lumberjack.com/it/ankle-boots?gender=woman",
    "https://www.lumberjack.com/it/beatles?gender=woman",
    "https://www.lumberjack.com/it/outdoor?gender=woman"
]

lumberjack_details = fetch_product_details(links)
lumberjack_df = pd.DataFrame([detail for detail in lumberjack_details if detail is not None])

lumberjack_df['price'] = pd.to_numeric(lumberjack_df['price'].astype(str).str.replace(r'\s*EUR', '', regex=True).str.replace(',', '.'), errors='coerce')

lumberjack_df['Category'] = lumberjack_df['name'].apply(lambda x: x.split()[-1])
lumberjack_df['Name'] = lumberjack_df['name'].apply(lambda x: x.split()[0])
lumberjack_df['Type'] = lumberjack_df['name'].apply(lambda x: ' '.join(x.split()[1:-1]) if len(x.split()) > 2 else 'N/A')

lumberjack_df['Category'] = lumberjack_df['Category'].str.title()
lumberjack_df['Name'] = lumberjack_df['Name'].str.title()
lumberjack_df['Type'] = lumberjack_df['Type'].str.title()
lumberjack_df['color'] = lumberjack_df['color'].str.title()
lumberjack_df['materiale'] = lumberjack_df['materiale'].str.title()

lumberjack_df = lumberjack_df[['Category', 'Type', 'Name', 'price', 'color', 'materiale']]

lumberjack_df




Unnamed: 0,Category,Type,Name,price,color,materiale
0,Uomo,Scarpe Da Barca,Navigator,89.99,Navy Blue,Nubuck
1,Uomo,Sneakers,Warner,79.99,Navy Blue/Bluette,Suede-Mesh
2,Uomo,Scarpe Da Barca,Navigator,89.99,Navy Blue/White,Pull-Up Leather
3,Uomo,Sneaker,Lee,59.99,Black/Yellow,Suede-Mesh
4,Uomo,Sneaker,Lee,59.99,Grey/Orange,Suede-Mesh
...,...,...,...,...,...,...
127,Donna,Outdoor,Brontes,49.99,Black/Grey,Synthetic Smooth-Mesh
128,Donna,Scarponcino Outdoor,Stowe,34.99,Black,Synthetic Smooth
129,Donna,Outdoor,Vabory,59.99,Black,Mesh-Synthetic Smooth
130,Donna,Outdoor,Modesta,39.99,Mid Grey,Softshell-Synthetic
