In [34]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def fetch_product_details(links):
    category_data = []
    
    for link in links:
        page = requests.get(link)
        soup = BeautifulSoup(page.text, "html.parser")
        
        product_divs = soup.find_all("div", class_="product__name")
        
        for product_div in product_divs:
            product_link = product_div.find_parent("a")
            if product_link and 'href' in product_link.attrs:
                product_url = "https://www.lumberjack.com" + product_link['href']
                
                product_page = requests.get(product_url)
                product_soup = BeautifulSoup(product_page.text, "html.parser")
                
                product_details = {}
                
                name_tag = product_soup.find("h1", class_="product__name")
                product_details["name"] = name_tag.text.strip() if name_tag else "N/A"
                
                price_tag = product_soup.find("span", class_="product__prices-sale")
                product_details["price"] = price_tag.text.strip() if price_tag else "N/A"
                
                color_label = product_soup.find("div", class_="detail__properties-label", string=lambda t: "Colore" in t)
                if color_label:
                    color_value = color_label.find_next_sibling("div", class_="detail__properties-link")
                    product_details["color"] = color_value.text.strip() if color_value else "N/A"
                
                material_label = product_soup.find("div", class_="detail__properties-label", string=lambda t: "Materiale" in t)
                if material_label:
                    material_value = material_label.find_next_sibling("div", class_="detail__properties-link")
                    product_details["materiale"] = material_value.text.strip() if material_value else "N/A"
                
                category_data.append(product_details)
    
    return category_data

links = [
    "https://www.lumberjack.com/it/sport_shoes?gender=man",
    "https://www.lumberjack.com/it/boat-shoes?gender=man",
    "https://www.lumberjack.com/it/slip_on?gender=woman",
    "https://www.lumberjack.com/it/sandals?gender=woman",
    "https://www.lumberjack.com/it/ankle-boots?gender=woman",
    "https://www.lumberjack.com/it/beatles?gender=woman",
    "https://www.lumberjack.com/it/outdoor?gender=woman"
]

product_details = fetch_product_details(links)
product_df = pd.DataFrame(product_details)

product_df['price'] = product_df['price'].astype(str)

# Rimuovi "EUR" e spazi non interrompibili, poi converti in float gestendo le stringhe non convertibili come NaN
product_df['price'] = pd.to_numeric(product_df['price'].str.replace(r'\s*EUR', '', regex=True).str.replace(',', '.'), errors='coerce')

# Normalizzazione dei testi: Prima lettera maiuscola per nome, colore e materiale
product_df['name'] = product_df['name'].str.title()
product_df['color'] = product_df['color'].str.title()
product_df['materiale'] = product_df['materiale'].str.title()

# Estrazione della categoria da 'name' e aggiunta della colonna 'Category'
product_df['Category'] = product_df['name'].apply(lambda x: x.split()[-1]).str.title()

# Rimuovi 'Uomo' e 'Donna' da 'name'
product_df['name'] = product_df['name'].str.replace(' Uomo', '').str.replace(' Donna', '')

# Controlla se la colonna 'Type' esiste già; se sì, aggiorna i valori, altrimenti inseriscila
if 'Type' in product_df.columns:
    product_df['Type'] = product_df['name'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else 'N/A')
else:
    product_df.insert(1, 'Type', product_df['name'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else 'N/A'))

# Riordina le colonne per mettere 'Category' come prima colonna
cols = ['Category'] + [col for col in product_df.columns if col != 'Category']
product_df = product_df[cols]

product_df



Unnamed: 0,name,price,color,materiale
0,MARVIN SNEAKERS UOMO,"64,99 EUR",White,SUEDE-SYNTHETIC SMOOTH
1,MARVIN SNEAKERS UOMO,"64,99 EUR",Navy blue,SUEDE-SYNTHETIC SMOOTH
2,MARVIN SNEAKERS UOMO,"69,99 EUR",White,LEATHER
3,MARVIN SNEAKERS UOMO,"69,99 EUR",Navy blue,LEATHER
4,WARNER SNEAKERS UOMO,"79,99 EUR",Navy blue/bluette,SUEDE-MESH
...,...,...,...,...
127,ELECTRIC SCARPE OUTDOOR DONNA,"34,99 EUR",Black,NYLON-NOBESTO LEATHER
128,VABORY OUTDOOR DONNA,"59,99 EUR",Black,MESH-SYNTHETIC SMOOTH
129,STOWE Scarponcino outdoor Donna,"34,99 EUR",Black,SYNTHETIC SMOOTH
130,MODESTA OUTDOOR DONNA,"39,99 EUR",Mid grey,SOFTSHELL-SYNTHETIC


In [36]:

product_df['price'] = product_df['price'].astype(str)

# Rimuovi "EUR" e spazi non interrompibili, poi converti in float gestendo le stringhe non convertibili come NaN
product_df['price'] = pd.to_numeric(product_df['price'].str.replace(r'\s*EUR', '', regex=True).str.replace(',', '.'), errors='coerce')

# Normalizzazione dei testi: Prima lettera maiuscola per nome, colore e materiale
product_df['name'] = product_df['name'].str.title()
product_df['color'] = product_df['color'].str.title()
product_df['materiale'] = product_df['materiale'].str.title()

# Estrazione della categoria da 'name' e aggiunta della colonna 'Category'
product_df['Category'] = product_df['name'].apply(lambda x: x.split()[-1]).str.title()

# Rimuovi 'Uomo' e 'Donna' da 'name'
product_df['name'] = product_df['name'].str.replace(' Uomo', '').str.replace(' Donna', '')

# Controlla se la colonna 'Type' esiste già; se sì, aggiorna i valori, altrimenti inseriscila
if 'Type' in product_df.columns:
    product_df['Type'] = product_df['name'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else 'N/A')
else:
    product_df.insert(1, 'Type', product_df['name'].apply(lambda x: x.split()[1] if len(x.split()) > 1 else 'N/A'))

# Riordina le colonne per mettere 'Category' come prima colonna
cols = ['Category'] + [col for col in product_df.columns if col != 'Category']
product_df = product_df[cols]

product_df



Unnamed: 0,Category,name,Type,price,color,materiale
0,Sneakers,Marvin Sneakers,Sneakers,64.99,White,Suede-Synthetic Smooth
1,Sneakers,Marvin Sneakers,Sneakers,64.99,Navy Blue,Suede-Synthetic Smooth
2,Sneakers,Marvin Sneakers,Sneakers,69.99,White,Leather
3,Sneakers,Marvin Sneakers,Sneakers,69.99,Navy Blue,Leather
4,Sneakers,Warner Sneakers,Sneakers,79.99,Navy Blue/Bluette,Suede-Mesh
...,...,...,...,...,...,...
127,Outdoor,Electric Scarpe Outdoor,Scarpe,34.99,Black,Nylon-Nobesto Leather
128,Outdoor,Vabory Outdoor,Outdoor,59.99,Black,Mesh-Synthetic Smooth
129,Outdoor,Stowe Scarponcino Outdoor,Scarponcino,34.99,Black,Synthetic Smooth
130,Outdoor,Modesta Outdoor,Outdoor,39.99,Mid Grey,Softshell-Synthetic
