In [5]:
pip install pandas requests tqdm

[1;31merror[0m: [1mexternally-managed-environment[0m

[31mÃ—[0m This environment is externally managed
[31mâ•°â”€>[0m To install Python packages system-wide, try apt install
[31m   [0m python3-xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian-packaged Python package,
[31m   [0m create a virtual environment using python3 -m venv path/to/venv.
[31m   [0m Then use path/to/venv/bin/python and path/to/venv/bin/pip. Make
[31m   [0m sure you have python3-full installed.
[31m   [0m 
[31m   [0m If you wish to install a non-Debian packaged Python application,
[31m   [0m it may be easiest to use pipx install xyz, which will manage a
[31m   [0m virtual environment for you. Make sure you have pipx installed.
[31m   [0m 
[31m   [0m See /usr/share/doc/python3.12/README.venv for more information.

[1;35mnote[0m: If you believe this is a mistake, please contact your Python installation or OS

In [1]:
import os
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# --- 1. CONFIGURAZIONE PERCORSI E PARAMETRI ---

# Percorso assoluto alla root del progetto (due livelli su rispetto al notebook)
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

CSV_PATH = os.path.join(PROJECT_ROOT, "data", "raw", "product_dataset.csv")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data", "images", "products")

MAX_WORKERS = 20

# Crea la cartella se non esiste
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Verifica che il CSV esista prima di procedere
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(
        f"CSV non trovato in: {CSV_PATH}\n"
        f"Directory corrente: {os.getcwd()}\n"
        f"Assicurati che 'product_dataset.csv' sia in '{os.path.join(PROJECT_ROOT, 'data', 'raw')}'"
    )

# --- 2. FUNZIONE DI DOWNLOAD SINGOLO ---
def download_image(row):
    asset_id = row['product_asset_id']
    url = row['product_image_url']
    
    save_path = os.path.join(OUTPUT_DIR, f"{asset_id}.jpg")
    
    if os.path.exists(save_path):
        return True
        
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    except Exception:
        return False

# --- 3. ESECUZIONE MULTITHREADING ---
def main():
    print(f"ðŸ“‚ Project root: {PROJECT_ROOT}")
    print(f"Caricamento dataset da {CSV_PATH}...")
    
    df = pd.read_csv(CSV_PATH)
    df = df.dropna(subset=['product_image_url'])
    print(f"Trovate {len(df)} immagini da scaricare.")
    
    success_count = 0
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = {executor.submit(download_image, row): row for _, row in df.iterrows()}
        
        for future in tqdm(as_completed(futures), total=len(futures), desc="Download in corso"):
            if future.result():
                success_count += 1
                
    print(f"\nðŸš€ Download completato! {success_count}/{len(df)} immagini salvate in: {OUTPUT_DIR}")

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'pandas'