In [1]:
import json
import os
from scraping_product import scraping_product
import logging
import random
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load JSON data
file = open("glamira.summary_3.json")
data = json.load(file)

# Load already processed data if it exists
output_file = "processed_glamira_data.json"
processed_ids = set()
new_data = []

if os.path.exists(output_file):
    with open(output_file, "r") as infile:
        existing_data = json.load(infile)
        processed_ids = {item['product_id'] for item in existing_data}
        new_data = existing_data

# Function to scrape a single URL
def scrape_url(url):
    time.sleep(random.uniform(1, 2.3))
    try:
        name, category, gender, image_urls = scraping_product(url)
        if name and category and gender and image_urls:
            return (url, name, category, gender, image_urls)
    except Exception as e:
        logging.error(f'Error processing URL {url}: {e}')
    return (url, None, None, None, None)

# Function to process each item
def process_item(item):
    if item['product_id'] in processed_ids:
        print(f"Skipping already processed product_id: {item['product_id']}")
        return None

    print(f"Processing product_id: {item['product_id']}")
    for url in item["urls"]:
        print(f"Trying URL: {url}")
        url, name, category, gender, image_urls = scrape_url(url)
        if name and category and gender and image_urls:
            print(f"Successfully processed product_id: {item['product_id']} with URL: {url}")
            new_item = {
                "product_id": item["product_id"],
                "name": name,
                "category": category,
                "gender": gender,
                "current_url": url,
                "image_urls": image_urls,
                "image_downloaded": False
            }
            return new_item
    print(f"Failed to find valid data for product_id: {item['product_id']}")
    return {
        "product_id": item["product_id"],
        "name": None,
        "category": None,
        "gender": None,
        "current_url": None,
        "image_urls": None,
        "image_downloaded": False
    }

# Use ThreadPoolExecutor to process items concurrently
processed_count = 0
total_items = len(data)
save_interval = 200  # Save every 1000 URLs
processed_urls = 0

with ThreadPoolExecutor(max_workers=4) as executor:
    futures = [executor.submit(process_item, item) for item in data]
    for future in as_completed(futures):
        result = future.result()
        if result:
            new_data.append(result)
            processed_ids.add(result["product_id"])
            processed_count += 1
            processed_urls += 1
            print(f"Processed {processed_count}/{total_items} product_id(s)")

            # Save data every 200 URLs processed
            if processed_urls >= save_interval:
                print(f"Saving data after processing {processed_urls} URLs...")
                with open(output_file, "w") as outfile:
                    json.dump(new_data, outfile, indent=4)
                processed_urls = 0

# Final save
with open(output_file, "w") as outfile:
    json.dump(new_data, outfile, indent=4)

print(f"Data has been saved to {output_file}")


Skipping already processed product_id: 100000
Skipping already processed product_id: 100001
Skipping already processed product_id: 100002
Skipping already processed product_id: 100003
Skipping already processed product_id: 100004
Skipping already processed product_id: 100005
Skipping already processed product_id: 100006
Skipping already processed product_id: 100007
Skipping already processed product_id: 100008
Skipping already processed product_id: 100009
Skipping already processed product_id: 100010
Skipping already processed product_id: 100011
Skipping already processed product_id: 100012
Skipping already processed product_id: 100013
Skipping already processed product_id: 100014
Skipping already processed product_id: 100015
Skipping already processed product_id: 100016
Skipping already processed product_id: 100017
Skipping already processed product_id: 100018
Skipping already processed product_id: 100019
Skipping already processed product_id: 100020
Skipping already processed product