## Import necessary libraries

In [63]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

## Define functions to be used for scraping and cleaning watch data from bobs watches


In [64]:
def extract_json_ld_from_div(div_element):
    json_ld_script = div_element.find('script', type='application/ld+json')
    
    if json_ld_script:
        try:
            data = json.loads(json_ld_script.string)
            
            if isinstance(data, dict) and data.get('@type') == 'Product':
                return data
            elif isinstance(data, list):
                for item in data:
                    if isinstance(item, dict) and item.get('@type') == 'Product':
                        return item
        except (json.JSONDecodeError, TypeError) as e:
            print(f"  Warn: Error parsing JSON-LD in a script tag: {e}")
            return None
            
    return None

def extract_additional_properties(product_json):
    if not product_json or not isinstance(product_json, dict):
        return None
    
    # Extract the additionalProperty array
    additional_properties = product_json.get('additionalProperty', [])
    
    # Extract basic info
    basic_info = {
        'name': product_json.get('name', 'Unknown Product'),
        'url': product_json.get('url', '') # Add URL for reference
    }
    
    offers_data = product_json.get('offers', {})
    if isinstance(offers_data, dict):
        basic_info['price'] = offers_data.get('price', None) # Get price, default to None if not found
    else:
        basic_info['price'] = None # Set price to None if offers is not a dict
   
    if isinstance(additional_properties, list):
        return {'basic_info': basic_info, 'properties': additional_properties}
    else:
        return {'basic_info': basic_info, 'properties': []} 

# Main execution function
def scrape_watches(target_url=None):
    if target_url is None:
        target_url = "https://www.bobswatches.com/rolex/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(target_url, headers=headers, timeout=20)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve the page. Error: {e}")
        return # Exit the function if the page can't be fetched

    soup = BeautifulSoup(response.text, 'html.parser')
    
    product_divs = soup.find_all('div', class_='seocart_ProductWrapper px-xl-3')
    
    all_additional_properties_data = [] # List to store extracted data

    if not product_divs:
        
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        processed_urls = set() # Keep track of processed product URLs to avoid duplicates

        for script in json_ld_scripts:
             try:
                 data = json.loads(script.string)
                 # Handle cases where JSON-LD is a list or a single object
                 items_to_process = data if isinstance(data, list) else [data]
                 
                 for item in items_to_process:
                     if isinstance(item, dict) and item.get('@type') == 'Product':
                         product_url = item.get('url')
                         # Process only if it's a product and we haven't seen its URL before
                         if product_url and product_url not in processed_urls:
                             properties_data = extract_additional_properties(item)
                             if properties_data:
                                 all_additional_properties_data.append(properties_data)
                                 processed_urls.add(product_url) # Mark URL as processed
             except (json.JSONDecodeError, TypeError):
                 # Ignore scripts that are not valid JSON or not products
                 continue 
    else:
        print(f"Found {len(product_divs)} product divs. Processing...")
        
        for i, div in enumerate(product_divs):
            product_json = extract_json_ld_from_div(div)
            
            if product_json:
                properties_data = extract_additional_properties(product_json)
                if properties_data:
                    all_additional_properties_data.append(properties_data)
                else:
                    pass 
            else:
                pass

    if all_additional_properties_data:
        print(f"\nSuccessfully extracted 'additionalProperty' data for {len(all_additional_properties_data)} products.")
        
        flattened_data = []
        for item in all_additional_properties_data:
            flat_item = item['basic_info'].copy()
         
            for prop in item['properties']:
                if isinstance(prop, dict) and 'name' in prop and 'value' in prop:
                    col_name = prop['name'].lower().replace(' ', '_').replace('&', 'and')
                    flat_item[col_name] = prop['value']
            flattened_data.append(flat_item)
        
        # Create DataFrame
        df = pd.DataFrame(flattened_data)
        
        return df # Return the DataFrame for further processing or output

    else:
        print("\nNo 'additionalProperty' data was extracted. Check selectors and website structure.")

def preprocess_data(df, manufacturer_name):
    df['name'] = manufacturer_name
    df.rename(columns={'name': 'Manufacturer'}, inplace=True)
    df.drop(columns=['dial_color', 'gender', 'warranty', 'crystal_material'], inplace=True)
    df.fillna('Not listed', inplace=True)
    df.rename(columns={'model_name':'Model', 'price':'Price ($)', 'movement_type':'Movement', 
                       'hour_markers':'Hour Markers', 'bezel_type':'Bezel Type', 'year':'Year',
                       'discontinued': 'Discontinued', 'metal_type':'Metal', 'url':'Link'}, inplace=True)

    column_order = [
        'Manufacturer','Model', 'Price ($)', 'Link','Movement',
        'Metal','Hour Markers','Bezel Type','Year','Discontinued'
    ]

    return df[column_order]


## Scrape data for watches

In [65]:
brands = ['rolex', 'patek-philippe', 'audemars-piguet', 'breitling']
all_preprocessed_watches = [] # Initialize list to store preprocessed DFs for each brand

for brand in brands:
    pages = []
    for i in range(1, 10): # Scrape pages 1 to 9
        url = f"https://www.bobswatches.com/{brand}/?page={i}"
        df = scrape_watches(url) # scrape_watches should return a DataFrame or None
        if df is not None and not df.empty:
            pages.append(df)
        else:
            print(f"  No data found for {brand} page {i}, stopping for this brand.")
            
    if not pages:
        print(f"  No data collected for brand: {brand}")
        continue # Skip to the next brand if no pages were successfully scraped

    brand_watches_df = pd.concat(pages, ignore_index=True)
    print(f"  Collected {len(brand_watches_df)} listings for {brand}.")

    formatted_brand_name = brand.replace('-', ' ').title() 
    preprocessed_df = preprocess_data(brand_watches_df, formatted_brand_name)
    
    all_preprocessed_watches.append(preprocessed_df)
    print(f"  Finished preprocessing for {brand}.")

# After processing all brands, concatenate all preprocessed DFs into one
if all_preprocessed_watches:
    aggregate_df = pd.concat(all_preprocessed_watches, ignore_index=True)
else:
    print("No data collected for any brand.")

Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.
  Collected 405 listings for rolex.
  Finished preprocessing for

## Export the final dataframe into csv file for use in streamlit app

In [66]:
aggregate_df.to_csv('final_watches.csv', index=False)