## Import necessary libraries

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json

## Define functions to be used for scraping and cleaning watch data from bobs watches


In [14]:
def extract_json_ld_from_div(div_element):
    json_ld_script = div_element.find('script', type='application/ld+json')
    
    if json_ld_script:
        try:
            data = json.loads(json_ld_script.string)
            
            if isinstance(data, dict) and data.get('@type') == 'Product':
                return data
            elif isinstance(data, list):
                for item in data:
                    if isinstance(item, dict) and item.get('@type') == 'Product':
                        return item
        except (json.JSONDecodeError, TypeError) as e:
            print(f"  Warn: Error parsing JSON-LD in a script tag: {e}")
            return None
            
    return None

def extract_additional_properties(product_json):
    if not product_json or not isinstance(product_json, dict):
        return None
    
    # Extract the additionalProperty array
    additional_properties = product_json.get('additionalProperty', [])
    
    # Extract basic info
    basic_info = {
        'name': product_json.get('name', 'Unknown Product'),
        'url': product_json.get('url', '') # Add URL for reference
    }
    
    offers_data = product_json.get('offers', {})
    if isinstance(offers_data, dict):
        basic_info['price'] = offers_data.get('price', None) # Get price, default to None if not found
    else:
        basic_info['price'] = None # Set price to None if offers is not a dict
   
    if isinstance(additional_properties, list):
        return {'basic_info': basic_info, 'properties': additional_properties}
    else:
        return {'basic_info': basic_info, 'properties': []} 

# Main execution function
def scrape_watches(target_url=None):
    if target_url is None:
        target_url = "https://www.bobswatches.com/rolex/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(target_url, headers=headers, timeout=20)
        response.raise_for_status() 
    except requests.exceptions.RequestException as e:
        print(f"Failed to retrieve the page. Error: {e}")
        return # Exit the function if the page can't be fetched

    soup = BeautifulSoup(response.text, 'html.parser')
    
    product_divs = soup.find_all('div', class_='seocart_ProductWrapper px-xl-3')
    
    all_additional_properties_data = [] # List to store extracted data

    if not product_divs:
        
        json_ld_scripts = soup.find_all('script', type='application/ld+json')
        processed_urls = set() # Keep track of processed product URLs to avoid duplicates

        for script in json_ld_scripts:
             try:
                 data = json.loads(script.string)
                 # Handle cases where JSON-LD is a list or a single object
                 items_to_process = data if isinstance(data, list) else [data]
                 
                 for item in items_to_process:
                     if isinstance(item, dict) and item.get('@type') == 'Product':
                         product_url = item.get('url')
                         # Process only if it's a product and we haven't seen its URL before
                         if product_url and product_url not in processed_urls:
                             properties_data = extract_additional_properties(item)
                             if properties_data:
                                 all_additional_properties_data.append(properties_data)
                                 processed_urls.add(product_url) # Mark URL as processed
             except (json.JSONDecodeError, TypeError):
                 # Ignore scripts that are not valid JSON or not products
                 continue 
    else:
        print(f"Found {len(product_divs)} product divs. Processing...")
        
        for i, div in enumerate(product_divs):
            product_json = extract_json_ld_from_div(div)
            
            if product_json:
                properties_data = extract_additional_properties(product_json)
                if properties_data:
                    all_additional_properties_data.append(properties_data)
                else:
                    pass 
            else:
                pass

    if all_additional_properties_data:
        print(f"\nSuccessfully extracted 'additionalProperty' data for {len(all_additional_properties_data)} products.")
        
        flattened_data = []
        for item in all_additional_properties_data:
            flat_item = item['basic_info'].copy()
         
            for prop in item['properties']:
                if isinstance(prop, dict) and 'name' in prop and 'value' in prop:
                    col_name = prop['name'].lower().replace(' ', '_').replace('&', 'and')
                    flat_item[col_name] = prop['value']
            flattened_data.append(flat_item)
        
        # Create DataFrame
        df = pd.DataFrame(flattened_data)
        
        return df # Return the DataFrame for further processing or output

    else:
        print("\nNo 'additionalProperty' data was extracted. Check selectors and website structure.")

def preprocess_data(df, manufacturer_name):
    df['name'] = manufacturer_name
    df.rename(columns={'name': 'manufacturer'}, inplace=True)
    df.drop(columns=['dial_color', 'gender', 'warranty', 'crystal_material'], inplace=True)
    df.fillna('Not listed', inplace=True)

    column_order = [
        'manufacturer','model_name', 'price', 'url','movement_type',
        'metal_type','hour_markers','bezel_type','year','discontinued'
    ]
    return df[column_order]


## Scrape data for watches

In [15]:
rolex_watches = scrape_watches()
rolex_watches = preprocess_data(rolex_watches, 'Rolex')
rolex_watches.head()

Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.


Unnamed: 0,manufacturer,model_name,price,url,movement_type,metal_type,hour_markers,bezel_type,year,discontinued
0,Rolex,Datejust 41,11795.0,https://www.bobswatches.com/rolex-datejust-41-...,Automatic,Stainless Steel,Index,Fluted,2010 - Present,No
1,Rolex,Submariner,36995.0,https://www.bobswatches.com/rolex-submariner-1...,Automatic,White Gold,Luminous,Timing,2010 - Present,Yes
2,Rolex,Daytona,44495.0,https://www.bobswatches.com/rolex-daytona-1165...,Automatic,Rose Gold,Index,Tachymetric,2010 - Present,Not listed
3,Rolex,Datejust 41,13995.0,https://www.bobswatches.com/pre-owned-rolex-da...,Automatic,Stainless Steel,Diamond,Fluted,2010 - Present,No
4,Rolex,OysterDate,3995.0,https://www.bobswatches.com/rolex-oysterdate-r...,Manual,Stainless Steel,Index,Smooth,1970s,Yes


In [16]:
patek_watches = scrape_watches('https://www.bobswatches.com/patek-philippe/')
patek_watches = preprocess_data(patek_watches, 'Patek Philippe')
patek_watches.head()

Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.


Unnamed: 0,manufacturer,model_name,price,url,movement_type,metal_type,hour_markers,bezel_type,year,discontinued
0,Patek Philippe,Nautilus,139995.0,https://www.bobswatches.com/patek-philippe/pat...,Automatic,Rose Gold,Index,Not listed,2010 - Present,Not listed
1,Patek Philippe,Gondolo,8495.0,https://www.bobswatches.com/patek-philippe/pat...,Quartz,White Gold,Diamond,Diamond,Not listed,Yes
2,Patek Philippe,Calatrava,14495.0,https://www.bobswatches.com/patek-philippe/pat...,Manual,Yellow Gold,Roman,Not listed,Not listed,Yes
3,Patek Philippe,Golden Ellipse,13995.0,https://www.bobswatches.com/patek-philippe/pat...,Quartz,Yellow Gold,Arabic,Diamond,1990s,Yes
4,Patek Philippe,Neptune,20495.0,https://www.bobswatches.com/patek-philippe/pat...,Automatic,Stainless Steel,Roman,Notched,2000s,Yes


In [17]:
audemars_watches = scrape_watches('https://www.bobswatches.com/audemars-piguet/')
audemars_watches = preprocess_data(audemars_watches, 'Audemars Piguet')
audemars_watches.head()

Found 33 product divs. Processing...

Successfully extracted 'additionalProperty' data for 33 products.


Unnamed: 0,manufacturer,model_name,price,url,movement_type,metal_type,hour_markers,bezel_type,year,discontinued
0,Audemars Piguet,Jules Audemars,19995.0,https://www.bobswatches.com/audemars-piguet/au...,Automatic,Stainless Steel,Arabic,Smooth,2000s,Yes
1,Audemars Piguet,Royal Oak Offshore,24995.0,https://www.bobswatches.com/audemars-piguet/au...,Automatic,Stainless Steel,Arabic,Smooth,2010 - Present,Yes
2,Audemars Piguet,Royal Oak Offshore,25995.0,https://www.bobswatches.com/audemars-piguet/au...,Automatic,Stainless Steel,Index,Not listed,2010 - Present,Not listed
3,Audemars Piguet,Royal Oak,81995.0,https://www.bobswatches.com/audemars-piguet/au...,Quartz,Yellow Gold,No Numerals,Diamond,Not listed,Yes
4,Audemars Piguet,Royal Oak Offshore,25995.0,https://www.bobswatches.com/audemars-piguet/au...,Automatic,Stainless Steel,Arabic,Smooth,2010 - Present,Yes


In [18]:
breitling_watches = scrape_watches('https://www.bobswatches.com/breitling/')
breitling_watches = preprocess_data(breitling_watches, 'Breitling')
breitling_watches.head()

Found 45 product divs. Processing...

Successfully extracted 'additionalProperty' data for 45 products.


Unnamed: 0,manufacturer,model_name,price,url,movement_type,metal_type,hour_markers,bezel_type,year,discontinued
0,Breitling,Superocean,2695.0,https://www.bobswatches.com/breitling/pre-owne...,Automatic,Stainless Steel,Arabic,Timing,2010 - Present,Yes
1,Breitling,Navitimer,3695.0,https://www.bobswatches.com/breitling/breitlin...,Automatic,Stainless Steel,Index,Slide Rule,2010 - Present,No
2,Breitling,Classic AVI,4195.0,https://www.bobswatches.com/breitling/breitlin...,Automatic,Stainless Steel,Arabic,12-hour,2010 - Present,No
3,Breitling,Superocean,2895.0,https://www.bobswatches.com/breitling/pre-owne...,Automatic,Stainless Steel,Index,Timing,2010 - Present,Yes
4,Breitling,Navitimer,6595.0,https://www.bobswatches.com/breitling/breitlin...,Automatic,Stainless Steel,Index,Slide Rule,2010 - Present,No


## Stack all the data into a final dataframe

In [19]:

dfs = [rolex_watches, patek_watches, audemars_watches, breitling_watches]
final = pd.concat(dfs, ignore_index=True)
final.rename(columns = {'year':'years_available'}, inplace = True)
final

Unnamed: 0,manufacturer,model_name,price,url,movement_type,metal_type,hour_markers,bezel_type,years_available,discontinued
0,Rolex,Datejust 41,11795.00,https://www.bobswatches.com/rolex-datejust-41-...,Automatic,Stainless Steel,Index,Fluted,2010 - Present,No
1,Rolex,Submariner,36995.00,https://www.bobswatches.com/rolex-submariner-1...,Automatic,White Gold,Luminous,Timing,2010 - Present,Yes
2,Rolex,Daytona,44495.00,https://www.bobswatches.com/rolex-daytona-1165...,Automatic,Rose Gold,Index,Tachymetric,2010 - Present,Not listed
3,Rolex,Datejust 41,13995.00,https://www.bobswatches.com/pre-owned-rolex-da...,Automatic,Stainless Steel,Diamond,Fluted,2010 - Present,No
4,Rolex,OysterDate,3995.00,https://www.bobswatches.com/rolex-oysterdate-r...,Manual,Stainless Steel,Index,Smooth,1970s,Yes
...,...,...,...,...,...,...,...,...,...,...
163,Breitling,Navitimer 8,2695.00,https://www.bobswatches.com/breitling/breitlin...,Automatic,Stainless Steel,Arabic,Slide Rule,2010 - Present,Yes
164,Breitling,Super Avenger,7295.00,https://www.bobswatches.com/breitling/breitlin...,Automatic,Ceramic,Arabic,Timing,2010 - Present,No
165,Breitling,Superocean,2595.00,https://www.bobswatches.com/breitling/breitlin...,Automatic,Stainless Steel,Arabic,Timing,2010 - Present,Yes
166,Breitling,Bentley,4395.00,https://www.bobswatches.com/breitling/breitlin...,Automatic,Stainless Steel,Index,Slide Rule,2010 - Present,Yes


## Export the final dataframe into csv file for use in streamlit app

In [20]:
final.to_csv('final_watches.csv', index=False)