## WEBSCRAPING WATCH PRODUCTS IN PYTHON USING BEAUTIFULSOUP

#### Import libraries we will utilise

In [None]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import numpy as np
import requests

In [29]:
# URL
url = 'https://www.misterchrono.com/en/126-watch-bookstore'

In [30]:
# Get my user agent
Header = ({
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Accept-language':'en-US, en;q=0.5'
})

In [31]:
# HTTPS REQUEST
page = requests.get(url, headers=Header)

In [32]:
type(page.content)

bytes

In [33]:
#To convert into HTML Format
soup = bs(page.content,'html.parser')

In [35]:
soup

<!DOCTYPE HTML>

<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en-us"><![endif]-->
<!--[if IE 7]><html class="no-js lt-ie9 lt-ie8 ie7" lang="en-us"><![endif]-->
<!--[if IE 8]><html class="no-js lt-ie9 ie8" lang="en-us"><![endif]-->
<!--[if gt IE 8]> <html class="no-js ie9" lang="en-us"><![endif]-->
<html class="default" lang="en-us">
<head>
<meta charset="utf-8"/>
<title>Watch Bookstore - MisterChrono</title>
<meta content="Watch Bookstore" name="description"/>
<meta content="watch, book" name="keywords"/>
<meta content="PrestaShop" name="generator"/>
<meta content="noindex,nofollow" name="robots"/>
<meta content="width=device-width, minimum-scale=0.25, maximum-scale=1.6, initial-scale=1.0" name="viewport"/> <meta content="yes" name="apple-mobile-web-app-capable"/>
<link href="/img/favicon.ico?1697125177" rel="icon" type="image/vnd.microsoft.icon"/>
<link href="/img/favicon.ico?1697125177" rel="shortcut icon" type="image/x-icon"/>
<link href="/themes/leo_mayshop/css/

In [36]:
#Scrape links first
Links = soup.find_all('a', attrs={'class':"product-name"})

In [39]:
# To go into one link and extract all the details we require about a watch
product_link = Links[0].get('href')# extracts the actual link to product
product_link

'https://www.misterchrono.com/en/watch-bookstore/4171-rolex-philosophy-3050262344227.html'

In [40]:
# Link HTTPS REQUEST
new_page = requests.get(product_link, headers=Header)

In [41]:
#To convert new link page into HTML Format
new_soup = bs(new_page.content,'html.parser')

In [42]:
new_soup

<!DOCTYPE HTML>

<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en-us"><![endif]-->
<!--[if IE 7]><html class="no-js lt-ie9 lt-ie8 ie7" lang="en-us"><![endif]-->
<!--[if IE 8]><html class="no-js lt-ie9 ie8" lang="en-us"><![endif]-->
<!--[if gt IE 8]> <html class="no-js ie9" lang="en-us"><![endif]-->
<html class="default" lang="en-us">
<head>
<meta charset="utf-8"/>
<title>Rolex  History, Icons and Record-Breaking Models</title>
<meta content="Rolex

History, Icons and Record-Breaking Models" name="description"/>
<meta content="PrestaShop" name="generator"/>
<meta content="index,follow" name="robots"/>
<meta content="width=device-width, minimum-scale=0.25, maximum-scale=1.6, initial-scale=1.0" name="viewport"/> <meta content="yes" name="apple-mobile-web-app-capable"/>
<link href="/img/favicon.ico?1697125177" rel="icon" type="image/vnd.microsoft.icon"/>
<link href="/img/favicon.ico?1697125177" rel="shortcut icon" type="image/x-icon"/>
<link href="/themes/leo_mayshop/css

In [45]:
price = new_soup.find('span', attrs={"itemprop":"price"}).text

In [47]:
price

'38,00 €'

In [55]:
product_name = new_soup.find('h1', attrs={"itemprop":"name"}).text

In [56]:
product_name

'Rolex  History, Icons and Record-Breaking Models'

In [57]:
product_availability = new_soup.find('span', attrs={"id":"availability_value"}).text

In [60]:
if product_availability == "":
    product_availability = 'na'
else:
    product_availability

In [70]:
product_availability

'na'

#### CREATING FUNCTIONS TO BE REUSED WHENEVER RUNNING THE PROGRAM

####     Define the functions which will help pull Product Name, Price and Availsbility of watches in www.misterchrono.com webstore

#### Since not always will we be able to find results,we instead replace with n/a, and whenever an AttributeError occurs we also replace with n/a instead

In [62]:
# Function to pull the Product Name
def get_product_name(soup):
    try:
        product_name = new_soup.find('h1', attrs={"itemprop":"name"}).text
    except AttributeError:
        product_name ='n/a'
        
    return product_name
        
# Function to pull the Product Price
def get_product_price(soup):
    try:
        product_price = new_soup.find('span', attrs={"itemprop":"price"}).text
    except AttributeError:
        product_price ='n/a'
        
    return product_price

# Function to pull the Product Availability
def get_product_availability(soup):
    try:
        product_availability= new_soup.find('span', attrs={"id":"availability_value"}).text
    except AttributeError:
        product_availability ='n/a'
        
    return product_availability

### Writing the Universal script where 

#### We define our url
#### We request from the url access
#### Get the links in to a list
#### Iterate through the links using a for loop utilising the created functions so as to pick Product Name, Price and Availabilty
#### Create a dictionary where the iterated link results get stored in
#### Create a dataframe from the dictionary and drop every product name with NA, since it wont be of relevance not haveing product name
#### Finally we store our data set in CSV file which can be later utilised for analysis and research.


In [68]:
if __name__ == '__main__':
    # add your user agent 
    HEADERS = ({'User-Agent':'', 'Accept-Language': 'en-US, en;q=0.5'})

    # The webpage URL
    URL = "https://www.misterchrono.com/en/126-watch-bookstore"

    # HTTP Request
    page = requests.get(url, headers=Header)

    # Soup Object containing all data
    soup = bs(page.content, "html.parser")

    # Fetch links as List of Tag Objects
    links = soup.find_all('a', attrs={'class':"product-name"})

    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
            links_list.append(link.get('href'))

    d = {"Name":[], "Price":[], "Availability":[]}
    
    # Loop for extracting product details from each link 
    for link in links_list:
        new_page = requests.get(link, headers=Header)

        new_soup = bs(new_page.content, "html.parser")

        # Function calls to display all necessary product information
        d['Name'].append(get_product_name(new_soup))
        d['Price'].append(get_product_price(new_soup))
        d['Availability'].append(get_product_availability(new_soup))

    
    watch_df = pd.DataFrame.from_dict(d)
    watch_df['Name'].replace('n/a', np.nan, inplace=True)
    watch_df = watch_df.dropna(subset=['Name'])
    watch_df.to_csv("watch_data.csv", header=True, index=False)

In [69]:
watch_df

Unnamed: 0,Name,Price,Availability
0,"Rolex History, Icons and Record-Breaking Models","38,00 €",
1,Investing in wristwatches: Rolex,"85,00 €",in stock
2,Patek Philippe: Investing in Wristwatches,"95,00 €",
3,Rolex Cosmograph Daytona Vol. 1: Manual Windi...,"175,00 €",in stock
4,Rolex Philosophy,"65,00 €",in stock
5,Lexique Rolex – Gaspard SANDERS (French),"29,90 €",in stock
6,Mondani - Vintage Patek,"480,00 €",in stock
7,I IS FOR INDIE - ABCAIRES DES HORLOGERS INDEPE...,"26,90 €",
8,R IS FOR ROLEX - ABCDAIRE POUR LES FUTURES COL...,"26,90 €",
9,THE WATCH COLLECTOR'S COLORING BOOK,"26,90 €",
