# Python Libraries

In [None]:
import pandas as pd
import numpy as np
import requests
import json
import time
import urllib.parse
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# Function used to scrape 'Online Shops' page

In [None]:
def clean_url(url):
    # Your URL cleaning logic here
    cleaned_url = url.strip()  # For example, removing leading/trailing spaces
    return cleaned_url

def giveMyJson(csvFile):
    # Data Cleaning
    data = pd.read_csv(csvFile,encoding='utf-8')
    data.drop_duplicates(inplace=True)
    data.dropna(subset=['Google Location', 'Name'], inplace=True)
    data['Rating'].fillna(0, inplace=True)
    desired_purposes = ['Pet supply store', 'Pet Shop', 'Pet food and animal feeds']
    data = data[data['Purpose'].isin(desired_purposes)]
    data['Address'].fillna('Address not listed', inplace=True)
    data['Contact No.'].fillna('Contact not listed', inplace=True)
    data = data.sort_values(by='Name').reset_index(drop=True)
    
    # Getting URL by webscraping
    driver = webdriver.Chrome()
    extracted_data = []

    for index, row in data.iterrows():
        url = row['Google Location']
        name = row['Name']
        rating = row['Rating']
        image = row['Image']
        purpose = row['Purpose']
        address = row['Address']
        contact = row['Contact No.']
    
        try:   
            driver.get(url)
            script = 'return document.querySelector("a[data-tooltip=\\"Open website\\"]").getAttribute("href")'
            title = driver.execute_script(script)
            soup = BeautifulSoup(title, 'html.parser')
            if soup:
                url = soup.text
            else:
                url = "URL not found"
        except Exception:
            url = "URL Not Listed"
    
        # Append the extracted data to the list
        extracted_data.append({'Name': name, 'URL': url,'Rating': rating,'Address': address,'Contact': contact,'Purpose': purpose,'Image Link': image})
    
    driver.quit()
    data = pd.DataFrame(extracted_data)
    
    # Final cleaning of data and creating json file
    data['URL'] = data['URL'].apply(clean_url)
    data.drop_duplicates(subset=['URL'], inplace=True)
    data = data[data['URL'] != 'URL Not Listed']
    data.sort_values(by='Name', inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    json_data = []
    for index, row in data.iterrows():
        store_info = {
            "name": row["Name"],
            "url": row["URL"],
            "rating": row["Rating"],
            "address": row["Address"],
            "contact": row["Contact"],
            "purpose": row["Purpose"],
            "image_link": row["Image Link"]
        }
        json_data.append(store_info)

    # Define the output JSON file name
    json_filename = "pet_stores.json"

    # Write the JSON data to the file
    with open(json_filename, 'w') as json_file:
        json.dump(json_data, json_file, indent=4)
        
# This function basically deals with what you want to have in your json file,provided csv file exported from instant data scraper
giveMyJson('google.csv')

The function above takes a csv as an argument, which is basically the data scraped using the 'Instant Data Scraper' chrome extension. The data is then cleaned and the website url is scraped from the google search result which corresponds to that online store. After extracting the website url, the data is cleaned again and a json file containing that data is given.

# Function used to scrape 'Top Selling Products' page

In [None]:
def scrape_google_shopping(query):
    # Initialize Chrome WebDriver (you need to download and specify the path to your ChromeDriver executable)
    driver = webdriver.Chrome()

    # Navigate to Google Shopping with the specified query
    search_query = f'https://www.google.com/search?q={query}&tbm=shop'
    driver.get(search_query)

    # Wait for the page to load (you may need to adjust the waiting time)
    time.sleep(5)

    # Scroll down the page to load more results (you can adjust the number of scrolls)
    for _ in range(5):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)

    # Get the page source after scrolling
    page_source = driver.page_source

    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find and extract the product elements (you may need to inspect the HTML structure)
    product_elements = soup.find_all('div', class_='sh-dgr__gr-auto sh-dgr__grid-result')

    # Initialize a list to store product information
    products = []

    # Iterate through product elements
    for product_element in product_elements:
        product_name = product_element.find('h3', class_='tAxDx').text.strip()
        product_price = product_element.find('span', class_='a8Pemb').text.strip()

        # Extract the product URL
        product_url_element = product_element.find('a', class_='shntl', href=True)
        if product_url_element:
            encoded_url = product_url_element['href']
            # Decode the URL and construct the full URL
            decoded_url = urllib.parse.unquote(encoded_url)
            full_url = "https://www.google.com" + decoded_url
            product_url = full_url
        else:
            product_url = None

        # Extracting image URL
        product_image_element = product_element.find('div', class_='ArOc1c')
        product_image_element = product_image_element.find('img', src=True)
        if product_image_element:
            product_image = product_image_element['src']
        else:
            product_image = None
            
        # Extracting ratings
        product_ratings_element = product_element.find('span', class_='Rsc7Yb')
        if product_ratings_element:
            product_ratings = product_ratings_element.text.strip()
        else:
            product_ratings = None

        products.append({
            'name': product_name,
            'price': product_price,
            'url': product_url,
            'image': product_image,
            'ratings': product_ratings
        })

    # Close the WebDriver
    driver.quit()

    # Convert the product data to JSON
    json_data = json.dumps(products, indent=4)

    # Save the JSON data to a file
    with open('products.json', 'w', encoding='utf-8') as json_file:
        json_file.write(json_data)

# application:
query = 'top selling products for cats and dogs in Singapore'
scrape_google_shopping(query)

The function above takes a google query as an input argument, which then scrapes the results of that query. The out is a json file consisting of the product information, which is scraped by identifying the correct html tags.

# Function used to scrape 'Pet Care Info' page

In [None]:
def clean_text(text):
    # Replace '\u2019' with a regular apostrophe
    return text.replace('\u2019', "'").strip()

def scrape_website(url, selectors):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for 4xx and 5xx HTTP status codes

        soup = BeautifulSoup(response.text, 'html.parser')
        data = []

        tip_elements = soup.select(selectors["tip"])
        desc_elements = []

        # Check if a separate selector is provided for descriptions
        if "desc" in selectors:
            desc_elements = soup.select(selectors["desc"])

        for tip_element in tip_elements:
            tip_text = re.sub(r'^\d+\.\s*', '', clean_text(tip_element.text))  # Remove numbering
            description = ""

            # Find the description associated with this tip element
            if desc_elements:
                # Look for the description among siblings of the tip element
                for sibling in tip_element.find_next_siblings():
                    if sibling in desc_elements:
                        description = clean_text(sibling.text)
                        break

            # Check if the grooming tip is not empty before adding it
            if tip_text.strip():
                data.append({
                    "website_url": url,
                    "grooming_tip": tip_text,
                    "description": description,
                })

        return data

    except requests.exceptions.RequestException as e:
        return {"error": str(e)}
    except Exception as e:
        return {"error": str(e)}

def main():
    # Define rules for each website, including selectors
    website_rules = {
        "hastingsvet.com": {
            "url": "https://hastingsvet.com/six-helpful-grooming-tips-for-your-dog-or-cat/",
            "selectors": {
                "tip": "div.elementor-widget-container h2.wp-block-heading",
                "desc": "div.elementor-widget-container p",
            },
        },
        "briopets.com": {
            "url": "https://briopets.com/blogs/briopets-official-blog/top-9-pet-grooming-tips-for-dog-and-cat-owners",
            "selectors": {
                "tip": "div.rte h3",
                "desc": "div.rte p",
            },
        },
        "revivalanimal.com": {
            "url": "https://www.revivalanimal.com/learning-center/pet-grooming-tips-tricks",
            "selectors": {
                "tip": "h3 + ul li",
            },
        }
    }

    results = []

    for domain, rule in website_rules.items():
        print(f"Scraping from {domain}")
        data = scrape_website(rule["url"], rule["selectors"])
        results.extend(data)

    # Remove entries with empty grooming tips
    results = [entry for entry in results if entry["grooming_tip"].strip()]

    if results:
        json_data = json.dumps(results, indent=4)
#         print(json_data)
#         You can write the JSON data to a file if needed
        with open('grooming_tips.json', 'w', encoding='utf-8') as file:
            file.write(json_data)
    else:
        print("No data scraped.")

if __name__ == "__main__":
    main()

The function above is robust such that we just needed to add website urls which provided pet care information such as the ones listed under website rules. We will also need to pass the relavant tags which will be needed for scraping. The function 'scrape_website' is the main function used for scraping data. The output again is a json file with our data stored.

# Function used to scrape data for 'Shops' page
###### Applicable for all the pages with maps 

In [None]:
#Converted the xslx file to csv 
input_excel = "vet.xlsx"
output_csv = "vet.csv"

# Read Excel file
df = pd.read_excel(input_excel)

# Save as CSV
df.to_csv(output_csv, index=False)

# Initialize empty lists for each column
s_n_list = []
type_list = []
name_list = []
address_list = []
tel_office_list = []

# Open the CSV file for reading
with open('vet.csv', newline='', encoding='utf-8') as csvfile:
    csvreader = csv.reader(csvfile, delimiter=',', quotechar='"')

    # Skip the header row
    next(csvreader)

    # Iterate through each row and extract data for each column
    for row in csvreader:
        if len(row) >= 5:  # Ensure that the row has at least 5 columns
            s_n = row[0].strip()
            type = row[1].strip()
            name = row[2].strip()
            address = row[3].strip()
            tel_office = row[4].strip()

            # Append data to respective lists
            s_n_list.append(s_n)
            type_list.append(type)
            name_list.append(name)
            address_list.append(address)
            tel_office_list.append(tel_office)

with open('vet.json', 'r', encoding='utf-8') as jsonfile:
    data = json.load(jsonfile)

# Initialize a new list to store the updated data
new_data_list = []

# Iterate through each entry in the existing data
for entry in data:
    name_address_pair = entry.get('  NAME    ;ADDRESS', '').split(';')
    if len(name_address_pair) == 2:
        name, address = name_address_pair
        name = name.strip()
        address = address.strip()
        new_data_list.append({"NAME": name, "ADDRESS": address})

# Specify the name of the new JSON output file
new_output_json_file = 'vet_new.json'

# Load the JSON data
input_json = "vet_new.json"  
output_json = "output.json"  

with open(input_json, "r") as json_file:
    data = json.load(json_file)

# Process the data to split the "name" and "address" based on the delimiter ";"
processed_data = []
for entry in data:
    name_address = entry["  NAME    ;ADDRESS"].strip()
    null_values = entry.get("null", [])

    # Split the name and address based on the delimiter ";"
    parts = name_address.split(";")
    name = parts[0].strip()
    address = parts[1].strip() if len(parts) > 1 else ""

    # Merge the "null" values with the address
    if null_values:
        address += " " + " ".join(null_values)

    processed_data.append({"name": name, "address": address})

# Load the JSON file
input_json = "vet_new.json"
output_json = "vet_geocode.json"
api_key = "AIzaSyCUax1ygLocmDPvzRYPbfr1zVzo6M6YNKM"  # Replace with your actual API key

# Load the JSON data
with open(input_json, "r") as json_file:
    data = json.load(json_file)

# Define the base URL for Geocoding API
geocode_base_url = "https://maps.googleapis.com/maps/api/geocode/json"

# Update each entry with latitude and longitude
for entry in data:
    address = entry["address"]
    params = {
        "address": address,
        "key": api_key
    }
    response = requests.get(geocode_base_url, params=params)
    result = response.json()
    
    # Check if there's a valid result
    if result["status"] == "OK":
        location = result["results"][0]["geometry"]["location"]
        entry["latitude"] = location["lat"]
        entry["longitude"] = location["lng"]
    else:
        print(f"Geocoding failed for address: {address}")

# Save the data with latitude and longitude to a new JSON file
with open(output_json, "w") as json_file:
    json.dump(data, json_file, indent=4)

print(f"Geocoding results have been added to '{output_json}'")