<a href="https://colab.research.google.com/github/arvindravulavaru/scraper/blob/main/Shopify_dev_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Create folder structue in your Google Drive before running the code!

In [1]:
!pip install html2text


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting html2text
  Downloading html2text-2020.1.16-py3-none-any.whl (32 kB)
Installing collected packages: html2text
Successfully installed html2text-2020.1.16


In [28]:
import requests
from bs4 import BeautifulSoup
import html2text
import datetime
import pytz
import os
import tarfile
import json
from urllib.parse import urlparse, parse_qs, urljoin

now = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")

def remove_tags(soup, tag_names):
    """
    This function removes specified tags from HTML.

    Parameters:
    soup (bs4.BeautifulSoup) : BeautifulSoup object
    tag_names (list) : list of strings

    Returns:
    str : cleaned HTML
    """

    # Remove specified tags from HTML
    for tag_name in tag_names:
        for tag in soup.find_all(tag_name):
            tag.decompose()

    # Return cleaned HTML
    return str(soup)


# Function to scrape links recursively
def scrape_links(url, folder_path, max_files, scraped_links, tag_names, paths):

    # Check if we have reached the maximum number of files
    if len(scraped_links) >= max_files:
        return scraped_links
    
    # Write content to files
    parsed_url = urlparse(url)

    # Send GET request to URL if it is from the shopify domain
    if "shopify.dev" not in urlparse(url).netloc:
        return scraped_links
    
    print(f'Processing URL: [{len(scraped_links) + 1}] {url}')
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find the element with class name 'article--docs'
    article_html = soup.find(class_='article--docs')    

    if article_html is not None and len(str(article_html.text)) > 1:
      main_article_soup = BeautifulSoup(str(article_html), 'html.parser')

      # Remove specified tags from HTML
      page_html = remove_tags(main_article_soup, tag_names)

      # Convert HTML to Markdown
      converter = html2text.HTML2Text()
      converter.body_width = 0
      page_markdown = converter.handle(page_html)

      # Extract text content from HTML
      page_text = soup.get_text()
      
      file_name = parsed_url.path[1:].replace('/', '-')
      file_name = file_name.replace('https-', '').replace('www-', '').replace(':', '').replace('.', '-')

      # Create directory if it doesn't exist
      file_path = os.path.join(folder_path, file_name)
      if not os.path.exists(file_path):
          os.makedirs(file_path)

      # Write HTML content to file
      with open(f'{file_path}/index.html', 'w') as f:
          f.write(f"<!-- Generated on: {now} EST -->\n")
          f.write(f"<!-- Scraped URL: {url} -->\n")
          f.write(page_html)

      # Write Markdown content to file
      with open(f'{file_path}/index.md', 'w') as f:
          f.write(f"<!-- Generated on: {now} EST -->\n")
          f.write(f"<!-- Scraped URL: {url} -->\n")
          f.write(page_markdown)

      # Write text content to file
      with open(f'{file_path}/index.txt', 'w') as f:
          f.write(f"<!-- Generated on: {now} EST -->\n")
          f.write(f"<!-- Scraped URL: {url} -->\n")
          f.write(page_text)
      
      paths.append({"link": url, "path": file_name, "title": soup.title.string})
    else:
      print("No matching element found with class 'article--docs'.")
 
    # Add link to list of scraped links
    scraped_links.append(url)

    # Recursively scrape links on the page
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None:
            href = urljoin(url, href) # convert to absolute URL
            
            if href.startswith('http') and '#' not in href and href not in scraped_links:
                scrape_links(href, folder_path, max_files, scraped_links, tag_names, paths)
                
                # Check if we have reached the maximum number of files
                if len(scraped_links) >= max_files:
                    return scraped_links

    return scraped_links


# Scrape Shopify Developer Documentation website
url = "https://shopify.dev/docs/apps"
# make sure path exists on drive before running the code!
folder_path = '/content/drive/MyDrive/Experiments/Data/shopify-dev/'
# parent_dir = os.path.abspath(os.path.join(folder_path, os.pardir))
parent_dir = '/content/drive/MyDrive/Experiments/Data/'
max_files = 10
tag_names = ['script', 'style', 'link']
tar_filename = 'shopify-dev-files.'+ now + '.tar.gz'
meta_data_filename = 'meta-data.'+ now + '.json'
paths = []
# Scrape links recursively
scraped_links = scrape_links(url, folder_path, max_files, [], tag_names, paths)

# create a meta data about all files in the folder_path directory
with open(os.path.join(os.path.dirname(parent_dir), f'{meta_data_filename}'), "w") as file:
        json.dump(paths, file, indent=4)

# create a tarball of all files in the folder_path directory
with tarfile.open(os.path.join(os.path.dirname(parent_dir), f'{tar_filename}'), 'w:gz') as tar:
    for file in os.listdir(folder_path):
        tar.add(os.path.join(folder_path, file), arcname=file)

print(f'Processed {len(paths)} links!')
print(f'Successfully created: {tar_filename} & {meta_data_filename}')

Processing URL: [1] https://shopify.dev/docs/apps
No matching element found with class 'article--docs'.
Processing URL: [2] https://shopify.dev/docs
No matching element found with class 'article--docs'.
Processing URL: [3] https://shopify.dev/apps
No matching element found with class 'article--docs'.
Processing URL: [4] https://shopify.dev/themes
No matching element found with class 'article--docs'.
Processing URL: [5] https://shopify.dev/custom-storefronts
No matching element found with class 'article--docs'.
Processing URL: [6] https://shopify.dev/marketplaces
No matching element found with class 'article--docs'.
Processing URL: [7] https://shopify.dev/api/admin-graphql
No matching element found with class 'article--docs'.
Processing URL: [8] https://shopify.dev/api/admin-rest
No matching element found with class 'article--docs'.
Processing URL: [9] https://shopify.dev/api/liquid
No matching element found with class 'article--docs'.
Processing URL: [10] https://shopify.dev/api/ajax
P