<a href="https://colab.research.google.com/github/arvindravulavaru/scraper/blob/main/Shopify_dev_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install html2text


In [42]:
import requests
from bs4 import BeautifulSoup
import html2text
import datetime
import pytz
import os
import tarfile
from urllib.parse import urlparse, parse_qs, urljoin

# Function to remove script, style, and link tags
def remove_tags(soup, tag_names):
    
    # Remove specified tags from HTML
    for tag_name in tag_names:
        for tag in soup.find_all(tag_name):
            tag.decompose()

    # Return cleaned HTML
    return str(soup)


# Function to scrape links recursively
def scrape_links(url, folder_path, max_files, scraped_links, tag_names):

    # Check if we have reached the maximum number of files
    if len(scraped_links) >= max_files:
        return scraped_links

    # Write content to files
    parsed_url = urlparse(url)

    # Send GET request to URL if it is from the shopify domain
    if "shopify.dev" not in urlparse(url).netloc:
        return scraped_links
    
    response = requests.get(url)

    # Parse HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Remove specified tags from HTML
    page_html = remove_tags(soup, tag_names)

    # Convert HTML to Markdown
    converter = html2text.HTML2Text()
    converter.body_width = 0
    page_markdown = converter.handle(page_html)

    # Extract text content from HTML
    page_text = soup.get_text()
    
    # query_string = parse_qs(parsed_url.query, keep_blank_values=True)
    # query_string_formatted = "_".join([f"{k}={v[0]}" for k, v in query_string.items()]) if query_string else ""

    file_name = parsed_url.path[1:].replace('/', '-')
    file_name = file_name.replace('https-', '').replace('www-', '').replace(':', '').replace('.', '-')
    
    now = datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S")

    # Create directory if it doesn't exist
    file_path = os.path.join(folder_path, file_name)
    if not os.path.exists(file_path):
        os.makedirs(file_path)

    # Write HTML content to file
    with open(f'{file_path}/index.html', 'w') as f:
        f.write(f"<!-- Generated on: {now} EST -->\n")
        f.write(f"<!-- Scraped URL: {url} -->\n")
        f.write(page_html)

    # Write Markdown content to file
    with open(f'{file_path}/index.md', 'w') as f:
        f.write(f"<!-- Generated on: {now} EST -->\n")
        f.write(f"<!-- Scraped URL: {url} -->\n")
        f.write(page_markdown)

    # Write text content to file
    with open(f'{file_path}/index.txt', 'w') as f:
        f.write(f"<!-- Generated on: {now} EST -->\n")
        f.write(f"<!-- Scraped URL: {url} -->\n")
        f.write(page_text)

    # Add link to list of scraped links
    scraped_links.append(url)

    # Recursively scrape links on the page
    for link in soup.find_all('a'):
        href = link.get('href')
        if href is not None:
            href = urljoin(url, href) # convert to absolute URL
            
            if href.startswith('http') and '#' not in href and href not in scraped_links:
                scrape_links(href, file_path, max_files, scraped_links, tag_names)
                
                # Check if we have reached the maximum number of files
                if len(scraped_links) >= max_files:
                    return scraped_links

    return scraped_links


# Scrape Shopify Developer Documentation website
url = "https://shopify.dev/docs/apps"
folder_path = '/content/drive/MyDrive/Experiments/Data/shopify-dev/'
parent_dir = os.path.abspath(os.path.join(folder_path, os.pardir))
max_files = 10
tag_names = ['script', 'style', 'link']
tar_filename = 'shopify-dev-files.tar.gz'

# Scrape links recursively
scraped_links = scrape_links(url, folder_path, max_files, [], tag_names)

# Print success message
print(f'Successfully wrote {len(scraped_links)} files to Google Drive')
# # Print site hierarchy
# print('Site hierarchy:')
# for root, dirs, files in os.walk(folder_path):
#     level = root.replace(folder_path, '').count(os.sep)
#     indent = ' ' * 2 * (level)
#     print(f'{indent}{os.path.basename(root)}/')
#     subindent = ' ' * 2 * (level + 1)
#     for f in files:
#         print(f'{subindent}{f}')

# create a tarball of all files in the folder_path directory
with tarfile.open(f"{parent_dir}/{tar_filename}.tar.gz", 'w:gz') as tar:
    for file in os.listdir(folder_path):
        tar.add(os.path.join(folder_path, file), arcname=file)

print(f'Successfully created tarball: {tar_filename}')

Successfully wrote 10 files to Google Drive
Successfully created tarball: shopify-dev-files.tar.gz
