# Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Scraping a web page with documentation

In [2]:
import requests
from bs4 import BeautifulSoup
import re

urls = ['https://tokeru.com/cgwiki/HoudiniGettingStarted.html', 'https://tokeru.com/cgwiki/Houdini.html', 'https://tokeru.com/cgwiki/HoudiniChops.html', 'https://tokeru.com/cgwiki/HoudiniCops.html', 'https://tokeru.com/cgwiki/HoudiniCrowd.html', 'https://tokeru.com/cgwiki/HoudiniDops.html', 'https://tokeru.com/cgwiki/HoudiniFAQ.html', 'https://tokeru.com/cgwiki/HoudiniHair.html', 'https://tokeru.com/cgwiki/HoudiniHDA.html', 'https://tokeru.com/cgwiki/HoudiniKinefx.html', 'https://tokeru.com/cgwiki/Houdini_Lighting_Shading.html', 'https://tokeru.com/cgwiki/HoudiniLops.html', 'https://tokeru.com/cgwiki/HoudiniPython.html', 'https://tokeru.com/cgwiki/HoudiniTops.html', 'https://tokeru.com/cgwiki/HoudiniUserInterfaceTips.html', 'https://tokeru.com/cgwiki/HoudiniVellum.html', 'https://tokeru.com/cgwiki/HoudiniVex1.html', 'https://tokeru.com/cgwiki/HoudiniVex2.html', 'https://tokeru.com/cgwiki/HoudiniVex3.html', 'https://tokeru.com/cgwiki/HoudiniVolumes.html', 'https://tokeru.com/cgwiki/Houdini_Vops.html']


def scrape_webpage(url):
    # Fetch the webpage content
    response = requests.get(url)
    # Parse the content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find the first occurrence of h
    first_heading = soup.find(['h1','h2', 'h3', 'h4'])
    if not first_heading:
        return ["No h1-4 tags found"]

    # Collect parts starting from the first h
    parts = []
    current_part = [first_heading.text]  # start with the first heading text

    # Iterate over next siblings of the first heading
    for sibling in first_heading.next_siblings:
        if sibling.name in ['h1','h2', 'h3', 'h4']:
            # When a new h is found, store the current part and start a new one
            parts.append(' '.join(current_part))
            current_part = [sibling.text]  # start new part with the heading text
        elif sibling.name:
            # Add text from other elements
            current_part.append(sibling.get_text(strip=True))

    # Append the last part collected
    if current_part:
        parts.append(' '.join(current_part))

    return parts

# Call the function and print the result
parts = []

for url in urls:
    parts += scrape_webpage(url)




#Cleaning
def clean_text(text):
    # Regular expression to identify URLs
    url_pattern = r'(https?://[^\s]+)'
    # Find all URLs using regex
    urls = re.findall(url_pattern, text)

    # Create a dictionary to replace non-ASCII characters in the rest of the text while preserving URLs
    preserved_urls = {url: f"URL_{i}" for i, url in enumerate(urls)}

    # Replace URLs with placeholders to avoid accidental modification
    for url, placeholder in preserved_urls.items():
        text = text.replace(url, placeholder)

    # Remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Restore URLs from placeholders
    for placeholder, url in preserved_urls.items():
        text = text.replace(placeholder, url)

    return text

for i in range(len(parts)):
    parts[i] = clean_text(parts[i])

# Saving scraped corpus

In [3]:
with open('drive/MyDrive/Colab/docs/houdini_dataset.txt', 'w') as file:
    # Iterate over each string in the list
    for item in parts:
        # Write each string to the file followed by a newline
        file.write(item + '\n')