In [3]:
import os
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# Data Extraction

This notebook extracts, cleans, and stores data from the Rockfish website, documentation, news articles, research PDFs, and YouTube videos.  
It outputs cleaned `.txt` files in the `Data` folder for downstream processing (like RAG-based chatbots).  

**Key Features:**  
- Scrapes and saves HTML pages  
- Extracts PDF content while skipping math-heavy sections  
- Retrieves YouTube auto-generated transcripts  
- Organizes everything in `Data/`  


## Website

In [None]:
BASE_URL = 'https://www.rockfish.ai/'

# Create the Data directory if it doesn't exist
if not os.path.exists('Data'):
    os.makedirs('Data')

# To avoid re-visiting the same pages
visited = set()

def html_to_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove scripts and styles
    for s in soup(['script', 'style']):
        s.decompose()

    # If the page has a <main> section, prioritize it
    main = soup.find('main')
    if main:
        text = main.get_text(separator='\n', strip=True)
    else:
        text = soup.get_text(separator='\n', strip=True)

    return text

def crawl_and_save(url, depth=0):
    if url in visited or depth > 2:  # limit depth to avoid endless crawl
        return

    try:
        print(f'Crawling: {url}')
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
    except Exception as e:
        print(f'Failed to fetch {url}: {e}')
        return

    visited.add(url)

    # Clean the HTML to plain text
    clean_text = html_to_text(response.text)

    # Save the clean text to a .txt file
    parsed_url = urlparse(url)
    file_name = parsed_url.path.strip('/').replace('/', '_') or 'index'
    file_path = os.path.join('Data', f'{file_name}.txt')

    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(clean_text)

    # Parse and find internal links
    soup = BeautifulSoup(response.text, 'html.parser')
    for link_tag in soup.find_all('a', href=True):
        href = link_tag['href']
        next_url = urljoin(url, href)
        # Only follow internal links
        if urlparse(next_url).netloc == urlparse(BASE_URL).netloc:
            crawl_and_save(next_url, depth + 1)

    time.sleep(1)  # be nice!

if __name__ == '__main__':
    crawl_and_save(BASE_URL)



Crawling: https://www.rockfish.ai/
Crawling: https://www.rockfish.ai/why-rockfish/about-us
Crawling: https://www.rockfish.ai/why-rockfish/partners
Crawling: https://www.rockfish.ai/platform/the-science
Crawling: https://www.rockfish.ai/use-cases
Crawling: https://www.rockfish.ai/news
Crawling: https://www.rockfish.ai/contact-us
Crawling: https://www.rockfish.ai/privacy-policy


## Documentation

In [None]:
BASE_URL = 'https://docs142.rockfish.ai/index.html'

# Create the Data directory if it doesn't exist
if not os.path.exists('Data'):
    os.makedirs('Data')

# To avoid revisiting the same pages
visited = set()

# Common words to exclude (navigation, repeated on all pages)
REPEATED_WORDS = set([
    "Rockfish Documentation", "Home", "Getting Started", "Quick Start", "Installation",
    "SDK Installation", "CLI Installation", "Use Case Tutorials", "Summary",
    "Generic Rockfish Integration", "Central Global Model Accuracy", "Integration",
    "User Guide", "Overview", "Onboarding", "Import Data", "Data Models",
    "Dataset Properties", "Recommendation Engine", "Pre-Processing", "Privacy",
    "Training", "Models", "Train", "Model Store", "Generation", "Basic Generation",
    "Storytelling", "Evaluation", "Metrics", "Improving Data Quality", "Deployment",
    "Deployment Types", "Hybrid Deploy", "Table of contents", "Cuttlefish",
    "Self hosted worker", "Deploy", "Enterprise", "Resources", "Hardware",
    "Deployment Checklist", "Workers", "Docker", "Docker Registry",
    "Docker Compose", "Operations", "Troubleshooting", "Administration",
    "API", "Reference", "Welcome to Rockfish Data!", "Get started",
    "rockfish", "rockfish.events", "rockfish.metrics", "rockfish.models",
    "rockfish.remote", "rockfish.streams", "rockfish.labs.recommender",
    "rockfish.labs.metrics", "rockfish.labs.steps", "rockfish.labs.vis",
    "rockfish.actions", "rockfish.actions.transformer", "rockfish.actions.tab_gan",
    "rockfish.actions.dg", "rockfish.actions.dataset", "rockfish.actions.models",
    "rockfish.actions.amplify", "rockfish.actions.apply_transform",
    "rockfish.actions.replace", "Developer", "Support"
])


def html_to_text(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')

    # Get the <article> element with actual documentation content
    article = soup.find('article', class_='md-content__inner')
    if not article:
        # fallback to entire page
        article = soup

    # Extract content respecting the document structure
    lines = []
    for el in article.descendants:
        if el.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
            lines.append('\n' + el.get_text(strip=True) + '\n')
        elif el.name == 'p':
            text = el.get_text(separator=' ', strip=True)
            if text:
                lines.append(text)
        elif el.name == 'pre':
            code_text = el.get_text(separator=' ', strip=True)
            lines.append('\n' + code_text + '\n')
        elif el.name == 'li':
            li_text = el.get_text(separator=' ', strip=True)
            if li_text:
                lines.append('- ' + li_text)

    # Final cleanup: remove redundant empty lines
    final_text = '\n'.join(line for line in lines if line.strip())

    return final_text



def crawl_and_save(url, depth=0):
    if url in visited or depth > 5:  # Adjust depth limit as needed
        return

    try:
        print(f'Crawling: {url}')
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        response.raise_for_status()
    except Exception as e:
        print(f'Failed to fetch {url}: {e}')
        return

    visited.add(url)

    # Clean text
    clean_text = html_to_text(response.text)

    # Generate file name based on URL path
    parsed_url = urlparse(url)
    page_name = parsed_url.path.strip('/').replace('/', '_') or 'index'
    file_name = f'documentation-{page_name}.txt'
    file_path = os.path.join('Data', file_name)

    # Save clean text
    with open(file_path, 'w', encoding='utf-8') as f:
        f.write(clean_text)

    # Parse and find internal links
    soup = BeautifulSoup(response.text, 'html.parser')
    for link_tag in soup.find_all('a', href=True):
        href = link_tag['href']
        next_url = urljoin(url, href)

        # Only follow internal documentation links
        if urlparse(next_url).netloc == urlparse(BASE_URL).netloc:
            crawl_and_save(next_url, depth + 1)

    time.sleep(1)  # Be polite

if __name__ == '__main__':
    crawl_and_save(BASE_URL)

Crawling: https://docs142.rockfish.ai/index.html
Crawling: https://docs142.rockfish.ai/index.html#welcome-to-rockfish-data
Crawling: https://docs142.rockfish.ai/index.html#get-started
Crawling: https://docs142.rockfish.ai/quick-start.html
Crawling: https://docs142.rockfish.ai/quick-start.html#connect-to-the-rockfish-platform
Crawling: https://docs142.rockfish.ai/quick-start.html#get-an-api-key
Crawling: https://docs142.rockfish.ai/quick-start.html#set-up-the-rockfish-platform
Crawling: https://docs142.rockfish.ai/quick-start.html#optional-rockfish-integration
Crawling: https://docs142.rockfish.ai/quick-start.html#load-a-dataset
Crawling: https://docs142.rockfish.ai/quick-start.html#train-a-synthetic-data-model
Crawling: https://docs142.rockfish.ai/quick-start.html#generate-data
Crawling: https://docs142.rockfish.ai/quick-start.html#evaluate-data-quality
Crawling: https://docs142.rockfish.ai/quick-start.html#sql-queries
Crawling: https://docs142.rockfish.ai/quick-start.html#whats-next
C

## Research Papers

In [29]:
import fitz  # PyMuPDF

# Input and output folders
RESEARCH_FOLDER = 'research'
OUTPUT_FOLDER = 'Data'

# Create the output folder if it doesn't exist
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# Loop through each PDF in 'research' folder
for pdf_file in os.listdir(RESEARCH_FOLDER):
    if not pdf_file.lower().endswith('.pdf'):
        continue

    pdf_path = os.path.join(RESEARCH_FOLDER, pdf_file)
    doc = fitz.open(pdf_path)

    clean_text = ''
    for page in doc:
        # Extract text from the page
        text = page.get_text()
        lines = text.splitlines()

        # Filter out likely math-heavy lines
        for line in lines:
            # Heuristic 1: skip lines with lots of math symbols
            if any(sym in line for sym in ['=', '+', '-', '/', '*', '(', ')', '\\', '∫', 'Σ', '√', '∆', 'π']):
                continue
            # Heuristic 2: skip very short lines (often math labels)
            if len(line.strip()) < 10:
                continue
            # Keep the line
            clean_text += line + '\n'

    # Save the cleaned text
    doc_name = os.path.splitext(pdf_file)[0].replace(' ', '_')
    output_filename = f'research-{doc_name}.txt'
    output_path = os.path.join(OUTPUT_FOLDER, output_filename)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(clean_text)

    print(f"Saved: {output_filename}")

Saved: research-onprivacy.txt
Saved: research-spectralnorm.txt
Saved: research-practicalgan.txt
Saved: research-raregan.txt
Saved: research-imc20_doppelganger.txt


## News

In [21]:
# Load the raw HTML file you shared
with open('news.html', 'r', encoding='utf-8') as f:
    html_content = f.read()

# Parse the HTML
soup = BeautifulSoup(html_content, 'html.parser')

# Create 'Data' folder
if not os.path.exists('Data'):
    os.makedirs('Data')

# Find all article links
article_links = soup.find_all('a', class_='col-grid-link-block w-inline-block')

# Helper: extract main clean text from an external page
def extract_clean_text(url):
    try:
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=10)  # Timeout after 10 seconds
        response.raise_for_status()
    except requests.exceptions.Timeout:
        print(f"Timeout: Skipping {url}")
        return None
    except Exception as e:
        print(f"Failed to fetch {url}: {e}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    main = soup.find('main') or soup.body
    if not main:
        return None

    lines = []
    for el in main.descendants:
        if el.name in ['h1', 'h2', 'h3', 'h4', 'h5']:
            lines.append('\n' + el.get_text(strip=True) + '\n')
        elif el.name == 'p':
            text = el.get_text(separator=' ', strip=True)
            if text:
                lines.append(text)
        elif el.name == 'pre':
            code_text = el.get_text(separator=' ', strip=True)
            lines.append('\n' + code_text + '\n')
        elif el.name == 'li':
            li_text = el.get_text(separator=' ', strip=True)
            if li_text:
                lines.append('- ' + li_text)

    cleaned_text = '\n'.join(line for line in lines if line.strip())
    return cleaned_text

# Visit each link and save its content
for link_tag in article_links:
    href = link_tag.get('href')
    if not href or not href.startswith('http'):
        continue

    # Get link text for a fallback filename
    text_elem = link_tag.find('div', class_='research-caption')
    if text_elem:
        article_title = text_elem.get_text(separator=' ', strip=True)
    else:
        article_title = 'untitled-article'

    parsed_url = urlparse(href)
    slug = parsed_url.path.strip('/').split('/')[-1] or article_title.replace(' ', '_')[:30]
    filename = f'news-{slug}.txt'
    filepath = os.path.join('Data', filename)

    # Visit and extract
    print(f"🔍 Visiting: {href}")
    cleaned_text = extract_clean_text(href)
    if cleaned_text:
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(cleaned_text)
        print(f"Saved: {filename}")
    else:
        print(f"No clean text found for: {href}")

print("All done!")

🔍 Visiting: https://www.linkedin.com/posts/t-labs-telekom-laboratories_tchallenge-startups-activity-7290004012416811010-J9rX?utm_source=share&utm_medium=member_desktop
Saved: news-t-labs-telekom-laboratories_tchallenge-startups-activity-7290004012416811010-J9rX.txt
🔍 Visiting: https://techcrunch.com/2025/01/15/rockfish-is-helping-enterprises-leverage-synthetic-data/
Saved: news-rockfish-is-helping-enterprises-leverage-synthetic-data.txt
🔍 Visiting: https://www.businesswire.com/news/home/20250115547526/en/Rockfish-Data-Secures-Seed-Funding-to-Help-Enterprises-Leverage-Synthetic-Data-for-Operational-Workflows
Timeout: Skipping https://www.businesswire.com/news/home/20250115547526/en/Rockfish-Data-Secures-Seed-Funding-to-Help-Enterprises-Leverage-Synthetic-Data-for-Operational-Workflows
No clean text found for: https://www.businesswire.com/news/home/20250115547526/en/Rockfish-Data-Secures-Seed-Funding-to-Help-Enterprises-Leverage-Synthetic-Data-for-Operational-Workflows
🔍 Visiting: https:

## Youtube Videos

In [30]:
%pip install youtube_transcript_api

Collecting youtube_transcript_api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Collecting defusedxml<0.8.0,>=0.7.1 (from youtube_transcript_api)
  Using cached defusedxml-0.7.1-py2.py3-none-any.whl.metadata (32 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached defusedxml-0.7.1-py2.py3-none-any.whl (25 kB)
Installing collected packages: defusedxml, youtube_transcript_api
Successfully installed defusedxml-0.7.1 youtube_transcript_api-1.0.3
Note: you may need to restart the kernel to use updated packages.


In [32]:
from youtube_transcript_api import YouTubeTranscriptApi

OUTPUT_FOLDER = 'Data'
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

# List of YouTube video URLs
video_urls = [
    'https://www.youtube.com/watch?v=TrF1CU4Y2sc&t=43s',
    'https://www.youtube.com/watch?v=5kT05Hv8QzE&t=1814s'
]

for url in video_urls:
    # Extract video ID
    if 'v=' in url:
        video_id = url.split('v=')[1].split('&')[0]
    else:
        print(f"Could not extract video ID for {url}")
        continue

    try:
        # Get the transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)

        # Merge all text into one plain text
        transcript_text = '\n'.join([entry['text'] for entry in transcript])

        # Save it to a file
        output_filename = f'yt-{video_id}.txt'
        output_path = os.path.join(OUTPUT_FOLDER, output_filename)
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(transcript_text)

        print(f"Saved: {output_filename}")
    except Exception as e:
        print(f"Could not get transcript for {video_id}: {e}")

print("🎉 Done!")


Could not get transcript for TrF1CU4Y2sc: no element found: line 1, column 0
Saved: yt-5kT05Hv8QzE.txt
🎉 Done!


#### Sources
- Research Papers
- New articles
- Web Pages
- Documentation