In [2]:
import requests
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import os

# Function to generate keywords using TF-IDF
def generate_keywords_tfidf(content, top_n=5):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    X = vectorizer.fit_transform([content])
    keywords = vectorizer.get_feature_names_out()
    return list(keywords)

# Function to scrape a web page
def scrape_web_page(url, topic="General"):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch page: {response.status_code}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')
    documents = []

    # Find all <h2> tags
    titles = soup.find_all('h2')  # Adjust this tag based on the page structure
    for title in titles:
        question = title.text.strip()
        content = question + " "  # Initialize content with the question text
        sibling = title.find_next_sibling()

        # Collect content until the next <h2> or the end of the section
        while sibling and sibling.name != 'h2':
            if sibling.name in ['p', 'ul', 'ol', 'div']:  # Include other relevant tags as needed
                content += sibling.get_text(separator=" ", strip=True) + " "
            sibling = sibling.find_next_sibling()

        # Generate a relevant part of the URL as the source
        section_id = title.get('id')  # Extract the 'id' attribute if available
        if section_id:
            section_url = f"{url}#{section_id}"  # Append the ID as a fragment
        else:
            section_url = url  # Use the main URL if no ID is found

        # Generate keywords using TF-IDF
        keywords = generate_keywords_tfidf(content)

        documents.append({
            "title": question,
            "content": content.strip(),
            "topic": topic,
            "keywords": keywords,
            "source": section_url  # Use the section URL for this title
        })

    return documents

# Scrape web pages
links = [
    {"url": "https://wehi-researchcomputing.github.io/students", "topic": "Unpaid Student Internship Program"},
    {"url": "https://wehi-researchcomputing.github.io/complex-projects", "topic": "complex ambiguous projects"},
    {"url": "https://wehi-researchcomputing.github.io/software_maturity_model", "topic": "software maturity model"},
    {"url": "https://wehi-researchcomputing.github.io/explanation_about_ohs", "topic": "explanation about ohs"},
    {"url": "https://wehi-researchcomputing.github.io/top-5-mistakes", "topic": "top 5 mistakes"},
    {"url": "https://wehi-researchcomputing.github.io/project-wikis", "topic": "project wikis"},
    {"url": "https://wehi-researchcomputing.github.io/student-loxcoder", "topic": "loxcoder"},
    {"url": "https://wehi-researchcomputing.github.io/student-data-commons", "topic": "student data commons"},
    {"url": "https://wehi-researchcomputing.github.io/student-cryoem", "topic": "cryoem"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-qc", "topic": "genomics quantum computing"},
    {"url": "https://wehi-researchcomputing.github.io/student-schex", "topic": "schex"},
    {"url": "https://wehi-researchcomputing.github.io/student-mixOmics.html", "topic": "mixOmics"},
    {"url": "https://wehi-researchcomputing.github.io/student-capacity-planning.html", "topic": "capacity planning"},
    {"url": "https://wehi-researchcomputing.github.io/student-haemosphere", "topic": "haemosphere"},
    {"url": "https://wehi-researchcomputing.github.io/student-imaging", "topic": "imaging"},
    {"url": "https://wehi-researchcomputing.github.io/student-quantum", "topic": "quantum"},
    {"url": "https://wehi-researchcomputing.github.io/student-genomics-metadata.html", "topic": "genomics metadata"},
    {"url": "https://wehi-researchcomputing.github.io/student-aive", "topic": "aive"},
    {"url": "https://wehi-researchcomputing.github.io/student-bionix", "topic": "bionix"},
    {"url": "https://wehi-researchcomputing.github.io/student-clinical-dashboards", "topic": "clinical dashboards"},
    {"url": "https://wehi-researchcomputing.github.io/email_acknowledgement", "topic": "email acknowledgement"},
    {"url": "https://wehi-researchcomputing.github.io/code-of-conduct", "topic": "code of conduct"},
    {"url": "https://wehi-researchcomputing.github.io/faq", "topic": "FAQ"}       

]

web_data = []
for link in links:
    data = scrape_web_page(link["url"], topic=link["topic"])
    if data:
        web_data.extend(data)


# Print and save the aggregated data
for doc in web_data:
    print(f"Title: {doc['title']}")
    print(f"Content: {doc['content']}")  # Truncate for readability
    print(f"Topic: {doc['topic']}")
    print(f"Keywords: {', '.join(doc['keywords'])}")
    print(f"Source: {doc['source']}")  # Print source if available
    print("-" * 40)

with open("aggregated_data.json", "w") as f:
    json.dump(web_data, f, indent=4)

Failed to fetch page: 503
Title: Learn real world skills
Content: Learn real world skills We prepare students for the real-world by teaching them: how understanding the domain problem and the users is more important than technical skills, and how to work on a complex, ambiguous project , showing them how to become as independent as possible , show them how to document and share knowledge to others in a professional manner, explain how a software maturity model can help clarify expectations , and teaching them how to work productively in a remote environment. We even tell students how to try to avoid the top 5 mistakes that students make . In our Welcome Session, we talk about ways you can better learn real world skills .
Topic: Unpaid Student Internship Program
Keywords: learn, real, skills, students, world
Source: https://wehi-researchcomputing.github.io/students#learn-real-world-skills
----------------------------------------
Title: Types of projects
Content: Types of projects Many o