In [13]:
!pip install requests beautifulsoup4 pandas




In [14]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin


In [15]:
base_url = "https://www.th-bingen.de/" 
relevant_keywords = ["admissions", "students", "courses", "programs"]  # Define keywords


In [16]:
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses (4xx/5xx)
        return BeautifulSoup(response.text, "html.parser")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None


In [17]:
def extract_links(soup, base_url, relevant_keywords):
    links = []
    for a_tag in soup.find_all("a", href=True):  # Find all <a> tags with href attributes
        full_url = urljoin(base_url, a_tag["href"])
        if any(keyword in full_url for keyword in relevant_keywords):  # Check relevance
            links.append(full_url)
    return list(set(links))  # Remove duplicates


In [18]:
def crawl_website(start_url, relevant_keywords, max_depth=2):
    visited = set()  # Track visited URLs to avoid loops
    to_visit = [(start_url, 0)]  # URL and depth
    crawled_data = []  # Store data

    while to_visit:
        current_url, depth = to_visit.pop()
        if current_url in visited or depth > max_depth:
            continue

        visited.add(current_url)
        print(f"Crawling: {current_url} (Depth: {depth})")
        soup = fetch_page(current_url)
        if not soup:
            continue

        # Extract links
        links = extract_links(soup, start_url, relevant_keywords)
        to_visit.extend((link, depth + 1) for link in links)

        # Extract content
        title = soup.title.string if soup.title else "No title"
        content = " ".join(p.get_text() for p in soup.find_all("p"))
        crawled_data.append({"url": current_url, "title": title, "content": content.strip()})

    return pd.DataFrame(crawled_data)  


In [19]:
data = crawl_website(base_url, relevant_keywords, max_depth=2)
data.head()  # Display the first few rows


Crawling: https://www.th-bingen.de/ (Depth: 0)
Crawling: https://www.th-bingen.de/en/studies/courses/informatik-computer-science/overview (Depth: 1)
Crawling: https://www.th-bingen.de/en/studies/courses/environmental-sustainability/requirements (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses/environmental-sustainability/team (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses/informatik-computer-science/requirements (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses/environmental-sustainability/documents (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses/environmental-sustainability/course-structure (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses/environmental-sustainability/overview (Depth: 2)
Crawling: https://www.th-bingen.de/en/studies/courses/informatik-computer-science/course-structure (Depth: 2)
Crawling: https://www.th-bingen.de/en/how-to-a

Unnamed: 0,url,title,content
0,https://www.th-bingen.de/,Technische Hochschule (TH) Bingen - Willkommen,Seit 1897 bildet die Technische Hochschule Bin...
1,https://www.th-bingen.de/en/studies/courses/in...,Technische Hochschule Bingen: Überblick Inform...,"Informatik / Computer Science, M. Sc.\n Comput..."
2,https://www.th-bingen.de/en/studies/courses/en...,Technische Hochschule Bingen: Voraussetzungen ...,Expecting an affinity for environmental protec...
3,https://www.th-bingen.de/en/studies/courses/en...,Technische Hochschule Bingen: Team Umweltschutz,We stand for excellent education and applied r...
4,https://www.th-bingen.de/en/studies/courses/in...,Technische Hochschule Bingen: Voraussetzungen ...,Degree:\n The admission requirements for the M...


In [20]:
data.to_csv("crawled_data.csv", index=False)
