##　最終課題

In [4]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import time

In [21]:
def crawl_same_domain(start_url: str) -> dict:

    parsed_uri = urlparse(start_url)
    base_domain = parsed_uri.netloc
    
    exclusion = (
        '.pdf', '.jpg', '.jpeg', '.png', '.gif', '.svg',
        '.bmp', '.webp', '.tiff', '.css', '.js'
    )
    
    urls_to_visit = {start_url} 
    visited_urls = set()
    scraped_pages = {}

    session = requests.Session()
    session.headers.update({
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    })

    print(f"クロール開始")
    print(f"対象ドメイン: {base_domain}")

    

    while urls_to_visit:
        current_url = urls_to_visit.pop()
        
        if current_url in visited_urls:
            continue
            
        parsed_current = urlparse(current_url)
        if parsed_current.path.lower().endswith(exclusion):
            visited_urls.add(current_url)
            continue
        
        print(f"処理中...{current_url} ")
        
        try:
            response = session.get(current_url, timeout=10)
            
            if response.status_code != 200:
                visited_urls.add(current_url)
                continue
            
            content_type = response.headers.get('Content-Type', '').lower()

            if 'text/html' not in content_type:
                visited_urls.add(current_url)
                continue

            response.encoding = response.apparent_encoding
            html_content = response.text
            html_content_cleaned = re.sub(r'', '', html_content, flags=re.DOTALL)
            
            soup = BeautifulSoup(html_content_cleaned, 'html.parser')

            title_tag = soup.find('title')
            title_text = title_tag.string.strip() if title_tag and title_tag.string else "タイトルなし (Title Not Found)"
            
            scraped_pages[current_url] = title_text


            for a_tag in soup.find_all('a', href=True):
                href = a_tag['href'].strip()
                
                full_url = urljoin(current_url, href)
                full_url_no_fragment = full_url.split('#')[0]

                parsed_full_url = urlparse(full_url_no_fragment)
                
                path_lower = parsed_full_url.path.lower()
                if path_lower.endswith(exclusion):
                    continue

                if parsed_full_url.scheme in ['http', 'https'] and parsed_full_url.netloc == base_domain:
                    normalized_url = full_url_no_fragment.rstrip('/')
                    
                    if normalized_url not in visited_urls and normalized_url not in urls_to_visit:
                        urls_to_visit.add(normalized_url)
            
        except requests.exceptions.RequestException as e:
            print(f"    [エラー] リクエスト失敗: {e}")
        
        visited_urls.add(current_url)
        
        time.sleep(1.0) 

    print("\n完了")
    return scraped_pages

# 実行
if __name__ == "__main__":
    start_url = "https://www.musashino-u.ac.jp/"
    result_dict = crawl_same_domain(start_url)
    
    print("\n収集結果")
    print(result_dict)

クロール開始
対象ドメイン: www.musashino-u.ac.jp
処理中...https://www.musashino-u.ac.jp/ 
処理中...https://www.musashino-u.ac.jp/happiness_creators/no026.html 
処理中...https://www.musashino-u.ac.jp/guide/activities 
処理中...https://www.musashino-u.ac.jp/academics/basic 
処理中...https://www.musashino-u.ac.jp/guide/information/taishin.html 
処理中...https://www.musashino-u.ac.jp/guide/facility/laungage_center/index.html 
処理中...https://www.musashino-u.ac.jp/guide/profile/media/media.html 
処理中...https://www.musashino-u.ac.jp/basic/learning_cycle.html 
処理中...https://www.musashino-u.ac.jp/musashino 
処理中...https://www.musashino-u.ac.jp/guide/information/support_policy.html 
処理中...https://www.musashino-u.ac.jp/basic/ai_submajor/student_voice/index.html 
処理中...https://www.musashino-u.ac.jp/basic/policies 
処理中...https://www.musashino-u.ac.jp/admission/graduate_school 
処理中...https://www.musashino-u.ac.jp/basic/generation_ai/index.html 
処理中...https://www.musashino-u.ac.jp/admission/faculty/detail 
処理中...https://www.musashin