In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd

# 初始化

visited_urls = set()
to_visit_urls = []
domain = ""
external_domains = set()
unique_documents = set()


# 用于存储统计数据
stats = {
    "total_pages": 0,
    "all_links": 0,
    "internal_links": 0,
    "external_links": 0,
    "broken_links": 0,
    "internal_subdomains": 0,
    "external_resources_links": 0,
    "unique_external_resources": 0,
    "unique_documents_links": 0
}

# 用户代理，模仿浏览器访问
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
}

# 增加内部子域名集合
internal_subdomains = set()

def is_internal(url):
    return urlparse(url).netloc == domain or urlparse(url).netloc in internal_subdomains

def add_subdomain(url):
    subdomain = urlparse(url).netloc
    if subdomain != domain:
        internal_subdomains.add(subdomain)

def is_document(url):
    return url.split('?')[0].split('#')[0].lower().endswith(('.doc', '.docx', '.pdf'))


def crawl(url, depth):
    global domain
    if url in visited_urls or not url.startswith("http") or depth > 1:
        return
    print(f"Crawling: {url} at depth {depth}")
    try:
        response = safe_request(url, headers)
        if response and response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')  # 适配更通用的HTML解析器

            visited_urls.add(url)
            stats["total_pages"] += 1
            
            links = soup.find_all('a', href=True)
            stats["all_links"] += len(links)
            for link in links:
                absolute_link = urljoin(url, link['href'])
                stats["all_links"] += 1
                if is_internal(absolute_link):
                    stats["internal_links"] += 1
                    add_subdomain(absolute_link)
                    if absolute_link not in visited_urls:
                        to_visit_urls.append((absolute_link, depth + 1))
                else:
                    stats["external_links"] += 1
                    external_domains.add(urlparse(absolute_link).netloc)
                    if is_document(absolute_link) and absolute_link not in unique_documents:
                        unique_documents.add(absolute_link)
                        stats["unique_documents_links"] += 1
        else:
            stats["broken_links"] += 1
    except Exception as e:
        print(f"Error crawling {url}: {e}")
        stats["broken_links"] += 1

def start_crawl(start_url):
    global domain
    domain = urlparse(start_url).netloc
    to_visit_urls.append((start_url, 1))  # 开始URL和初始深度
    while to_visit_urls:
        current_url, depth = to_visit_urls.pop(0)  # FIFO队列
        crawl(current_url, depth)
    
    # 更新统计数据
    stats["internal_subdomains"] = len(internal_subdomains)
    stats["external_resources_links"] = stats["external_links"]
    stats["unique_external_resources"] = len(external_domains)

start_url = "https://spbu.ru/"  # 更改为你的目标网站。
start_crawl(start_url)

print(f"Total pages crawled: {stats['total_pages']}")
print(f"All links found: {stats['all_links']}")
print(f"Internal links found: {stats['internal_links']}")
print(f"External links found: {stats['external_links']}")
print(f"Broken links found: {stats['broken_links']}")
print(f"Internal subdomains found: {stats['internal_subdomains']}")
print(f"External resources links: {stats['external_resources_links']}")
print(f"Unique external resources: {stats['unique_external_resources']}")
print(f"Unique documents (doc, docx, pdf) links: {stats['unique_documents_links']}")

df = pd.DataFrame([stats])
df.to_csv("website_stats.csv1", index=False)

Crawling: https://spbu.ru/ at depth 1
Total pages crawled: 1
All links found: 652
Internal links found: 274
External links found: 52
Broken links found: 0
Internal subdomains found: 0
External resources links: 52
Unique external resources: 36
Unique documents (doc, docx, pdf) links: 0


In [None]:
def safe_request(url, headers, max_retries=3, timeout=5):
    """尝试重试请求"""
    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=timeout)
            return response
        except requests.exceptions.RequestException as e:
            print(f"Attempt {attempt+1} of {url} failed: {e}")
            if attempt == max_retries - 1:
                raise  # 最后一次尝试失败，抛出异常

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import pandas as pd

class SimpleCrawler:
    def __init__(self, start_url):
        self.start_url = start_url
        self.domain = urlparse(start_url).netloc
        self.visited_urls = set()
        self.to_visit_urls = [start_url]
        self.stats = {"total_pages": 0, "internal_links": 0, "external_links": 0, "unique_documents": 0}
        self.headers = {"User-Agent": "Mozilla/5.0"}
        self.unique_documents = set()
        
    def is_internal(self, url):
        return urlparse(url).netloc == self.domain
    
    def is_document(self, url):
        return url.lower().endswith(('.doc', '.docx', '.pdf'))
    
    def crawl(self, url):
        if not url.startswith("http"):
            return
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                self.visited_urls.add(url)
                self.stats["total_pages"] += 1
                soup = BeautifulSoup(response.text, 'html.parser')
                for link in soup.find_all('a', href=True):
                    absolute_link = urljoin(url, link['href'])
                    if self.is_internal(absolute_link):
                        self.stats["internal_links"] += 1
                        if absolute_link not in self.visited_urls:
                            self.to_visit_urls.append(absolute_link)
                    else:
                        self.stats["external_links"] += 1
                    if self.is_document(absolute_link):
                        if absolute_link not in self.unique_documents:
                            self.unique_documents.add(absolute_link)
                            self.stats["unique_documents"] += 1
        except requests.RequestException as e:
            print(f"Error: {e}")
    
    def start(self):
        while self.to_visit_urls:
            current_url = self.to_visit_urls.pop(0)
            if current_url not in self.visited_urls:
                self.crawl(current_url)
    
    def report(self):
        print(f"Total pages crawled: {self.stats['total_pages']}")
        print(f"Internal links: {self.stats['internal_links']}")
        print(f"External links: {self.stats['external_links']}")
        print(f"Unique documents: {self.stats['unique_documents']}")
        df = pd.DataFrame([self.stats])
        df.to_csv("website_stats_simplified.csv", index=False)

# 使用
crawler = SimpleCrawler("https://spbu.ru/")
crawler.start()
crawler.report()


KeyboardInterrupt: 