In [None]:
# 本地运行环境为kf_bf

# 数据文件已上传到google drive，地址见readme.md

In [None]:
# 第一个测试的爬虫脚本，爬取指定网站的图片信息并下载图片
import os
import csv
import requests
from lxml import html
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import re

# 配置参数
BASE_URL = "https://a5c425.com/pic/toupai/"
OUTPUT_CSV = "image_data.csv"
IMAGE_DIR = "../downloaded_images"
PAGES = 100  # 要爬取的页数
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Referer': 'https://a5c425.com/'
}

# 创建图片保存目录
os.makedirs(IMAGE_DIR, exist_ok=True)

def decode_title(script_content):
    """从script标签中提取标题文本"""
    # 查找标题文本的模式
    pattern = r"document\.write\(d\('([^']+)'\)\)"
    match = re.search(pattern, script_content)
    if match:
        encoded_str = match.group(1)
        # 这里应该使用网站实际的解密函数，但网站使用了自定义的d函数
        # 由于无法获取d函数，我们直接返回编码后的字符串作为占位
        return f"[EncodedTitle:{encoded_str}]"
    return "[NoTitle]"

def extract_image_info(page_content, page_url):
    """从页面内容中提取图片信息"""
    soup = BeautifulSoup(page_content, 'lxml')
    dl_tags = soup.find_all('dl')
    
    image_data = []
    
    for dl in dl_tags:
        try:
            # 提取图片URL
            img_tag = dl.find('img', {'class': 'nature'})
            if img_tag and 'data-original' in img_tag.attrs:
                img_url = img_tag['data-original']
                
                # 提取标题
                h3_tag = dl.find('h3')
                if h3_tag:
                    script_tag = h3_tag.find('script')
                    if script_tag:
                        title = decode_title(script_tag.string)
                    else:
                        # 直接提取</script>和</h3>之间的文本
                        script_content = str(h3_tag)
                        end_script = script_content.find('</script>')
                        if end_script != -1:
                            title_content = script_content[end_script + 9:]
                            title = title_content.replace('</h3>', '').strip()
                        else:
                            title = h3_tag.get_text().strip()
                else:
                    title = "Untitled"
                
                image_data.append({
                    'title': title,
                    'url': img_url
                })
        except Exception as e:
            print(f"Error processing element: {e}")
    
    return image_data

def download_image(url, save_path):
    """下载并保存图片"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
    except Exception as e:
        print(f"Failed to download {url}: {e}")
    return False

def scrape_website():
    """主爬虫函数"""
    all_images = []
    
    # 准备CSV文件
    with open(OUTPUT_CSV, 'w', newline='', encoding='utf-8-sig') as csvfile:
        fieldnames = ['名称', '地址']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        # 遍历每一页
        for page in range(1, PAGES + 1):
            if page == 1:
                page_url = BASE_URL
            else:
                page_url = f"{BASE_URL}index_{page}.html"
            
            print(f"Processing page {page}: {page_url}")
            
            try:
                # 获取页面内容
                response = requests.get(page_url, headers=HEADERS, timeout=15)
                response.encoding = 'utf-8'
                
                if response.status_code != 200:
                    print(f"Failed to fetch page {page}. Status code: {response.status_code}")
                    continue
                
                # 提取图片信息
                page_images = extract_image_info(response.text, page_url)
                
                # 处理本页的每张图片
                for idx, img_info in enumerate(page_images):
                    # 写入CSV
                    writer.writerow({
                        '名称': img_info['title'],
                        '地址': img_info['url']
                    })
                    
                    # 下载图片
                    img_ext = os.path.splitext(img_info['url'])[1] or '.jpg'
                    img_name = f"{page}_{idx}{img_ext}"
                    img_path = os.path.join(IMAGE_DIR, img_name)
                    
                    if download_image(img_info['url'], img_path):
                        print(f"  Downloaded: {img_name}")
                    else:
                        print(f"  Failed to download: {img_info['url']}")
                
                all_images.extend(page_images)
                print(f"Found {len(page_images)} images on page {page}")
                
            except Exception as e:
                print(f"Error processing page {page}: {e}")
            
            # 添加延迟避免被封
            time.sleep(1.5)
    
    print(f"\nFinished! Total images: {len(all_images)}")
    print(f"CSV file saved to: {OUTPUT_CSV}")
    print(f"Images saved to: {IMAGE_DIR}")

if __name__ == "__main__":
    scrape_website()

In [None]:
# 这个和上面版本类似。就是将网站多个选项添加到了一个列表中，因为各个网页后续页面格式一致,都是添加了index_页码.html的形式。
import os
import csv
import requests
from bs4 import BeautifulSoup
import time
import re
import hashlib

# 配置参数 - 改为URL列表
BASE_URLS = [
    "https://594b43.com/pic/meitui/",
    "https://594b43.com/pic/oumei/",  # 添加更多URL
    "https://594b43.com/pic/katong/"
]
OUTPUT_CSV = "image_data.csv"
IMAGE_DIR = "../downloaded_images"
PAGES_PER_SITE = 100  # 每个网站爬取的页数
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Referer': 'https://a5c425.com/'
}

# 创建图片保存目录
os.makedirs(IMAGE_DIR, exist_ok=True)

def decode_title(script_content):
    """从script标签中提取标题文本"""
    # 查找标题文本的模式
    pattern = r"document\.write\(d\('([^']+)'\)\)"
    match = re.search(pattern, script_content)
    if match:
        encoded_str = match.group(1)
        # 这里应该使用网站实际的解密函数
        return f"[EncodedTitle:{encoded_str}]"
    return "[NoTitle]"

def extract_image_info(page_content, page_url):
    """从页面内容中提取图片信息"""
    soup = BeautifulSoup(page_content, 'html.parser')
    dl_tags = soup.find_all('dl')
    
    image_data = []
    
    for dl in dl_tags:
        try:
            # 提取图片URL
            img_tag = dl.find('img', {'class': 'nature'})
            if img_tag and 'data-original' in img_tag.attrs:
                img_url = img_tag['data-original']
                
                # 提取标题
                h3_tag = dl.find('h3')
                if h3_tag:
                    script_tag = h3_tag.find('script')
                    if script_tag:
                        title = decode_title(script_tag.string)
                    else:
                        # 直接提取</script>和</h3>之间的文本
                        script_content = str(h3_tag)
                        end_script = script_content.find('</script>')
                        if end_script != -1:
                            title_content = script_content[end_script + 9:]
                            title = title_content.replace('</h3>', '').strip()
                        else:
                            title = h3_tag.get_text().strip()
                else:
                    title = "Untitled"
                
                # 清理文件名中的非法字符
                safe_title = re.sub(r'[\\/*?:"<>|]', '', title)[:100]
                
                image_data.append({
                    'title': safe_title,
                    'url': img_url
                })
        except Exception as e:
            print(f"Error processing element: {e}")
    
    return image_data

def download_image(url, save_path):
    """下载并保存图片"""
    try:
        # 如果文件已存在则跳过
        if os.path.exists(save_path):
            print(f"  Image already exists: {os.path.basename(save_path)}")
            return True
            
        response = requests.get(url, headers=HEADERS, timeout=15)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
            return True
        else:
            print(f"  Failed to download: HTTP {response.status_code}")
    except Exception as e:
        print(f"  Failed to download {url}: {e}")
    return False

def get_url_hash(url):
    """生成URL的短哈希用于文件名"""
    return hashlib.md5(url.encode()).hexdigest()[:8]

def scrape_website():
    """主爬虫函数"""
    # 检查CSV文件是否存在以确定是否需要写入表头
    write_header = not os.path.exists(OUTPUT_CSV)
    
    # 准备CSV文件(追加模式)
    with open(OUTPUT_CSV, 'a', newline='', encoding='utf-8-sig') as csvfile:
        fieldnames = ['名称', '地址']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # 只在第一次运行时写入表头
        if write_header:
            writer.writeheader()
        
        # 遍历所有BASE_URL
        for url_idx, base_url in enumerate(BASE_URLS):
            print(f"\n{'='*40}")
            print(f"Processing site #{url_idx+1}: {base_url}")
            print(f"{'='*40}")
            
            # 遍历每一页
            for page in range(1, PAGES_PER_SITE + 1):
                if page == 1:
                    page_url = base_url
                else:
                    page_url = f"{base_url}index_{page}.html"
                
                print(f"\nProcessing page {page}: {page_url}")
                
                try:
                    # 获取页面内容
                    response = requests.get(page_url, headers=HEADERS, timeout=20)
                    response.encoding = 'utf-8'
                    
                    if response.status_code != 200:
                        print(f"  Failed to fetch page. Status code: {response.status_code}")
                        continue
                    
                    # 提取图片信息
                    page_images = extract_image_info(response.text, page_url)
                    
                    # 处理本页的每张图片
                    for idx, img_info in enumerate(page_images):
                        # 写入CSV
                        writer.writerow({
                            '名称': img_info['title'],
                            '地址': img_info['url']
                        })
                        
                        # 生成唯一文件名
                        url_hash = get_url_hash(img_info['url'])
                        img_ext = os.path.splitext(img_info['url'])[1][:5]  # 获取扩展名
                        if not img_ext or len(img_ext) > 5:
                            img_ext = '.jpg'
                            
                        img_name = f"site{url_idx}_page{page}_{idx}_{url_hash}{img_ext}"
                        img_path = os.path.join(IMAGE_DIR, img_name)
                        
                        # 下载图片
                        if download_image(img_info['url'], img_path):
                            print(f"  ✓ Downloaded: {img_name}")
                        else:
                            print(f"  ✕ Failed to download: {img_info['url']}")
                    
                    print(f"  Found {len(page_images)} images on page {page}")
                    
                except Exception as e:
                    print(f"  Error processing page: {str(e)[:100]}")
                
                # 添加延迟避免被封
                time.sleep(2.0)
    
    print(f"\n{'='*40}")
    print("Finished! All sites processed.")
    print(f"CSV file: {OUTPUT_CSV}")
    print(f"Images folder: {IMAGE_DIR}")
    print(f"{'='*40}")

if __name__ == "__main__":
    scrape_website()