In [1]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

In [2]:
opts = Options()
# opts.add_argument("--headless")
opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")

service = Service(r"D:\TerrierClub\edgedriver_win64\msedgedriver.exe")
driver = webdriver.Edge(service=service, options=opts)

In [3]:
base_url = "https://terriercentral.bu.edu"
orgs_url = base_url + "/organizations"
print("Opening TerrierCentral...")
driver.get(orgs_url)
time.sleep(5)

Opening TerrierCentral...


In [5]:
import json

In [44]:
# # Print Load More,until no more information
# print("Loading all clubs...")
# load_more_clicked = 0
#
# for click_attempt in range(50):
#     try:
#         button = WebDriverWait(driver, 5).until(
#             EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Load More')]"))
#         )
#
#         driver.execute_script("arguments[0].scrollIntoView(true);", button)
#         time.sleep(1)
#         driver.execute_script("arguments[0].click();", button)
#         load_more_clicked += 1
#         print(f"Clicked 'Load More' {load_more_clicked} times")
#         time.sleep(3)
#
#     except Exception as e:
#         print("No more Load More buttons found")
#         break
#
# print(f"Finished loading after {load_more_clicked} clicks")
#
#
# print("Collecting all organization links...")
# all_links = driver.find_elements(By.TAG_NAME, "a")
# org_urls = []
#
# for link in all_links:
#     try:
#         href = link.get_attribute("href")
#         if href and '/organization/' in href and href not in org_urls:
#             org_urls.append(href)
#     except:
#         continue
#
# # Order
# org_urls = sorted(list(set(org_urls)))
# print(f"Found {len(org_urls)} unique organization links")

Loading all clubs...
Clicked 'Load More' 1 times
Clicked 'Load More' 2 times
Clicked 'Load More' 3 times
Clicked 'Load More' 4 times


KeyboardInterrupt: 

In [11]:
with open('club_links.json', 'r') as f:
    org_urls = json.load(f)
print(f"Loaded {len(org_urls)} club links")

def setup_driver():
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")

    service = Service(r"D:\TerrierClub\edgedriver_win64\msedgedriver.exe")
    driver = webdriver.Edge(service=service, options=opts)
    return driver

def extract_contact_name(org_data):

    primary_contact = org_data.get('primaryContact', {})
    first_name = primary_contact.get('firstName', '')
    last_name = primary_contact.get('lastName', '')

    if first_name or last_name:
        contact_name = f"{first_name} {last_name}".strip()
    else:
        contact_name = ""

    return contact_name

def scrape_club_info(url, max_attempts=3):
    for attempt in range(max_attempts):
        driver = None
        try:
            driver = setup_driver()
            driver.get(url)
            time.sleep(1)

            # 从页面脚本中提取结构化数据
            page_source = driver.page_source

            # 使用正则表达式提取 window.initialAppState
            match = re.search(r'window\.initialAppState\s*=\s*({.*?});', page_source)

            if match:
                app_state_json = match.group(1)
                app_state = json.loads(app_state_json)

                org_data = app_state.get('preFetchedData', {}).get('organization', {})

                # 提取基本信息
                name = org_data.get('name', '')
                email = org_data.get('email', '')

                # 只获取联系人姓名
                contact_name = extract_contact_name(org_data)

                driver.quit()

                return {
                    "name": name,
                    "url": url,
                    "email": email,
                    "contact_name": contact_name,
                    "attempts": attempt + 1,
                    "status": "success"
                }
            else:
                driver.quit()
                return {
                    "name": "No data found",
                    "url": url,
                    "email": "",
                    "contact_name": "",
                    "attempts": attempt + 1,
                    "status": "no_data"
                }

        except Exception as e:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

            if attempt < max_attempts - 1:
                print(f"  尝试 {attempt + 1} 失败，重试中...")
                time.sleep(2)
            else:
                return {
                    "name": "ERROR",
                    "url": url,
                    "email": "",
                    "contact_name": "",
                    "attempts": attempt + 1,
                    "status": "error"
                }

Loaded 448 club links


In [12]:
# 爬取俱乐部数据
data = []
total_clubs = len(org_urls)

for i, url in enumerate(org_urls):
    print(f"Processing {i+1}/{total_clubs}: {url}")

    club_data = scrape_club_info(url, max_attempts=3)
    data.append(club_data)

    # 显示重试信息
    if club_data['attempts'] > 1:
        print(f"  经过 {club_data['attempts']} 次尝试，状态: {club_data['status']}")

    # 每100个俱乐部显示一次进度
    if (i + 1) % 100 == 0:
        print(f"=== 已完成 {i+1}/{total_clubs} 个俱乐部 ===")
        success_count = len([d for d in data if d['status'] == 'success'])
        contact_count = len([d for d in data if d['contact_name'] != ''])
        print(f"成功: {success_count}, 有联系人: {contact_count}")

# 保存到CSV
df = pd.DataFrame(data)
df.to_csv("terrier_clubs_contacts.csv", index=False, encoding='utf-8')
print(f"\n数据已保存到 terrier_clubs_contacts.csv")

# 显示最终统计
success_count = len([d for d in data if d['status'] == 'success'])
error_count = len([d for d in data if d['status'] == 'error'])
no_data_count = len([d for d in data if d['status'] == 'no_data'])
contact_count = len([d for d in data if d['contact_name'] != ''])

print(f"\n=== 最终统计 ===")
print(f"总俱乐部: {total_clubs}")
print(f"成功爬取: {success_count}")
print(f"有联系人姓名: {contact_count}")
print(f"无数据: {no_data_count}")
print(f"错误: {error_count}")

Processing 1/448: https://terriercentral.bu.edu/organization/180-degrees-consulting
Processing 2/448: https://terriercentral.bu.edu/organization/3hclub
Processing 3/448: https://terriercentral.bu.edu/organization/54-3-47-570-3-550-470
Processing 4/448: https://terriercentral.bu.edu/organization/above-the-haze-a-chi-alpha-campus-ministry
Processing 5/448: https://terriercentral.bu.edu/organization/accounting-association
Processing 6/448: https://terriercentral.bu.edu/organization/adclub
Processing 7/448: https://terriercentral.bu.edu/organization/adoptedandmulticultralstudentalliance
Processing 8/448: https://terriercentral.bu.edu/organization/african-students-organization
Processing 9/448: https://terriercentral.bu.edu/organization/afrithms
Processing 10/448: https://terriercentral.bu.edu/organization/aisociety
Processing 11/448: https://terriercentral.bu.edu/organization/alc
Processing 12/448: https://terriercentral.bu.edu/organization/alexander-hamilton-society
Processing 13/448: htt

In [9]:
import re
with open('club_links.json', 'r') as f:
    org_urls = json.load(f)

print(f"Loaded {len(org_urls)} club links")

def setup_driver():
    opts = Options()
    opts.add_argument("--headless")
    opts.add_argument("--disable-gpu")
    opts.add_argument("--no-sandbox")
    opts.add_argument("--disable-dev-shm-usage")

    service = Service(r"D:\TerrierClub\edgedriver_win64\msedgedriver.exe")
    driver = webdriver.Edge(service=service, options=opts)
    return driver

def extract_social_media(social_media):
    """提取社交媒体链接"""
    socials = {}
    if social_media.get('externalWebsite'):
        socials['website'] = social_media['externalWebsite']
    if social_media.get('instagramUrl'):
        socials['instagram'] = social_media['instagramUrl']
    if social_media.get('facebookUrl'):
        socials['facebook'] = social_media['facebookUrl']
    if social_media.get('twitterUrl'):
        socials['twitter'] = social_media['twitterUrl']
    if social_media.get('youtubeUrl'):
        socials['youtube'] = social_media['youtubeUrl']
    return socials

def clean_html_description(html_text):
    """清理HTML标签，获取纯文本描述"""
    if not html_text:
        return ""
    clean_text = re.sub('<[^<]+?>', '', html_text)
    clean_text = clean_text.replace('&rsquo;', "'").replace('&nbsp;', ' ')
    clean_text = ' '.join(clean_text.split())
    return clean_text


Loaded 448 club links


In [38]:
def scrape_club_info(url, max_attempts=3):
    """爬取单个俱乐部信息，带重试机制"""
    for attempt in range(max_attempts):
        driver = None
        try:
            driver = setup_driver()
            driver.get(url)
            time.sleep(1)

            # 从页面脚本中提取结构化数据
            page_source = driver.page_source

            # 使用正则表达式提取 window.initialAppState
            match = re.search(r'window\.initialAppState\s*=\s*({.*?});', page_source)

            if match:
                app_state_json = match.group(1)
                app_state = json.loads(app_state_json)

                org_data = app_state.get('preFetchedData', {}).get('organization', {})
                social_media = org_data.get('socialMedia', {})

                # 提取基本信息
                name = org_data.get('name', '')
                email = org_data.get('email', '')
                description_html = org_data.get('description', '')
                description = clean_html_description(description_html)

                # 提取社交媒体链接
                socials = extract_social_media(social_media)

                driver.quit()

                return {
                    "name": name,
                    "url": url,
                    "email": email,
                    "description": description,
                    "website": socials.get('website', ''),
                    "instagram": socials.get('instagram', ''),
                    "facebook": socials.get('facebook', ''),
                    "twitter": socials.get('twitter', ''),
                    "youtube": socials.get('youtube', ''),
                    "attempts": attempt + 1,
                    "status": "success"
                }
            else:
                driver.quit()
                return {
                    "name": "No data found",
                    "url": url,
                    "email": "",
                    "description": "",
                    "website": "",
                    "instagram": "",
                    "facebook": "",
                    "twitter": "",
                    "youtube": "",
                    "attempts": attempt + 1,
                    "status": "no_data"
                }

        except Exception as e:
            if driver:
                try:
                    driver.quit()
                except:
                    pass

            if attempt < max_attempts - 1:
                print(f"  尝试 {attempt + 1} 失败，重试中...")
                time.sleep(2)  # 重试前等待
            else:
                return {
                    "name": "ERROR",
                    "url": url,
                    "email": "",
                    "description": f"Error: {str(e)}",
                    "website": "",
                    "instagram": "",
                    "facebook": "",
                    "twitter": "",
                    "youtube": "",
                    "attempts": attempt + 1,
                    "status": "error"
                }

In [39]:
data = []
total_clubs = len(org_urls)

for i, url in enumerate(org_urls):
    print(f"Processing {i+1}/{total_clubs}: {url}")

    club_data = scrape_club_info(url, max_attempts=3)
    data.append(club_data)

    # 显示重试信息
    if club_data['attempts'] > 1:
        print(f"  经过 {club_data['attempts']} 次尝试，状态: {club_data['status']}")

    # 每100个俱乐部显示一次进度
    if (i + 1) % 100 == 0:
        print(f"=== 已完成 {i+1}/{total_clubs} 个俱乐部 ===")
        success_count = len([d for d in data if d['status'] == 'success'])
        error_count = len([d for d in data if d['status'] == 'error'])
        no_data_count = len([d for d in data if d['status'] == 'no_data'])
        print(f"成功: {success_count}, 无数据: {no_data_count}, 错误: {error_count}")

# 保存到CSV
df = pd.DataFrame(data)
df.to_csv("all_terrier_clubs.csv", index=False, encoding='utf-8')
print(f"\nSave to all_terrier_clubs.csv")

# 显示最终统计
success_count = len([d for d in data if d['status'] == 'success'])
error_count = len([d for d in data if d['status'] == 'error'])
no_data_count = len([d for d in data if d['status'] == 'no_data'])

print(f"\n=== 最终统计 ===")
print(f"总俱乐部: {total_clubs}")
print(f"成功爬取: {success_count}")
print(f"无数据: {no_data_count}")
print(f"错误: {error_count}")

# 重试统计
retry_stats = {}
for d in data:
    attempts = d.get('attempts', 1)
    retry_stats[attempts] = retry_stats.get(attempts, 0) + 1

print(f"\n=== 重试统计 ===")
for attempts, count in sorted(retry_stats.items()):
    print(f"{attempts} 次尝试: {count} 个俱乐部")

if success_count > 0:
    success_data = [d for d in data if d['status'] == 'success']
    success_df = pd.DataFrame(success_data)

    print(f"\n=== 社交媒体统计 ===")
    social_platforms = ['website', 'instagram', 'facebook', 'twitter', 'youtube']
    for platform in social_platforms:
        count = success_df[platform].str.len().gt(0).sum()
        print(f"{platform}: {count} 个俱乐部")

Processing 1/448: https://terriercentral.bu.edu/organization/180-degrees-consulting
Processing 2/448: https://terriercentral.bu.edu/organization/3hclub
Processing 3/448: https://terriercentral.bu.edu/organization/54-3-47-570-3-550-470
Processing 4/448: https://terriercentral.bu.edu/organization/above-the-haze-a-chi-alpha-campus-ministry
Processing 5/448: https://terriercentral.bu.edu/organization/accounting-association
Processing 6/448: https://terriercentral.bu.edu/organization/adclub
Processing 7/448: https://terriercentral.bu.edu/organization/adoptedandmulticultralstudentalliance
Processing 8/448: https://terriercentral.bu.edu/organization/african-students-organization
Processing 9/448: https://terriercentral.bu.edu/organization/afrithms
Processing 10/448: https://terriercentral.bu.edu/organization/aisociety
Processing 11/448: https://terriercentral.bu.edu/organization/alc
Processing 12/448: https://terriercentral.bu.edu/organization/alexander-hamilton-society
Processing 13/448: htt