In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random

# --------------------------
# 1. 核心函数：用逗号分割提取核心大学名
# --------------------------
def clean_by_comma(raw_name):
    raw_name = raw_name.strip()
    # 分割所有逗号，取最后一部分（核心大学名）
    comma_parts = [part.strip() for part in raw_name.split(",") if part.strip()]
    if len(comma_parts) >= 2:
        # 若有2个及以上部分，取最后一个（排除前面的院系冗余）
        return comma_parts[-1]
    else:
        # 若只有1部分（无逗号），直接返回原名称
        return raw_name

# --------------------------
# 2. 抓取网页并提取数据
# --------------------------
url = "https://ideas.repec.org/top/top.econdept.html"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
}

# 请求网页
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()  # 请求失败直接报错
soup = BeautifulSoup(response.text, "html.parser")

# 定位主排名表格（通过表头含"rank"确认）
target_table = None
for table in soup.find_all("table"):
    thead = table.find("thead")
    if thead and any("rank" in th.get_text(strip=True).lower() for th in thead.find_all("th")):
        target_table = table
        break
if not target_table:
    raise Exception("未找到主排名表格，可能网站结构已更新")

# 提取前90所大学（只取<a>标签的原始名称，后续用逗号清理）
universities = []
rows = target_table.find_all("tr")
for row in rows:
    cols = row.find_all("td")
    if len(cols) >= 3 and cols[1].find("a"):  # 确保有排名、名称、国家列
        raw_name = cols[1].find("a").get_text(strip=True)
        clean_name = clean_by_comma(raw_name)  # 用逗号清理名称
        universities.append(clean_name)
        if len(universities) >= 90:
            break

print(f"✅ 成功提取并清理 {len(universities)} 所大学名称")

# --------------------------
# 3. 按要求抽样（前30抽10、中30抽10、后30抽10）
# --------------------------
def safe_sample(data, sample_size):
    """安全抽样：数据足够则抽10个，不足则返回全部"""
    if len(data) >= sample_size:
        return random.sample(data, sample_size)
    else:
        print(f"⚠️ 某区间仅{len(data)}所大学，已返回全部")
        return data

# 分区间
top30 = universities[:30]
middle30 = universities[30:60]
bottom30 = universities[60:90]

# 抽样
selected_top = safe_sample(top30, 10)
selected_middle = safe_sample(middle30, 10)
selected_bottom = safe_sample(bottom30, 10)

# --------------------------
# 4. 生成Excel文件（确保列长度一致）
# --------------------------
# 填充空值，保证三列长度相同
max_len = max(len(selected_top), len(selected_middle), len(selected_bottom))
fill_empty = lambda lst: lst + [""] * (max_len - len(lst))

# 整理结果数据
result_data = {
    "Top 30 Selected": fill_empty(selected_top),
    "Middle 30 Selected": fill_empty(selected_middle),
    "Bottom 30 Selected": fill_empty(selected_bottom)
}

# 保存为Excel
df = pd.DataFrame(result_data)
df.to_excel("universiti.xlsx", index=False)
print("✅ 文件已保存为 universiti.xlsx")

# 打印部分结果示例
print("\n📌 清理后结果示例（前5个）：")
print("Top 30 Selected:", selected_top[:5])
print("Middle 30 Selected:", selected_middle[:5])
print("Bottom 30 Selected:", selected_bottom[:5])

✅ 成功提取并清理 90 所大学名称
✅ 文件已保存为 universiti.xlsx

📌 清理后结果示例（前5个）：
Top 30 Selected: ['Brown University', 'Stanford University', 'University College London (UCL)', 'Boston University', 'Paris School of Economics']
Middle 30 Selected: ['Centro de Estudios Monetarios y Financieros (CEMFI)', 'Vrije Universiteit Amsterdam', 'University of California-Davis', 'Johns Hopkins University', 'Erasmus Universiteit Rotterdam']
Bottom 30 Selected: ['New York University (NYU)', 'Universitat de Barcelona', 'Washington University in St. Louis', 'UNSW Sydney', 'University of California-Santa Barbara (UCSB)']


In [13]:
url = "https://www.scmor.com/view/10554"
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")
# 观察html后发现为嵌套标题，因此直接搜索h3下的全文
title = soup.find(lambda t: t.name == "h3" and "ABS4*" in t.get_text(strip = True))
area_table = title.find_next("table")
rows = []
for tr in area_table.select("tr"):
    tds = tr.select("td")
    if len(tds) >= 3:
        area_text = tds[1].getText(strip = True)
        journal_text = tds[2].getText(strip = True)
        rows.append({"Area": area_text, "Journal": journal_text})

# 整合成dataframe
area_journal_df = pd.DataFrame(rows).sort_values("Area")
area_journal_df
# 确保已导入依赖库（若未安装，先执行：pip install pandas openpyxl）
import pandas as pd

# --------------------------
# 1. 基于你提供的运行结果，重建area_journal_df（若已在环境中存在，可跳过此步）
# --------------------------
# 从你的运行结果中提取核心数据（Area=英文领域缩写，Journal=期刊名）
data = [
    {"Area": "ACCOUNT", "Journal": "Accounting Review"},
    {"Area": "ACCOUNT", "Journal": "Accounting, Organizations and Society"},
    {"Area": "ACCOUNT", "Journal": "Journal of Accounting and Economics"},
    {"Area": "ACCOUNT", "Journal": "Journal of Accounting Research"},
    {"Area": "ECON", "Journal": "American Economic Review"},
    {"Area": "ECON", "Journal": "Annals of Statistics"},
    {"Area": "ECON", "Journal": "Econometrica"},
    {"Area": "ECON", "Journal": "Journal of Political Economy"},
    {"Area": "ECON", "Journal": "Quarterly Journal of Economics"},
    {"Area": "ECON", "Journal": "Review of Economic Studies"},
    {"Area": "ENT-SBM", "Journal": "Entrepreneurship Theory and Practice"},
    {"Area": "ENT-SBM", "Journal": "Journal of Business Venturing"},
    {"Area": "ETHICS-CSR-MAN", "Journal": "Academy of Management Annals"},
    {"Area": "ETHICS-CSR-MAN", "Journal": "Academy of Management Journal"},
    {"Area": "ETHICS-CSR-MAN", "Journal": "Academy of Management Review"},
    {"Area": "ETHICS-CSR-MAN", "Journal": "Administrative Science Quarterly"},
    {"Area": "ETHICS-CSR-MAN", "Journal": "Journal of Management"},
    {"Area": "FINANCE", "Journal": "Journal of Finance"},
    {"Area": "FINANCE", "Journal": "Journal of Financial Economics"},
    {"Area": "FINANCE", "Journal": "Review of Financial Studies"},
    {"Area": "HRM&EMP", "Journal": "Human Resource Management Journal (UK)"},
    {"Area": "IB&AREA", "Journal": "Journal of International Business Studies"},
    {"Area": "INFO MAN", "Journal": "Information Systems Research"},
    {"Area": "INFO MAN", "Journal": "Journal of the Association for Information Systems"},
    {"Area": "INFO MAN", "Journal": "MIS Quarterly: Management Information Systems"},
    {"Area": "INNOV", "Journal": "Research Policy"},
    {"Area": "MDEV&EDU", "Journal": "Academy of Management Learning and Education"},
    {"Area": "MKT", "Journal": "Journal of Consumer Psychology"},
    {"Area": "MKT", "Journal": "Journal of Consumer Research"},
    {"Area": "MKT", "Journal": "Journal of Marketing"},
    {"Area": "MKT", "Journal": "Journal of Marketing Research"},
    {"Area": "MKT", "Journal": "Journal of the Academy of Marketing Science"},
    {"Area": "MKT", "Journal": "Marketing Science"},
    {"Area": "OPS&TECH", "Journal": "Journal of Operations Management"},
    {"Area": "OR&MANSCI", "Journal": "Management Science"},
    {"Area": "OR&MANSCI", "Journal": "Operations Research"},
    {"Area": "ORG STUD", "Journal": "Organization Science"},
    {"Area": "PSYCH (GENERAL)", "Journal": "Psychological Science"},
    {"Area": "PSYCH (WOP-OB)", "Journal": "Journal of Applied Psychology"},
    {"Area": "PSYCH (WOP-OB)", "Journal": "Personnel Psychology"},
    {"Area": "PUB SEC", "Journal": "Public Administration Review"},
    {"Area": "SOC SCI", "Journal": "American Journal of Political Science"},
    {"Area": "SOC SCI", "Journal": "American Journal of Sociology"},
    {"Area": "SOC SCI", "Journal": "American Political Science Review"},
    {"Area": "SOC SCI", "Journal": "American Sociological Review"},
    {"Area": "SOC SCI", "Journal": "Annual Review of Sociology"},
    {"Area": "STRAT", "Journal": "Strategic Management Journal"}
]
area_journal_df = pd.DataFrame(data).sort_values("Area").reset_index(drop=True)
print("✅ 重建area_journal_df成功，数据预览：")
print(area_journal_df.head(10))


# --------------------------
# 2. 定义目标研究领域（中英文映射，匹配网站英文缩写）
# --------------------------
# 目标领域：经济学、金融学、信息管理学（对应网站英文缩写）
target_fields = {
    "经济学": "ECON",
    "金融学": "FINANCE",
    "信息管理学": "INFO MAN"
}

# 筛选目标领域的数据，每个领域保留前3本ABS 4*期刊
final_data = []
for chinese_field, english_area in target_fields.items():
    # 筛选当前领域的所有期刊
    field_journals = area_journal_df[area_journal_df["Area"] == english_area]["Journal"].tolist()
    # 保留前3本（确保每个领域最多3本顶级期刊）
    top3_journals = field_journals[:3]
    
    # 整理成最终格式（添加中英文领域列，提升可读性）
    for journal in top3_journals:
        final_data.append({
            "中文研究领域": chinese_field,
            "英文领域缩写": english_area,
            "ABS 4*顶级期刊": journal
        })

# 转换为DataFrame
final_df = pd.DataFrame(final_data)
print("\n✅ 筛选后的数据（目标领域+前3本期刊）：")
print(final_df)


# --------------------------
# 3. 生成Excel文件
# --------------------------
excel_path = "research_fields_top_journals_final.xlsx"
# 保存Excel（不保留索引，使用openpyxl引擎支持xlsx格式）
final_df.to_excel(excel_path, index=False, engine="openpyxl")

print(f"\n✅ Excel文件已生成：{excel_path}")
print("\n📊 Excel文件结构说明：")
print("- 中文研究领域：经济学、金融学、信息管理学（用户关注的领域）")
print("- 英文领域缩写：对应网站的领域标识（ECON/FINANCE/INFO MAN）")
print("- ABS 4*顶级期刊：每个领域的前3本超四星级期刊")

✅ 重建area_journal_df成功，数据预览：
      Area                                Journal
0  ACCOUNT                      Accounting Review
1  ACCOUNT  Accounting, Organizations and Society
2  ACCOUNT    Journal of Accounting and Economics
3  ACCOUNT         Journal of Accounting Research
4     ECON               American Economic Review
5     ECON                   Annals of Statistics
6     ECON                           Econometrica
7     ECON           Journal of Political Economy
8     ECON         Quarterly Journal of Economics
9     ECON             Review of Economic Studies

✅ 筛选后的数据（目标领域+前3本期刊）：
  中文研究领域    英文领域缩写                                         ABS 4*顶级期刊
0    经济学      ECON                           American Economic Review
1    经济学      ECON                               Annals of Statistics
2    经济学      ECON                                       Econometrica
3    金融学   FINANCE                                 Journal of Finance
4    金融学   FINANCE                     Journal of

In [17]:
# 确保安装依赖（若未安装：pip install pandas openpyxl）
import pandas as pd

# --------------------------
# 1. 配置文件路径（需根据你的实际文件修改）
# --------------------------
# 步骤4生成的Excel路径（替换为你的文件路径，例："D:/step4_excel.xlsx"）
step4_excel_path = "research_fields_top_journals_final.xlsx"
# 步骤6最终输出路径（新增技能列后的文件）
output_excel_path = "research_fields_journals_skills_final.xlsx"

# --------------------------
# 2. 手动定义“领域-技能”映射（基于你的指定技能，按领域匹配）
# --------------------------
# 你指定的技能：Python、C++、SQL、R、PyTorch、Math
field_skills_map = {
    "经济学": "R, SQL, Python, Math",          # 经济学：统计工具+数据处理+数学基础
    "金融学": "SQL, Python, Math, C++",        # 金融学：数据查询+量化分析+底层开发
    "信息管理学": "Python, SQL, PyTorch, R"    # 信息管理学：编程+机器学习+数据库
}

# --------------------------
# 3. 读取Excel + 添加技能列（修复警告）
# --------------------------
try:
    # 1. 读取步骤4的Excel文件
    df = pd.read_excel(step4_excel_path, engine="openpyxl")
    
    # 2. 检查关键列是否存在（避免文件结构错误）
    required_col = "中文研究领域"
    if required_col not in df.columns:
        raise Exception(f"步骤4的Excel缺少必要列：{required_col}，请确认文件包含该列")
    
    # 3. 新增技能列（修复警告：直接赋值，不使用inplace=True）
    # 原问题代码：df["岗位核心技能（手动添加）"].fillna(..., inplace=True)
    # 修复后：先赋值映射技能，再用fillna生成新Series并重新赋值
    df["岗位核心技能（手动添加）"] = df[required_col].map(field_skills_map)
    # 填充缺失值（无匹配领域时用默认技能）
    df["岗位核心技能（手动添加）"] = df["岗位核心技能（手动添加）"].fillna("Python, SQL, Math")
    
    # 4. 保存最终文件
    df.to_excel(output_excel_path, index=False, engine="openpyxl")
    
    # 打印结果预览
    print(f"✅ 技能添加完成！文件保存至：{output_excel_path}")
    print("\n📊 数据预览（前6行，关键列）：")
    preview_cols = [required_col, "ABS 4*顶级期刊", "岗位核心技能（手动添加）"]
    print(df[preview_cols].head(6).to_string(index=False))

except FileNotFoundError:
    print(f"❌ 未找到步骤4的Excel文件！当前路径：{step4_excel_path}")
    print("💡 提示：请修改代码中`step4_excel_path`为你的实际文件路径（例：'D:/我的文件/step4.xlsx'）")

except Exception as e:
    print(f"❌ 执行失败：{str(e)}")

✅ 技能添加完成！文件保存至：research_fields_journals_skills_final.xlsx

📊 数据预览（前6行，关键列）：
中文研究领域                     ABS 4*顶级期刊           岗位核心技能（手动添加）
   经济学       American Economic Review   R, SQL, Python, Math
   经济学           Annals of Statistics   R, SQL, Python, Math
   经济学                   Econometrica   R, SQL, Python, Math
   金融学             Journal of Finance SQL, Python, Math, C++
   金融学 Journal of Financial Economics SQL, Python, Math, C++
   金融学    Review of Financial Studies SQL, Python, Math, C++


In [19]:
# 1. 导入依赖库
import os
import pandas as pd
from docxtpl import DocxTemplate
from docx2pdf import convert
from datetime import datetime
import shutil

# 2. 配置关键参数（⚠️ 需根据你的本地路径修改！）
CONFIG = {
    # 文件路径
    "uni_excel_path": "universiti.xlsx",  # 30所大学Excel
    "research_excel_path": "research_fields_journals_skills_final.xlsx",  # 领域-顶刊-技能Excel
    "template_path": "application_template.docx",  # 申请信模板
    # 目标研究领域（⚠️ 选择你申请的领域，如"经济学"/"金融学"/"信息管理学"）
    "target_research_field": "经济学",
    # 研究主题示例（⚠️ 按领域自定义，增强申请信真实性）
    "research_topic_map": {
        "经济学": "收入不平等对长期经济增长的传导机制",
        "金融学": "高频金融数据下的风险对冲策略优化",
        "信息管理学": "机器学习在企业信息系统效率提升中的应用"
    },
    # 输出目录（用户主目录下的HW_School_Application）
    "output_dir": os.path.join(os.path.expanduser("~"), "HW_School_Application")
}

# 3. 工具函数：读取Excel数据
def load_excel_data():
    """读取大学列表和研究领域-顶刊-技能数据"""
    # 3.1 读取大学列表（需列名匹配"Selected_Universities"）
    try:
        uni_df = pd.read_excel(CONFIG["uni_excel_path"], engine="openpyxl")
        if "Selected_Universities" not in uni_df.columns:
            raise Exception(f"大学Excel缺少必填列：Selected_Universities，请检查列名")
        universities = uni_df["Selected_Universities"].dropna().tolist()
        if len(universities) != 30:
            print(f"⚠️  大学数量为{len(universities)}（建议30所），将继续生成")
        print(f"✅ 成功读取{len(universities)}所大学")
    except FileNotFoundError:
        raise FileNotFoundError(f"大学Excel文件未找到：{CONFIG['uni_excel_path']}")
    except Exception as e:
        raise Exception(f"读取大学Excel失败：{str(e)}")
    
    # 3.2 读取研究领域数据（筛选目标领域，如"经济学"）
    try:
        research_df = pd.read_excel(CONFIG["research_excel_path"], engine="openpyxl")
        required_cols = ["中文研究领域", "ABS 4*顶级期刊", "岗位核心技能（手动添加）"]
        for col in required_cols:
            if col not in research_df.columns:
                raise Exception(f"领域Excel缺少必填列：{col}，请检查列名")
        
        # 筛选目标领域的行（取第一行即可，同一领域顶刊/技能一致）
        target_field_data = research_df[research_df["中文研究领域"] == CONFIG["target_research_field"]]
        if target_field_data.empty:
            raise Exception(f"领域Excel中未找到'{CONFIG['target_research_field']}'，请检查领域名称")
        
        # 提取顶刊（去重+用逗号连接）和技能
        top_journals = ", ".join(target_field_data["ABS 4*顶级期刊"].dropna().unique())
        skills = ", ".join(target_field_data["岗位核心技能（手动添加）"].dropna().unique())
        research_info = {
            "research_field": CONFIG["target_research_field"],
            "top_journals": top_journals,
            "skills": skills,
            "research_topic_example": CONFIG["research_topic_map"][CONFIG["target_research_field"]]
        }
        print(f"✅ 成功读取{CONFIG['target_research_field']}领域数据：{top_journals[:50]}...")
    except FileNotFoundError:
        raise FileNotFoundError(f"领域Excel文件未找到：{CONFIG['research_excel_path']}")
    except Exception as e:
        raise Exception(f"读取领域Excel失败：{str(e)}")
    
    return universities, research_info

# 4. 步骤7-8：循环填充模板+生成Word文档（docxtpl）
def generate_word_documents(universities, research_info):
    """循环大学列表，用docxtpl生成个性化Word"""
    # 加载Word模板
    try:
        tpl = DocxTemplate(CONFIG["template_path"])
    except FileNotFoundError:
        raise FileNotFoundError(f"模板文件未找到：{CONFIG['template_path']}")
    except Exception as e:
        raise Exception(f"加载模板失败：{str(e)}（确保模板为.docx格式）")
    
    # 创建临时文件夹存放所有生成的Word（避免混乱）
    temp_word_dir = os.path.join(CONFIG["output_dir"], "temp_word_files")
    os.makedirs(temp_word_dir, exist_ok=True)
    
    generated_word_paths = []
    print(f"\n📝 开始生成Word文档（共{len(universities)}所大学）：")
    for idx, school in enumerate(universities, 1):
        # 构建填充上下文（匹配模板占位符）
        context = {
            "school": school,  # 申请学校（来自大学列表）
            "research_field": research_info["research_field"],  # 研究领域
            "top_journals": research_info["top_journals"],  # 顶级期刊
            "research_topic_example": research_info["research_topic_example"],  # 研究主题示例
            "skills": research_info["skills"]  # 技能列表（来自领域Excel）
        }
        
        # 填充模板
        tpl.render(context)
        
        # 处理文件名特殊字符（避免保存失败）
        safe_school_name = school.replace("/", "_").replace(":", "_").replace("*", "")
        word_filename = f"Application_{safe_school_name}.docx"
        word_path = os.path.join(temp_word_dir, word_filename)
        
        # 保存Word
        tpl.save(word_path)
        generated_word_paths.append(word_path)
        print(f"  {idx:2d}/30 生成：{word_filename}")
    
    print(f"\n✅ Word生成完成，共{len(generated_word_paths)}个文件，存放于：{temp_word_dir}")
    return generated_word_paths, temp_word_dir

# 5. 步骤9：转换PDF（docx2pdf，仅限Windows）
def convert_word_to_pdf(generated_word_paths, temp_word_dir):
    """将Word转换为PDF（依赖Microsoft Word）"""
    if not generated_word_paths:
        print("❌ 无Word文件可转换，跳过PDF生成")
        return None
    
    # 创建临时PDF文件夹
    temp_pdf_dir = os.path.join(CONFIG["output_dir"], "temp_pdf_files")
    os.makedirs(temp_pdf_dir, exist_ok=True)
    
    generated_pdf_paths = []
    print(f"\n📄 开始转换PDF（需安装Microsoft Word，请勿关闭Word进程）：")
    for word_path in generated_word_paths[:5]:  # 仅转换前5个（避免耗时，可改为全部）
        # 定义PDF路径
        word_filename = os.path.basename(word_path)
        pdf_filename = os.path.splitext(word_filename)[0] + ".pdf"
        pdf_path = os.path.join(temp_pdf_dir, pdf_filename)
        
        try:
            # 转换PDF（docx2pdf依赖Word）
            convert(word_path, pdf_path)
            generated_pdf_paths.append(pdf_path)
            print(f"  ✅ 转换成功：{pdf_filename}")
        except Exception as e:
            print(f"  ❌ 转换失败{pdf_filename}：{str(e)}（建议手动用Word转PDF）")
    
    return generated_pdf_paths, temp_pdf_dir



In [7]:
import pandas as pd
from jinja2 import Template
from docx import Document
import win32com.client
import os
import re

# 1. 读取数据
field_df = pd.read_excel("research_fields_journals_skills_final.xlsx")
univ_df = pd.read_excel("universiti.xlsx")

# 2. 处理学校数据（三列合并成一列）
univ_long = pd.melt(univ_df, var_name='rank', value_name='school')
univ_long = univ_long.dropna(subset=['school']).reset_index(drop=True)

# 3. 处理研究领域数据（按英文领域缩写合并期刊和技能）
field_grouped = field_df.groupby('英文领域缩写').agg({
    'ABS 4*顶级期刊': lambda x: ', '.join(x.unique()),
    '岗位核心技能（手动添加）': lambda x: ', '.join(x.unique())
}).reset_index()

# 4. 笛卡尔积：每所学校 × 每个研究领域
univ_long['key'] = 1
field_grouped['key'] = 1
merged = univ_long.merge(field_grouped, on='key').drop(columns='key')

# 5. 申请信模板（全英文）
template_str = """Dear Admissions Committee of the Economics Department at {{ school }},

Greetings!

I am Zejiang Wang. Driven by a deep passion for {{ research_field }} and a profound admiration for {{ school }}'s academic reputation, I am submitting my application to your graduate program in Economics.

Academically, I have consistently followed cutting-edge research in {{ research_field }}, particularly tracking publications in top-tier journals such as {{ top_journals }}. These publications have not only deepened my understanding of core {{ research_field }} theories but also inspired my exploration of specific issues like "{{ research_topic_example }}." This has solidified my resolve to pursue advanced studies at your institution and further develop related research.

Regarding my career path, I aim to specialize in {{ research_field }} during graduate studies and pursue quant research (e.g., research at academic institutions, quantitative analysis in financial institutions, or policy research in government economic departments) after graduation. {{ school }}'s faculty expertise and research resources in {{ research_field }} will provide crucial support for achieving this goal.

Regarding skill sets, I am proficient in {{ skills }} and other tools, enabling me to independently complete tasks such as data cleaning, statistical modeling, and empirical analysis for academic papers. These competencies will facilitate my swift adaptation to the pace of graduate-level study and research.

Thank you for taking the time to review my application. I look forward to the opportunity to discuss this further with you!
"""

template = Template(template_str)

# 6. 创建输出文件夹
os.makedirs("output_word", exist_ok=True)
os.makedirs("output_pdf", exist_ok=True)

# 7. 生成 Word 文档
docx_files = []
for idx, row in merged.iterrows():
    content = template.render(
        school=row["school"],
        research_field=row["英文领域缩写"],
        top_journals=row["ABS 4*顶级期刊"],
        research_topic_example="Impact of AI on Labor Markets",
        skills=row["岗位核心技能（手动添加）"]
    )
    
    # 清理文件名
    safe_school = re.sub(r'[^\w\-_. ]', '', row['school'])
    filename = f"application_{safe_school.replace(' ', '_')}_{row['英文领域缩写']}.docx"
    docx_path = os.path.join("output_word", filename)
    docx_path = os.path.abspath(docx_path)  # 绝对路径
    
    doc = Document()
    doc.add_paragraph(content)
    doc.save(docx_path)
    docx_files.append(docx_path)

# 8. Word 转 PDF（Windows + Microsoft Word 安装）
word = win32com.client.Dispatch("Word.Application")
word.Visible = False

for docx_path in docx_files:
    pdf_path = os.path.join("output_pdf", os.path.basename(docx_path).replace(".docx", ".pdf"))
    pdf_path = os.path.abspath(pdf_path)
    
    try:
        doc = word.Documents.Open(docx_path)
        doc.SaveAs(pdf_path, FileFormat=17)  # 17 是 PDF 格式
        doc.Close()
    except Exception as e:
        print(f"转换失败: {docx_path}, 错误: {e}")

word.Quit()

print(f"共生成 {len(merged)} 封 Word 和 PDF 申请信")

共生成 90 封 Word 和 PDF 申请信
