In [17]:
import platform
import re
from pathlib import Path

import pandas as pd
from docxtpl import DocxTemplate
from unidecode import unidecode
import os
os.chdir(r"C:/Users/86135/HW_School_Application")

# -----------------------------
# 个人信息
# -----------------------------
CONFIG = {
    "applicant_name": "Zhang Ziyue",
    "program_name": "Master of Science in {Economics, Management, Information Systems}",
    "sample_topics_by_area": {
        "Economics": "industrial organization, applied econometrics, digital economy",
        "Management": "organization design, innovation strategy, operations analytics",
        "Information Systems": "platform economics, algorithmic governance, causal ML in IS"
    }
}

# -----------------------------
# 文件名清洗器
# -----------------------------
def sanitize_filename(name: str) -> str:
    safe = re.sub(r"[^\w\-\.\s]", "", unidecode(str(name)))
    return "_".join(safe.split())[:120]

# -----------------------------
# 主函数
# -----------------------------
def generate_sops(base_dir=".", convert_pdf=False):
    BASE = Path(base_dir)   # 保持相对路径，不强制转绝对路径
    OUTPUT = BASE / "output"
    OUTPUT.mkdir(parents=True, exist_ok=True)

    tmpl_path = BASE / "sop_template.docx"
    univ_xlsx = BASE / "universities.xlsx"
    areas_xlsx = BASE / "research_areas.xlsx"
    skills_xlsx = BASE / "skills.xlsx"


    for p in [tmpl_path, univ_xlsx, areas_xlsx, skills_xlsx]:
        if not p.exists():
            raise FileNotFoundError(f"Required file missing: {p}")

    # 读取数据
    df_univ = pd.read_excel(univ_xlsx)
    df_areas = pd.read_excel(areas_xlsx)
    df_skills = pd.read_excel(skills_xlsx)

    if "university" not in df_univ.columns:
        raise ValueError("universities.xlsx must contain a 'university' column.")
    for col in ["area", "top_journal_1", "top_journal_2", "top_journal_3"]:
        if col not in df_areas.columns:
            raise ValueError(f"research_areas.xlsx must contain column: {col}")
    if "skill" not in df_skills.columns:
        raise ValueError("skills.xlsx must contain a 'skill' column.")

    applicant_name = CONFIG["applicant_name"]
    program_name = CONFIG["program_name"]
    topics_map = CONFIG["sample_topics_by_area"]

    count = 0
    generated_files = []

    for _, urow in df_univ.iterrows():
        university = urow["university"]
        for _, arow in df_areas.iterrows():
            area = arow["area"]
            journals = [arow["top_journal_1"], arow["top_journal_2"], arow["top_journal_3"]]
            sample_topics = topics_map.get(area, "applied analytics and policy evaluation")
            skills_list = df_skills["skill"].dropna().astype(str).tolist()

            ctx = {
                "applicant_name": applicant_name,
                "target_school": university,
                "program_name": program_name,
                "research_area": area,
                "journals": journals,
                "skills": skills_list,
                "sample_topics": sample_topics,
            }

            tpl = DocxTemplate(str(tmpl_path))
            tpl.render(ctx)
            out_docx = OUTPUT / f"SOP_{sanitize_filename(university)}_{sanitize_filename(area)}.docx"
            tpl.save(out_docx)
            generated_files.append(out_docx)
            count += 1

    print(f"[OK] Generated {count} DOCX file(s) in: {OUTPUT}")


    return generated_files

# -----------------------------
# Notebook 里直接运行
# -----------------------------
files = generate_sops(".")
files[:5]  # 看前5个生成的文件路径


[OK] Generated 90 DOCX file(s) in: output


[WindowsPath('output/SOP_Harvard_University_-_Department_of_Economics_Economics.docx'),
 WindowsPath('output/SOP_Harvard_University_-_Department_of_Economics_Management.docx'),
 WindowsPath('output/SOP_Harvard_University_-_Department_of_Economics_Information_Systems.docx'),
 WindowsPath('output/SOP_Massachusetts_Institute_of_Technology_MIT_-_Economics_Department_Economics.docx'),
 WindowsPath('output/SOP_Massachusetts_Institute_of_Technology_MIT_-_Economics_Department_Management.docx')]