In [1]:
%pip install -U pip
%pip install pandas openpyxl docxtpl

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.2
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
%%writefile generate_sops.py
# -*- coding: utf-8 -*-
"""
把 Excel 的 30 所学校 × 3 个研究方向 组合，套用 Word 模板批量生成 DOCX。
要求：
  - 当前文件夹内存在：
      - Dear Admissions Committee.docx
      - 工作簿1.xlsx (Sheet1，至少包含 school_name)
运行：
  在 Jupyter 里：%run generate_sops.py  或  !python generate_sops.py
依赖：
  pip install pandas openpyxl docxtpl
"""
import re
from pathlib import Path
import pandas as pd

ROOT = Path(__file__).resolve().parent
EXCEL_PATH = ROOT / "sheet.xlsx"
TEMPLATE_PATH = ROOT / "Dear Admissions Committee.docx"
OUT_DIR = ROOT / "output" / "docx"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# 兜底的方向与期刊（若 Excel 缺项会用它们补全）
FALLBACKS = {
    "finance": {
        "journal_1": "Review of Financial Studies (RFS)",
        "journal_2": "Journal of Finance (JF)",
        "journal_3": "Journal of Financial Economics (JFE)",
        "career_goal": "quant researcher",
    },
    "economics": {
        "journal_1": "American Economic Review (AER)",
        "journal_2": "Quarterly Journal of Economics (QJE)",
        "journal_3": "Econometrica",
        "career_goal": "academic economist",
    },
    "information management": {
        "journal_1": "MIS Quarterly (MISQ)",
        "journal_2": "Information Systems Research (ISR)",
        "journal_3": "Management Science (MS)",
        "career_goal": "data scientist",
    },
    "marketing": {
        "journal_1": "Journal of Consumer Psychology",
        "journal_2": "Journal of Marketing Research",
        "journal_3": "Journal of the Academy of Marketing Science",
        "career_goal": "marketing analyst",
    },
}

def sanitize_filename(s: str) -> str:
    s = re.sub(r'[\\/:*?"<>|]', "_", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def build_areas(df: pd.DataFrame):
    """选出 3 个研究方向，并保证每个方向有 journal_1~3 和 career_goal。"""
    areas = []
    if "research_area" in df.columns:
        for v in df["research_area"].dropna().astype(str):
            key = v.strip().lower()
            if key and key not in areas:
                areas.append(key)
            if len(areas) == 3:
                break
    # 不足 3 个，用 FALLBACKS 补足
    for fb in FALLBACKS:
        if len(areas) >= 3:
            break
        if fb not in areas:
            areas.append(fb)

    area_map = {}
    for area in areas[:3]:
        row = None
        if "research_area" in df.columns:
            m = df["research_area"].astype(str).str.strip().str.lower() == area
            if m.any():
                row = df[m].iloc[0]
        j1 = (row["journal_1"] if (row is not None and pd.notna(row.get("journal_1", None))) else None)
        j2 = (row["journal_2"] if (row is not None and pd.notna(row.get("journal_2", None))) else None)
        j3 = (row["journal_3"] if (row is not None and pd.notna(row.get("journal_3", None))) else None)
        cg = (row["career_goal"] if (row is not None and "career_goal" in df.columns and pd.notna(row.get("career_goal", None))) else None)

        fb = FALLBACKS.get(area, {})
        area_map[area] = {
            "journal_1": str(j1) if j1 else fb.get("journal_1", "Leading Journal 1"),
            "journal_2": str(j2) if j2 else fb.get("journal_2", "Leading Journal 2"),
            "journal_3": str(j3) if j3 else fb.get("journal_3", "Leading Journal 3"),
            "career_goal": str(cg) if cg else fb.get("career_goal", "researcher"),
        }
    return area_map

def main():
    df = pd.read_excel(EXCEL_PATH, sheet_name=0)
    if "school_name" not in df.columns:
        raise ValueError("Excel 必须包含 'school_name' 列（在 Sheet1）。")

    schools = (
        df["school_name"].dropna().astype(str).str.strip().drop_duplicates().head(30).tolist()
    )
    if len(schools) < 30:
        print(f"[warning] 仅找到 {len(schools)} 所学校，将按 {len(schools)}×3 份生成。")

    area_map = build_areas(df)

    try:
        from docxtpl import DocxTemplate
    except ImportError as e:
        raise SystemExit("请先安装 docxtpl：pip install docxtpl") from e
    if not TEMPLATE_PATH.exists():
        raise FileNotFoundError(f"找不到模板：{TEMPLATE_PATH}")

    total = 0
    for school in schools:
        for area, meta in list(area_map.items())[:3]:
            context = {
                "school_name": school,
                "research_area": area,
                "journal_1": meta["journal_1"],
                "journal_2": meta["journal_2"],
                "journal_3": meta["journal_3"],
                "career_goal": meta["career_goal"],
                "email": "email@example.com",
            }
            tpl = DocxTemplate(str(TEMPLATE_PATH))  # 每份重新载入模板更稳
            tpl.render(context)
            fname = f"SOP_{sanitize_filename(school)}_{sanitize_filename(area)}.docx"
            outpath = OUT_DIR / fname
            tpl.save(str(outpath))
            total += 1
    print(f"已生成 {total} 份 DOCX，目录：{OUT_DIR}")

if __name__ == "__main__":
    main()


Overwriting generate_sops.py


In [7]:
%run generate_sops.py

已生成 90 份 DOCX，目录：/Users/cristiano/homework2/output/docx
