In [1]:
%pip install -U pip
%pip install pandas openpyxl docxtpl

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-25.2
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:


import os
import random
import requests
import pandas as pd
from bs4 import BeautifulSoup
from pathlib import Path
from docxtpl import DocxTemplate
from datetime import datetime

# ---------- 配置 ----------
random.seed(42)

HOME = Path.home()
OUT_DIR = HOME / "HW_School_Application"
OUT_DIR.mkdir(parents=True, exist_ok=True)


TEMPLATE_PATH = OUT_DIR / "Dear Admissions Committee.docx"

# RePEc 排行页（抓学校）
REPEC_URL = "https://ideas.repec.org/top/top.econdept.html"

# 申请人信息（你可以自行修改）
APPLICANT = {
    "applicant_name": "Cristiano Lu",
    "program": "Master of Finance",
    "career_goal": "quant researcher",
    "email": "cristiano@example.com"
}

# 研究方向、期刊与技能（可以按需修改）
RESEARCH_AREAS = ["Economics", "Finance", "Information Management"]
JOURNALS = {
    "Economics": ["American Economic Review", "Econometrica", "Quarterly Journal of Economics"],
    "Finance": ["Journal of Finance", "Journal of Financial Economics", "Review of Financial Studies"],
    "Information Management": ["MIS Quarterly", "Information Systems Research", "Journal of Management Information Systems"]
}
SKILLS = ["Python", "SQL", "R", "Stata", "Excel", "PowerBI", "Tableau", "Mathematics", "Statistics", "Machine Learning"]

# 输出文件名
SCHOOLS_XLSX = OUT_DIR / "schools.xlsx"
AREAS_XLSX = OUT_DIR / "research_areas.xlsx"
JOURNALS_XLSX = OUT_DIR / "top_journals_by_area.xlsx"
SKILLS_XLSX = OUT_DIR / "skills.xlsx"
COMBOS_XLSX = OUT_DIR / "school_area_combinations.xlsx"

# ---------- 函数：抓取 RePEc 院系列表 ----------
def fetch_repec_departments(url=REPEC_URL, timeout=20):
    """
    尝试从 IDEAS/RePEc 抓取按序列出的 economics departments 名称。
    如果网络或解析失败，返回空列表，调用端会使用备用名单。
    """
    try:
        resp = requests.get(url, headers={"User-Agent":"Mozilla/5.0"}, timeout=timeout)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "html.parser")
        names = []
        # 优先根据链接文本挑选包含 'Econom' 的条目
        for a in soup.find_all("a"):
            txt = (a.get_text() or "").strip()
            if not txt:
                continue
            low = txt.lower()
            if "econom" in low and len(txt) > 5:
                if txt not in names:
                    names.append(" ".join(txt.split()))
        # 如果解析的太少，再尝试从表格行抓取
        if len(names) < 90:
            for tr in soup.find_all("tr"):
                txt = (tr.get_text(" ", strip=True) or "").strip()
                if "Econom" in txt or "Department" in txt:
                    if len(txt) > 8 and txt not in names:
                        names.append(" ".join(txt.split()))
        # 返回去重且按出现顺序的列表
        dedup = []
        for n in names:
            if n not in dedup:
                dedup.append(n)
        return dedup
    except Exception as e:
        print(" fetch_repec_departments failed:", e)
        return []

# 备用学校名单（当抓取失败时回退）
FALLBACK_SCHOOLS = [
    "Harvard University", "Massachusetts Institute of Technology (MIT)", "University of California-Berkeley",
    "University of Chicago", "Princeton University", "Stanford University", "Yale University",
    "Oxford University", "London School of Economics (LSE)", "Columbia University",
    "New York University (NYU)", "University of Pennsylvania", "University of Michigan",
    "Duke University", "Cornell University", "University of Toronto", "UCLA", "University of Warwick",
    "University of Cambridge", "Toulouse School of Economics (TSE)", "Paris School of Economics (PSE)",
    "Johns Hopkins University", "Brown University", "Tilburg University", "University of British Columbia",
    "University of Wisconsin-Madison", "UCL", "KU Leuven", "Monash University", "Purdue University",
    # （可自行补充到 >=90）
]

# ---------- 函数：按排名区间抽样 10/10/10 ----------
def pick_30_schools_by_buckets(depts):
    """
    按页面顺序（近似排名）从 1-30、31-60、61-90 各随机取 10 所。如果某段数据不足，则从可用中尽量补齐。
    返回选中的 30 所学校列表（尽量长度为 30）。
    """
    n = len(depts)
    b1 = depts[0:30] if n >= 1 else []
    b2 = depts[30:60] if n >= 31 else depts[30:min(60,n)]
    b3 = depts[60:90] if n >= 61 else depts[60:min(90,n)]
    # fallback if bucket empty
    if not b1 and n>0: b1 = depts[:min(30,n)]
    if not b2 and n>30: b2 = depts[30:min(60,n)]
    if not b3 and n>60: b3 = depts[60:min(90,n)]

    s1 = random.sample(b1, min(10, len(b1))) if b1 else []
    s2 = random.sample(b2, min(10, len(b2))) if b2 else []
    s3 = random.sample(b3, min(10, len(b3))) if b3 else []

    selected = s1 + s2 + s3
    # 如果总数不足 30，从整个列表补齐（去重）
    if len(selected) < 30:
        pool = [d for d in depts if d not in selected]
        need = 30 - len(selected)
        if pool:
            add = random.sample(pool, min(need, len(pool)))
            selected.extend(add)
    # 若仍然少（例如 depts 很小），使用 fallback 补齐
    if len(selected) < 30:
        for s in FALLBACK_SCHOOLS:
            if s not in selected:
                selected.append(s)
            if len(selected) >= 30:
                break
    return selected[:30]

# ---------- 函数：保存各 Excel 列表 ----------
def save_excel_lists(schools, areas, journals_dict, skills_list):
    pd.DataFrame({"university": schools}).to_excel(SCHOOLS_XLSX, index=False)
    pd.DataFrame({"research_area": areas}).to_excel(AREAS_XLSX, index=False)
    rows = []
    for area, js in journals_dict.items():
        for j in js:
            rows.append({"area": area, "journal": j})
    pd.DataFrame(rows).to_excel(JOURNALS_XLSX, index=False)
    pd.DataFrame({"skill": skills_list}).to_excel(SKILLS_XLSX, index=False)
    print(f" 已保存 Excel 列表到: {OUT_DIR}")

# ---------- 函数：生成 strict 30 x 3 (90) 组合并保存 ----------
def generate_90_combinations(schools, areas):
    combos = []
    for s in schools:
        # 若 areas 有 3 个或更多则取 3 个，不足的话循环填满
        if len(areas) >= 3:
            picks = random.sample(areas, 3)
        else:
            picks = (areas * (3 // max(1,len(areas)))) + areas[:(3 % max(1,len(areas)))]
        for a in picks:
            combos.append({"school": s, "research_area": a})
    # 确保长度正好 90（若学校不足 30 或其他情况则截断或重复）
    if len(combos) > 90:
        combos = combos[:90]
    while len(combos) < 90:
        combos.append(random.choice(combos).copy())
    df = pd.DataFrame(combos)
    df.to_excel(COMBOS_XLSX, index=False)
    print(f" 已生成并保存 90 条组合到: {COMBOS_XLSX}")
    return combos

# ---------- 函数：渲染 docx（不进行 PDF 转换） ----------
def render_docs_from_template(template_path, combos, journals_dict, skills_list, applicant_info):
    if not Path(template_path).exists():
        raise FileNotFoundError(f"模板文件未找到: {template_path}\n请把模板放到该路径或修改 TEMPLATE_PATH 变量。")
    generated_paths = []
    for idx, rec in enumerate(combos, start=1):
        school = rec["school"]
        area = rec["research_area"]
        top_js = journals_dict.get(area, [])[:3]
        j1 = top_js[0] if len(top_js) > 0 else ""
        j2 = top_js[1] if len(top_js) > 1 else ""
        j3 = top_js[2] if len(top_js) > 2 else ""
        chosen_skills = ", ".join(random.sample(skills_list, min(6, len(skills_list))))

        context = {
            "school_name": school,
            "research_area": area,
            "journal_1": j1,
            "journal_2": j2,
            "journal_3": j3,
            "skills": chosen_skills,
            **applicant_info
        }
        # 渲染并保存
        tpl = DocxTemplate(str(template_path))
        tpl.render(context)
        safe_school = "".join(c for c in school if c.isalnum() or c in (" ", "_")).strip().replace(" ", "_")[:60]
        safe_area = "".join(c for c in area if c.isalnum() or c in (" ", "_")).strip().replace(" ", "_")[:30]
        fname = f"SOP_{idx:03d}_{safe_school}_{safe_area}.docx"
        out_file = OUT_DIR / fname
        tpl.save(out_file)
        generated_paths.append(out_file)
    print(f" 渲染完成：已生成 {len(generated_paths)} 个 .docx 文件，保存目录：{OUT_DIR}")
    return generated_paths

# ---------- 主流程 ----------
def main_pipeline():
    start_time = datetime.now()
    print("SOP generation pipeline started at", start_time.isoformat())

    # 1) 抓取 RePEc 列表
    depts = fetch_repec_departments()
    if not depts or len(depts) < 90:
        print(" RePEc 抓取数据不足或失败（len=%s），使用回退名单与组合填充。" % len(depts))
        # 合并抓取结果与 fallback，去重，确保有足够项
        merged = []
        for x in (depts + FALLBACK_SCHOOLS):
            if x not in merged:
                merged.append(x)
        depts = merged

    print(f"Total departments available: {len(depts)}")

    # 2) 按区间抽样 30 所学校
    schools_30 = pick_30_schools_by_buckets(depts)
    print("Selected 30 schools (sample):")
    for s in schools_30[:10]:
        print(" -", s)
    # 3) 保存 Excel 列表（schools, areas, journals, skills）
    save_excel_lists(schools_30, RESEARCH_AREAS, JOURNALS, SKILLS)

    # 4) 生成 90 条组合并保存
    combos = generate_90_combinations(schools_30, RESEARCH_AREAS)

    # 5) 渲染 90 份 Word（使用你提供的模板）
    generated = render_docs_from_template(TEMPLATE_PATH, combos, JOURNALS, SKILLS, APPLICANT)

    end_time = datetime.now()
    print("Pipeline finished at", end_time.isoformat(), " | elapsed:", end_time - start_time)
    return {
        "start": start_time,
        "end": end_time,
        "generated_files": generated,
        "combos_xlsx": COMBOS_XLSX,
        "lists": {
            "schools": SCHOOLS_XLSX,
            "areas": AREAS_XLSX,
            "journals": JOURNALS_XLSX,
            "skills": SKILLS_XLSX
        }
    }

# ---------- Run ----------
if __name__ == "__main__":
    result = main_pipeline()
    # 在 notebook 中可直接查看 result 内容
    print("Result summary:")
    print("Combos Excel:", result["combos_xlsx"])
    print("Example generated file:", result["generated_files"][0] if result["generated_files"] else "None")




SOP generation pipeline started at 2025-10-09T17:28:04.427491
Total departments available: 319
Selected 30 schools (sample):
 - Department of Economics, University College London (UCL)
 - Department of Economics, Harvard University
 - Economic literature:
 - Department of Economics, Northwestern University
 - Department of Economics, Princeton University
 - Paris School of Economics
 - Economics Department, Dartmouth College
 - Economics Department, Massachusetts Institute of Technology (MIT)
 - Department of Economics, University of California-Los Angeles (UCLA)
 - Department of Economics, University of Pennsylvania
✅ 已保存 Excel 列表到: /Users/cristiano/HW_School_Application
✅ 已生成并保存 90 条组合到: /Users/cristiano/HW_School_Application/school_area_combinations.xlsx
✅ 渲染完成：已生成 90 个 .docx 文件，保存目录：/Users/cristiano/HW_School_Application
Pipeline finished at 2025-10-09T17:28:06.891685  | elapsed: 0:00:02.464194
Result summary:
Combos Excel: /Users/cristiano/HW_School_Application/school_area_combina

In [7]:
%run generate_sops.py

已生成 90 份 DOCX，目录：/Users/cristiano/homework2/output/docx
