## 1、安装环境

In [2]:
%pip install -U pandas openpyxl docxtpl docx2pdf


Collecting pandas
  Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting docxtpl
  Downloading docxtpl-0.20.1-py3-none-any.whl.metadata (9.4 kB)
Collecting docx2pdf
  Downloading docx2pdf-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Collecting python-docx>=1.1.1 (from docxtpl)
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting docxcompose (from docxtpl)
  Downloading docxcompose-1.4.0.tar.gz (20 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Downloading pandas-2.3.3-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
    --------------------------------------- 0.3/11.0 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.0 MB 3.4 MB/s eta 0:00:03
   ----------- ---------------------------- 3.1/11.0 MB 6.0 MB/s eta 0:00:02
   --------------- --

  You can safely remove it manually.


## 2、导入库和设置路径

In [64]:
import os, sys, json, platform
from datetime import datetime

import pandas as pd
from docxtpl import DocxTemplate

In [79]:
BASE_DIR = "HW_School_Application" 

uni_path = os.path.join(BASE_DIR, "universities.xlsx")
areas_path = os.path.join(BASE_DIR, "research_areas.xlsx")
config_path = os.path.join(BASE_DIR, "config.json")
template_path = os.path.join(BASE_DIR, "sop_template.docx")
output_dir = os.path.join(BASE_DIR, "output")

In [105]:
os.makedirs(output_dir, exist_ok=True)

In [107]:
print("工作目录：", os.getcwd())
print("工程目录：", os.path.abspath(BASE_DIR))
print("输出目录：", os.path.abspath(OUTPUT_DIR))

工作目录： D:\周方健\Desktop\大学\计算机课\人工智能与Python 程序设计\作业\第三次作业
工程目录： D:\周方健\Desktop\大学\计算机课\人工智能与Python 程序设计\作业\第三次作业\HW_School_Application
输出目录： D:\周方健\Desktop\大学\计算机课\人工智能与Python 程序设计\作业\第三次作业\HW_School_Application\output


## 3、读取文件

In [110]:
df_uni = pd.read_excel(uni_path)

In [112]:
df_area = pd.read_excel(areas_path)

In [114]:
with open(config_path, "r", encoding="utf-8") as f:
    config = json.load(f) 

In [116]:
display(df_uni.head())

Unnamed: 0,University
0,"Department of Economics, Harvard University"
1,"Economics Department, Massachusetts Institute ..."
2,"Department of Economics, University of Califor..."
3,"Department of Economics, University of Chicago"
4,Paris School of Economics


In [118]:
display(df_area.head())

Unnamed: 0,Area,TopJournal1,TopJournal2,TopJournal3,Skills
0,Economics,American Economic Review,Econometrica,Quarterly Journal of Economics,"Python, R, SQL, Statistics, Econometrics, Stat..."
1,Finance,Journal of Finance,Journal of Financial Economics,Review of Financial Studies,"Python, SQL, Risk Modeling, Time Series, Stoch..."
2,Information Management,MIS Quarterly,Information Systems Research,Journal of the Association for Information Sys...,"Python, SQL, Data Warehousing (BigQuery), API/..."


In [120]:
print("个人信息：", {k: config.get(k) for k in ["applicant_name","email","phone","undergrad_school","undergrad_major"]})

个人信息： {'applicant_name': '周方健/James Zhou', 'email': '18807846189@163.com', 'phone': '+86-18807846189', 'undergrad_school': 'Renmin University of China', 'undergrad_major': 'Economics'}


## 4、准备“变量上下文”构造函数

In [164]:
def build_context(university: str, area_row: dict, cfg: dict):

    journal_list = f"{area_row['TopJournal1']}, {area_row['TopJournal2']}, {area_row['TopJournal3']}"
    
    skills_sentence = area_row['Skills']
    
    program_map = cfg.get("program_name_map", {})
    program_name = program_map.get(area_row["Area"], cfg.get("target_degree_default", "Master Program"))
    
    career_map = {
        "Economics": "economic research / Econometric",
        "Finance": "quant research or financial analytics",
        "Information Management": "data analytics and information systems"
    }
    area_career = career_map.get(area_row["Area"], area_row["Area"])

    strengths_list = cfg.get("personal_strengths", [])
    personal_strengths_bullets = "\n".join(f"• {s}" for s in strengths_list)

    hobbies = cfg.get("hobbies", "")

    return {
        "today": datetime.now().strftime("%B %d, %Y"),
        "university": university,
        "area": area_row["Area"],

        "journal_list": journal_list,
        "skills_sentence": skills_sentence,
        "area_career": area_career,

        "program_name": program_name,

        "applicant_name": cfg.get("applicant_name", ""),
        "email": cfg.get("email", ""),
        "phone": cfg.get("phone", ""),
        "undergrad_school": cfg.get("undergrad_school", ""),
        "undergrad_major": cfg.get("undergrad_major", ""),

        "personal_strengths_bullets": personal_strengths_bullets,
        "hobbies": hobbies,
    }

## 5、用 docxtpl 渲染模板，生成 90 份 Word

In [167]:
from docxtpl import DocxTemplate
import os

In [169]:
tpl = DocxTemplate(template_path)

In [171]:
def safe_name(s: str) -> str:
    keep = []
    for ch in s:
        if ch.isalnum() or ch in (" ", "_", "-", "&"):
            keep.append(ch)
    return "".join(keep).strip().replace(" ", "_")

In [173]:
count = 0
index = 1
for _, uni_row in df_uni.iterrows():
    univ = str(uni_row["University"]).strip()
    for _, area_row in df_area.iterrows():
        ctx = build_context(univ, area_row, config)
        tpl.render(ctx)
        
        fname = f"SOP_{index:02d}_{safe_name(univ)}_{area_row['Area'].replace(' ', '')}.docx"
        
        out_path = os.path.join(output_dir, fname)
        tpl.save(out_path)
        
        index += 1
        count += 1

## 6、把其中 1 份转成 PDF

In [175]:
import glob, os
from docx2pdf import convert

In [176]:
docx_files = sorted(glob.glob(os.path.join(output_dir, "*.docx")))

in_docx = docx_files[0]
out_pdf = os.path.splitext(in_docx)[0] + ".pdf"


In [177]:
convert(in_docx, out_pdf)

  0%|          | 0/1 [00:00<?, ?it/s]