In [1]:
import pandas as pd
from docxtpl import DocxTemplate
from docx2pdf import convert
import os

In [2]:
#创建输出目录
output_dir = os.path.expanduser("D:/Year3/QuantEconomy/Homework/HW2_Grad_Application/SOP1")
os.makedirs(output_dir, exist_ok=True)
print(f"输出文件将保存到: {output_dir}")

输出文件将保存到: D:/Year3/QuantEconomy/Homework/HW2_Grad_Application/SOP1


In [3]:
#读取大学列表universities.xlsx
universities_df = pd.read_excel("universities.xlsx")
print(f"已读取 {len(universities_df)} 所大学")

已读取 30 所大学


In [4]:
universities_df

Unnamed: 0,university_name
0,Harvard University
1,Massachusetts Institute of Technology (MIT)
2,University of California-Berkeley
3,University of Chicago
4,Princeton University
5,Stanford University
6,Yale University
7,Oxford University
8,Columbia University
9,Brown University


In [5]:
#读取研究领域信息
research_df = pd.read_excel("research_info.xlsx")
print(f"已读取 {len(research_df)} 个研究领域")

已读取 3 个研究领域


In [6]:
research_df

Unnamed: 0,research_area,journal1,journal2,journal3,job,skills
0,finance,Journal of Finance,Review of Financial Studies,Journal of Financial Economics,Quant Researcher,"Python, SQL, Math, PowerBI, Tableau"
1,economics,American Economic Review,Econometrica,Journal of Political Economy,Economic Analyst,"R, Stata, Econometrics, Data Analysis"
2,account,Accounting Review,Journal of Accounting Research,Journal of Accounting and Economics,Financial Accountant,"Excel, Financial Reporting, Tax, Auditing, GAAP"


In [7]:
#定义生成SOP的函数
def generate_sop(university, research_area, job, skills, journal1, journal2, journal3, template_path="template.docx"):
    """
    生成个人陈述文档
    - university: 大学名称
    - research_area: 研究领域
    - job: 目标职位
    - skills: 技能列表
    - journal1, journal2, journal3: 相关期刊
    - template_path: 模板文件路径
    返回:
    - 生成的Word文档路径
    """
    #加载模板
    doc = DocxTemplate(template_path)
    
    #准备替换数据
    context = {
        'university': university,
        'research_area': research_area,
        'job': job,
        'skills': skills,
        'journal1': journal1,
        'journal2': journal2,
        'journal3': journal3
    }
    
    #替换模板中的变量
    doc.render(context)
    
    #生成文件名
    #safe_university = university.replace(" ", "_").replace("/", "_")
    filename = f"SOP_{university}_{research_area}.docx"
    output_path = os.path.join(output_dir, filename)
    
    #保存生成的文档
    doc.save(output_path)
    return output_path

In [8]:
#循环生成所有组合的SOP
total_documents = 0
pdf_files = []
university_count = len(universities_df)

#循环处理每所大学
for idx, university_row in universities_df.iterrows():
    university = university_row['university_name']
    #显示当前处理的大学
    print(f"处理第 {idx+1}/{university_count} 所大学: {university}")
    
    #循环处理每个研究领域
    for _, research_row in research_df.iterrows():
        #生成Word文档
        word_path = generate_sop(
            university=university,
            research_area=research_row['research_area'],
            job=research_row['job'],
            skills=research_row['skills'],
            journal1=research_row['journal1'],
            journal2=research_row['journal2'],
            journal3=research_row['journal3']
        )

        #仅前3个文档执行PDF转换
        if total_documents < 3:  
            try:
                pdf_path = word_path.replace(".docx", ".pdf")
                convert(word_path, pdf_path)
                pdf_files.append(pdf_path)
                print(f"  已生成Word并转换为PDF: {os.path.basename(pdf_path)} ")
            except Exception as e:
                print(f"  转换PDF时出错 {os.path.basename(word_path)}: {str(e)}")
                
        total_documents += 1


print(f"\n生成完成！共生成 {total_documents} 份文档")
print(f"其中 {len(pdf_files)} 份已转换为PDF")
print(f"所有文件已保存到: {output_dir}")

处理第 1/30 所大学: Harvard University


  0%|          | 0/1 [00:00<?, ?it/s]

  已生成Word并转换为PDF: SOP_Harvard University_finance.pdf 


  0%|          | 0/1 [00:00<?, ?it/s]

  已生成Word并转换为PDF: SOP_Harvard University_economics.pdf 


  0%|          | 0/1 [00:00<?, ?it/s]

  已生成Word并转换为PDF: SOP_Harvard University_account.pdf 
处理第 2/30 所大学: Massachusetts Institute of Technology (MIT)
处理第 3/30 所大学: University of California-Berkeley
处理第 4/30 所大学: University of Chicago
处理第 5/30 所大学: Princeton University
处理第 6/30 所大学: Stanford University
处理第 7/30 所大学: Yale University
处理第 8/30 所大学: Oxford University
处理第 9/30 所大学: Columbia University
处理第 10/30 所大学: Brown University
处理第 11/30 所大学: Boston University
处理第 12/30 所大学: University College London (UCL)
处理第 13/30 所大学: University of California-San Diego (UCSD)
处理第 14/30 所大学: University of British Columbia
处理第 15/30 所大学: Dartmouth College
处理第 16/30 所大学: Paris School of Economics
处理第 17/30 所大学: Barcelona School of Economics (BSE)
处理第 18/30 所大学: New York University (NYU)
处理第 19/30 所大学: University of Pennsylvania
处理第 20/30 所大学: London School of Economics (LSE)
处理第 21/30 所大学: University of Michigan
处理第 22/30 所大学: University of British Columbia
处理第 23/30 所大学: University of California-Los Angeles (UCLA)
处理第 24/30 所大学: University