In [None]:
pip install docxtpl python-docx pandas openpyxl

In [None]:
pip install docxtpl python-docx docx2pdf pandas requests beautifulsoup4 openpyxl

In [None]:
import pandas as pd
from docxtpl import DocxTemplate
import os
from pathlib import Path
from docx import Document
import random

def create_custom_template():
    
    # 创建新的Word文档
    doc = Document()
    
    # 定义申请信内容
    content = [
        "To the Graduate Admissions Committee,",
        "",
        "I am writing with great enthusiasm to express my interest in the {{ program }} at {{ university }}. My name is {{ name }}, and I believe my academic background and research interests align perfectly with the objectives of your distinguished program.",
        "",
        "Throughout my academic journey, I have developed a strong passion for {{ field }} research. I regularly engage with cutting-edge scholarship by reading articles from premier academic journals in this field, particularly {{ journals }}. This consistent exposure to high-quality research has equipped me with robust analytical capabilities and a nuanced understanding of contemporary methodologies.",
        "",
        "My career aspiration is to become a {{ career_goal }}, leveraging quantitative techniques to solve complex financial problems. To prepare for this career path, I have acquired proficiency in various technical tools and methodologies, including {{ skills }}.",
        "",
        "What particularly attracts me to {{ university }} is its reputation for academic excellence and its commitment to fostering innovative research in quantitative finance. The interdisciplinary approach and research opportunities available at your institution would provide an ideal environment for me to further develop my skills and contribute meaningfully to the field.",
        "",
        "Thank you for considering my application. I am excited about the possibility of joining your academic community and am confident that I would thrive in the challenging yet supportive environment at {{ university }}.",
        "",
        "Best regards,",
        "",
        "{{ name }}"
    ]
    
    for paragraph in content:
        if paragraph.strip():
            p = doc.add_paragraph(paragraph)
        else:
            doc.add_paragraph()
    
    template_path = "sop_template_custom.docx"
    doc.save(template_path)
    return template_path

def read_excel_files():

    # 定义文件路径
    university_path = r"D:\Desktop\HW_School_Application\University.xlsx"
    journal_path = r"D:\Desktop\HW_School_Application\Journal.xlsx"
    skill_path = r"D:\Desktop\HW_School_Application\Skill.xlsx"
    
    # 读取Excel文件
    try:
        university_df = pd.read_excel(university_path)
        journal_df = pd.read_excel(journal_path)
        skill_df = pd.read_excel(skill_path)
        
        print("成功读取所有Excel文件")
        
        print(f"大学文件列名: {list(university_df.columns)}")
        print(f"期刊文件列名: {list(journal_df.columns)}")
        print(f"技能文件列名: {list(skill_df.columns)}")
        
        return university_df, journal_df, skill_df
        
    except Exception as e:
        print(f"读取Excel文件时出错: {e}")
        return None, None, None

def organize_journals_by_area(journal_df):
    
    field_journals = {}
    
    # 检查期刊数据框是否包含必要的列
    if 'Area' not in journal_df.columns or 'Journal' not in journal_df.columns:
        print("期刊文件缺少必要的列(Area或Journal)")
        return field_journals
    
    # 按领域分组期刊
    for area in journal_df['Area'].unique():
        journals = journal_df[journal_df['Area'] == area]['Journal'].tolist()
        field_journals[area] = journals
    
    return field_journals

def get_skills_list(skill_df):

    skill_columns = list(skill_df.columns)
    print(f"技能文件列名: {skill_columns}")
    
    possible_skill_columns = ['Skill', 'Skills', '技术', '能力', '技能']
    skill_column = None
    
    for col in possible_skill_columns:
        if col in skill_df.columns:
            skill_column = col
            break
    
    if skill_column is None and len(skill_df.columns) > 0:
        skill_column = skill_df.columns[0]
        print(f"使用第一列作为技能列: {skill_column}")
    
    if skill_column is None:
        print("无法找到技能列，使用默认技能列表")
        return ['Python', 'SQL', 'R', 'MATLAB', 'Stata', 'Excel', 'PowerBI', 'Tableau']
    
    # 获取技能列表
    skills = skill_df[skill_column].dropna().tolist()
    print(f"找到 {len(skills)} 个技能")
    
    return skills

def format_skills_text(selected_skills):
    
    if len(selected_skills) == 0:
        return ""
    elif len(selected_skills) == 1:
        return f"{selected_skills[0]} and etc"
    else:
        return ", ".join(selected_skills[:-1])  + selected_skills[-1] + " and etc"

def select_skills(skill_list, num_skills=4):
  
    if len(skill_list) < num_skills:
        print(f"警告: 技能列表只有 {len(skill_list)} 个技能，但需要选择 {num_skills} 个")
        # 如果技能列表不够，重复使用技能
        selected = skill_list.copy()
        while len(selected) < num_skills:
            selected.append(random.choice(skill_list))
        selected = selected[:num_skills]
    else:
        # 随机选择指定数量的技能
        selected = random.sample(skill_list, num_skills)
    
    return selected

def generate_sop_documents():
   
    # 读取Excel文件
    university_df, journal_df, skill_df = read_excel_files()
    
    # 检查是否成功读取文件
    if university_df is None or journal_df is None or skill_df is None:
        print("无法读取必要的Excel文件，程序终止")
        return None
    
    field_journals = organize_journals_by_area(journal_df)

    all_skills = get_skills_list(skill_df)
  
    field_names = {
        'ECON': 'economics',
        'ACCOUNT': 'accounting', 
        'FINANCE': 'finance'
    }
    
    all_areas = ['ECON', 'ACCOUNT', 'FINANCE']
    
    # 创建输出目录 
    output_dir = Path(r"D:\Desktop\HW_School_Application\Generated_SOPs")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print(f"文档将保存到: {output_dir}")
    
    # 创建模板
    template_path = create_custom_template()
    template = DocxTemplate(template_path)
    
    count = 0
    
    # 个人信息
    personal_info = {
        'name': 'Yuki',
        'program': 'Master of Finance Program',
        'career_goal': 'quant researcher'
    }
    
    # 遍历所有大学
    for _, row in university_df.iterrows():
        university = row['University']
        
        # 为每个领域生成申请信
        for area in all_areas:
            # 检查该领域是否有期刊数据
            if area not in field_journals:
                print(f"警告: 领域 {area} 没有期刊数据，跳过")
                continue
         
            selected_skills = select_skills(all_skills)
            skills_text = format_skills_text(selected_skills)
            
            context = {
                'name': personal_info['name'],
                'university': university,
                'program': personal_info['program'],
                'field': field_names.get(area, area.lower()),
                'career_goal': personal_info['career_goal'],
                'skills': skills_text,
                'journals': ', '.join(field_journals[area])
            }
            
            template.render(context)
            
            # 保存Word文档
            filename = f"SOP_{university.replace(' ', '_').replace('/', '_')}_{area}.docx"
            output_path = output_dir / filename
            template.save(output_path)
            
            count += 1
            print(f"已生成: {filename}")
    
    print(f"\n成功生成 {count} 份申请信!")
    return output_dir

def convert_to_pdf(output_dir):
    try:
        from docx2pdf import convert
        import platform
        
        word_files = list(output_dir.glob("*.docx"))
        for word_file in word_files:
            pdf_file = word_file.with_suffix('.pdf')
            try:
                convert(str(word_file), str(pdf_file))
                print(f"已转换为PDF: {pdf_file.name}")
            except Exception as e:
                print(f"转换 {word_file.name} 为PDF时出错: {e}")
                    
        print(f"\n成功转换 {len(word_files)} 个文件为PDF格式")
       
    except ImportError:
        print("docx2pdf未安装，跳过PDF转换")
    except Exception as e:
        print(f"PDF转换过程中出错: {e}")

def main():
    """
    主执行函数
    """
    print("开始生成申请信...")
    
    # 生成申请信
    output_dir = generate_sop_documents()
    
    if output_dir is None:
        print("生成申请信失败")
        return
    
    print(f"\n处理完成! 文件保存在: {output_dir}")
    
    # 显示生成的文件列表
    files = list(output_dir.glob("*.docx"))
    print(f"\n已生成 {len(files)} 个文件:")
    for i, file in enumerate(files[:10], 1):  # 显示前10个文件
        print(f"  {i}. {file.name}")
    if len(files) > 10:
        print(f"  ... 以及另外 {len(files) - 10} 个文件")
    
    # 询问是否转换为PDF
    try:
        response = input("\n是否要将Word文档转换为PDF? (y/n): ")
        if response.lower() == 'y':
            convert_to_pdf(output_dir)
    except:
        print("跳过PDF转换")
    
    # 尝试打开输出目录
    try:
        os.startfile(output_dir)
        print(f"已尝试打开输出目录: {output_dir}")
    except:
        print(f"无法自动打开目录，请手动访问: {output_dir}")

# 设置随机种子以确保结果可重现
random.seed(42)

# 运行主程序
if __name__ == "__main__":
    main()