# Tips

- Split the code into multiple cells (multiple cells can be executed independently and easily debugged)


- Try to use `os.path.join` to join paths instead of template string (The file path may be wrong when you run the code in other systems)

In [1]:
import pandas as pd
import os
import random
from docxtpl import DocxTemplate

# Step 1: 创建更新后的Word模板（如果不存在）
def create_template():
    from docx import Document
    doc = Document()
    
    content = """Dear Admission Committee,

My name is Boheng Shen, and I am pleased to apply for the Master of {{research_area}} program at {{university_name}}.

In my free time, I enjoy reading top-tier academic research to stay updated with the latest advancements in {{research_area}}. I occasionally study articles from leading ABS 4+ rated journals such as {{journal1}}, {{journal2}}, and {{journal3}}, among others. This habit not only deepens my understanding of theoretical and empirical approaches in {{research_area}} but also sharpens my ability to critically analyze complex phenomena.

I want to be a {{job}}. To achieve my dream, I have developed strong technical and analytical skills, including proficiency in {{skills}}, which enable me to transform complex data into meaningful insights.

I am particularly drawn to {{university_name}} due to its strong academic environment and research-oriented approach.

Thank you for considering my application. I am eager to contribute to and benefit from the rigorous academic culture at {{university_name}}.

Sincerely,

Boheng Shen"""
    
    doc.add_paragraph(content)
    doc.save('template.docx')
    print("✅ Created updated template.docx")

# 创建模板
create_template()

# Step 2: 从两个Excel文件中读取数据
try:
    # 读取大学列表
    df_uni = pd.read_excel('universities.xlsx')
    all_universities = df_uni['university'].tolist()
    print(f"✅ Loaded {len(all_universities)} universities")

    # 读取研究领域、期刊、技能和职业目标
    df_areas = pd.read_excel('Area_Journals_Job_Skills.xlsx')
    
    # 创建映射字典
    journals = {}
    skills_list = []
    jobs_list = []  # 新增：存储每个领域的职业目标
    research_areas = []  # 存储所有研究领域名称

    print("\n🔍 Parsing research areas and details:")
    for _, row in df_areas.iterrows():
        area = str(row['Area']).strip()
        journals_str = str(row['Journals'])
        skills_str = str(row['Skills'])
        job_str = str(row['Job']) 
        
        # 处理期刊字符串
        journals_clean = journals_str.replace(' and ', ', ').replace(' & ', ', ')
        journal_list = [j.strip() for j in journals_clean.split(',') if j.strip()]
        
        # 确保至少有3个期刊
        while len(journal_list) < 3:
            journal_list.append("General Research Journal")
        journal_list = journal_list[:3]  
        
        # 存储数据
        journals[area] = journal_list
        skills_list.append(skills_str)
        jobs_list.append(job_str)
        research_areas.append(area)
        
        print(f"   → {area}")
        print(f"     Journals: {journal_list}")
        print(f"     Skills: {skills_str}")
        print(f"     Job: {job_str}")
        print("   ---")

    print(f"\n✅ Successfully loaded {len(research_areas)} research areas\n")

except FileNotFoundError as e:
    print(f"❌ File not found error: {e}")
    print("Please ensure you have:")
    print("1. universities.xlsx with a 'university' column")
    print("2. Area_Journals_Job_Skills.xlsx with 'Area', 'Journals', 'Skills', 'job' columns")
    exit()
except KeyError as e:
    print(f"❌ Missing column in Excel: {e}")
    print("Please check that your Area_Journals_Job_Skills.xlsx has these columns:")
    print("   - Area")
    print("   - Journals") 
    print("   - Skills")
    print("   - job")
    exit()
except Exception as e:
    print(f"❌ Error reading Excel files: {e}")
    exit()

# Step 3: 创建输出目录
os.makedirs('HW_School_Application', exist_ok=True)

# Step 4: 按顺序生成90份申请信
application_count = 0
print("🚀 Generating 90 application letters...")

for university in all_universities:
    for area in research_areas:
        if application_count >= 90:
            break
            
        application_count += 1
        
        # 获取当前领域的数据
        area_journals = journals[area]
        skill_combo = random.choice(skills_list)
        job_title = jobs_list[research_areas.index(area)]  
        
        # 创建上下文（完全匹配新模板）
        context = {
            'university_name': university,
            'research_area': area,
            'journal1': area_journals[0],
            'journal2': area_journals[1], 
            'journal3': area_journals[2],
            'skills': skill_combo,
            'job': job_title  
        }
        
        # 加载模板并生成
        tpl = DocxTemplate('template.docx')
        tpl.render(context)
        
        # 生成文件名（清理特殊字符）
        clean_uni = university.replace(",", "").replace(" ", "_").replace("/", "_")
        clean_area = area.replace(" ", "_")
        filename = f'HW_School_Application/SOP_{application_count:02d}_{clean_uni}_{clean_area}.docx'
        tpl.save(filename)
        
        print(f"   ✅ Generated: SOP_{application_count:02d}_{clean_uni}_{clean_area}.docx")

print(f"\n Success! Generated {application_count} application letters.")
print("📁 All saved in 'HW_School_Application/' folder")
print("📄 Template updated and saved as 'template.docx'")




✅ Created updated template.docx
✅ Loaded 30 universities

🔍 Parsing research areas and details:
   → economics
     Journals: ['American Economic Review，Quarterly Journal of Economics', 'Econometrica', 'General Research Journal']
     Skills: Python, SQL, Linux
     Job: Data Scientist
   ---
   → finance
     Journals: ['Journal of Finance', 'Journal of Financial Economics', 'Review of Financial Studies']
     Skills: C++, Python,  Q/KDB
     Job: Quant Researcher
   ---
   → information management
     Journals: ['Information Systems Research', 'Journal of the Association for Information Systems', 'MIS Quarterly: Management Information Systems']
     Skills: Dataiku, Python, Bash
     Job: Product Manager
   ---

✅ Successfully loaded 3 research areas

🚀 Generating 90 application letters...
   ✅ Generated: SOP_01_Department_of_Economics_Harvard_University_economics.docx
   ✅ Generated: SOP_02_Department_of_Economics_Harvard_University_finance.docx
   ✅ Generated: SOP_03_Department_of