# Tips:

- Good job for using advanced expressions, like `re`, `BeautifulSoup`, `itertools`, `subprocess`!

- Write some comments to explain how `fetch_university_rankings` and `select_top_ranges` work.

- Split the last cell into multiple cells (multiple cells can be executed independently and easily debugged)

- For the last cell:
    - Some steps in last cell are redundant, try to simplify them. For example, you have alrady install `docxtpl` in the third cell, you don't need to install it again in the last cell.
    - The part of `automatic recognize cols` is difficlut to read, try to simplify it.

    

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

def fetch_university_rankings(url="https://ideas.repec.org/top/top.econdept.html"):
    resp = requests.get(url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    rows = soup.select("table tr")
    univ_list = []
    for tr in rows:
        link = tr.find("a", href=True)
        if link:
            name = link.get_text(strip=True)
            # 过滤掉异常值
            if name.lower() == "short":  
                continue
            if len(name) < 5:  # 避免过短的无效词
                continue
            univ_list.append(name)
    return univ_list

def select_top_ranges(univ_list, n1=10, n2=10, n3=10):
    if len(univ_list) < 90:
        raise ValueError(f"Ranking list has only {len(univ_list)} institutions, need at least 90")
    top30 = univ_list[:30]
    block2 = univ_list[30:60]
    block3 = univ_list[60:90]
    sel1 = top30[:n1]
    sel2 = block2[:n2]
    sel3 = block3[:n3]
    return sel1 + sel2 + sel3

if __name__ == "__main__":
    all_unis = fetch_university_rankings()
    selected = select_top_ranges(all_unis, 10, 10, 10)
    df = pd.DataFrame({"university": selected})

    save_dir = r"D:\FFOutput"
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, "selected_universities.xlsx")

    df.to_excel(save_path, index=False)
    print(f"Excel 文件已保存到: {save_path}")


Excel 文件已保存到: D:\FFOutput\selected_universities.xlsx


In [7]:
import pandas as pd
import os

# Step 4: 选择研究方向
research_areas = ["Economics", "Management", "Finance"]

# Step 5: 每个研究方向对应的顶级期刊（这里手动举例，你也可以换成自己查到的期刊）
top_journals = {
    "Economics": ["American Economic Review", "Quarterly Journal of Economics", "Journal of Political Economy"],
    "Management": ["Academy of Management Journal", "Strategic Management Journal", "Journal of Management"],
    "Finance": ["Journal of Finance", "Review of Financial Studies", "Journal of Financial Economics"]
}

# Step 6: 技能列表（这里举例 Glassdoor 上常见的技能）
skills = ["Python", "SQL", "Tableau", "Data Analysis", "Machine Learning", "Excel"]

# 构造 DataFrame
data = []
for area in research_areas:
    for journal in top_journals[area]:
        data.append([area, journal])

df_research = pd.DataFrame(data, columns=["Research Area", "Top Journal"])
df_skills = pd.DataFrame(skills, columns=["Skills"])

# 保存路径
save_dir = r"D:\FFOutput"
os.makedirs(save_dir, exist_ok=True)
save_path = os.path.join(save_dir, "research_journals_skills.xlsx")

# 使用 ExcelWriter 将多个表写入一个 Excel 文件
with pd.ExcelWriter(save_path, engine="openpyxl") as writer:
    df_research.to_excel(writer, sheet_name="Research & Journals", index=False)
    df_skills.to_excel(writer, sheet_name="Skills", index=False)

print(f"Excel 文件已保存到: {save_path}")


Excel 文件已保存到: D:\FFOutput\research_journals_skills.xlsx


In [4]:
import sys
!{sys.executable} -m pip install docxtpl


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple


In [5]:
import docxtpl
print(docxtpl.__version__)


0.20.1


In [11]:
# -------------------- 完整 SOP 生成 Notebook --------------------
import sys
import subprocess
from pathlib import Path
import pandas as pd
import itertools
import datetime
import csv
import platform
import re

# ---------------- 安装依赖 ----------------
try:
    import docxtpl
except ModuleNotFoundError:
    print("docxtpl 未安装，正在安装...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "docxtpl"])
    import docxtpl
print("docxtpl 版本:", docxtpl.__version__)

try:
    from docx2pdf import convert as docx2pdf_convert
    DOCX2PDF_AVAILABLE = True
except Exception:
    DOCX2PDF_AVAILABLE = False

from docxtpl import DocxTemplate
from docx import Document

# ---------------- 参数 ----------------
INPUT_PATH = Path(r"D:\FFOutput\datainput.xlsx")
TEMPLATE_PATH = Path(r"D:\FFOutput\template.docx")
OUTPUT_DIR = Path(r"D:\FFOutput")
APPLICANT_NAME = "Fangyu Zhao"
PROGRAM = "Master of Finance program"
MAX_DOCS = 90
DATE_STR = datetime.date.today().strftime('%Y-%m-%d')

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- 创建默认模板 ----------------
if not TEMPLATE_PATH.exists():
    print("模板不存在，创建默认模板:", TEMPLATE_PATH)
    doc = Document()
    doc.add_paragraph(
        "Dear Admission Committee,\n\n"
        "My name is {{ applicant_name }}, and I am pleased to apply for the {{ program }} at {{ university }}.\n\n"
        "In my free time, I enjoy reading top-tier academic research to stay updated with the latest advancements in {{ research_area }}. "
        "I occasionally study articles from journals such as {{ journals }}.\n\n"
        "I have practical skills such as {{ skills }}.\n\n"
        "Thank you for considering my application.\n\n"
        "Sincerely,\n\n{{ applicant_name }}\nDate: {{ date }}"
    )
    doc.save(TEMPLATE_PATH)

# ---------------- 读取 Excel ----------------
if INPUT_PATH.exists():
    try:
        df = pd.read_excel(INPUT_PATH)
    except Exception:
        df = pd.read_csv(INPUT_PATH)
else:
    csv_path = INPUT_PATH.with_suffix('.csv')
    if csv_path.exists():
        df = pd.read_csv(csv_path)
    else:
        raise FileNotFoundError(f"输入文件不存在: {INPUT_PATH} 或 {csv_path}")

# ---------------- 自动识别列 ----------------
cols = {c.lower(): c for c in df.columns}
col_univ = next((v for k,v in cols.items() if 'univers' in k or 'school' in k), None)
col_research = next((v for k,v in cols.items() if 'research' in k or 'journal' in k), None)
col_skills = next((v for k,v in cols.items() if 'skill' in k), None)

if not col_univ: col_univ = df.columns[0]
if not col_research and len(df.columns) > 1: col_research = df.columns[1]
if not col_skills and len(df.columns) > 2: col_skills = df.columns[2]

universities = df[col_univ].dropna().astype(str).str.strip().tolist()
research_entries = df[col_research].dropna().astype(str).str.strip().tolist() if col_research else ["economics"]
skills_entries = df[col_skills].dropna().astype(str).str.strip().tolist() if col_skills else ["Python, SQL, Math"]

product_iter = list(itertools.product(universities, research_entries, skills_entries))
if not product_iter:
    raise RuntimeError("从输入数据生成组合失败")

# ---------------- SOP 生成 ----------------
manifest = []
count = 0

def sanitize_filename(name):
    return re.sub(r'[^A-Za-z0-9_-]', '_', name)

for uni, research, skills in itertools.cycle(product_iter):
    if count >= MAX_DOCS: break
    ctx = {
        'applicant_name': APPLICANT_NAME,
        'program': PROGRAM,
        'university': uni,
        'research_area': research,
        'journals': research,
        'skills': skills,
        'date': DATE_STR,
    }

    filename_base = f"SOP_{sanitize_filename(APPLICANT_NAME)}_{sanitize_filename(uni)}_{count+1}"
    docx_out = OUTPUT_DIR / f"{filename_base}.docx"

    template = DocxTemplate(str(TEMPLATE_PATH))
    template.render(ctx)
    template.save(str(docx_out))

    pdf_out = None
    if DOCX2PDF_AVAILABLE and platform.system() == 'Windows':
        try:
            pdf_out = OUTPUT_DIR / f"{filename_base}.pdf"
            docx2pdf_convert(str(docx_out), str(pdf_out))
        except Exception as e:
            print("PDF 转换失败:", e)

    manifest.append({
        'index': count+1,
        'university': uni,
        'research_area': research,
        'journals': research,
        'skills': skills,
        'docx_path': str(docx_out),
        'pdf_path': str(pdf_out) if pdf_out else ''
    })
    count += 1

# ---------------- 保存 manifest ----------------
manifest_path = OUTPUT_DIR / 'manifest.csv'
with open(manifest_path, 'w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['index','university','research_area','journals','skills','docx_path','pdf_path'])
    writer.writeheader()
    writer.writerows(manifest)

print(f"生成完成 {count} 个文档，输出目录: {OUTPUT_DIR}")
print(f"manifest 保存至: {manifest_path}")
if not DOCX2PDF_AVAILABLE:
    print("未安装 docx2pdf 或非 Windows 系统，PDF 未生成。")


docxtpl 版本: 0.20.1


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: <unknown>.Open


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: (-2147023170, '远程过程调用失败。', None, None)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: Word.Application.Documents


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

PDF 转换失败: (-2147023170, '远程过程调用失败。', None, None)
生成完成 90 个文档，输出目录: D:\FFOutput
manifest 保存至: D:\FFOutput\manifest.csv
