# 生成申请信

### 下载大学排名

In [4]:
pip install requests beautifulsoup4 pandas openpyxl

Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [2]:
def scrape_repec_rankings(target_url):
    headers = {    #模拟真实浏览器行为，并优化连接性能
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "zh-CN,zh;q=0.9",
        "Connection": "keep-alive"  
    }

    response = requests.get(target_url, headers=headers, timeout=5)  
    response.raise_for_status()  

    #用BeautifulSoup提取结构化数据
    soup = BeautifulSoup(response.text, "html.parser") 
    rankings_data = [] 

    #定位并获取"The rankings"下面的表格
    rankings_heading = soup.find("h2", string="The rankings")
    ranking_table = rankings_heading.find_next("table")
    table_rows = ranking_table.find_all("tr")  # 获取表格所有行
        
    # 遍历所有数据行，并跳过表头
    for row in table_rows[1:]:
        try:
            cells = row.find_all("td")
            """
            if len(cells) < 2: 
                print(f"跳过无效行（单元格数量不足）：{row}")
                continue
            """    
            rank = cells[0].get_text(strip=True) #排名列
            
            univ_name_tag = cells[1].find("a")  #大学名称列，查找是否有链接标签
            if univ_name_tag:
                university_name = univ_name_tag.get_text(strip=True)
            else:
                university_name = cells[1].get_text(strip=True)

            if not rank or not university_name:
                print(f"空数据行：{row}")
                continue

            # 将数据添加到结果列表最后
            rankings_data.append({
                "Ranking": rank,
                "Institutions": university_name
            })

        except Exception as row_error: #单行抓取出现错误时
            print(f"解析单条排名出错：{str(row_error)}，跳过")
            continue

    # 返回抓取结果
    return rankings_data

In [3]:
def save_to_excel(data, excel_filename="repec_university_rankings.xlsx"):  #将结构化数据保存为Excel文件
    # 用pandas将数据转换为DataFrame
    df = pd.DataFrame(data)
    df.to_excel(excel_filename, index=False, engine="openpyxl") #index=False 不保留行索引
    print(f"数据已保存到：{excel_filename}")

In [5]:
if __name__ == "__main__":
    target_ranking_url = "https://ideas.repec.org/top/top.econdept.html"
    
    ranked_universities = scrape_repec_rankings(target_ranking_url)

    if ranked_universities:
        save_to_excel(ranked_universities)

数据已保存到：repec_university_rankings.xlsx


### 筛选大学，得到list1 excel表格

In [6]:
import pandas as pd
import random
import itertools
import numpy as np

In [7]:
df1 = pd.read_excel("C:/Users/activ/repec_university_rankings.xlsx")
print(df1[:90])

    Ranking                                       Institutions
0         1        Department of Economics, Harvard University
1         2  Economics Department, Massachusetts Institute ...
2         3  Department of Economics, University of Califor...
3         4     Department of Economics, University of Chicago
4         5                          Paris School of Economics
..      ...                                                ...
85       86  Department of Economics, Mitch Daniels School ...
86       87  School of Economics, Faculty of Arts and Socia...
87       88  Department of Economics, George Washington Uni...
88       89         Department of Economics, McGill University
89       90  Department of Economics, University of Washington

[90 rows x 2 columns]


In [8]:
population1 = list(range(1,31))
population2 = list(range(31,61))
population3 = list(range(61,91))
numbers1 = random.sample(population1,10)
numbers2 = random.sample(population2,10)
numbers3 = random.sample(population3,10)
numbers_all = numbers1+numbers2+numbers3

df2 = df1[df1['Ranking'].isin(numbers_all)] #筛选出Ranking列在numbers_all里的所有行，保存到df2中
df3 = pd.DataFrame(df2)
df3.to_excel("template_list1.xlsx", index=False, engine="openpyxl")  #保存为excel文件

#### 手动得到list2 excel表格

### 循环生成申请信

In [96]:
pip install docxtpl

Note: you may need to restart the kernel to use updated packages.


In [12]:
import pandas as pd
from docxtpl import DocxTemplate
import os

In [13]:
class TemplateProcessor:  #定义类
    def __init__(self, template_file, universities_file, area_file):
        self.template_file = template_file  # 模板word文件
        self.universities_file = universities_file  # list1大学排名
        self.area_file = area_file  # list2领域等内容
        self.universities_df = None
        self.area_df = None
        self.template = None

    def load_data(self):  #加载Excel数据和word模板
        self.universities_df = pd.read_excel(self.universities_file)
        self.area_df = pd.read_excel(self.area_file)

        self.template = DocxTemplate(self.template_file)
    
    def replace_and_save_document(self, university, area_data, output_dir="output"):  #替换文本并保存文档
        # 创建输出目录
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # 准备上下文数据
        context = {
            "University_Name": university,
            "Area": area_data['Area'],
            "Journals": area_data['Journals'],
            "Skills": area_data['Skills']
        }
        #遍历字符，只保留字母数字或空格、连字符、下划线，rstrip()去除字符串末尾的空白字符
        clear_university = "".join(c for c in university if c.isalnum() or c in (' ', '-', '_')).rstrip()
        clear_area = area_data['Area'].replace(" ", "_")
            
        # 生成文件名
        filename = f"Application_{clear_university}_{clear_area}.docx"
        filepath = os.path.join(output_dir, filename)
        # 使用docxtpl渲染模板
        self.template.render(context)
            
        # 保存文档
        self.template.save(filepath)
        print(f"文件已生成: {filename}")
        return filepath
    
    def process_all(self, output_dir="output"):
        self.load_data()
        
        if self.universities_df is None or self.area_df is None:
            print("数据加载失败")
            return
        
        # 统计生成的文件数量
        generated_files = 0
        # 遍历30所大学和3个领域的组合
        for _, uni_row in self.universities_df.iterrows():
            university = uni_row['Institutions']
            
            for _, area_row in self.area_df.iterrows():
                area_data = {
                    'Area': area_row['Area'],
                    'Journals': area_row['Journals'],
                    'Skills': area_row['Skills']
                }
                
                result = self.replace_and_save_document(university, area_data, output_dir)
                if result:
                    generated_files += 1
        
        print(f"\n共生成 {generated_files} 个申请文件")
        print(f"文件保存在 '{output_dir}' 目录中")

In [14]:
if __name__ == "__main__":  #使该文件既可以作为独立程序运行，也可以作为模块被其他程序导入使用
    processor = TemplateProcessor(
        template_file=r"C:\Users\activ\ApplicationTemplate.docx",
        universities_file=r"C:\Users\activ\template_list1.xlsx", 
        area_file=r"C:\Users\activ\template_list2.xlsx"
    )
    
    # 处理所有组合并生成文件
    processor.process_all("applications_output")

文件已生成: Application_Economics Department Massachusetts Institute of Technology MIT_ECON.docx
文件已生成: Application_Economics Department Massachusetts Institute of Technology MIT_FINANCE.docx
文件已生成: Application_Economics Department Massachusetts Institute of Technology MIT_SOC_SCI.docx
文件已生成: Application_Paris School of Economics_ECON.docx
文件已生成: Application_Paris School of Economics_FINANCE.docx
文件已生成: Application_Paris School of Economics_SOC_SCI.docx
文件已生成: Application_Department of Economics Princeton University_ECON.docx
文件已生成: Application_Department of Economics Princeton University_FINANCE.docx
文件已生成: Application_Department of Economics Princeton University_SOC_SCI.docx
文件已生成: Application_Economics Department Brown University_ECON.docx
文件已生成: Application_Economics Department Brown University_FINANCE.docx
文件已生成: Application_Economics Department Brown University_SOC_SCI.docx
文件已生成: Application_Department of Economics University College London UCL_ECON.docx
文件已生成: Application_Department

### 选择一个文件生成pdf版

In [98]:
pip install docx2pdf

Collecting docx2pdf
  Downloading docx2pdf-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Downloading docx2pdf-0.1.8-py3-none-any.whl (6.7 kB)
Installing collected packages: docx2pdf
Successfully installed docx2pdf-0.1.8
Note: you may need to restart the kernel to use updated packages.


In [16]:
from docx2pdf import convert
import os

def convert_docx_to_pdf_docx2pdf(docx_path, pdf_path=None):
    if pdf_path is None:
        # 如果没有指定PDF路径，使用相同目录和文件名
        pdf_path = docx_path.replace('.docx', '.pdf')
        
    # 转换文件
    convert(docx_path, pdf_path)
    print(f"{docx_path} -> {pdf_path}")
    return True

if __name__ == "__main__":
    #这里随机使用了本次生成的一个文件，但每次循环筛选的大学不同，可能要更改文件地址
    convert_docx_to_pdf_docx2pdf("C:/Users/activ/applications_output/Application_Department of Economics Boston College_ECON.docx", "Application_Department of Economics Boston College_ECON_pdf.pdf")

  0%|          | 0/1 [00:00<?, ?it/s]

C:/Users/activ/applications_output/Application_Department of Economics Boston College_ECON.docx -> Application_Department of Economics Boston College_ECON_pdf.pdf
