In [4]:
pip install docxtpl doc2pdf requests beautifulsoup4 pandas openpyxl




In [6]:
from docxtpl import DocxTemplate
import os

In [8]:
doc=DocxTemplate(r"D:\HuaweiMoveData\Users\24522\Desktop\application letter.docx")

In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import random
import itertools
import time


In [12]:
df1 = pd.read_excel(r"C:\Users\24522\python25\university rank.xlsx")
print(df1[:90])

    Rank                                        Institution
0      1        Department of Economics, Harvard University
1      2  Economics Department, Massachusetts Institute ...
2      3  Department of Economics, University of Califor...
3      4     Department of Economics, University of Chicago
4      5                          Paris School of Economics
..   ...                                                ...
85    86  Department of Economics, George Washington Uni...
86    87  Department of Economics, Mitch Daniels School ...
87    88  School of Economics, Faculty of Arts and Socia...
88    89         Department of Economics, McGill University
89    90  Department of Economics, University of Washington

[90 rows x 2 columns]


In [31]:
population1=list(range(1,31))
population2=list(range(31,61))
population3=list(range(61,91))
nums1=random.sample(population1,10)
nums2=random.sample(population2,10)
nums3=random.sample(population3,10)
nums=nums1+nums2+nums3
df2=df1[df1["Rank"].isin(nums)]
df3=pd.DataFrame(df2)
df3.to_excel("chosen_universities.xlsx",index=False,engine="openpyxl")

In [32]:
class TemplateProcessor:
    # 1. 构建全局变量
    def __init__(self, template_file, universities_file, area_file):
        self.template_file = template_file
        self.universities_file = universities_file
        self.area_file = area_file
        self.universities_df = None
        self.area_df = None
        self.template = None
    
    # 2. 读取文件数据
    def load_data(self):
        """加载模板和数据文件"""
        try:
            self.universities_df = pd.read_excel(self.universities_file)
            self.area_df = pd.read_excel(self.area_file)
            self.template=DocxTemplate(self.template_file)
            print("数据加载成功")
            return True
        except FileNotFoundError as e:
            print(f"文件未找到: {e}")
            return False
        except Exception as e:
            print(f"数据加载失败: {e}")
            return False
    
    # 3. 对文件数据进行替换和保存
    def replace_and_save_document(self, university, area_data, output_dir="output"):
        """替换模板并保存文档"""
        try:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            # 验证数据完整性
            if not all(key in area_data for key in ['Area', 'Journal', 'Skill']):
                print(f"数据不完整: {area_data}")
                return None
            
            context = {
                "school": university,
                "major": area_data['Area'],
                "journal": area_data['Journal'],
                "skill": area_data['Skill']
            }
            
            # 4. 生成文件 - 处理文件名过长问题
            base_name = f"Application_{university}_{area_data['Area']}"
            # 清理文件名中的非法字符并限制长度
            file_name = "".join(c for c in base_name if c not in r'<>:"/\\|?*')
            # 限制文件名长度（Windows最大路径限制）
            if len(file_name) > 200:
                file_name = file_name[:200]
            file_name += ".docx"
            file_path = os.path.join(output_dir, file_name)
            
            
            template = DocxTemplate(self.template_file)
            template.render(context)
            template.save(file_path)
            
            print(f"文件已生成：{file_name}")
            return file_path
            
        except Exception as e:
            print(f"生成文档失败 {university}-{area_data.get('Area', 'Unknown')}: {e}")
            return None
    
    def onebyone(self, output_dir="applications_output"):
        """批量生成所有文档"""
        # 加载数据
        if not self.load_data():
            print("数据加载失败，无法继续")
            return
        
        # 验证数据列是否存在
        required_uni_columns = ['Institution']
        required_area_columns = ['Area', 'Journal', 'Skill']
        
        if not all(col in self.universities_df.columns for col in required_uni_columns):
            print("大学数据文件缺少必要列")
            return
            
        if not all(col in self.area_df.columns for col in required_area_columns):
            print("领域数据文件缺少必要列")
            return
        
        # 检查数据是否为空
        if self.universities_df.empty:
            print("大学数据文件为空")
            return
            
        if self.area_df.empty:
            print("领域数据文件为空")
            return
        
        file_count = 0
        success_count = 0
        
        # 遍历所有组合
        for _, uni_row in self.universities_df.iterrows():
            university = uni_row['Institution']
            # 检查大学名称是否为空
            if pd.isna(university) or str(university).strip() == "":
                print("跳过空大学名称")
                continue
                
            for _, area_row in self.area_df.iterrows():
                # 检查区域数据是否完整
                if any(pd.isna(area_row[col]) for col in ['Area', 'Journal', 'Skill']):
                    print(f"跳过不完整的数据行: {area_row}")
                    continue
                    
                file_count += 1
                area_data = {
                    'Area': area_row['Area'],
                    'Journal': area_row['Journal'],
                    'Skill': area_row['Skill']
                }
                
                result = self.replace_and_save_document(university, area_data, output_dir)
                if result:
                    success_count += 1
        
        print(f"处理完成: 成功生成 {success_count}/{file_count} 个申请文件")
        print(f"保存于 {output_dir} 目录中")

In [74]:
if __name__=="__main__":
    processor = TemplateProcessor(
        template_file=r"D:\HuaweiMoveData\Users\24522\Desktop\application letter.docx",
        universities_file=r"C:\Users\24522\python25\university rank.xlsx", 
        area_file=r"D:\HuaweiMoveData\Users\24522\Desktop\area.xlsx"
    )
    processor.onebyone("applications_output")


文件已生成：Application_Department of Economics, Harvard University_FINANCE.docx
文件已生成：Application_Department of Economics, Harvard University_ECON.docx
文件已生成：Application_Department of Economics, Harvard University_SOC SCI.docx
文件已生成：Application_Economics Department, Massachusetts Institute of Technology (MIT)_FINANCE.docx
文件已生成：Application_Economics Department, Massachusetts Institute of Technology (MIT)_ECON.docx
文件已生成：Application_Economics Department, Massachusetts Institute of Technology (MIT)_SOC SCI.docx
文件已生成：Application_Department of Economics, University of California-Berkeley_FINANCE.docx
文件已生成：Application_Department of Economics, University of California-Berkeley_ECON.docx
文件已生成：Application_Department of Economics, University of California-Berkeley_SOC SCI.docx
文件已生成：Application_Department of Economics, University of Chicago_FINANCE.docx
文件已生成：Application_Department of Economics, University of Chicago_ECON.docx
文件已生成：Application_Department of Economics, University of Chicago_SOC S

OSError: [Errno 22] Invalid argument: 'applications_output\\Application_Facoltà di Economia, Università degli Studi di Roma "Tor Vergata"_FINANCE.docx'

In [93]:
!pip install docx2pdf
from docx2pdf import convert
def to_pdf(docx_path,pdf_path=None):
    if pdf_path is None:
        pdf_path=docx_path.replace('docx','pdf')
    convert(docx_path,pdf_path)
    print(f"转换PDF成功")
    return True

if __name__ =="__main__":
    to_pdf(r"C:\Users\24522\python25\applications_output\Application_Barcelona School of Economics (BSE)_ECON.docx")



  0%|          | 0/1 [00:00<?, ?it/s]

转换PDF成功
