# HW3 2023201150

In [192]:
import pandas as pd
import requests, re
import random
from docxtpl import DocxTemplate
from docx2pdf import convert
from bs4 import BeautifulSoup, Tag

## 1 抓取所需数据

### 1.1 大学数据抓取与Excel生成

In [193]:
url = "https://ideas.repec.org/top/top.econdept.html"
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")

# 查看网页源代码后，发现该页标题“Top 12.5%...”在<h3>分级中
title = soup.find("h3", string = lambda s: "Top 12.5% Economics Departments, all authors, all publication years" in s) # 查找指定的标题
university_table = title.find_next("table")

# 收集文本，其中大学信息在<a>标签中，需要单独提取，去除<p>标签中的地点，否则在后期无法识别（地点可能由多个单词组成）
rows = []
for tr in university_table.select("tr"):
    tds = tr.select("td")
    if len(tds) >= 2:   # 防止有空行
        rank_text = tds[0].get_text(strip = True)
        num = re.match(r"(\d+)", rank_text) # 从第一个td里取排名
        rank = int(num.group(0))

        uni_tag = tds[1].find("a") # 提取university和department信息
        depart_uni = uni_tag.get_text(strip = True) # 第二个tds存储department和university

        rows.append({"Rank": rank, "Department_University": depart_uni})
print(rows)

[{'Rank': 1, 'Department_University': 'Department of Economics, Harvard University'}, {'Rank': 2, 'Department_University': 'Economics Department, Massachusetts Institute of Technology (MIT)'}, {'Rank': 3, 'Department_University': 'Department of Economics, University of California-Berkeley'}, {'Rank': 4, 'Department_University': 'Department of Economics, University of Chicago'}, {'Rank': 5, 'Department_University': 'Paris School of Economics'}, {'Rank': 6, 'Department_University': 'Department of Economics, Princeton University'}, {'Rank': 7, 'Department_University': 'Department of Economics, Stanford University'}, {'Rank': 8, 'Department_University': 'Economics Department, Yale University'}, {'Rank': 9, 'Department_University': 'Toulouse School of Economics (TSE)'}, {'Rank': 10, 'Department_University': 'Department of Economics, Oxford University'}, {'Rank': 11, 'Department_University': 'Department of Economics, School of Arts and Sciences, Columbia University'}, {'Rank': 12, 'Departmen

In [194]:
# 整合成dataframe方便之后抽取/转换成Excel
rank_df = pd.DataFrame(rows).sort_values("Rank") # 以rank排序
rank_df
# 由于此处rank数目和序号匹配，不再检查前90个是否都存在

Unnamed: 0,Rank,Department_University
0,1,"Department of Economics, Harvard University"
1,2,"Economics Department, Massachusetts Institute ..."
2,3,"Department of Economics, University of Califor..."
3,4,"Department of Economics, University of Chicago"
4,5,Paris School of Economics
...,...,...
333,334,"Department of Economics, Faculty of Economic a..."
334,335,"Department of Economics, Colorado State Univer..."
335,336,"School of Economics, Fudan University"
336,337,"Fakultät Wirtschaftswissenschaften, Leuphana U..."


In [195]:
# 根据逗号分隔department和university，尝试后发现存在没有逗号只有大学名称，该情况直接返回
def extract_uni(dep_uni):
    t = str(dep_uni)
    if "," in t:
        return t.split(",")[-1].strip() # 逗号分隔后的最后一个小单元为大学名
    else:
        return t

In [196]:
# 把Department_University转化为只含University信息
rank_df["Department_University"] = rank_df["Department_University"].map(extract_uni)
rank_df = rank_df.rename(columns = {"Department_University":"University"})

In [197]:
# 分段随机挑选大学并导出Excel文档
top30 = rank_df[(rank_df["Rank"]>=1) & (rank_df["Rank"]<=30)].sample(n=10, random_state = 17)
top60 = rank_df[(rank_df["Rank"]>=31) & (rank_df["Rank"]<=60)].sample(n=10, random_state = 17)
top90 = rank_df[(rank_df["Rank"]>=61) & (rank_df["Rank"]<=90)].sample(n=10, random_state = 17)
top90

Unnamed: 0,Rank,University
60,61,University of Virginia
78,79,Universidad Carlos III de Madrid
81,82,Arizona State University
71,72,Ohio State University
65,66,University of California-Santa Barbara (UCSB)
83,84,Hebrew University of Jerusalem
87,88,George Washington University
79,80,Universitetet i Oslo
84,85,"Università degli Studi di Roma ""Tor Vergata"""
86,87,University of Sydney


In [198]:
# 合并抽出的所有大学
university_df = pd.concat([top30, top60, top90], ignore_index = True)
# 去掉Rank列
university_df = university_df.drop(columns = ["Rank"])
# 转成Excel
university_df.to_excel("university.xlsx", index = False)

### 1.2 抓取领域和期刊数据

In [199]:
url = "https://www.scmor.com/view/10554"
html = requests.get(url).text
soup = BeautifulSoup(html, "lxml")

In [200]:
# 观察html后发现为嵌套标题，因此直接搜索h3下的全文
title = soup.find(lambda t: t.name == "h3" and "ABS4*" in t.get_text(strip = True))
area_table = title.find_next("table")
rows = []
for tr in area_table.select("tr"):
    tds = tr.select("td")
    if len(tds) >= 3:
        area_text = tds[1].getText(strip = True)
        journal_text = tds[2].getText(strip = True)
        rows.append({"Area": area_text, "Journal": journal_text})

# 整合成dataframe
area_journal_df = pd.DataFrame(rows).sort_values("Area")
area_journal_df

Unnamed: 0,Area,Journal
0,ACCOUNT,Accounting Review
1,ACCOUNT,"Accounting, Organizations and Society"
2,ACCOUNT,Journal of Accounting and Economics
3,ACCOUNT,Journal of Accounting Research
4,ECON,American Economic Review
5,ECON,Annals of Statistics
6,ECON,Econometrica
7,ECON,Journal of Political Economy
8,ECON,Quarterly Journal of Economics
9,ECON,Review of Economic Studies


In [201]:
# 搜集所有的areas
areas = []
for i in range(len(area_journal_df)):
    if area_journal_df.iloc[i,0] not in areas:
        areas.append(area_journal_df.iloc[i,0])

# 选择一组各自至少有三个4+期刊的领域
while True:
    area_3 = random.sample(areas,3)
    valid = True
    for a in area_3:
        count = area_journal_df[area_journal_df["Area"] == a].shape[0] # 布尔索引来统计a有多少对应的期刊
        if count < 3:
            valid = False
    if valid:
        break

# 针对area_3中的每个领域选三个期刊
selected_journal = {}
for a in area_3:
    sub_df = area_journal_df[area_journal_df["Area"] == a]
    journals = list(sub_df["Journal"])
    if len(journals) > 3:
        selected_journal[a] = random.sample(journals,3)
    else:
        selected_journal[a] = journals

# 转化为dataframe方便导出
rows = []
for a, journals in selected_journal.items():
    for j in journals:
        rows.append({'Area': a, 'Journal': j})
selected_area_journal_df = pd.DataFrame(rows)

# 在新的sheet中手动写入所需skills
skills = ["Python", "C++", "R", "time-series analysis", "pattern recognition", "NLP", "Wind", "SQL", "Math"]
selected_area_journal_df.insert(loc=2, column='Skills', value=skills)

# 导出为Excel
selected_area_journal_df.to_excel("area_journal_skill.xlsx",index=False)

# 2 文书套用

In [219]:
# 生成doc文档的函数
def generate_doc(university,a):
    s = random.sample(skills, 3)    # 在skill中随机取三个
    j = selected_journal[a]  # 匹配之前取出的随机数
    # 把s和j改成由逗号分隔的字符串，否则输出会带有中括号和引号
    s_str = ", ".join(s)
    j_str = ", ".join(j)
    a_str = str(a)
    a_str = a_str.lower()
    content = {
            "university": university,
            "area": a_str,
            "skills": s_str,
            "journals": j_str
        }
    doc = DocxTemplate("template.docx")
    doc.render(content) # 替换为content中的文字
    doc_path = f"outputs/{university}_{a_str}.docx"
    doc.save(doc_path)
    # mac系统无法生成pdf

In [220]:
# 外层循环大学，内层循环area来达成90次输出，skills随机取三个填入
for uni in university_df["University"]:
    for a in area_3:
       generate_doc(uni,a)