In [80]:
import json
import openai

def process_complicated_author(author):

    with open('./openai_auth.json', 'r') as json_file:
        json_load = json.load(json_file)

    openai.api_key = json_load["api_key"]

    prompt = f"""
    Please give me the names of the authors included in this text, and remove any unnecessary spaces within names. Please give me the names, seperated by a space, and nothing else. For example, if the text was "求是》杂志记者 何雯雯 新县融媒体中心记者 韩 燕", I would want you to tell me "何雯雯 韩燕". The text is: {author}
    """

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[{
            "role":"user",
            "content":prompt
        }]
    )

    return response.choices[0].message.content
'''given a url, scrape the article information'''

from newspaper import Article
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json

class QiuShiArticleParser:
    def __init__(self, url):
        # get the beautiful soup version
        self.url = url
        page = urlopen(url)
        html = page.read().decode("utf-8")
        self.soup = BeautifulSoup(html, "html.parser")

        # get the newspaper article version
        self.article = Article(url, language='zh')
        self.article.download()
        self.article.parse()

    def get_authors(self):
        span_els = self.soup.find_all("span", class_="appellation")
        try:
            # author is the second appel element
            author_el = span_els[1].text.strip()
            # remove the "Author:" piece
            author = author_el.split("：")[1]

            # see if it needs additional processing
            # no spaces, should be good to go. Or if None, return
            if author == None:
                return [author]
            elif " " not in author:
                return [author]
            # spaces, but length three - a 2 char name with a space
            elif len(author) == 3:
                return [author[0] + author[2]]
            # something more complicated is going on - could be multiple authors, could have additional info 
            else:
                print("Calling ChatGPT...")
                try:
                    author = process_complicated_author(author)
                    print(f"Author returned by ChatGPT: {author}")
                    # split the result in case there is more than one author
                    return author.split(" ")
                except:
                    print("ChatGPT call failed")
                return "CHATGPT" + author

        except:
            return None
    
    def get_title(self):
        return self.soup.find('h1').text.strip()
    
    def get_year_edition(self):
        span_els = self.soup.find_all("span", class_="appellation")
        # year/edition is the first appel element
        ye_text = span_els[0].text.strip()
        # get the last piece with the date
        cleaned_ye_text = ye_text[-7:]

        year = cleaned_ye_text[:4]
        edition = cleaned_ye_text[-2:]

        return year, edition
    
    def get_date(self):
        date_el = self.soup.find("span", class_="pubtime").text.strip()
        # remove the time piece
        date = date_el.split(" ")[0]
        return date
    
    def get_url(self):
        return self.url
    
    def get_text(self):
        return self.article.text.replace('\n', "")
    
class QiuShiArticle:
    def __init__(self, article_parser):
        self.authors = article_parser.get_authors()
        self.title = article_parser.get_title()
        self.date = article_parser.get_date()
        self.year, self.edition = article_parser.get_year_edition()
        self.url = article_parser.get_url()

        try:
            self.text = article_parser.get_text()
        except:
            self.text = None
        

    def toJson(self):
        return self.__dict__

In [82]:
random_links = ['http://www.qstheory.cn/dukan/qs/2023-04/16/c_1129525662.htm',
 'http://www.qstheory.cn/dukan/qs/2023-06/01/c_1129655751.htm',
 'http://www.qstheory.cn/dukan/qs/2023-12/16/c_1130027650.htm',
 'http://www.qstheory.cn/dukan/qs/2021-07/16/c_1127657270.htm',
 'http://www.qstheory.cn/dukan/qs/2022-08/16/c_1128913890.htm',
 'http://www.qstheory.cn/dukan/qs/2020-11/16/c_1126739322.htm',
 'http://www.qstheory.cn/dukan/qs/2019-07/01/c_1124690399.htm',
 'http://www.qstheory.cn/dukan/qs/2022-03/16/c_1128467651.htm',
 'http://www.qstheory.cn/dukan/qs/2022-06/16/c_1128738827.htm',
 'http://www.qstheory.cn/dukan/qs/2022-12/01/c_1129173253.htm',
 'http://www.qstheory.cn/dukan/qs/2022-08/16/c_1128913669.htm',
 'http://www.qstheory.cn/dukan/qs/2019-07/16/c_1124750207.htm',
 'http://www.qstheory.cn/dukan/qs/2021-01/16/c_1126985579.htm',
 'http://www.qstheory.cn/dukan/qs/2023-11/30/c_1129998594.htm',
 'http://www.qstheory.cn/dukan/qs/2021-07/16/c_1127657502.htm',
 'http://www.qstheory.cn/dukan/qs/2019-04/01/c_1124302988.htm',
 'http://www.qstheory.cn/dukan/qs/2023-05/01/c_1129582104.htm',
 'http://www.qstheory.cn/dukan/qs/2019-09/01/c_1124940057.htm',
 'http://www.qstheory.cn/dukan/qs/2023-12/01/c_1129998497.htm',
 'http://www.qstheory.cn/dukan/qs/2022-06/16/c_1128739216.htm',
 'http://www.qstheory.cn/dukan/qs/2023-02/16/c_1129363477.htm',
 'http://www.qstheory.cn/dukan/qs/2020-06/01/c_1126055488.htm',
 'http://www.qstheory.cn/dukan/qs/2019-11/01/c_1125178859.htm',
 'http://www.qstheory.cn/dukan/qs/2020-03/15/c_1125710695.htm',
 'http://www.qstheory.cn/dukan/qs/2021-02/01/c_1127044259.htm',
 'http://www.qstheory.cn/dukan/qs/2022-03/16/c_1128468584.htm',
 'http://www.qstheory.cn/dukan/qs/2021-03/31/c_1127274750.htm',
 'http://www.qstheory.cn/dukan/qs/2023-11/16/c_1129973905.htm',
 'http://www.qstheory.cn/dukan/qs/2023-08/16/c_1129801705.htm',
 'http://www.qstheory.cn/dukan/qs/2023-12/16/c_1130027702.htm',
 'http://www.qstheory.cn/dukan/qs/2019-08/01/c_1124819317.htm']

In [76]:
for article_link in random_links:
    article_parser = QiuShiArticleParser(article_link)
    article = QiuShiArticle(article_parser)
    print(article.__dict__)

{'author': '齐玉', 'title': '深刻领会中国式现代化的世界意义 凝聚奋进新征程磅礴力量', 'date': '2023-04-16', 'year': '2023', 'edition': '08', 'url': 'http://www.qstheory.cn/dukan/qs/2023-04/16/c_1129525662.htm', 'text': '深刻领会中国式现代化的世界意义 凝聚奋进新征程磅礴力量齐 玉习近平总书记在学习贯彻党的二十大精神研讨班开班式上的重要讲话，深刻阐述中国式现代化一系列重大理论和实践问题，在党的二十大重大理论创新的基础上进一步深化、拓展、升华，形成了科学完备的理论体系，标志着我们党对中国式现代化的认识提升到新高度，为人类现代化理论与实践创新作出了重大原创性贡献，为全面建成社会主义现代化强国、全面推进中华民族伟大复兴进一步指明了前进方向，提供了根本遵循。我们要结合深入开展学习贯彻习近平新时代中国特色社会主义思想主题教育，深刻领会习近平总书记重要讲话的指导意义和实践要求，从统筹中华民族伟大复兴战略全局和世界百年未有之大变局的高度，准确把握新时代新征程的目标任务，切实增强政治自觉、思想自觉、行动自觉，以更加奋发有为的使命担当为推进中国式现代化作出应有贡献。一、全面领会中国式现代化的深远世界意义，在掌握历史主动中坚定不移走中国式现代化道路中国式现代化坚持把马克思主义基本原理同中国现代化具体实际相结合、同中华优秀传统文化相结合，准确把握共产党执政规律、社会主义建设规律、人类社会发展规律，借鉴吸收人类一切优秀文明成果，既是中国的又是世界的，既立足时代又引领时代，对当今世界开创现代化模式新图景、重塑现代化建设新格局、创造人类文明新形态、开辟社会主义发展新境界具有示范性引领性意义。中国式现代化是对世界现代化理论和实践的重大创新。习近平总书记指出，世界上既不存在定于一尊的现代化模式，也不存在放之四海而皆准的现代化标准。很长时期以来，少数国家垄断现代化的话语权，抱持“西方中心论”，鼓吹所谓“华盛顿共识”，不断放大“现代化就是西方化”、“西方文明就是现代文明”的错觉。个别西方大国号称全球经济最发达、科技最先进，贫富分化却最严重，新冠疫情死亡病例也是全球最多的；号称“自由民主”，却不断发生针对平民的枪击案、针对少数族裔的歧视仇杀，社会