In [1]:
'''given a url, scrape the article information'''

from newspaper import Article
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import openai

def process_complicated_author(author):

    with open('../openai_auth.json', 'r') as json_file:
        json_load = json.load(json_file)

    openai.api_key = json_load["api_key"]

    prompt = f"""
    Please give me the names of the authors included in this text, and remove any unnecessary spaces within names. Please give me the names, seperated by a space, and nothing else. For example, if the text was "求是》杂志记者 何雯雯 新县融媒体中心记者 韩 燕", I would want you to tell me "何雯雯 韩燕". The text is: {author}
    """

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[{
            "role":"user",
            "content":prompt
        }]
    )

    return response.choices[0].message.content

class QiuShiArticleParser:
    def __init__(self, url):
        # get the beautiful soup version
        self.url = url
        try:
            page = urlopen(url)
            html = page.read().decode("utf-8")
            self.soup = BeautifulSoup(html, "html.parser")

            # get the newspaper article version
            self.article = Article(url, language='zh')
            self.article.download()
            self.article.parse()
        except:
            print(f"Article with URL: {url} failed to parse")
            self.soup = None
            self.article = None

    def get_authors(self):
        if self.soup == None:
            return None
        span_els = self.soup.find_all("span", class_="appellation")
        try:
            # author is the second appel element
            author_el = span_els[1].text.strip()
            # remove the "Author:" piece
            author = author_el.split("：")[1]

            # see if it needs additional processing
            # no spaces, should be good to go. Or if None, return
            if author == None:
                return [author]
            elif " " not in author:
                return [author]
            # spaces, but length three - a 2 char name with a space
            elif len(author) == 3:
                return [author[0] + author[2]]
            # something more complicated is going on - could be multiple authors, could have additional info 
            else:
                print("Calling ChatGPT...")
                try:
                    author = process_complicated_author(author)
                    print(f"Author returned by ChatGPT: {author}")
                    # split the result in case there is more than one author
                    return author.split(" ")
                except:
                    print("ChatGPT call failed")
                return "CHATGPT" + author

        except:
            return None
    
    def get_title(self):
        try:
            return self.soup.find('h1').text.strip()
        except:
            return None
    
    def get_year_edition(self):
        try: 
            span_els = self.soup.find_all("span", class_="appellation")
            # year/edition is the first appel element
            ye_text = span_els[0].text.strip()
            # get the last piece with the date
            cleaned_ye_text = ye_text[-7:]

            year = cleaned_ye_text[:4]
            edition = cleaned_ye_text[-2:]

            return year, edition
        except:
            return None, None
    
    def get_date(self):
        try: 
            date_el = self.soup.find("span", class_="pubtime").text.strip()
            # remove the time piece
            date = date_el.split(" ")[0]
            return date
        except:
            return None
    
    def get_url(self):
        return self.url
    
    def get_text(self):
        try: 
            return self.article.text.replace('\n', "")
        except:
            return None

    
class QiuShiArticle:
    def __init__(self, article_parser):
        self.authors = article_parser.get_authors()
        self.title = article_parser.get_title()
        self.date = article_parser.get_date()
        self.year, self.edition = article_parser.get_year_edition()
        self.url = article_parser.get_url()

        try:
            self.text = article_parser.get_text()
        except:
            self.text = None
        

    def toJson(self):
        return self.__dict__

In [4]:
import pandas as pd
links_df = pd.read_csv('./links.csv')
links_df.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,year_name,edition_name,link
0,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...
1,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...
2,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...
3,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...
4,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...


In [5]:
links_df = links_df.drop(columns=["year_name", "edition_name"])
print(len(links_df))
links_df.head()

2190


Unnamed: 0,link
0,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...
1,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...
2,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...
3,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...
4,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...


In [6]:
sample_articles_df = pd.DataFrame(columns=["authors", "title", "date", "year", "edition", "url", "text"])

In [7]:
from tqdm import tqdm

In [8]:
for link in tqdm(links_df["link"]):
    article_parser = QiuShiArticleParser(link)
    article = QiuShiArticle(article_parser)

    sample_articles_df.loc[len(sample_articles_df)] = [article.authors, article.title, article.date, article.year, article.edition, article.url, article.text]


  0%|          | 0/2190 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/r4/qqn8tv4d7g18t_5fsxt6y1340000gn/T/jieba.cache
Loading model cost 0.311 seconds.
Prefix dict has been built successfully.
  2%|▏         | 45/2190 [00:58<39:24,  1.10s/it]  

Calling ChatGPT...


  2%|▏         | 46/2190 [01:01<57:18,  1.60s/it]

Author returned by ChatGPT: 周昭成 陈聪
Calling ChatGPT...


  2%|▏         | 47/2190 [01:02<55:42,  1.56s/it]

Author returned by ChatGPT: 聂悄语
Calling ChatGPT...


  2%|▏         | 48/2190 [01:04<55:06,  1.54s/it]

Author returned by ChatGPT: 黎海华 吴擒虎
Calling ChatGPT...


  2%|▏         | 49/2190 [01:06<1:03:03,  1.77s/it]

Author returned by ChatGPT: 梁佩韵 刘骏娇
Calling ChatGPT...


  2%|▏         | 50/2190 [01:08<1:05:08,  1.83s/it]

Author returned by ChatGPT: 李飞 姜小薇


  3%|▎         | 67/2190 [01:29<45:14,  1.28s/it]  

Calling ChatGPT...


  3%|▎         | 68/2190 [01:32<55:45,  1.58s/it]

Author returned by ChatGPT: 魏天舒 吉文磊
Calling ChatGPT...


  3%|▎         | 69/2190 [01:34<1:08:21,  1.93s/it]

Author returned by ChatGPT: 蔡春玲 樊遂桥
Calling ChatGPT...


  3%|▎         | 70/2190 [01:36<1:07:21,  1.91s/it]

Author returned by ChatGPT: 申小提 张颐佳
Calling ChatGPT...


  3%|▎         | 71/2190 [01:38<1:09:38,  1.97s/it]

Author returned by ChatGPT: 狄英娜 翟汝增


  4%|▎         | 78/2190 [01:58<2:29:06,  4.24s/it]

Article with URL: http://www.qstheory.cn/dukan/qs/2023-01/31/c_1129324011.htm failed to parse


  4%|▍         | 90/2190 [02:21<53:19,  1.52s/it]  

Calling ChatGPT...


  4%|▍         | 91/2190 [02:23<1:00:39,  1.73s/it]

Author returned by ChatGPT: 盛玮
Calling ChatGPT...


  4%|▍         | 92/2190 [02:26<1:13:21,  2.10s/it]

Author returned by ChatGPT: 吴晓迪 杜晨薇
Calling ChatGPT...


  4%|▍         | 93/2190 [02:29<1:21:57,  2.34s/it]

Author returned by ChatGPT: 陈有勇 王磊


  5%|▌         | 110/2190 [02:52<46:50,  1.35s/it] 

Calling ChatGPT...


  5%|▌         | 111/2190 [02:55<1:00:11,  1.74s/it]

Author returned by ChatGPT: 周昭成 王成果
Calling ChatGPT...


  5%|▌         | 112/2190 [02:57<1:06:29,  1.92s/it]

Author returned by ChatGPT: 周璐铭 张春林
Calling ChatGPT...


  5%|▌         | 113/2190 [03:00<1:18:59,  2.28s/it]

Author returned by ChatGPT: 旷思思 章丹
Calling ChatGPT...


  5%|▌         | 114/2190 [03:04<1:26:39,  2.50s/it]

Author returned by ChatGPT: 侯亚景 咸文静


  6%|▌         | 132/2190 [03:43<47:20,  1.38s/it]  

Calling ChatGPT...


  6%|▌         | 133/2190 [03:45<49:23,  1.44s/it]

Author returned by ChatGPT: 陈有勇 谭雅竹
Calling ChatGPT...


  6%|▌         | 134/2190 [03:47<55:39,  1.62s/it]

Author returned by ChatGPT: 张淑虹 汪纯
Calling ChatGPT...


  6%|▌         | 135/2190 [03:48<55:03,  1.61s/it]

Author returned by ChatGPT: 李民圣 李旭


  7%|▋         | 143/2190 [03:59<47:53,  1.40s/it]  

Calling ChatGPT...


  7%|▋         | 144/2190 [04:03<1:10:57,  2.08s/it]

Author returned by ChatGPT: 中共吉林省委 吉林省人民政府


  7%|▋         | 149/2190 [04:09<47:19,  1.39s/it]  

Calling ChatGPT...


  7%|▋         | 150/2190 [04:11<53:46,  1.58s/it]

Author returned by ChatGPT: 高天鼎 李宏伟
Calling ChatGPT...


  7%|▋         | 151/2190 [04:14<59:09,  1.74s/it]

Author returned by ChatGPT: 侯亚景 刘秉承
Calling ChatGPT...


  7%|▋         | 152/2190 [04:16<1:02:37,  1.84s/it]

Author returned by ChatGPT: 陈金霞 王嘉楠
Calling ChatGPT...


  7%|▋         | 153/2190 [04:17<59:48,  1.76s/it]  

Author returned by ChatGPT: 梁佩韵 伏润之


  8%|▊         | 170/2190 [04:42<47:03,  1.40s/it]  

Calling ChatGPT...


  8%|▊         | 171/2190 [04:46<1:09:52,  2.08s/it]

Author returned by ChatGPT: 李斌 周文强


 10%|█         | 222/2190 [06:26<45:04,  1.37s/it]  

Calling ChatGPT...


 10%|█         | 223/2190 [06:27<48:54,  1.49s/it]

Author returned by ChatGPT: 张淑虹 赵凝
Calling ChatGPT...


 10%|█         | 224/2190 [06:29<54:10,  1.65s/it]

Author returned by ChatGPT: 周昭成
Calling ChatGPT...


 10%|█         | 225/2190 [06:32<1:03:30,  1.94s/it]

Author returned by ChatGPT: 刘振峰 周璐铭
Calling ChatGPT...


 10%|█         | 226/2190 [06:33<58:02,  1.77s/it]  

Author returned by ChatGPT: 狄英娜 杨文娟
Calling ChatGPT...


 10%|█         | 227/2190 [06:35<57:52,  1.77s/it]

Author returned by ChatGPT: 魏天舒 刘祥元


 11%|█         | 244/2190 [06:59<43:28,  1.34s/it]  

Calling ChatGPT...


 11%|█         | 245/2190 [07:01<44:49,  1.38s/it]

Author returned by ChatGPT: 梁佩韵
Calling ChatGPT...


 11%|█         | 246/2190 [07:03<53:41,  1.66s/it]

Author returned by ChatGPT: 陈有勇
Calling ChatGPT...


 11%|█▏        | 247/2190 [07:05<56:04,  1.73s/it]

Author returned by ChatGPT: 孙煜华 宋东泽
Calling ChatGPT...


 11%|█▏        | 248/2190 [07:07<58:52,  1.82s/it]

Author returned by ChatGPT: 孙洋洋
Calling ChatGPT...


 11%|█▏        | 249/2190 [07:09<58:36,  1.81s/it]

Author returned by ChatGPT: 高天鼎 刘名美


 12%|█▏        | 265/2190 [07:31<53:13,  1.66s/it]

Calling ChatGPT...


 12%|█▏        | 266/2190 [07:34<1:06:45,  2.08s/it]

Author returned by ChatGPT: 周昭成
Calling ChatGPT...


 12%|█▏        | 267/2190 [07:37<1:12:13,  2.25s/it]

Author returned by ChatGPT: 陈有勇 丁燕
Calling ChatGPT...


 12%|█▏        | 268/2190 [07:39<1:10:14,  2.19s/it]

Author returned by ChatGPT: 盛玮
Calling ChatGPT...


 12%|█▏        | 269/2190 [07:41<1:09:17,  2.16s/it]

Author returned by ChatGPT: 黎海华 苏晓琼
Calling ChatGPT...


 12%|█▏        | 270/2190 [07:43<1:09:20,  2.17s/it]

Author returned by ChatGPT: 刘名美


 13%|█▎        | 274/2190 [18:20<101:39:44, 191.01s/it]

Article with URL: http://www.qstheory.cn/dukan/qs/2023-06/16/c_1129695716.htm failed to parse


 13%|█▎        | 286/2190 [31:16<4:29:56,  8.51s/it]   

Calling ChatGPT...


 13%|█▎        | 287/2190 [31:18<3:32:16,  6.69s/it]

Author returned by ChatGPT: 狄英娜
Calling ChatGPT...


 13%|█▎        | 288/2190 [31:20<2:43:44,  5.17s/it]

Author returned by ChatGPT: 王文婷 旷思思
Calling ChatGPT...


 13%|█▎        | 289/2190 [31:22<2:14:18,  4.24s/it]

Author returned by ChatGPT: 蔡春玲 赵嘉宾
Calling ChatGPT...


 13%|█▎        | 290/2190 [31:24<1:53:59,  3.60s/it]

Author returned by ChatGPT: 梁佩韵


 14%|█▍        | 308/2190 [31:50<39:07,  1.25s/it]  

Calling ChatGPT...


 14%|█▍        | 309/2190 [31:51<41:37,  1.33s/it]

Author returned by ChatGPT: 盛玮
Calling ChatGPT...


 14%|█▍        | 310/2190 [31:54<53:41,  1.71s/it]

Author returned by ChatGPT: 李民圣
Calling ChatGPT...


 14%|█▍        | 311/2190 [31:56<52:21,  1.67s/it]

Author returned by ChatGPT: 周璐铭 李坤
Calling ChatGPT...


 14%|█▍        | 312/2190 [31:58<1:01:20,  1.96s/it]

Author returned by ChatGPT: 那非丁 杨世智


 16%|█▌        | 347/2190 [32:44<40:06,  1.31s/it]  

Calling ChatGPT...


 16%|█▌        | 348/2190 [32:46<46:48,  1.52s/it]

Author returned by ChatGPT: 魏天舒
Calling ChatGPT...


 16%|█▌        | 349/2190 [32:48<51:57,  1.69s/it]

Author returned by ChatGPT: 盛玮
Calling ChatGPT...


 16%|█▌        | 350/2190 [32:51<1:00:31,  1.97s/it]

Author returned by ChatGPT: 何雯雯
Calling ChatGPT...


 16%|█▌        | 351/2190 [32:54<1:06:26,  2.17s/it]

Author returned by ChatGPT: 高天鼎
Calling ChatGPT...


 16%|█▌        | 352/2190 [32:56<1:05:43,  2.15s/it]

Author returned by ChatGPT: 周璐铭 徐阳


 17%|█▋        | 367/2190 [33:23<43:49,  1.44s/it]  

Calling ChatGPT...


 17%|█▋        | 368/2190 [33:25<49:22,  1.63s/it]

Author returned by ChatGPT: 侯亚景
Calling ChatGPT...


 17%|█▋        | 369/2190 [33:26<48:38,  1.60s/it]

Author returned by ChatGPT: 陈有勇
Calling ChatGPT...


 17%|█▋        | 370/2190 [33:28<53:06,  1.75s/it]

Author returned by ChatGPT: 黄炜信 旷思思


 19%|█▊        | 406/2190 [34:14<38:20,  1.29s/it]

Calling ChatGPT...


 19%|█▊        | 407/2190 [34:16<41:23,  1.39s/it]

Author returned by ChatGPT: 周璐铭
Calling ChatGPT...


 19%|█▊        | 408/2190 [34:18<43:20,  1.46s/it]

Author returned by ChatGPT: 狄英娜 张昕
Calling ChatGPT...


 19%|█▊        | 409/2190 [34:20<47:03,  1.59s/it]

Author returned by ChatGPT: 刘名美


 20%|██        | 443/2190 [35:02<42:58,  1.48s/it]

Calling ChatGPT...


 20%|██        | 444/2190 [35:03<42:09,  1.45s/it]

Author returned by ChatGPT: 王小蕾 黎海华


 22%|██▏       | 478/2190 [35:47<55:37,  1.95s/it]

Calling ChatGPT...


 22%|██▏       | 479/2190 [35:49<1:02:16,  2.18s/it]

Author returned by ChatGPT: 李雯博
Calling ChatGPT...


 22%|██▏       | 480/2190 [35:54<1:18:22,  2.75s/it]

Author returned by ChatGPT: 刘名美
Calling ChatGPT...


 22%|██▏       | 481/2190 [35:56<1:17:12,  2.71s/it]

Author returned by ChatGPT: 张淑虹


 23%|██▎       | 499/2190 [36:23<46:10,  1.64s/it]  

Calling ChatGPT...


 23%|██▎       | 500/2190 [36:25<45:51,  1.63s/it]

Author returned by ChatGPT: 刘名美
Calling ChatGPT...


 23%|██▎       | 501/2190 [36:27<45:15,  1.61s/it]

Author returned by ChatGPT: 黎海华
Calling ChatGPT...


 23%|██▎       | 502/2190 [36:29<48:26,  1.72s/it]

Author returned by ChatGPT: 盛玮


 27%|██▋       | 585/2190 [38:38<34:32,  1.29s/it]  

Calling ChatGPT...


 27%|██▋       | 586/2190 [38:39<37:10,  1.39s/it]

Author returned by ChatGPT: 张淑虹
Calling ChatGPT...


 27%|██▋       | 587/2190 [38:42<51:11,  1.92s/it]

Author returned by ChatGPT: 梁佩韵 张春林
Calling ChatGPT...


 27%|██▋       | 588/2190 [38:45<56:17,  2.11s/it]

Author returned by ChatGPT: 郭斐然 秋菊
Calling ChatGPT...


 27%|██▋       | 589/2190 [38:49<1:13:22,  2.75s/it]

Author returned by ChatGPT: 李坤 蔡春玲
Calling ChatGPT...


 27%|██▋       | 590/2190 [38:51<1:03:56,  2.40s/it]

Author returned by ChatGPT: 李达


 28%|██▊       | 606/2190 [39:15<42:48,  1.62s/it]  

Calling ChatGPT...


 28%|██▊       | 607/2190 [39:18<50:25,  1.91s/it]

Author returned by ChatGPT: 陈亦琳 王春楠
Calling ChatGPT...


 28%|██▊       | 608/2190 [39:20<46:53,  1.78s/it]

Author returned by ChatGPT: 杨绍华 倪琴
Calling ChatGPT...


 28%|██▊       | 609/2190 [39:22<54:30,  2.07s/it]

Author returned by ChatGPT: 尹霞 梁生树 李彬
Calling ChatGPT...


 28%|██▊       | 610/2190 [39:24<50:55,  1.93s/it]

Author returned by ChatGPT: 何雯雯 韩燕
Calling ChatGPT...


 28%|██▊       | 611/2190 [39:26<51:05,  1.94s/it]

Author returned by ChatGPT: 侯亚景 王云 何勤华


 28%|██▊       | 612/2190 [39:27<44:39,  1.70s/it]

Calling ChatGPT...


 28%|██▊       | 613/2190 [39:28<42:38,  1.62s/it]

Author returned by ChatGPT: 李育蒙 刘艳辉


 29%|██▊       | 626/2190 [39:48<37:47,  1.45s/it]

Calling ChatGPT...


 29%|██▊       | 627/2190 [39:50<41:51,  1.61s/it]

Author returned by ChatGPT: 申小提 王力
Calling ChatGPT...


 29%|██▊       | 628/2190 [39:52<42:39,  1.64s/it]

Author returned by ChatGPT: 魏天舒 郑亚丽
Calling ChatGPT...


 29%|██▊       | 629/2190 [39:54<46:11,  1.78s/it]

Author returned by ChatGPT: 黎海华 蔡萌
Calling ChatGPT...


 29%|██▉       | 630/2190 [39:55<44:36,  1.72s/it]

Author returned by ChatGPT: 梁佩韵 伏润之


 29%|██▉       | 645/2190 [40:16<37:31,  1.46s/it]

Calling ChatGPT...


 29%|██▉       | 646/2190 [40:18<40:23,  1.57s/it]

Author returned by ChatGPT: 何雯雯 李连成
Calling ChatGPT...


 30%|██▉       | 647/2190 [40:37<2:59:50,  6.99s/it]

Author returned by ChatGPT: 狄英娜 赵丰
Calling ChatGPT...


 30%|██▉       | 648/2190 [40:39<2:21:06,  5.49s/it]

Author returned by ChatGPT: 旷思思 董凤龙
Calling ChatGPT...


 30%|██▉       | 649/2190 [40:43<2:06:04,  4.91s/it]

Author returned by ChatGPT: 黎海华 张智伟
Calling ChatGPT...


 30%|██▉       | 650/2190 [40:44<1:40:03,  3.90s/it]

Author returned by ChatGPT: 杨绍华 石雷


 31%|███       | 669/2190 [41:22<37:06,  1.46s/it]  

Calling ChatGPT...


 31%|███       | 670/2190 [41:23<37:38,  1.49s/it]

Author returned by ChatGPT: 郭斐然 丁佳文
Calling ChatGPT...


 31%|███       | 671/2190 [41:25<38:10,  1.51s/it]

Author returned by ChatGPT: 吴晓迪 段利军
Calling ChatGPT...


 31%|███       | 672/2190 [41:26<38:26,  1.52s/it]

Author returned by ChatGPT: 魏天舒 姜斌
Calling ChatGPT...


 31%|███       | 673/2190 [41:28<38:50,  1.54s/it]

Author returned by ChatGPT: 岳劲松 郑颖


 31%|███       | 683/2190 [41:41<31:53,  1.27s/it]

Calling ChatGPT...


 31%|███       | 684/2190 [41:44<45:28,  1.81s/it]

Author returned by ChatGPT: 王金南 蔡博峰


 31%|███▏      | 689/2190 [41:49<32:23,  1.29s/it]

Calling ChatGPT...


 32%|███▏      | 690/2190 [41:53<47:03,  1.88s/it]

Author returned by ChatGPT: 旷思思
Calling ChatGPT...


 32%|███▏      | 691/2190 [41:54<43:47,  1.75s/it]

Author returned by ChatGPT: 周昭成 龙丹梅
Calling ChatGPT...


 32%|███▏      | 692/2190 [41:56<47:14,  1.89s/it]

Author returned by ChatGPT: 侯亚景 李贵文
Calling ChatGPT...


 32%|███▏      | 693/2190 [41:58<47:15,  1.89s/it]

Author returned by ChatGPT: 孙洋洋 张猛
Calling ChatGPT...


 32%|███▏      | 694/2190 [42:00<49:38,  1.99s/it]

Author returned by ChatGPT: 李民圣 潘彦云


 33%|███▎      | 712/2190 [42:23<30:03,  1.22s/it]

Calling ChatGPT...


 33%|███▎      | 713/2190 [42:25<35:55,  1.46s/it]

Author returned by ChatGPT: 刘玉成 孙大卫
Calling ChatGPT...


 33%|███▎      | 714/2190 [42:27<41:04,  1.67s/it]

Author returned by ChatGPT: 蔡春玲 张力军
Calling ChatGPT...


 33%|███▎      | 715/2190 [42:30<47:43,  1.94s/it]

Author returned by ChatGPT: 李孝纯 晏培娟


 33%|███▎      | 730/2190 [42:50<37:25,  1.54s/it]

Calling ChatGPT...


 33%|███▎      | 731/2190 [42:52<41:09,  1.69s/it]

Author returned by ChatGPT: 侯亚景 史泽奇
Calling ChatGPT...


 33%|███▎      | 732/2190 [42:54<44:03,  1.81s/it]

Author returned by ChatGPT: 李达 贾红路
Calling ChatGPT...


 33%|███▎      | 733/2190 [42:56<46:05,  1.90s/it]

Author returned by ChatGPT: 魏天舒 谭雅竹
Calling ChatGPT...


 34%|███▎      | 734/2190 [42:58<43:43,  1.80s/it]

Author returned by ChatGPT: 高天鼎 黄志武
Calling ChatGPT...


 34%|███▎      | 735/2190 [43:00<44:27,  1.83s/it]

Author returned by ChatGPT: 李民圣 朱兴


 34%|███▍      | 749/2190 [43:23<28:56,  1.21s/it]  

Calling ChatGPT...


 34%|███▍      | 750/2190 [43:25<32:23,  1.35s/it]

Author returned by ChatGPT: 何雯雯 李保健
Calling ChatGPT...


 34%|███▍      | 751/2190 [43:27<33:52,  1.41s/it]

Author returned by ChatGPT: 岳劲松 郑颖
Calling ChatGPT...


 34%|███▍      | 752/2190 [43:28<35:39,  1.49s/it]

Author returned by ChatGPT: 狄英娜 王珊珊
Calling ChatGPT...


 34%|███▍      | 753/2190 [43:30<39:19,  1.64s/it]

Author returned by ChatGPT: 陈有勇 周亚明
Calling ChatGPT...


 34%|███▍      | 754/2190 [43:32<38:48,  1.62s/it]

Author returned by ChatGPT: 尹霞 肖亮


 35%|███▌      | 771/2190 [43:55<25:44,  1.09s/it]

Calling ChatGPT...


 35%|███▌      | 772/2190 [43:57<29:19,  1.24s/it]

Author returned by ChatGPT: 梁佩韵 白续宏
Calling ChatGPT...


 35%|███▌      | 773/2190 [43:59<35:15,  1.49s/it]

Author returned by ChatGPT: 柴潇凡 陈凌墨
Calling ChatGPT...


 35%|███▌      | 774/2190 [44:00<37:35,  1.59s/it]

Author returned by ChatGPT: 李飞 秦丽云
Calling ChatGPT...


 35%|███▌      | 775/2190 [44:02<38:53,  1.65s/it]

Author returned by ChatGPT: 侯亚景 蔡萌


 36%|███▌      | 790/2190 [44:23<29:52,  1.28s/it]

Calling ChatGPT...


 36%|███▌      | 791/2190 [44:26<35:45,  1.53s/it]

Author returned by ChatGPT: 岳劲松 张大鹏
Calling ChatGPT...


 36%|███▌      | 792/2190 [44:27<34:55,  1.50s/it]

Author returned by ChatGPT: 侯亚景 陈文君
Calling ChatGPT...


 36%|███▌      | 793/2190 [44:30<44:32,  1.91s/it]

Author returned by ChatGPT: 何雯雯 曹雯


 37%|███▋      | 810/2190 [44:51<29:24,  1.28s/it]

Calling ChatGPT...


 37%|███▋      | 811/2190 [44:53<36:04,  1.57s/it]

Author returned by ChatGPT: 魏天舒 翁杰
Calling ChatGPT...


 37%|███▋      | 812/2190 [44:54<33:58,  1.48s/it]

Author returned by ChatGPT: 郭斐然 刘勇
Calling ChatGPT...


 37%|███▋      | 813/2190 [44:56<35:27,  1.55s/it]

Author returned by ChatGPT: 黎海华 刘艳辉


 38%|███▊      | 826/2190 [45:11<29:00,  1.28s/it]

Calling ChatGPT...


 38%|███▊      | 827/2190 [45:13<37:49,  1.67s/it]

Author returned by ChatGPT: 狄英娜 高天鼎 冷兴邦


 38%|███▊      | 828/2190 [45:15<34:04,  1.50s/it]

Calling ChatGPT...


 38%|███▊      | 829/2190 [45:16<34:01,  1.50s/it]

Author returned by ChatGPT: 陈有勇 丁佳文
Calling ChatGPT...


 38%|███▊      | 830/2190 [45:18<34:41,  1.53s/it]

Author returned by ChatGPT: 蔡春玲 李抑嫱
Calling ChatGPT...


 38%|███▊      | 831/2190 [45:20<38:20,  1.69s/it]

Author returned by ChatGPT: 柴潇凡 王瑞欣


 39%|███▊      | 848/2190 [45:40<25:38,  1.15s/it]

Calling ChatGPT...


 39%|███▉      | 849/2190 [45:42<28:35,  1.28s/it]

Author returned by ChatGPT: 狄英娜 柳萍
Calling ChatGPT...


 39%|███▉      | 850/2190 [45:43<30:39,  1.37s/it]

Author returned by ChatGPT: 何雯雯 龚砚庆
Calling ChatGPT...


 39%|███▉      | 851/2190 [45:45<31:53,  1.43s/it]

Author returned by ChatGPT: 盛玮 张文博
Calling ChatGPT...


 39%|███▉      | 852/2190 [45:47<32:48,  1.47s/it]

Author returned by ChatGPT: 陈亦琳 李志廷


 39%|███▉      | 853/2190 [45:47<29:15,  1.31s/it]

Calling ChatGPT...


 39%|███▉      | 854/2190 [45:49<31:40,  1.42s/it]

Author returned by ChatGPT: 谢尚芸 王春英


 40%|███▉      | 870/2190 [46:13<30:44,  1.40s/it]

Calling ChatGPT...


 40%|███▉      | 871/2190 [46:15<31:29,  1.43s/it]

Author returned by ChatGPT: 梁佩韵
Calling ChatGPT...


 40%|███▉      | 872/2190 [46:16<32:20,  1.47s/it]

Author returned by ChatGPT: 吴晓迪 宰飞
Calling ChatGPT...


 40%|███▉      | 873/2190 [46:18<33:01,  1.50s/it]

Author returned by ChatGPT: 柴潇凡 张冠年


 41%|████      | 891/2190 [46:40<23:02,  1.06s/it]

Calling ChatGPT...


 41%|████      | 892/2190 [46:42<27:12,  1.26s/it]

Author returned by ChatGPT: 陈晋 钟波


 41%|████      | 893/2190 [46:43<30:01,  1.39s/it]

Calling ChatGPT...


 41%|████      | 894/2190 [46:45<33:18,  1.54s/it]

Author returned by ChatGPT: 张淑虹 金晓玲
Calling ChatGPT...


 41%|████      | 895/2190 [46:47<33:54,  1.57s/it]

Author returned by ChatGPT: 侯亚景 咸文静
Calling ChatGPT...


 41%|████      | 896/2190 [46:48<33:29,  1.55s/it]

Author returned by ChatGPT: 旷思思 赵星月
Calling ChatGPT...


 41%|████      | 897/2190 [46:50<34:08,  1.58s/it]

Author returned by ChatGPT: 李民圣 马军权


 42%|████▏     | 921/2190 [47:23<24:30,  1.16s/it]

Calling ChatGPT...


 42%|████▏     | 922/2190 [47:26<37:07,  1.76s/it]

Author returned by ChatGPT: 张旭东 丁小溪
Calling ChatGPT...


 42%|████▏     | 923/2190 [47:27<32:35,  1.54s/it]

Author returned by ChatGPT: 张旭东 丁小溪


 43%|████▎     | 952/2190 [48:10<24:49,  1.20s/it]

Calling ChatGPT...


 44%|████▎     | 953/2190 [48:12<34:53,  1.69s/it]

Author returned by ChatGPT: 黎海华 孙敏


 44%|████▎     | 954/2190 [48:14<33:45,  1.64s/it]

Calling ChatGPT...


 44%|████▎     | 955/2190 [48:16<35:58,  1.75s/it]

Author returned by ChatGPT: 陈有勇 丁佳文
Calling ChatGPT...


 44%|████▎     | 956/2190 [48:18<35:37,  1.73s/it]

Author returned by ChatGPT: 魏天舒 方炜杭
Calling ChatGPT...


 44%|████▎     | 957/2190 [48:21<46:40,  2.27s/it]

Author returned by ChatGPT: 何雯雯 刘晓波
Calling ChatGPT...


 44%|████▎     | 958/2190 [48:23<45:19,  2.21s/it]

Author returned by ChatGPT: 吴晓迪 蒋君芳
Calling ChatGPT...


 44%|████▍     | 959/2190 [48:25<40:11,  1.96s/it]

Author returned by ChatGPT: 尹霞 梁生树


 45%|████▍     | 976/2190 [48:47<32:03,  1.58s/it]

Calling ChatGPT...


 45%|████▍     | 977/2190 [48:50<38:13,  1.89s/it]

Author returned by ChatGPT: 蔡春玲 孙娇杨
Calling ChatGPT...


 45%|████▍     | 978/2190 [48:52<36:56,  1.83s/it]

Author returned by ChatGPT: 王文婷 纪伟
Calling ChatGPT...


 45%|████▍     | 979/2190 [48:53<37:33,  1.86s/it]

Author returned by ChatGPT: 黎海华 蔡萌
Calling ChatGPT...


 45%|████▍     | 980/2190 [48:56<39:09,  1.94s/it]

Author returned by ChatGPT: 李雯博 张文峰


 47%|████▋     | 1038/2190 [50:13<26:33,  1.38s/it]

Calling ChatGPT...


 47%|████▋     | 1039/2190 [50:16<32:17,  1.68s/it]

Author returned by ChatGPT: 高志顺 马彦铭 潘文静


 48%|████▊     | 1048/2190 [50:27<24:44,  1.30s/it]

Calling ChatGPT...


 48%|████▊     | 1049/2190 [50:29<31:25,  1.65s/it]

Author returned by ChatGPT: 闫玉清
Calling ChatGPT...


 48%|████▊     | 1050/2190 [50:31<34:23,  1.81s/it]

Author returned by ChatGPT: 宗言
Calling ChatGPT...


 48%|████▊     | 1051/2190 [50:37<52:58,  2.79s/it]

Author returned by ChatGPT: 海华


 49%|████▊     | 1066/2190 [50:56<22:05,  1.18s/it]

Calling ChatGPT...


 49%|████▊     | 1067/2190 [50:57<24:27,  1.31s/it]

Author returned by ChatGPT: 周昭成
Calling ChatGPT...


 49%|████▉     | 1068/2190 [50:59<28:52,  1.54s/it]

Author returned by ChatGPT: 李孝纯
Calling ChatGPT...


 49%|████▉     | 1069/2190 [51:02<37:41,  2.02s/it]

Author returned by ChatGPT: 海华
Calling ChatGPT...


 49%|████▉     | 1070/2190 [51:05<40:30,  2.17s/it]

Author returned by ChatGPT: 狄英娜


 49%|████▉     | 1071/2190 [51:07<39:47,  2.13s/it]

Calling ChatGPT...


 49%|████▉     | 1072/2190 [51:09<37:35,  2.02s/it]

Author returned by ChatGPT: 李育蒙 黄曦


 50%|████▉     | 1087/2190 [51:28<23:09,  1.26s/it]

Calling ChatGPT...


 50%|████▉     | 1088/2190 [51:30<27:39,  1.51s/it]

Author returned by ChatGPT: 闫玉清
Calling ChatGPT...


 50%|████▉     | 1089/2190 [51:32<30:39,  1.67s/it]

Author returned by ChatGPT: 旷思思
Calling ChatGPT...


 50%|████▉     | 1090/2190 [51:34<30:09,  1.64s/it]

Author returned by ChatGPT: 尹霞


 51%|█████     | 1106/2190 [51:53<19:58,  1.11s/it]

Calling ChatGPT...


 51%|█████     | 1107/2190 [51:54<22:01,  1.22s/it]

Author returned by ChatGPT: 李达
Calling ChatGPT...


 51%|█████     | 1108/2190 [51:56<23:46,  1.32s/it]

Author returned by ChatGPT: 宗言
Calling ChatGPT...


 51%|█████     | 1109/2190 [51:57<25:29,  1.42s/it]

Author returned by ChatGPT: 高天鼎


 51%|█████     | 1113/2190 [52:06<50:25,  2.81s/it]


AttributeError: 'NoneType' object has no attribute 'text'

In [9]:
len(sample_articles_df)

1113

In [10]:
sample_articles_df.head()

Unnamed: 0,authors,title,date,year,edition,url,text
0,,本期导读,2024-01-01,2024,1,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...,本期发表了习近平总书记的重要文章《以美丽中国建设全面推进人与自然和谐共生的现代化》。文章强调...
1,[习近平],以美丽中国建设全面推进人与自然和谐共生的现代化,2023-12-31,2024,1,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...,以美丽中国建设全面推进人与自然和谐共生的现代化※习近平今后5年是美丽中国建设的重要时期，要深...
2,[《求是》杂志编辑部],全面推进美丽中国建设的系统部署,2023-12-31,2024,1,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...,全面推进美丽中国建设的系统部署《求是》杂志编辑部“今后5年是美丽中国建设的重要时期，要深入贯...
3,[中共政协全国委员会机关党组],为美丽中国建设贡献政协智慧和力量,2024-01-01,2024,1,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...,为美丽中国建设贡献政协智慧和力量中共政协全国委员会机关党组党的二十大对推动绿色发展、促进人与...
4,[自然资源部],抓好海洋资源开发保护 为建设美丽中国提供蓝色动力,2024-01-01,2024,1,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...,抓好海洋资源开发保护 为建设美丽中国提供蓝色动力自然资源部海洋是支撑未来发展的资源宝库和战略...


In [11]:
sample_articles_df.to_csv("articles.csv", index=False)

## Try picking up from where it left off

In [24]:
articles_df = pd.read_csv("articles.csv")

In [25]:
len(articles_df)

1113

In [21]:
links_df = links_df.iloc[1114:, :]

In [26]:
len(links_df)

1076

In [27]:
for link in tqdm(links_df["link"]):
    article_parser = QiuShiArticleParser(link)
    article = QiuShiArticle(article_parser)

    articles_df.loc[len(articles_df)] = [article.authors, article.title, article.date, article.year, article.edition, article.url, article.text]


  1%|          | 6/1076 [00:07<20:49,  1.17s/it]

Calling ChatGPT...


  1%|          | 7/1076 [00:09<23:48,  1.34s/it]

Author returned by ChatGPT: 国务院港澳事务办公室


  8%|▊         | 85/1076 [01:49<18:36,  1.13s/it]

Calling ChatGPT...


  8%|▊         | 86/1076 [01:50<20:30,  1.24s/it]

Author returned by ChatGPT: 无需答复.


  8%|▊         | 88/1076 [01:52<18:50,  1.14s/it]

Calling ChatGPT...


  8%|▊         | 89/1076 [01:54<20:29,  1.25s/it]

Author returned by ChatGPT: 罗平汉 宋庆伟


 11%|█         | 119/1076 [02:30<19:07,  1.20s/it]

Calling ChatGPT...


 11%|█         | 120/1076 [02:32<20:55,  1.31s/it]

Author returned by ChatGPT: 沈壮海 王芸婷


 13%|█▎        | 138/1076 [02:51<16:57,  1.08s/it]

Calling ChatGPT...


 13%|█▎        | 139/1076 [02:52<18:44,  1.20s/it]

Author returned by ChatGPT: 梁佩韵 顾海凇 付松


 16%|█▌        | 167/1076 [03:23<16:53,  1.11s/it]

Calling ChatGPT...


 16%|█▌        | 168/1076 [03:25<21:03,  1.39s/it]

Author returned by ChatGPT: 黄曦 李育蒙


 20%|█▉        | 214/1076 [04:18<13:40,  1.05it/s]

Calling ChatGPT...


 20%|█▉        | 215/1076 [04:19<15:36,  1.09s/it]

Author returned by ChatGPT: 孙金龙 黄润秋


 24%|██▍       | 256/1076 [05:09<16:36,  1.22s/it]

Calling ChatGPT...


 24%|██▍       | 257/1076 [05:11<19:30,  1.43s/it]

Author returned by ChatGPT: 陈旭 邱勇


 25%|██▌       | 273/1076 [05:28<14:02,  1.05s/it]

Calling ChatGPT...


 25%|██▌       | 274/1076 [05:31<20:12,  1.51s/it]

Author returned by ChatGPT: 中共湖北省委 湖北省人民政府 中央军委训练管理部


 30%|██▉       | 318/1076 [06:16<13:17,  1.05s/it]

Calling ChatGPT...


 30%|██▉       | 319/1076 [06:18<15:42,  1.24s/it]

Author returned by ChatGPT: 李学勇 李宣良 樊永强


 32%|███▏      | 348/1076 [06:52<15:14,  1.26s/it]

Calling ChatGPT...


 32%|███▏      | 349/1076 [06:54<17:50,  1.47s/it]

Author returned by ChatGPT: 陈芳 董瑞丰 陈聪 田晓航


 33%|███▎      | 357/1076 [07:03<14:06,  1.18s/it]

Calling ChatGPT...


 33%|███▎      | 358/1076 [07:05<15:29,  1.29s/it]

Author returned by ChatGPT: 张士涛 江勇


 34%|███▎      | 362/1076 [07:09<13:29,  1.13s/it]

Calling ChatGPT...


 34%|███▎      | 363/1076 [07:10<14:56,  1.26s/it]

Author returned by ChatGPT: 国家卫生健康委员会宣传司 健康报社


 34%|███▍      | 365/1076 [07:13<13:56,  1.18s/it]

Calling ChatGPT...


 34%|███▍      | 366/1076 [07:14<14:28,  1.22s/it]

Author returned by ChatGPT: 姚瑜坪


 35%|███▍      | 374/1076 [07:22<13:27,  1.15s/it]

Calling ChatGPT...


 35%|███▍      | 375/1076 [07:24<15:42,  1.34s/it]

Author returned by ChatGPT: 周芳 周志兵 彭小萍 黄璐
Calling ChatGPT...


 35%|███▍      | 376/1076 [07:26<17:24,  1.49s/it]

Author returned by ChatGPT: 王子铭 孙少龙 黄玥


 35%|███▌      | 379/1076 [07:29<13:37,  1.17s/it]

Calling ChatGPT...


 35%|███▌      | 380/1076 [07:30<14:21,  1.24s/it]

Author returned by ChatGPT: 李利 焦红


 39%|███▉      | 418/1076 [08:08<10:53,  1.01it/s]

Calling ChatGPT...


 39%|███▉      | 419/1076 [08:10<12:52,  1.18s/it]

Author returned by ChatGPT: 林晖 高蕾 黄玥


 42%|████▏     | 456/1076 [08:52<12:31,  1.21s/it]

Calling ChatGPT...


 42%|████▏     | 457/1076 [08:54<14:36,  1.42s/it]

Author returned by ChatGPT: 郝薇薇 郑汉根 商婧 陈杉


 44%|████▍     | 472/1076 [09:11<11:21,  1.13s/it]

Calling ChatGPT...


 44%|████▍     | 473/1076 [09:13<14:13,  1.41s/it]

Author returned by ChatGPT: 盛玮 徐辉冠 郭煦
Calling ChatGPT...


 44%|████▍     | 474/1076 [09:14<14:52,  1.48s/it]

Author returned by ChatGPT: 闫玉清 赵雁
Calling ChatGPT...


 44%|████▍     | 475/1076 [09:16<15:57,  1.59s/it]

Author returned by ChatGPT: 张少义 靳晶


 46%|████▌     | 491/1076 [09:37<11:57,  1.23s/it]

Calling ChatGPT...


 46%|████▌     | 492/1076 [09:38<13:35,  1.40s/it]

Author returned by ChatGPT: 黎海华 王光煦
Calling ChatGPT...


 46%|████▌     | 493/1076 [09:41<15:47,  1.62s/it]

Author returned by ChatGPT: 蔡春玲 鄂璠 王文婷
Calling ChatGPT...


 46%|████▌     | 494/1076 [09:42<15:28,  1.60s/it]

Author returned by ChatGPT: 王兆斌 张芯蕊


 48%|████▊     | 514/1076 [10:08<12:30,  1.33s/it]

Calling ChatGPT...


 48%|████▊     | 515/1076 [10:11<15:28,  1.66s/it]

Author returned by ChatGPT: 王寅 韩辰
Calling ChatGPT...


 48%|████▊     | 516/1076 [10:13<16:48,  1.80s/it]

Author returned by ChatGPT: 包俊洪 郭玲
Calling ChatGPT...


 48%|████▊     | 517/1076 [10:15<17:43,  1.90s/it]

Author returned by ChatGPT: 刘名美 李露倩


 49%|████▉     | 530/1076 [10:32<11:50,  1.30s/it]

Calling ChatGPT...


 49%|████▉     | 531/1076 [10:34<15:51,  1.75s/it]

Author returned by ChatGPT: 余艳红 于文明


 49%|████▉     | 532/1076 [10:36<15:14,  1.68s/it]

Calling ChatGPT...


 50%|████▉     | 533/1076 [10:38<15:10,  1.68s/it]

Author returned by ChatGPT: 梁佩韵 周彪
Calling ChatGPT...


 50%|████▉     | 534/1076 [10:39<15:31,  1.72s/it]

Author returned by ChatGPT: 那非丁
Calling ChatGPT...


 50%|████▉     | 535/1076 [10:41<14:46,  1.64s/it]

Author returned by ChatGPT: 许华卿
Calling ChatGPT...


 50%|████▉     | 536/1076 [10:42<14:19,  1.59s/it]

Author returned by ChatGPT: 侯亚景
Calling ChatGPT...


 50%|████▉     | 537/1076 [10:44<14:59,  1.67s/it]

Author returned by ChatGPT: 狄英娜 刘唯达


 51%|█████     | 549/1076 [11:03<13:22,  1.52s/it]

Calling ChatGPT...


 51%|█████     | 550/1076 [11:05<16:13,  1.85s/it]

Author returned by ChatGPT: 杨学军 方向


 51%|█████▏    | 552/1076 [11:08<14:04,  1.61s/it]

Calling ChatGPT...


 51%|█████▏    | 553/1076 [11:10<14:27,  1.66s/it]

Author returned by ChatGPT: 周昭成 柴潇凡
Calling ChatGPT...


 51%|█████▏    | 554/1076 [11:12<16:24,  1.89s/it]

Author returned by ChatGPT: 旷思思 王慧
Calling ChatGPT...


 52%|█████▏    | 555/1076 [11:14<16:06,  1.86s/it]

Author returned by ChatGPT: 李飞 曾嘉雯
Calling ChatGPT...


 52%|█████▏    | 556/1076 [11:17<17:43,  2.05s/it]

Author returned by ChatGPT: 陈金霞 唐淑楠
Calling ChatGPT...


 52%|█████▏    | 557/1076 [11:19<17:06,  1.98s/it]

Author returned by ChatGPT: 周璐铭 李丹华


 53%|█████▎    | 570/1076 [11:35<11:10,  1.33s/it]

Calling ChatGPT...


 53%|█████▎    | 571/1076 [11:39<16:08,  1.92s/it]

Author returned by ChatGPT: 申小提 张利英
Calling ChatGPT...


 53%|█████▎    | 572/1076 [11:40<15:28,  1.84s/it]

Author returned by ChatGPT: 李孝纯 龚紫陌
Calling ChatGPT...


 53%|█████▎    | 573/1076 [11:42<15:05,  1.80s/it]

Author returned by ChatGPT: 孙洋洋 周兵
Calling ChatGPT...


 53%|█████▎    | 574/1076 [11:43<14:25,  1.72s/it]

Author returned by ChatGPT: 李民圣 韩辰
Calling ChatGPT...


 53%|█████▎    | 575/1076 [11:45<14:27,  1.73s/it]

Author returned by ChatGPT: 刘名美 乔雪


 55%|█████▍    | 589/1076 [12:08<27:42,  3.41s/it]

Calling ChatGPT...


 55%|█████▍    | 590/1076 [12:17<39:21,  4.86s/it]

Author returned by ChatGPT: 梁佩韵 侯亚景
Calling ChatGPT...


 55%|█████▍    | 591/1076 [12:20<35:13,  4.36s/it]

Author returned by ChatGPT: 刘健 李兴文 侯雪静 高皓亮 郭强


 55%|█████▌    | 592/1076 [12:22<30:32,  3.79s/it]

Calling ChatGPT...


 55%|█████▌    | 593/1076 [12:24<25:20,  3.15s/it]

Author returned by ChatGPT: 杨绍华 鄂璠
Calling ChatGPT...


 55%|█████▌    | 594/1076 [12:26<23:19,  2.90s/it]

Author returned by ChatGPT: 王寅 刘建华
Calling ChatGPT...


 55%|█████▌    | 595/1076 [12:29<23:28,  2.93s/it]

Author returned by ChatGPT: 易赛键 袁帅
Calling ChatGPT...


 55%|█████▌    | 596/1076 [12:39<40:57,  5.12s/it]

Author returned by ChatGPT: 高天鼎 周彪
Calling ChatGPT...


 55%|█████▌    | 597/1076 [12:41<31:48,  3.98s/it]

Author returned by ChatGPT: 郭斐然 刘玉冰
Calling ChatGPT...


 56%|█████▌    | 598/1076 [12:43<26:16,  3.30s/it]

Author returned by ChatGPT: 李艳玲 高天鼎


 58%|█████▊    | 628/1076 [13:22<10:31,  1.41s/it]

Calling ChatGPT...


 58%|█████▊    | 629/1076 [13:26<16:11,  2.17s/it]

Author returned by ChatGPT: 李达 周昭成


 61%|██████▏   | 661/1076 [14:08<09:44,  1.41s/it]

Calling ChatGPT...


 62%|██████▏   | 662/1076 [14:10<11:25,  1.66s/it]

Author returned by ChatGPT: 陈芳 屈婷 陈聪 董瑞丰


 63%|██████▎   | 677/1076 [14:31<09:39,  1.45s/it]

Calling ChatGPT...


 63%|██████▎   | 678/1076 [14:35<13:55,  2.10s/it]

Author returned by ChatGPT: 闫玉清


 64%|██████▍   | 694/1076 [14:56<10:13,  1.60s/it]

Calling ChatGPT...


 65%|██████▍   | 695/1076 [14:59<12:14,  1.93s/it]

Author returned by ChatGPT: 刘雅鸣 陈聪 李亚楠 宋晓东


 66%|██████▌   | 706/1076 [15:15<08:28,  1.37s/it]

Calling ChatGPT...


 66%|██████▌   | 707/1076 [15:17<09:10,  1.49s/it]

Author returned by ChatGPT: 丁来杭 于忠福


 66%|██████▌   | 711/1076 [15:23<09:29,  1.56s/it]

Calling ChatGPT...


 66%|██████▌   | 712/1076 [15:25<10:40,  1.76s/it]

Author returned by ChatGPT: 李继明 毛盛勇


 71%|███████   | 765/1076 [16:37<06:08,  1.18s/it]

Calling ChatGPT...


 71%|███████   | 766/1076 [16:39<07:03,  1.37s/it]

Author returned by ChatGPT: 王兆斌


 74%|███████▎  | 792/1076 [17:13<05:34,  1.18s/it]

Calling ChatGPT...


 74%|███████▎  | 793/1076 [17:15<07:13,  1.53s/it]

Author returned by ChatGPT: 沈金龙 秦生祥


 74%|███████▍  | 796/1076 [17:19<05:36,  1.20s/it]

Article with URL: http://www.qstheory.cn/dukan/qs/2019-04/16/c_1124364204.htm failed to parse
Calling ChatGPT...


 74%|███████▍  | 797/1076 [17:20<06:18,  1.36s/it]

Author returned by ChatGPT: 南开大学


 75%|███████▌  | 810/1076 [17:35<04:24,  1.01it/s]

Article with URL: http://www.qstheory.cn/dukan/qs/2019-05/01/c_1124440784.htm failed to parse


 76%|███████▌  | 815/1076 [17:43<06:06,  1.41s/it]

Calling ChatGPT...


 76%|███████▌  | 816/1076 [17:46<07:46,  1.80s/it]

Author returned by ChatGPT: 徐永辉 何苏鸣


 79%|███████▉  | 854/1076 [18:35<04:28,  1.21s/it]

Calling ChatGPT...


 79%|███████▉  | 855/1076 [18:37<05:19,  1.45s/it]

Author returned by ChatGPT: 那非丁


 81%|████████  | 874/1076 [19:03<04:41,  1.39s/it]

Calling ChatGPT...


 81%|████████▏ | 875/1076 [19:06<06:11,  1.85s/it]

Author returned by ChatGPT: 孙波 沈虹冰 梁娟 陈晨 张斌
Calling ChatGPT...


 81%|████████▏ | 876/1076 [19:08<06:05,  1.83s/it]

Author returned by ChatGPT: 韩卫国 刘雷


 82%|████████▏ | 878/1076 [19:11<05:09,  1.56s/it]

Calling ChatGPT...


 82%|████████▏ | 879/1076 [19:12<04:56,  1.50s/it]

Author returned by ChatGPT: 梁佩韵


 83%|████████▎ | 894/1076 [19:32<04:38,  1.53s/it]

Calling ChatGPT...


 83%|████████▎ | 895/1076 [19:34<04:58,  1.65s/it]

Author returned by ChatGPT: 侯亚景 李淼 侯冲 樊邦平 张雨


 90%|████████▉ | 966/1076 [21:05<02:12,  1.21s/it]

Calling ChatGPT...


 90%|████████▉ | 967/1076 [21:06<02:23,  1.32s/it]

Author returned by ChatGPT: 伍正华 王通化


 92%|█████████▏| 993/1076 [21:36<01:43,  1.24s/it]

Calling ChatGPT...


 92%|█████████▏| 994/1076 [21:37<01:47,  1.31s/it]

Author returned by ChatGPT: 李军 孙继炼


 96%|█████████▌| 1032/1076 [22:23<00:51,  1.18s/it]

Calling ChatGPT...


 96%|█████████▌| 1033/1076 [22:25<01:00,  1.41s/it]

Author returned by ChatGPT: 丁来杭 于忠福
Calling ChatGPT...


 96%|█████████▌| 1034/1076 [22:26<01:00,  1.44s/it]

Author returned by ChatGPT: 荆博 庞高杰


 98%|█████████▊| 1054/1076 [22:51<00:24,  1.10s/it]

Calling ChatGPT...


 98%|█████████▊| 1055/1076 [22:53<00:26,  1.26s/it]

Author returned by ChatGPT: 吴强 旷思思


 99%|█████████▉| 1067/1076 [23:06<00:09,  1.07s/it]

Calling ChatGPT...


 99%|█████████▉| 1068/1076 [23:07<00:09,  1.22s/it]

Author returned by ChatGPT: 袁誉柏 王建武


100%|██████████| 1076/1076 [23:17<00:00,  1.30s/it]


In [28]:
len(articles_df)

2189

In [29]:
articles_df.to_csv("all_articles.csv", index=False)

## Fix two missing articles

In [29]:
missing_articles = ['http://www.qstheory.cn/dukan/qs/2023-01/31/c_1129324011.htm', 'http://www.qstheory.cn/dukan/qs/2023-06/16/c_1129695716.htm']

In [30]:
import pandas as pd

# import the previously run results
articles = pd.read_csv('all_articles.csv')
len(articles)

2191

In [21]:
# drop the two articles that did not got loaded correctly before
articles = articles[(articles['url'] != missing_articles[0]) & (articles['url'] != missing_articles[1])]
len(articles)

2187

In [None]:
# write it back
articles.to_csv('all_articles.csv', index=False)

In [25]:
# re-run them and add them to a new df

missing_articles_df = pd.DataFrame(columns=["authors", "title", "date", "year", "edition", "url", "text"])

for link in missing_articles:
    article_parser = QiuShiArticleParser(link)
    article = QiuShiArticle(article_parser)
    print(article.__dict__)

    missing_articles_df.loc[len(missing_articles_df)] = [article.authors, article.title, article.date, article.year, article.edition, article.url, article.text]


{'authors': ['《求是》杂志编辑部'], 'title': '把党的伟大自我革命进行到底', 'date': '2023-01-31', 'year': '2023', 'edition': '03', 'url': 'http://www.qstheory.cn/dukan/qs/2023-01/31/c_1129324011.htm', 'text': '把党的伟大自我革命进行到底《求是》杂志编辑部“要站在事关党长期执政、国家长治久安、人民幸福安康的高度，把全面从严治党作为党的长期战略、永恒课题，始终坚持问题导向，保持战略定力，发扬彻底的自我革命精神，永远吹冲锋号，把严的基调、严的措施、严的氛围长期坚持下去，把党的伟大自我革命进行到底。”在二十届中央纪委二次全会上，习近平总书记从新时代新征程党和国家事业发展全局的高度，深刻阐明健全全面从严治党体系的目标任务、实践要求，对坚定不移深入推进全面从严治党作出战略部署。治国必先治党，党兴才能国强。党的十八大以来，习近平总书记发表一系列重要讲话、提出一系列重要思想和重要要求，为深入推进全面从严治党指明前进方向、提供根本遵循。以习近平同志为核心的党中央把全面从严治党纳入“四个全面”战略布局，刀刃向内、刮骨疗毒，猛药祛疴、重典治乱，管党治党宽松软状况得到根本扭转，党在革命性锻造中更加坚强有力，开辟了百年大党自我革命的新境界，引领保障党和国家事业取得历史性成就、发生历史性变革。《全面从严治党探索出依靠党的自我革命跳出历史周期率的成功路径》一文，是习近平总书记2022年1月18日在十九届中央纪委六次全会上重要讲话的一部分。在这篇重要文章中，习近平总书记深刻阐述全面从严治党取得的历史性开创性成就、产生的全方位深层次影响，深刻总结新时代党的自我革命的成功实践，精辟概括对建设什么样的长期执政的马克思主义政党、怎样建设长期执政的马克思主义政党的规律性认识，为推进新时代党的建设新的伟大工程提供了基本遵循。要深刻学习领会这篇重要文章，学习好贯彻好习近平总书记党的二十大以来的一系列重要讲话精神，学习好贯彻好二十届中央纪委二次全会等一系列重要会议精神，切实把党的二十大精神学深悟透、融会贯通、落实落地，以党的伟大自我革命引领伟大社会革命，为全面建设社会主义现代化国家开好局起好步提供坚强保障。跳出历史周期率的第二个答案“我经常讲跳出历史周

In [26]:
missing_articles_df

Unnamed: 0,authors,title,date,year,edition,url,text
0,[《求是》杂志编辑部],把党的伟大自我革命进行到底,2023-01-31,2023,3,http://www.qstheory.cn/dukan/qs/2023-01/31/c_1...,把党的伟大自我革命进行到底《求是》杂志编辑部“要站在事关党长期执政、国家长治久安、人民幸福安...
1,,时代楷模：敦煌研究院文物保护利用群体,2023-06-16,2023,12,http://www.qstheory.cn/dukan/qs/2023-06/16/c_1...,时代楷模：敦煌研究院文物保护利用群体图为敦煌研究院文物保护利用群体代表在“时代楷模”发布仪式...


In [27]:
missing_articles_df.to_csv('missing_articles.csv', index=False)

In [28]:
# merge the two CSV files
with open('missing_articles.csv', 'r') as f1:
    missing_articles = f1.read()

with open('all_articles.csv', 'a') as f2:
    f2.write('\n')
    f2.write(missing_articles)


after this the file had a few issues, and I ended up going through and deleting the link-only entries that I had confirmed were invalid, and finalizing the addition of the new articles to the end. so it was not totally programatic, which is obviously less than ideal.