In [42]:
'''given a url, scrape the article information'''

from newspaper import Article
from urllib.request import urlopen
from bs4 import BeautifulSoup
import json
import openai

def process_complicated_author(author):

    with open('./openai_auth.json', 'r') as json_file:
        json_load = json.load(json_file)

    openai.api_key = json_load["api_key"]

    prompt = f"""
    Please give me the names of the authors included in this text, and remove any unnecessary spaces within names. Please give me the names, seperated by a space, and nothing else. For example, if the text was "求是》杂志记者 何雯雯 新县融媒体中心记者 韩 燕", I would want you to tell me "何雯雯 韩燕". The text is: {author}
    """

    response = openai.chat.completions.create(
        model="gpt-3.5-turbo-1106",
        messages=[{
            "role":"user",
            "content":prompt
        }]
    )

    return response.choices[0].message.content

class QiuShiArticleParser:
    def __init__(self, url):
        # get the beautiful soup version
        self.url = url
        try:
            page = urlopen(url)
            html = page.read().decode("utf-8")
            self.soup = BeautifulSoup(html, "html.parser")

            # get the newspaper article version
            self.article = Article(url, language='zh')
            self.article.download()
            self.article.parse()
        except:
            print(f"Article with URL: {url} failed to parse")
            self.soup = None
            self.article = None

    def get_authors(self):
        if self.soup == None:
            return None
        span_els = self.soup.find_all("span", class_="appellation")
        try:
            # author is the second appel element
            author_el = span_els[1].text.strip()
            # remove the "Author:" piece
            author = author_el.split("：")[1]

            # see if it needs additional processing
            # no spaces, should be good to go. Or if None, return
            if author == None:
                return [author]
            elif " " not in author:
                return [author]
            # spaces, but length three - a 2 char name with a space
            elif len(author) == 3:
                return [author[0] + author[2]]
            # something more complicated is going on - could be multiple authors, could have additional info 
            else:
                print("Calling ChatGPT...")
                try:
                    author = process_complicated_author(author)
                    print(f"Author returned by ChatGPT: {author}")
                    # split the result in case there is more than one author
                    return author.split(" ")
                except:
                    print("ChatGPT call failed")
                return "CHATGPT" + author

        except:
            return None
    
    def get_title(self):
        if self.soup == None:
            return None
        return self.soup.find('h1').text.strip()
    
    def get_year_edition(self):
        if self.soup == None:
            return None, None
        
        span_els = self.soup.find_all("span", class_="appellation")
        # year/edition is the first appel element
        ye_text = span_els[0].text.strip()
        # get the last piece with the date
        cleaned_ye_text = ye_text[-7:]

        year = cleaned_ye_text[:4]
        edition = cleaned_ye_text[-2:]

        return year, edition
    
    def get_date(self):
        if self.soup == None:
            return None

        date_el = self.soup.find("span", class_="pubtime").text.strip()
        # remove the time piece
        date = date_el.split(" ")[0]
        return date
    
    def get_url(self):
        return self.url
    
    def get_text(self):
        if self.article == None:
            return None

        return self.article.text.replace('\n', "")
    
class QiuShiArticle:
    def __init__(self, article_parser):
        self.authors = article_parser.get_authors()
        self.title = article_parser.get_title()
        self.date = article_parser.get_date()
        self.year, self.edition = article_parser.get_year_edition()
        self.url = article_parser.get_url()

        try:
            self.text = article_parser.get_text()
        except:
            self.text = None
        

    def toJson(self):
        return self.__dict__

In [30]:
import pandas as pd
links_df = pd.read_csv('./links.csv')
links_df.head()

Unnamed: 0,year_name,edition_name,link
0,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...
1,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...
2,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2023-12/31/c_1...
3,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...
4,《求是》2024年,2024年第1期《求是》目录,http://www.qstheory.cn/dukan/qs/2024-01/01/c_1...


In [31]:
links_df = links_df.drop(columns=["year_name", "edition_name"]).sample(100)
links_df.head()

Unnamed: 0,link
583,http://www.qstheory.cn/dukan/qs/2022-03/01/c_1...
1388,http://www.qstheory.cn/dukan/qs/2020-01/01/c_1...
908,http://www.qstheory.cn/dukan/qs/2022-11/01/c_1...
1681,http://www.qstheory.cn/dukan/qs/2020-09/16/c_1...
1790,http://www.qstheory.cn/dukan/qs/2020-12/16/c_1...


In [43]:
sample_articles_df = pd.DataFrame(columns=["authors", "title", "date", "year", "edition", "url", "text"])

In [44]:
from tqdm import tqdm

In [46]:
for link in tqdm(links_df["link"]):
    article_parser = QiuShiArticleParser(link)
    article = QiuShiArticle(article_parser)

    sample_articles_df.loc[len(sample_articles_df)] = [article.authors, article.title, article.date, article.year, article.edition, article.url, article.text]


  0%|          | 0/100 [00:00<?, ?it/s]

 23%|██▎       | 23/100 [00:29<01:36,  1.26s/it]

Calling ChatGPT...


 24%|██▍       | 24/100 [00:30<01:40,  1.32s/it]

Author returned by ChatGPT: 侯亚景 李贵文


 32%|███▏      | 32/100 [00:40<01:29,  1.31s/it]

Calling ChatGPT...


 33%|███▎      | 33/100 [00:42<01:42,  1.52s/it]

Author returned by ChatGPT: 中央港澳工作领导小组办公室 国务院港澳事务办公室


 41%|████      | 41/100 [00:53<01:17,  1.31s/it]

Calling ChatGPT...


 42%|████▏     | 42/100 [00:55<01:41,  1.74s/it]

Author returned by ChatGPT: 周昭成 龙丹梅


 50%|█████     | 50/100 [01:07<01:14,  1.50s/it]

Calling ChatGPT...


 51%|█████     | 51/100 [01:08<01:20,  1.64s/it]

Author returned by ChatGPT: 王寅 韩辰


 58%|█████▊    | 58/100 [01:16<00:44,  1.06s/it]

Article with URL: http://www.qstheory.cn/dukan/qs/2019-04/16/c_1124364204.htm failed to parse


 63%|██████▎   | 63/100 [01:22<00:39,  1.07s/it]

Calling ChatGPT...


 64%|██████▍   | 64/100 [01:25<00:54,  1.51s/it]

Author returned by ChatGPT: 尹霞


 83%|████████▎ | 83/100 [01:50<00:19,  1.18s/it]

Calling ChatGPT...


 84%|████████▍ | 84/100 [01:51<00:19,  1.19s/it]

Author returned by ChatGPT: 刘名美


 89%|████████▉ | 89/100 [01:57<00:12,  1.11s/it]

Calling ChatGPT...


 90%|█████████ | 90/100 [01:58<00:12,  1.25s/it]

Author returned by ChatGPT: 李孝纯 晏培娟


 98%|█████████▊| 98/100 [02:08<00:02,  1.20s/it]

Calling ChatGPT...


 99%|█████████▉| 99/100 [02:10<00:01,  1.31s/it]

Author returned by ChatGPT: 杨学军 方向


100%|██████████| 100/100 [02:11<00:00,  1.31s/it]


In [47]:
len(sample_articles_df)

100

In [48]:
sample_articles_df.head()

Unnamed: 0,authors,title,date,year,edition,url,text
0,[新华社记者],改革开放启新局,2022-03-01,2022,5,http://www.qstheory.cn/dukan/qs/2022-03/01/c_1...,改革开放启新局新华社记者改革开放是我们党的一次伟大觉醒，是决定当代中国前途命运的关键一招，是...
1,[王玉强],重温《论共产党员的修养》,2020-01-01,2020,1,http://www.qstheory.cn/dukan/qs/2020-01/01/c_1...,1939年7月，刘少奇在延安发表《论共产党员的修养》（以下简称《修养》）演讲。这部名篇丰富了...
2,,中国共产党第二十次全国代表大会关于十九届中央委员会报告的决议,2022-11-01,2022,21,http://www.qstheory.cn/dukan/qs/2022-11/01/c_1...,中国共产党第二十次全国代表大会关于十九届中央委员会报告的决议（2022年10月22日中国共产...
3,[谢伏瞻],抗疫彰显中华优秀传统文化的强大力量,2020-09-16,2020,18,http://www.qstheory.cn/dukan/qs/2020-09/16/c_1...,在全国抗击新冠肺炎疫情表彰大会上，习近平总书记指出：“抗疫斗争伟大实践再次证明，社会主义核心...
4,[新华社记者],赤胆忠魂写初心,2020-12-16,2020,24,http://www.qstheory.cn/dukan/qs/2020-12/16/c_1...,赤胆忠魂写初心新华社记者旌旗猎猎，战鼓催征。党的十八大以来，中国共产党带领亿万中华儿女向绝对...


In [49]:
sample_articles_df.to_csv("sample_articles.csv", index=False)