In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import datetime

In [None]:
#define daterange 

url = "https://arxiv.org/list/cs.HC/recent"
start_date = datetime.date(2013, 3, 20)
end_date = datetime.date.today()

Step 3: Define the function to extract the research paper information

In [4]:
def extract_paper_info(html, keywords):
    soup = BeautifulSoup(html, "html.parser")
    papers = soup.find_all("li", class_="arxiv-result")
    paper_info_list = []

    for paper in papers:
        paper_info = {}
        paper_info["title"] = paper.find("div", class_="list-title mathjax").text.strip()
        paper_info["authors"] = paper.find("div", class_="list-authors").text.strip().replace("\n", ", ")
        paper_info["abstract"] = paper.find("p", class_="mathjax").text.strip()
        paper_info["submission_date"] = paper.find("div", class_="list-dateline").text.strip()

        matched_keywords = []
        for keyword in keywords:
            if keyword.lower() in paper_info["title"].lower() or keyword.lower() in paper_info["abstract"].lower():
                matched_keywords.append(keyword)
        if matched_keywords:
            paper_info["matched_keywords"] = ", ".join(matched_keywords)
            paper_info["keywords_flag"] = True
        else:
            paper_info["matched_keywords"] = ""
            paper_info["keywords_flag"] = False
        
        paper_info_list.append(paper_info)

    return paper_info_list


In [5]:
def save_to_csv(paper_info_list, directory):
    date_string = datetime.datetime.today().strftime('%Y-%m-%d')
    filename = f"{directory}/arxiv_{date_string}.csv"

    with open(filename, "w", newline="", encoding="utf-8") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["title", "authors", "abstract", "submission_date", "keywords_flag", "matched_keywords"])
        writer.writeheader()
        writer.writerows(paper_info_list)

In [6]:
keywords = ["cognitive bias", "psychology", "human-ai interaction", "human behavior", "behavioral science", "heuristics"]
directory = "/Users/annaking/Documents/LSE/Dissertation"

In [None]:
date = start_date
while date <= end_date:
    date_str = date.strftime("%y%m%d")
    page_url = f"{url}/{date_str}"
    response = requests.get(page_url)

    if response.status_code == 200:
        paper_info_list = extract_paper_info(response.content, keywords)
        save_to_csv(paper_info_list, directory)
        print(f"Scraped data for {date}")
    else:
        print(f"Failed to scrape data for {date}")
    
    date += datetime.timedelta(days=1)