In [1]:
import csv
import requests
import json
import sys
from requests import Session
from typing import Generator, Union
from requests import Session
import subprocess
import pandas as pd
import logging 
import numpy as np


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import os
API_KEY = os.environ.get('S2APIKEY')

In [3]:
base_dir = 'data' # all changes from this file will be under this directory

In [None]:
# website
lti_faculty = ["Yonatan Bisk", "Ralf Brown", "Jamie Callan", "Justine Cassell", "Mona Diab", "Fernando Diaz",
               "Scott Fahlman", "Robert Frederking", "Daniel Fried", "Anatole Gershman", "Alexander Hauptmann", 
               "Daphne Ippolito", "Lori S Levin", "Lei Li", "Teruko Mitamura", "Louis-Philippe Morency", "David R Mortensen",
               "Graham Neubig", "Eric Nyberg", "Kemal Oflazer", "Bhiksha Ramakrishnan", "Bhiksha Raj", "Carolyn Rosé", "Alexander Rudnicky",
               "Maarten Sap", "Michael Shamos", "Rita Singh", "Emma Strubell", "Alexander Waibel", "Shinji Watanabe", 
               "Sean Welleck", "Eric P. Xing", "Chenyan Xiong", "Yiming Yang"]

affiliated_faculty = ["Jeffrey Bigham", "Matt Gormley", "Ian Lane", "Brian MacWhinney", "Tom Mitchell", "Jack Mostow", 
                      "Raj Reddy", "Roni Rosenfeld", "Norman Sadeh", "Richard Stern", "Rodolfo M Vega"]
adjunct_faculty = ["Malihe Alikhani", "Taylor Berg-Kirkpatrick", "William Cohen", "Christopher Dyer", 
                   "Madhavi Ganapathiraju", "Matthias Grabmair", "Lu Jiang", "Alon Lavie", "Michael Mauldin",
                   "Florian Metze", "Thomas Schaaf", "Ravi Starzl", "Yulia Tsvetkov", "Monika Woszczyna"]

lti_complete_faculty = lti_faculty + affiliated_faculty + adjunct_faculty
expected_length = len(lti_faculty) + len(affiliated_faculty)+ len(adjunct_faculty)
assert len(lti_complete_faculty) == expected_length, "Length does not match the expected value."
lti_complete_faculty.sort()
print(len(lti_complete_faculty))

59


In [5]:
author_id_path = 'manual_authorId.csv'
author_df = pd.read_csv(author_id_path)
duplicates = author_df[author_df.duplicated(subset='author_id', keep=False)]
print(duplicates)

Empty DataFrame
Columns: [author_name, author_id]
Index: []


### Fetching author data + paperIds

In [6]:
def add_paper_id_csv(paper_id, paper_title, author_name, author_id, field, csv_file_path='data/paper_logs/paperId.csv'):
    if not os.path.exists(csv_file_path):
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['paper_id', 'paper_title', 'author_name', 'author_id', 'field']) 
    
    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        existing_data = [row for row in reader]

    # Check if the paper ID already there
    if paper_id not in [row[0] for row in existing_data[1:]]:
        with open(csv_file_path, 'a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([paper_id, paper_title, author_name, author_id, field])

In [7]:
def fetch_paper_json(author_id, 
                    year, 
                    open_access = True,
                    manual=True, #['Computer Science', 'Linguistics']
                    save_raw = False,
                    parent_dir =  'data'):
    if not API_KEY:
        raise EnvironmentError("S2_API_KEY environment variable not set.")
    # Set up the headers with the API key
    headers = {
        "x-api-key": API_KEY
    }
    print("--------------------------------------------------------------------------------------------")

    fields = "authorId,name,url,hIndex,affiliations,paperCount,citationCount"
    fields += ",papers.paperId,papers.title,papers.year,papers.url,papers.abstract,papers.authors"
    fields += ",papers.externalIds,papers.openAccessPdf,papers.fieldsOfStudy"
    fields += ",papers.influentialCitationCount,papers.journal"
    if open_access:
        fields += ",papers.isOpenAccess"
    
    response = requests.get( # following their git modules
        f'https://api.semanticscholar.org/graph/v1/author/{author_id}',
        headers=headers,
        params={'fields': fields}
    )
    
    if response.status_code == 200:
        author_data = response.json()
        name_fetched =  author_data.get("name")

        # Paths for saving jsons 
        raw_data_dir = parent_dir + '/paper_jsons'
        processed_data_dir = parent_dir + '/paper_jsons'
        raw_file_name = f'raw_{name_fetched}_{author_id}.json'
        processed_file_name = f'{name_fetched}_{author_id}.json'
        # If want to save the raw output (not filtered)
        if save_raw:
            os.makedirs(raw_data_dir, exist_ok=True)
            raw_json_file_path = os.path.join(raw_data_dir, raw_file_name)
            with open(raw_json_file_path, 'w') as raw_file:
                json.dump(author_data, raw_file, indent=2)
                print(f"Saved raw data to {raw_json_file_path}")
        
        
        # allowed_fields = ['Computer Science', 'Linguistics'] if not manual else None
        # Filters: Open Access, Year, and optionally by Field if manual is False
        filtered_papers = [{
            "authorId": author_data.get("authorId"),
            "authorName": author_data.get("name"),
            "authorUrl": author_data.get("url"),
            "authorHIndex": author_data.get("hIndex"),
            "authorAffiliations": author_data.get("affiliations", []),
            "authorPaperCount": author_data.get("paperCount"),
            "authorCitationCount": author_data.get("citationCount"),
            **paper,
            } for paper in author_data.get('papers', [])
            if str(paper.get('year')) == str(year) and paper.get('isOpenAccess')]
            # and (manual or any(area in paper.get('fieldsOfStudy', []) for area in allowed_fields))]
        
        print(f"Author: {name_fetched}, id:{author_id}, All: {len(response.json().get('papers', []))}" )
        for paper in filtered_papers:
            paper_id = paper.get('paperId')
            paper_title = paper.get('title')
            paper_authors = name_fetched
            if paper_id:
                add_paper_id_csv(paper_id, paper_title, paper_authors, author_id, paper.get('fieldsOfStudy'))

        print(f"Author: {name_fetched}, id:{author_id}, 2023+OpenAccess+Fields: {len(filtered_papers)}" )
        # Save the processed data
        if len(filtered_papers) > 0:
            os.makedirs(processed_data_dir, exist_ok=True)
            processed_json_file_path = os.path.join(processed_data_dir, processed_file_name)
        else:
            print(f"No OpenAccess CS+L papers found for {name_fetched} in {year}")
            return
        
        with open(processed_json_file_path, 'w') as processed_file:
            json.dump(filtered_papers, processed_file, indent=2)
            print(f"Saved processed data to {processed_json_file_path}")
    else:
        print(f"Error fetching author data: {response.status_code}")
        if response.text:
            print(json.dumps(response.json(), indent=2))
        else:
            print("No additional error information is provided.")

In [8]:
authors = author_df['author_name'].tolist()
authorId = author_df['author_id'].tolist()
len(authors)

65

In [14]:
output_file = '/paper_jsons/paperJson.txt'
output_dir = base_dir + output_file
original_stdout = sys.stdout  
with open(output_dir, 'w') as f:
    sys.stdout = f 
    for i in range(len(authorId)):
        fetch_paper_json(author_id = authorId[i], year = 2023)
    sys.stdout = original_stdout  

In [9]:
# checking if we have all jsons
! cd data/paper_jsons && ls | grep '\.json$' | wc -l

      50


In [10]:
# paper_id = csv.reader(open('data/paper/processed_output/paperId.csv', 'r'))
input_name = '/paper_logs/paperId.csv'
input_dir = base_dir + input_name
paper_id = csv.reader(open(input_dir, 'r'))
paper_ids = []
titles = []
authors = []
for row in paper_id:
    paper_ids.append(row[0])  # Access the first column for paper ID
    titles.append(row[1])  # Access the second column for title
    authors.append(row[2])

In [11]:
len(paper_ids[1:])

337

In [12]:
# output = subprocess.getoutput(f'python semantic_scholar_simple.py -d {dir} {paper_ids[1]}')
# output

In [14]:
# Setup logging
failed_downloads = pd.DataFrame(columns=['author', 'paper_id', 'title'])

logging_output = 'data/paper_logs/download_logs_manual_updated.txt'
logging.basicConfig(filename=logging_output, level=logging.INFO, format='%(asctime)s - %(message)s')
dir = os.path.abspath('data/papers')  # Use absolute paths
os.makedirs(dir, exist_ok=True)  # Ensure directory exists

for i in range(1,len(paper_ids)):
    paper_id = paper_ids[i]
    title = titles[i]
    author = authors[i]
    file_path = os.path.join(dir, f"{paper_id}.pdf")
    logging.info('_______________________________________________________________________')
    logging.info(f'i: {i}, paper_id: {paper_id}, title: {title}, author:{author}')
    # print(f"title: {title}, author:{author}")
    if os.path.exists(file_path):
        logging.info(f"Downloaded already")
    else:
        output = subprocess.getoutput(f'python semantic_scholar_simple.py -d {dir} {paper_id}')
        logging.info(output)
        # print(output)
        if "Downloaded" in output and paper_id in output:  
            logging.info(f"Successfully downloaded")
        else:
            logging.error(f"Download failed")

In [43]:
# Setup logging
failed_downloads = pd.DataFrame(columns=['author', 'paper_id', 'title'])

logging_output = 'data/paper_logs/download_logs_manual.txt'
logging.basicConfig(filename=logging_output, level=logging.INFO, format='%(asctime)s - %(message)s')
dir = os.path.abspath('data/papers')  # Use absolute paths
os.makedirs(dir, exist_ok=True)  # Ensure directory exists

for i in range(1,len(paper_ids)):
    paper_id = paper_ids[i]
    title = titles[i]
    author = authors[i]
    file_path = os.path.join(dir, f"{paper_id}.pdf")
    print('_____________________________________________________________________')
    print(f"title: {title}, author:{author}")
    if os.path.exists(file_path):
        logging.info(f"Downloaded already '{paper_id}, title: {title}, author:{author}'")
    else:
        output = subprocess.getoutput(f'python semantic_scholar_simple.py -d {dir} {paper_id}')
        
        print(output)
        
        if "Downloaded" in output and paper_id in output:  
            logging.info(f"Successfully downloaded '{paper_id}, title: {title}, author:{author}'")
        else:
            logging.error(f"Download failed for {paper_id}, title: {title}, author:{author}")

_____________________________________________________________________
title: Towards Open-Domain Twitter User Profile Inference, author:Alexander Hauptmann
Downloaded '72cce47fd053bf916314d89a8174726c58c05e02' to '/Users/vashisth/Documents/GitHub/ANLP_projects/NLP-RAG/data/papers/72cce47fd053bf916314d89a8174726c58c05e02.pdf'
_____________________________________________________________________
title: Zero-Shot and Few-Shot Stance Detection on Varied Topics via Conditional Generation, author:A. Hauptmann
Downloaded '2107b867cb8f8afa30a9a940288d7c8b657f8aa5' to '/Users/vashisth/Documents/GitHub/ANLP_projects/NLP-RAG/data/papers/2107b867cb8f8afa30a9a940288d7c8b657f8aa5.pdf'
_____________________________________________________________________
title: SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs, author:A. Hauptmann
Downloaded '376f494126d1ea4f571ea0263c43ac2b6331800a' to '/Users/vashisth/Documents/GitHub/ANLP_projects/NLP-RAG/data/papers/376f494126d1ea4f571

In [44]:
# checking num pdfs
! cd data/papers && ls | grep '\.pdf$' | wc -l

     290


In [45]:
no_failed =337-290
no_failed

47