In [192]:
from semanticscholar import SemanticScholar
import csv
import requests
import json
import os
from requests import Session
from typing import Generator, Union
API_KEY = os.getenv('S2APIKEY')

### [Semantic Scholar API Calls](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/post_graph_get_papers)

- Check their git for examples 
- out = sch.search_author(query="Graham Neubig")
  - then out[0] is a dict with keys dict_keys(['authorId', 'externalIds', 'url', 'name', 'affiliations', 'homepage', 'paperCount', 'citationCount', 'hIndex', 'papers'])
    - out[0].papers[0] has dict_keys(['paperId', 'externalIds', 'corpusId', 'publicationVenue', 'url', 'title', 'abstract', 'venue', 'year', 'referenceCount', 'citationCount', 'influentialCitationCount', 'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy', 'publicationTypes', 'publicationDate', 'journal', 'authors'])

        - out[0].papers[0].externalIds.keys() has dict_keys(['ArXiv', 'DBLP', 'DOI', 'CorpusId'])
        - out[0].papers[0].publicationVenue.keys()dict_keys(['id', 'name', 'alternate_names', 'issn', 'url'])
        - out[0].papers[0].journal.keys() dict_keys(['volume', 'name'])
        - out[0].papers[0].authors[0] is a list of dicts 
- Eg:
- [{'authorId': '2279677197', 'name': 'Abhika Mishra'}, {'authorId': '35584853', 'name': 'Akari Asai'}, {'authorId': '143820870', 'name': 'Vidhisha Balachandran'}]
- if AttributeError for any of the out.atrributes then just skip that in adding 


In [191]:

def fetch_and_process_author_papers(author_name:str, year:int, save_raw=False):
    if not API_KEY:
        raise EnvironmentError("S2_API_KEY environment variable not set.")

    # Set up the headers with the API key
    headers = {
        "x-api-key": API_KEY
    }

    # Get author ID 
    # another way SemanticScholar(api_key=API_KEY) sch= Semansch.search_author(query="author_name")
    search_response = requests.get(
        'https://api.semanticscholar.org/graph/v1/author/search',
        headers=headers,
        params={'query': author_name}
    )
    if search_response.status_code != 200: # from their git
        raise Exception(f"Error finding author ID: {search_response.status_code}")

    author_id = search_response.json()['data'][0]['authorId']

    # had to look these from the api documentation 
    fields = "authorId,name,url,hIndex,affiliations,paperCount,citationCount"
    fields += ",papers.paperId,papers.title,papers.year,papers.url,papers.abstract,papers.authors"
    fields += ",papers.externalIds,papers.isOpenAccess,papers.openAccessPdf,papers.fieldsOfStudy"
    fields += ",papers.influentialCitationCount,papers.journal"

    response = requests.get( # following their git modules
        f'https://api.semanticscholar.org/graph/v1/author/{author_id}',
        headers=headers,
        params={'fields': fields}
    )

    if response.status_code == 200:
        author_data = response.json()

        # Paths for saving jsons 
        raw_data_dir = 'data/paper/raw_output'
        processed_data_dir = 'data/paper/processed_output'
        raw_file_name = f'raw_output_{author_name}_{author_id}.json'
        processed_file_name = f'filtered_output_{author_name}_{author_id}.json'

        # If want to save the raw output
        if save_raw:
            os.makedirs(raw_data_dir, exist_ok=True)
            raw_json_file_path = os.path.join(raw_data_dir, raw_file_name)
            with open(raw_json_file_path, 'w') as raw_file:
                json.dump(author_data, raw_file, indent=2)
                print(f"Saved raw data to {raw_json_file_path}")

        # Process and filter the data
        filtered_papers = [
            paper for paper in author_data.get('papers', [])
            if str(paper.get('year')) == str(year) and paper.get('isOpenAccess')
        ]

        # Save the processed data
        os.makedirs(processed_data_dir, exist_ok=True)
        processed_json_file_path = os.path.join(processed_data_dir, processed_file_name)
        with open(processed_json_file_path, 'w') as processed_file:
            json.dump(filtered_papers, processed_file, indent=2)
            print(f"Saved processed data to {processed_json_file_path}")
    else:
        print(f"Error fetching author data: {response.status_code}")
        if response.text:
            print(json.dumps(response.json(), indent=2))
        else:
            print("No additional error information is provided.")

Saved processed data to data/paper/processed_output/filtered_output_Graham Neubig_1700325.json


In [193]:
for faculty in lti_faculty:
    fetch_and_process_author_papers(author_name=faculty,year=2023, save_raw=False)


year
