In [68]:
from semanticscholar import SemanticScholar
import csv
import requests
import json
import os
from requests import Session
from typing import Generator, Union
from requests import Session
import subprocess
API_KEY = os.getenv('S2APIKEY')

### [Semantic Scholar API Calls](https://api.semanticscholar.org/api-docs/graph#tag/Paper-Data/operation/post_graph_get_papers)

- Check their git for examples 
- out = sch.search_author(query="Graham Neubig")
  - then out[0] is a dict with keys dict_keys(['authorId', 'externalIds', 'url', 'name', 'affiliations', 'homepage', 'paperCount', 'citationCount', 'hIndex', 'papers'])
    - out[0].papers[0] has dict_keys(['paperId', 'externalIds', 'corpusId', 'publicationVenue', 'url', 'title', 'abstract', 'venue', 'year', 'referenceCount', 'citationCount', 'influentialCitationCount', 'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy', 'publicationTypes', 'publicationDate', 'journal', 'authors'])

        - out[0].papers[0].externalIds.keys() has dict_keys(['ArXiv', 'DBLP', 'DOI', 'CorpusId'])
        - out[0].papers[0].publicationVenue.keys()dict_keys(['id', 'name', 'alternate_names', 'issn', 'url'])
        - out[0].papers[0].journal.keys() dict_keys(['volume', 'name'])
        - out[0].papers[0].authors[0] is a list of dicts 
- Eg:
- [{'authorId': '2279677197', 'name': 'Abhika Mishra'}, {'authorId': '35584853', 'name': 'Akari Asai'}, {'authorId': '143820870', 'name': 'Vidhisha Balachandran'}]
- if AttributeError for any of the out.atrributes then just skip that in adding 


In [69]:
import csv
import os

def add_paper_details_to_csv(paper_id, paper_title, author_name, csv_file_path='data/paper/processed_output/paperId.csv'):
    # Check if the CSV file exists and create it if it does not
    if not os.path.exists(csv_file_path):
        with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['paper_id', 'paper_title', 'author_name'])  # Write header

    # Read the existing data to avoid duplicates
    with open(csv_file_path, 'r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        existing_data = [row for row in reader]

    # Check if the paper ID is already listed to prevent duplicates
    if paper_id not in [row[0] for row in existing_data[1:]]:  # Skip header row
        # Add the paper ID, title, and author name to the CSV
        with open(csv_file_path, 'a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow([paper_id, paper_title, author_name])

In [70]:
def fetch_and_process_author_papers(author_name:str, year:int, save_raw=False):
    if not API_KEY:
        raise EnvironmentError("S2_API_KEY environment variable not set.")

    # Set up the headers with the API key
    headers = {
        "x-api-key": API_KEY
    }

    # Get author ID 
    # another way SemanticScholar(api_key=API_KEY) sch= SemanticScholar(api_key) sch.search_author(query="author_name")
    search_response = requests.get(
        'https://api.semanticscholar.org/graph/v1/author/search',
        headers=headers,
        params={'query': author_name}
    )
    if search_response.status_code != 200: # from their git
        raise Exception(f"Error finding author ID: {search_response.status_code}")
    
    if search_response.status_code == 200 and len(search_response.json().get('data', [])) == 0: 
        # doing this because for a lot of authors name is First-Last instead of First Last
        modified_author_name = author_name.replace(' ', '-')
    
        # Fallback request with modified author name
        search_response = requests.get(
            'https://api.semanticscholar.org/graph/v1/author/search',
            headers=headers,
            params={'query': modified_author_name}
        )
    
    author_id = search_response.json()['data'][0]['authorId']
    # had to look these from the api documentation 
    fields = "authorId,name,url,hIndex,affiliations,paperCount,citationCount"
    fields += ",papers.paperId,papers.title,papers.year,papers.url,papers.abstract,papers.authors"
    fields += ",papers.externalIds,papers.isOpenAccess,papers.openAccessPdf,papers.fieldsOfStudy"
    fields += ",papers.influentialCitationCount,papers.journal"

    response = requests.get( # following their git modules
        f'https://api.semanticscholar.org/graph/v1/author/{author_id}',
        headers=headers,
        params={'fields': fields}
    )

    if response.status_code == 200:
        author_data = response.json()

        # Paths for saving jsons 
        raw_data_dir = 'data/paper/raw_output'
        processed_data_dir = 'data/paper/processed_output'
        raw_file_name = f'raw_output_{author_name}_{author_id}.json'
        processed_file_name = f'filtered_output_{author_name}_{author_id}.json'

        # If want to save the raw output (not filtered)
        if save_raw:
            os.makedirs(raw_data_dir, exist_ok=True)
            raw_json_file_path = os.path.join(raw_data_dir, raw_file_name)
            with open(raw_json_file_path, 'w') as raw_file:
                json.dump(author_data, raw_file, indent=2)
                print(f"Saved raw data to {raw_json_file_path}")

        # author_data = {
        # "authorId": author_data.get("authorId"),
        # "name": author_data.get("name"),
        # "url": author_data.get("url"),
        # "hIndex": author_data.get("hIndex"),
        # "affiliations": author_data.get("affiliations"),
        # "paperCount": author_data.get("paperCount"),
        # "citationCount": author_data.get("citationCount"),
        # }

    # Process and filter the papers, including author information with each paper
        filtered_papers = [{
        "authorId": author_data.get("authorId"),
        "authorName": author_data.get("name"),
        "authorUrl": author_data.get("url"),
        "authorHIndex": author_data.get("hIndex"),
        "authorAffiliations": author_data.get("affiliations", []),
        "authorPaperCount": author_data.get("paperCount"),
        "authorCitationCount": author_data.get("citationCount"),
        **paper,  # Original paper details
        }  for paper in author_data.get('papers', []) 
       if str(paper.get('year')) == str(year) and paper.get('isOpenAccess') ]

        print(f"Author: {author_name} len json: {len(response.json().get('papers', []))}" )
        for paper in filtered_papers:
            paper_id = paper.get('paperId')
            paper_title = paper.get('title')
            paper_authors = author_name
            if paper_id:
                add_paper_details_to_csv(paper_id, paper_title, paper_authors)
        
        print(f"Author: {author_name}, No of filtered papers: {len(filtered_papers)}" )
        # Save the processed data
        os.makedirs(processed_data_dir, exist_ok=True)
        processed_json_file_path = os.path.join(processed_data_dir, processed_file_name)
        
        with open(processed_json_file_path, 'w') as processed_file:
            json.dump(filtered_papers, processed_file, indent=2)
            print(f"Saved processed data to {processed_json_file_path}")
        print("--------------------------------------------------------------------------------------------")
    else:
        print(f"Error fetching author data: {response.status_code}")
        if response.text:
            print(json.dumps(response.json(), indent=2))
        else:
            print("No additional error information is provided.")

In [71]:
# website
lti_faculty = ["Yonatan Bisk", "Ralf Brown", "Jamie Callan", "Justine Cassell", "Mona Diab", "Fernando Diaz",
               "Scott Fahlman", "Robert Frederking", "Daniel Fried", "Anatole Gershman", "Alexander Hauptmann", 
               "Daphne Ippolito", "Lori Levin", "Lei Li", "Teruko Mitamura", "Louis-Philippe Morency", "David Mortensen", 
               "Graham Neubig", "Eric Nyberg", "Kemal Oflazer", "Bhiksha Ramakrishnan", "Carolyn Rosé", "Alexander Rudnicky",
               "Maarten Sap", "Michael Shamos", "Rita Singh", "Emma Strubell", "Alexander Waibel", "Shinji Watanabe", 
               "Sean Welleck", "Eric P. Xing", "Chenyan Xiong", "Yiming Yang"]

affiliated_faculty = ["Jeffrey Bigham", "Matt Gormley", "Ian Lane", "Brian MacWhinney", "Tom Mitchell", "Jack Mostow", 
                      "Raj Reddy", "Roni Rosenfeld", "Norman Sadeh", "Richard Stern", "Rodolfo M Vega"]
adjunct_faculty = ["Malihe Alikhani", "Taylor Berg-Kirkpatrick", "William Cohen", "Christopher Dyer", 
                   "Madhavi Ganapathiraju", "Matthias Grabmair", "Lu Jiang", "Alon Lavie", "Michael Mauldin",
                   "Florian Metze", "Thomas Schaaf", "Ravi Starzl", "Yulia Tsvetkov", "Monika Woszczyna"]

lti_complete_faculty = lti_faculty + affiliated_faculty + adjunct_faculty
expected_length = len(lti_faculty) + len(affiliated_faculty)+ len(adjunct_faculty)
assert len(lti_complete_faculty) == expected_length, "Length does not match the expected value."
print(len(lti_complete_faculty))

58


In [72]:
for faculty in lti_complete_faculty:
    fetch_and_process_author_papers(author_name=faculty,year=2023, save_raw=False)

Author: Yonatan Bisk len json: 98
Author: Yonatan Bisk, No of filtered papers: 8
Saved processed data to data/paper/processed_output/filtered_output_Yonatan Bisk_3312309.json
--------------------------------------------------------------------------------------------
Author: Ralf Brown len json: 78
Author: Ralf Brown, No of filtered papers: 0
Saved processed data to data/paper/processed_output/filtered_output_Ralf Brown_2109449533.json
--------------------------------------------------------------------------------------------
Author: Jamie Callan len json: 4
Author: Jamie Callan, No of filtered papers: 0
Saved processed data to data/paper/processed_output/filtered_output_Jamie Callan_17038253.json
--------------------------------------------------------------------------------------------
Author: Justine Cassell len json: 3
Author: Justine Cassell, No of filtered papers: 0
Saved processed data to data/paper/processed_output/filtered_output_Justine Cassell_2065308530.json
-------------

In [73]:
# checking if we have all jsons
! cd data/paper/processed_output && ls | grep '\.json$' | wc -l

      58


In [93]:
paper_id = csv.reader(open('data/paper/processed_output/paperId.csv', 'r'))

In [96]:
with open('data/paper/processed_output/paperId.csv', 'r') as file:
    paper_id_reader = csv.reader(file)
    paper_ids = []
    titles = []
    authors = []
    for row in paper_id_reader:
        paper_ids.append(row[0])  # Access the first column for paper ID
        titles.append(row[1])  # Access the second column for title
        authors.append(row[2])


In [97]:
for i in range(1, len(paper_ids)):
    paper_id = paper_ids[i]
    title = titles[i]
    author = authors[i]
    if os.path.exists(f'papers/{paper_id}.pdf'):
        print(f"Downloaded already '{paper_id}, title: {title}, author:{author}")
    else:
        output = subprocess.getoutput(f'python semantic_scholar_simple.py -d papers {paper_id}')
        expected_output = f"Downloaded '{paper_id}, title= {title}, author:{author}' to 'papers/{paper_id}.pdf'"
        try:
            assert expected_output in output, f"Download failed for {paper_id}, title: {title}, author:{author}"
        except AssertionError as e:
            print(e)

Downloaded already '376f494126d1ea4f571ea0263c43ac2b6331800a, title: SPAE: Semantic Pyramid AutoEncoder for Multimodal Generation with Frozen LLMs, author:Yonatan Bisk
Downloaded already '3b0c02955e88f5862e61b560c7f70ba8cf235b1d, title: HomeRobot: Open-Vocabulary Mobile Manipulation, author:Yonatan Bisk
Downloaded already '5ce2f1dff23a5620f77f9b11f1e534422ab8ff3f, title: Plan, Eliminate, and Track - Language Models are Good Teachers for Embodied Agents, author:Yonatan Bisk
Downloaded already '69b8cd15966c4c9c3e44e71769e557f1c87fb3f9, title: MOSAIC: Learning Unified Multi-Sensory Object Property Representations for Robot Perception, author:Yonatan Bisk
Downloaded already '8035a247980cb18abf2bb7b9d96e7d4c63622ef2, title: Reasoning about the Unseen for Efficient Outdoor Object Navigation, author:Yonatan Bisk
Downloaded already 'b777aa86b5a1d49ce8eababc5c2ee56d3562801e, title: The Framework Tax: Disparities Between Inference Efficiency in Research and Deployment, author:Yonatan Bisk
Downlo