# Corpus Building Notebook

In [1]:
#Install libraries

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm
from collections import Counter

In [4]:
#Installing client - run below command in terminal

#pip install pyalex #pyalex client: https://github.com/J535D165/pyalex

## Seed Sources

### Create and Obtain List of Seed Sources

In [2]:
#TODO: Fill in the rest of our seed sources in DOI_list below
DOI_list = ['https://doi.org/10.1098/rsta.2017.0357',
           'https://doi.org/10.1098/rsta.2017.0359',
           'https://doi.org/10.1146/annurev-lawsocsci-041221-023808',
           '10.1080/14719037.2022.2048685',
           'https://doi.org/10.1145/3476089',
           'https://doi.org/10.1162/DAED_a_01922',
           'https://doi.org/10.1098/rsta.2017.0364']

### Grab Information for Each Seed Source

In [5]:
#Initializing pyalex client
from pyalex import Works, Authors, Venues, Institutions, Concepts
import pyalex #The polite pool has much faster and more consistent response times.
pyalex.config.email = "zmintz@utexas.edu" #To get into the polite pool, you set your email

In [6]:
#Pull list of seed DOIs in OpenAlex API
# pipe_separated_DOI_list = "|".join(DOI_list)
# r = requests.get(f"https://api.openalex.org/works?filter=doi:{pipe_separated_DOI_list}&per-page=50&mailto=support@openalex.org")
# works = r.json()["results"]

In [8]:
#Pull list of seed DOIs in OpenAlex API
pipe_separated_DOI_list = "|".join(DOI_list)

#Grab citations and references from each seed source
grabbed_seed_sources_dicts = Works().filter(doi=pipe_separated_DOI_list, is_oa=True).get()

#Check how many we grabbed
print("Grabbed %d out of %d articles" % (len(grabbed_seed_sources_dicts), len(DOI_list)))

#If we need to investigate which dois didn't get grabbed
#grabbed_DOI_list = [dict["doi"] for dict in grabbed_seed_sources_dicts]
#grabbed_DOI_list

Grabbed 7 out of 7 articles


### Snapshot of One Work in Seed Source Dictionary 
JSON viewer: http://jsonviewer.stack.hu/ --> view json in tree here
JSON editor: https://jsoneditoronline.org/#left=local.zacayi --> reformat output here

Pertinent attributes in dictionary include
* id : "https://openalex.org/W3186947646"
* doi : "https://doi.org/10.1146/annurev-lawsocsci-041221-023808"
* title : "Algorithms and Decision-Making in the Public Sector"
* display_name : "Algorithms and Decision-Making in the Public Sector"
* publication_year : 2021
* publication_date : "2021-10-13"
* ids
* primary_location
* host_venue
* type : "journal-article"
* open_access
* authorships
* cited_by_count : 15
* biblio
* is_retracted : false
* is_paratext : false
* concepts
* mesh
* locations
* best_oa_location
* alternate_host_venues
* referenced_works
* related_works
* ngrams_url : "https://api.openalex.org/works/W3186947646/ngrams"
* abstract_inverted_index
* cited_by_api_url : "https://api.openalex.org/works?filter=cites:W3186947646"
* counts_by_year
* updated_date : "2023-02-23T16:34:40.366204"
* created_date : "2021-08-02"

## Obtain References and Related Works

### Get List of References and Related Works

In [9]:
#Get referenced works
ref_works_id_list = [dict["referenced_works"] for dict in grabbed_seed_sources_dicts]

#Get related works
related_works_id_list = [dict["related_works"] for dict in grabbed_seed_sources_dicts]

#Flatten list of lists and dedupe ids
ref_ids = [] #initialize
related_ids = []

non_deduped_ref = [item for sublist in ref_works_id_list for item in sublist]
non_deduped_rel = [item for sublist in related_works_id_list for item in sublist]
[ref_ids.append(item) for sublist in ref_works_id_list for item in sublist if item not in ref_ids]
[related_ids.append(item) for sublist in related_works_id_list for item in sublist if item not in related_ids]

print("Related Works: %d total, %d deduped" % (len(non_deduped_rel), len(related_ids)))
print("Referenced Works: %d total, %d deduped" % (len(non_deduped_ref), len(ref_ids)))

Related Works: 70 total, 62 deduped
Referenced Works: 353 total, 344 deduped


In [10]:
#Combine and dedupe the related works and referenced works
combined_rel_ref_ids = set(ref_ids + related_ids)
print("Total number of Deduped Referenced and Related Works:", len(combined_rel_ref_ids))

Total number of Deduped Referenced and Related Works: 406


### Grab Information for Each Reference

In [11]:
#Pull list of seed DOIs in OpenAlex API
# pipe_separated_ref_list = "|".join(ref_ids)

#OpenAlex API can only handle calls for 50 articles at once
pipe_separated_ref_list1 = "|".join(ref_ids[0:50])
pipe_separated_ref_list2 = "|".join(ref_ids[50:100])
pipe_separated_ref_list3 = "|".join(ref_ids[100:150])
pipe_separated_ref_list4 = "|".join(ref_ids[150:200])
pipe_separated_ref_list5 = "|".join(ref_ids[200:250])
pipe_separated_ref_list6 = "|".join(ref_ids[250:300])
pipe_separated_ref_list7 = "|".join(ref_ids[300::])

In [16]:
#OpenAlex is only grabbing ~half of the articles due to pagination 

#Not sure how to change this using the python wrapper
# grabbed_ref_works_dict1 = Works().filter(openalex=pipe_separated_ref_list1, is_oa=False, per_page=200).get()

r = requests.get(f"https://api.openalex.org/works?filter=openalex:{pipe_separated_ref_list1}&per-page=200&mailto=support@openalex.org")
works = r.json()["results"]
len(works)

50

# Functions for Corpus Building

In [94]:
import math

def openalex(article_list, id_type="doi"):
    article_batches = batch(article_list)
    results = []
    #iterate through batches of 50 articles 
    for i in article_batches:
        piped_article_batch='|'.join(i)
        url=f'https://api.openalex.org/works?filter={id_type}:{piped_article_batch}&per-page=50&cursor=*&mailto=support@openalex.org'
        req=requests.get(url).json()
        results += req['results']
    if len(results) == len(article_list):
        return results
    else:
        # print("Number of articles in (%s) does not equal number returned (%s)" % (len(article_list), len(results)))
        raise Exception("Number of articles in (%s) does not equal number returned (%s)" % (len(article_list), len(results))) # Make sure all results are returned; otherwise, raise errors.
   
    
def batch(articles):
    batched_articles = []
    for i in range(0, len(articles), 50):
        batched_articles += [articles[i:i + 50]]
    return batched_articles

7

In [108]:
def grab_references(openalex_list):
    #Get referenced works
    ref_works_id_list = [dict["referenced_works"] for dict in openalex_list]
    print("num of ref works %s" % (len(ref_works_id_list)))

    #Get related works
    related_works_id_list = [dict["related_works"] for dict in openalex_list]
    print("num of rel works %s" % (len(related_works_id_list)))

    #Flatten list of lists and dedupe ids
    ref_ids = [] #initialize
    related_ids = []

    non_deduped_ref = [item for sublist in ref_works_id_list for item in sublist]
    non_deduped_rel = [item for sublist in related_works_id_list for item in sublist]
    [ref_ids.append(item) for sublist in ref_works_id_list for item in sublist if item not in ref_ids]
    [related_ids.append(item) for sublist in related_works_id_list for item in sublist if item not in related_ids]

    print("Related Works: %d total, %d deduped" % (len(non_deduped_rel), len(related_ids)))
    print("Referenced Works: %d total, %d deduped" % (len(non_deduped_ref), len(ref_ids)))
    
    combined_rel_ref_ids = set(ref_ids + related_ids)
    print("Total number of Deduped Referenced and Related Works:", len(combined_rel_ref_ids))
    
    #dedupe the seed sources from references
    return openalex(list(combined_rel_ref_ids), id_type="openalex") 

In [107]:
full_seed_src = ["10.1162/daed_a_01922",
                 "10.1098/rsta.2017.0364",
                 "10.1007/978-3-030-92644-1",
                 # "10.2307/48662048", 
                 "10.1111/puar.12979",
                 "10.3390/pr8111374",
                 "10.1002/poi3.165",
                 "10.1145/3476089",
                 "10.1080/14719037.2022.2048685",
                 "10.1146/annurev-lawsocsci-041221-023808",
                 # "10.2307/26601757",
                 # "10.2307/26601763",
                 "10.1177/08944393211034087",
                 "10.1145/3025453.3025579",
                 "10.1016/j.giq.2005.11.005",
                 "10.1016/j.techsoc.2016.07.001",
                 "10.3390/su11205791",
                 # "10.2307/27170531"
                ]

full_seed_openalex = openalex(full_seed_src)
# len(full_seed_openalex)
full_ref_openalex = grab_references(full_seed_openalex)

Related Works: 70 total, 62 deduped
Referenced Works: 353 total, 344 deduped
Total number of Deduped Referenced and Related Works: 406
