# Corpus Building Notebook

In [1]:
#Install libraries

import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests
from tqdm import tqdm
from collections import Counter

In [4]:
#Installing client - run below command in terminal

#pip install pyalex #pyalex client: https://github.com/J535D165/pyalex

## Seed Sources

### Create and Obtain List of Seed Sources

In [3]:
#TODO: Fill in the rest of our seed sources in DOI_list below
DOI_list = ['https://doi.org/10.1098/rsta.2017.0357',
           'https://doi.org/10.1098/rsta.2017.0359',
           'https://doi.org/10.1146/annurev-lawsocsci-041221-023808',
           '10.1080/14719037.2022.2048685',
           'https://doi.org/10.1145/3476089',
           'https://doi.org/10.1162/DAED_a_01922',
           'https://doi.org/10.1098/rsta.2017.0364']

### Grab Information for Each Seed Source

In [4]:
#Initializing pyalex client
from pyalex import Works, Authors, Venues, Institutions, Concepts
import pyalex #The polite pool has much faster and more consistent response times.
pyalex.config.email = "zmintz@utexas.edu" #To get into the polite pool, you set your email

In [5]:
#Pull list of seed DOIs in OpenAlex API
# pipe_separated_DOI_list = "|".join(DOI_list)
# r = requests.get(f"https://api.openalex.org/works?filter=doi:{pipe_separated_DOI_list}&per-page=50&mailto=support@openalex.org")
# works = r.json()["results"]

In [246]:
#Grab citations and references from each seed source
grabbed_seed_sources_dicts = Works().filter(doi=pipe_separated_DOI_list, is_oa=True).get()

#Check how many we grabbed
print("Grabbed %d out of %d articles" % (len(grabbed_seed_sources_dicts), len(DOI_list)))

#If we need to investigate which dois didn't get grabbed
#grabbed_DOI_list = [dict["doi"] for dict in grabbed_seed_sources_dicts]
#grabbed_DOI_list

Grabbed 7 out of 7 articles


### Snapshot of One Work in Seed Source Dictionary 
JSON viewer: http://jsonviewer.stack.hu/ --> view json in tree here
JSON editor: https://jsoneditoronline.org/#left=local.zacayi --> reformat output here

Pertinent attributes in dictionary include
* id : "https://openalex.org/W3186947646"
* doi : "https://doi.org/10.1146/annurev-lawsocsci-041221-023808"
* title : "Algorithms and Decision-Making in the Public Sector"
* display_name : "Algorithms and Decision-Making in the Public Sector"
* publication_year : 2021
* publication_date : "2021-10-13"
* ids
* primary_location
* host_venue
* type : "journal-article"
* open_access
* authorships
* cited_by_count : 15
* biblio
* is_retracted : false
* is_paratext : false
* concepts
* mesh
* locations
* best_oa_location
* alternate_host_venues
* referenced_works
* related_works
* ngrams_url : "https://api.openalex.org/works/W3186947646/ngrams"
* abstract_inverted_index
* cited_by_api_url : "https://api.openalex.org/works?filter=cites:W3186947646"
* counts_by_year
* updated_date : "2023-02-23T16:34:40.366204"
* created_date : "2021-08-02"

## Obtain References and Related Works

### Get List of References and Related Works

In [247]:
#Get referenced works
ref_works_id_list = [dict["referenced_works"] for dict in grabbed_seed_sources_dicts]

#Get related works
related_works_id_list = [dict["related_works"] for dict in grabbed_seed_sources_dicts]

#Flatten list of lists and dedupe ids
ref_ids = [] #initialize
related_ids = []

non_deduped_ref = [item for sublist in ref_works_id_list for item in sublist]
non_deduped_rel = [item for sublist in related_works_id_list for item in sublist]
[ref_ids.append(item) for sublist in ref_works_id_list for item in sublist if item not in ref_ids]
[related_ids.append(item) for sublist in related_works_id_list for item in sublist if item not in related_ids]

print("Related Works: %d total, %d deduped" % (len(non_deduped_rel), len(related_ids)))
print("Referenced Works: %d total, %d deduped" % (len(non_deduped_ref), len(ref_ids)))

Related Works: 70 total, 62 deduped
Referenced Works: 353 total, 344 deduped


In [248]:
#Combine and dedupe the related works and referenced works
combined_rel_ref_ids = set(ref_ids + related_ids)
print("Total number of Deduped Referenced and Related Works:", len(combined_rel_ref_ids))

Total number of Deduped Referenced and Related Works: 406


### Grab Information for Each Reference

In [249]:
#Pull list of seed DOIs in OpenAlex API
# pipe_separated_ref_list = "|".join(ref_ids)

#OpenAlex API can only handle calls for 50 articles at once
pipe_separated_ref_list1 = "|".join(ref_ids[0:50])
pipe_separated_ref_list2 = "|".join(ref_ids[50:100])
pipe_separated_ref_list3 = "|".join(ref_ids[100:150])
pipe_separated_ref_list4 = "|".join(ref_ids[150:200])
pipe_separated_ref_list5 = "|".join(ref_ids[200:250])
pipe_separated_ref_list6 = "|".join(ref_ids[250:300])
pipe_separated_ref_list7 = "|".join(ref_ids[300::])

In [253]:
#OpenAlex is only grabbing ~half of the articles - do not understand why
grabbed_ref_works_dict1 = Works().filter(openalex=pipe_separated_ref_list1, is_oa=False).get()

In [254]:
grabbed_oa_list = [dict["id"] for dict in grabbed_ref_works_dict1]

not_grabbed = list(set(ref_ids[0:50]) - set(grabbed_oa_list))
print("Not Grabbed:", len(not_grabbed))
not_grabbed

Not Grabbed: 25


['https://openalex.org/W2621644951',
 'https://openalex.org/W2599165923',
 'https://openalex.org/W1583390633',
 'https://openalex.org/W2761004239',
 'https://openalex.org/W2121508014',
 'https://openalex.org/W2117767124',
 'https://openalex.org/W2155317500',
 'https://openalex.org/W2019075367',
 'https://openalex.org/W1996358684',
 'https://openalex.org/W1981235228',
 'https://openalex.org/W2017481266',
 'https://openalex.org/W2032653717',
 'https://openalex.org/W2084341220',
 'https://openalex.org/W3125353836',
 'https://openalex.org/W2607562496',
 'https://openalex.org/W1582164576',
 'https://openalex.org/W2528072246',
 'https://openalex.org/W2014352947',
 'https://openalex.org/W1494192115',
 'https://openalex.org/W1528117011',
 'https://openalex.org/W1515468801',
 'https://openalex.org/W1996392459',
 'https://openalex.org/W2098614273',
 'https://openalex.org/W1534601691',
 'https://openalex.org/W2148889331']

In [255]:
print("Grabbed:", len(grabbed_oa_list))
grabbed_oa_list

Grabbed: 25


['https://openalex.org/W1901616594',
 'https://openalex.org/W2132773425',
 'https://openalex.org/W1686065478',
 'https://openalex.org/W2077494655',
 'https://openalex.org/W1994451842',
 'https://openalex.org/W2149583356',
 'https://openalex.org/W2123992955',
 'https://openalex.org/W1975070330',
 'https://openalex.org/W2096166601',
 'https://openalex.org/W2099436484',
 'https://openalex.org/W2088641112',
 'https://openalex.org/W1976548852',
 'https://openalex.org/W1976237711',
 'https://openalex.org/W2162577040',
 'https://openalex.org/W2052432121',
 'https://openalex.org/W2150363226',
 'https://openalex.org/W2165610448',
 'https://openalex.org/W1993781022',
 'https://openalex.org/W2102124654',
 'https://openalex.org/W1965919670',
 'https://openalex.org/W1550609238',
 'https://openalex.org/W1899305834',
 'https://openalex.org/W2000771887',
 'https://openalex.org/W2159821153',
 'https://openalex.org/W4251425504']