In [14]:
import elsapy
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json
import urllib

import httpx
import time
import requests
import textwrap

import pandas as pd
import numpy as np
import sys

import config

In [2]:
sys.path.append('C:/Users/ad882/OneDrive - University of Exeter\Miscellaneous/Research-Paper-Explorer')

In [21]:
# Specify the keys and tokens

api_key = config.elsevier_api_key
inst_token = config.elsevier_inst_token


headers = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }

In [22]:
json_data = []

query = 'sexism%20OR%20misogy'
fields = 'dc:title' # Just focus on titles
year = 2013

scopus_search_url = 'https://api.elsevier.com/content/search/scopus?'
offset = 0

print(f'Starting {year} now')

while True: # As long as the url exists
    # print('Requesting', url)
    # print('Offset', offset)
    # Make the request
    responses = requests.get(f'{scopus_search_url}query={query}&field={fields}&date={year}&start={offset}&view=STANDARD', headers=headers, stream = True)
    json = responses.json()
    if 'search-results' in json:
        json = json['search-results']
        # Check if any values are left
        if 'entry' in json : # The 'entry' key only exists if the request was successful, meaning values are left
            offset = offset + 25
            # If data found, add them to the variable, 
            # add them to the list and move to the next offset
            json_data.extend(json['entry'])
            time.sleep(3)
        else:
            year += 1
            if year > 2022:
                print(f'Reached {year}, so ending search now')
                break
            else: 
                offset = 0
                responses = requests.get(f'{scopus_search_url}query={query}&field={fields}&date={year}&start={offset}&view=STANDARD', headers=headers, stream = True)
                json = responses.json()
                if 'search-results' in json:
                    json = json['search-results']
                    if 'entry' in json : # The 'entry' key only exists if the request was successful, meaning values are left
                        offset = offset + 25
                        json_data.extend(json['entry'])
                        time.sleep(3)
                        print(f'Starting {year} now')
                    else:
                        print(f'Reached here')
                        continue
                else:
                    print(f'Reached the year {year}, but no search results')
                    continue
    else:
        print(f'Response 400 Bad request') # check the query for the request then
        break

Starting 2013 now
Starting 2014 now
Starting 2015 now
Starting 2016 now
Starting 2017 now
Starting 2018 now
Starting 2019 now
Starting 2020 now
Starting 2021 now
Starting 2022 now
Response 400 Bad request


In [23]:
titles = pd.DataFrame(json_data)
titles = titles.drop_duplicates(subset='dc:title')
len(titles)

32009

In [24]:
titles

Unnamed: 0,@_fa,prism:url,dc:title
0,true,https://api.elsevier.com/content/abstract/scop...,The scale of ethical attitude toward ethnic hu...
1,true,https://api.elsevier.com/content/abstract/scop...,Womanist ways and pentecostalism: The work of ...
2,true,https://api.elsevier.com/content/abstract/scop...,A feminist pentecostal theological anthropolog...
3,true,https://api.elsevier.com/content/abstract/scop...,Dying to get out of the asylum: Mortality and ...
4,true,https://api.elsevier.com/content/abstract/scop...,"Citizenship without representation? Blackface,..."
...,...,...,...
32204,true,https://api.elsevier.com/content/abstract/scop...,Gender disparities among leadership in academi...
32205,true,https://api.elsevier.com/content/abstract/scop...,Towards a digital football studies: current tr...
32206,true,https://api.elsevier.com/content/abstract/scop...,Gender and potential impacts on decision-makin...
32207,true,https://api.elsevier.com/content/abstract/scop...,Women representation in academic and leadershi...


## Using the Semantic Scholar API now

In [25]:
def url_encode(text):
    text = urllib.parse.quote(str(text))
    return text

In [26]:
list_titles = titles['dc:title'].apply(url_encode)
# Store it as a list
list_titles = list_titles.to_list()

In [27]:
api_key = config.semantic_api_key

headers = {
        'Content-type': 'application/json',
        'x-api-key': api_key,
        'Connection': 'close'
    }

In [34]:
# Automated, but without the API credentials
limit= 1 # Because we only want exact match
json_data = []
print('Starting to collect results')

for title in list_titles:
    # Make the request
    #print(f'{title}')
    session = requests.Session()
    # What the TimeoutHTTPAdapter class actually does is it overrides any call-specific timeout arguments 
    # with the instance-defined timeout attribute when sending requests.
    # So only using the max retries argument here and calling the timeout in the calls itself
    adapter = requests.adapters.HTTPAdapter(max_retries=20) # Mounts a custom adapter to a given schema. Here just increases the number of allowed retries.
    # Now let's mount the instance of the custom TimeoutHTTPAdapter class with the session object, and then the GET request was specifically executed via the 
    # session object (session.get()), not via the requests.get() global function.       
    session.mount("https://", adapter)
    # Actually the timeout argument is a bit more complex: you can specify a tuple of two values. 
    # The first value is the connect timeout and the second is the read timeout.
    # e.g., timeout=Timeout(connect=2, read=60)
    responses = session.get(f'https://api.semanticscholar.org/graph/v1/paper/search?query=title:({title})&limit={limit}', headers=headers, stream = True, timeout= (50, 120))
    json = responses.json()
    responses.close()
    # Check if the sum of offset and limit exceeds 10000 or if any values are left
    # To avoid {'error': 'offset + limit must be < 10000'} Why? Check the About.md for answers
    #if 'data' in json: 
    if ('data' not in json) or (not json['data']):
        #print('Going to the next title')
        continue
    else:
        try:
            paper_id = json['data'][0]['paperId']
            responses = session.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields=url,abstract,authors,title,venue,year,publicationTypes,referenceCount,citationCount,influentialCitationCount,isOpenAccess,fieldsOfStudy,tldr', headers=headers, stream = True, timeout= (50, 120))
            json = responses.json()
            responses.close()
            if 'title' in json: 
                try:
                    # The next step is needed. Otherwise will give unhashable dict error
                    json_extension = {}
                    # Add all the different kinds of objects which are of interest
                    json_extension['Title'] = json['title']
                    json_extension['Authors'] = json['authors']
                    json_extension['Abstract'] = json['abstract']
                    json_extension['Venue'] = json['venue']
                    json_extension['Year'] = json['year']
                    json_extension['Publication_Type'] = json['publicationTypes']
                    json_extension['Reference_Count'] = json['referenceCount']
                    json_extension['Citation_Count'] = json['citationCount']
                    json_extension['Influential_Citation_Count'] = json['influentialCitationCount']
                    json_extension['Open_Access'] = json['isOpenAccess']
                    json_extension['fieldsOfStudy'] = json['fieldsOfStudy']
                    json_extension['TLDR'] = json['tldr']
                    json_data.append(json_extension)
                    time.sleep(5)
                    print(f'Results of {paper_id} started')
                except requests.exceptions.Timeout:
                    print("Paper information timeout occurred")
                    continue
            else:
                print(f'No data could be retrieved')
                continue
        except requests.exceptions.Timeout:
            print("Search paper timeout occurred")
            continue


Starting to collect results
Results of 13a5af8e07a219eb0e44ca628f5e5afc1d580f75 started
Results of 3f08db4b3f1b4ae0724e17a9cda4e5b6c05c36ad started
Results of b0fb71ea3120011b0a11e51158ade0bdfd50cc3a started
Results of 16e2e7522907c43202fbfd1236e892e085345da5 started
Results of 2f872da85c2e1dfed1bfcda2921107ac27d5bf54 started
Results of dd129191b001189a673649712626d05b6f51fb48 started
Results of 0b496d7bd574eb6a1711004efa4e424e2a5aa7c0 started
Results of 8c72da105fdc10ad918eb3732fd1d491be1803d1 started
Results of 9a948d0ae399d4f1f012bfb4f2a3774a1644b14f started
Results of 96f983b1c853b15d6d4fb2dfbb27d3b13417b416 started
Results of a0d1ddea097b95f9460e9b0e4a3aabedeb7710b1 started
Results of 3faba61266dc7ca97398d75f8ee26ef1e666aea9 started
Results of 6e95bfb6ee64b9423848fecd665a43c11fcbae6d started
Results of e770cc9e56c700ee6edc488a6ad2a8ce0609b33f started
Results of 31591aa605e4c482a22a1e09760ebfaf95368ee9 started
Results of ad6a4d6fcaec301f08443ef1095a0268077007da started
Results of f

KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(json_data)
df = df.drop_duplicates(subset=['Title', 'Abstract'])
df['index'] = df.index
df

Unnamed: 0,Title,Authors,Abstract,Venue,Year,Publication_Type,Reference_Count,Citation_Count,Influential_Citation_Count,Open_Access,fieldsOfStudy,TLDR,index
0,The Scale of Ethical Attitude toward Ethnic Hu...,"[{'authorId': '1391152009', 'name': 'E. F. Hag...",Research on ethnic humor has been centered on ...,,2013,,77,5,0,True,[Psychology],,0
1,A Feminist Pentecostal Theological Anthropolog...,"[{'authorId': '40164727', 'name': 'Lisa P. Ste...",Abstract In this article I focus on developing...,,2013,,1,1,0,False,[Sociology],,1
2,Dying to Get Out of the Asylum: Mortality and ...,"[{'authorId': '2144036837', 'name': 'D. Wright...",This article examines the status of lunatic ca...,Bulletin of The History of Medicine,2013,[JournalArticle],74,4,1,False,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'Cause of dea...",2
3,"Citizenship without representation? Blackface,...","[{'authorId': '73721487', 'name': 'Adam Haupt'}]",Abstract This article explores Die Antwoord's ...,,2013,,41,2,0,False,[Sociology],,3
4,"Schools, Sex Education, and Support for Sexual...","[{'authorId': '1401799456', 'name': 'David McC...",School-based adolescent sexual health educatio...,,2013,[Review],84,40,6,False,[Psychology],,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,"Racial Diversity, Religion, and Morality: Exam...","[{'authorId': '6004434', 'name': 'Samuel L. Pe...",,,2013,[Review],41,10,0,False,[Sociology],,446
447,Penalizing Men Who Request a Family Leave: Is ...,"[{'authorId': '3156773', 'name': 'L. Rudman'},...",Men who request a family leave are viewed as p...,,2013,,45,391,21,False,[Psychology],,447
448,Comparative study of the quality of life assoc...,"[{'authorId': '145894542', 'name': 'F. Ferrand...",ObjectiveBecause the experience of menopause v...,Menopause,2013,"[JournalArticle, Study, Review]",59,46,0,False,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'This epidemi...",448
449,Rethinking racial formation theory: a systemic...,"[{'authorId': '93292986', 'name': 'J. Feagin'}...",Abstract In this theoretical analysis of US ra...,,2013,,97,162,4,False,[Sociology],,449


In [None]:
df.Year.unique()

array([2013, 1999, 1988, 2022, 1986, 1989, 2012, 2016, 2020, 2014, 1963,
       2006, 2021, 2010, 2018, 1977, 2011, 1995, 2019, 1985, 1964, 2015,
       2009, 2001, 1982, 2008], dtype=int64)

## Authors

In [51]:
authors = pd.json_normalize(df['Authors'])
print("Number of null values in authors: \n First author: {first}\n Second author: {second} \n Third author: {third} \n Fourth author: {fourth}\n Fifth author: {fifth}\n Sixth author: {sixth}".format(first=authors[0].isna().sum(), second= authors[1].isna().sum(), third= authors[2].isna().sum(), fourth= authors[3].isna().sum(), fifth= authors[4].isna().sum(), sixth= authors[5].isna().sum()))

Number of null values in authors: 
 First author: 9
 Second author: 565 
 Third author: 917 
 Fourth author: 1149
 Fifth author: 1272
 Sixth author: 1325


In [52]:
nested_authors1 = pd.json_normalize(authors[0]).rename(columns={'authorId':'author_1_id', 'name':'author_1_name'})
nested_authors1['index'] = nested_authors1.index
nested_authors1_comb = nested_authors1.groupby(['author_1_id','author_1_name']).size().reset_index().rename(columns={0:'count'})
nested_authors1_comb = nested_authors1_comb.sort_values(by='count', ascending=False)

# Following the same technique from the previous author 1 set
# Author 2
nested_authors2 = pd.json_normalize(authors[1]).rename(columns={'authorId':'author_2_id', 'name':'author_2_name'})
nested_authors2['index'] = nested_authors2.index
nested_authors2_comb = nested_authors2.groupby(['author_2_id','author_2_name']).size().reset_index().rename(columns={0:'count'})
nested_authors2_comb = nested_authors2_comb.sort_values(by='count', ascending=False)

# Author 3
nested_authors3 = pd.json_normalize(authors[2]).rename(columns={'authorId':'author_3_id', 'name':'author_3_name'})
nested_authors3['index'] = nested_authors3.index
nested_authors3_comb = nested_authors3.groupby(['author_3_id','author_3_name']).size().reset_index().rename(columns={0:'count'})
nested_authors3_comb = nested_authors3_comb.sort_values(by='count', ascending=False)

# Author 4
nested_authors4 = pd.json_normalize(authors[3]).rename(columns={'authorId':'author_4_id', 'name':'author_4_name'})
nested_authors4['index'] = nested_authors4.index
nested_authors4_comb = nested_authors4.groupby(['author_4_id','author_4_name']).size().reset_index().rename(columns={0:'count'})
nested_authors4_comb = nested_authors4_comb.sort_values(by='count', ascending=False)

# Author 5
nested_authors5 = pd.json_normalize(authors[4]).rename(columns={'authorId':'author_5_id', 'name':'author_5_name'})
nested_authors5['index'] = nested_authors5.index
nested_authors5_comb = nested_authors5.groupby(['author_5_id','author_5_name']).size().reset_index().rename(columns={0:'count'})
nested_authors5_comb = nested_authors5_comb.sort_values(by='count', ascending=False)

# Author 6
nested_authors6 = pd.json_normalize(authors[5]).rename(columns={'authorId':'author_6_id', 'name':'author_6_name'})
nested_authors6['index'] = nested_authors6.index
nested_authors6_comb = nested_authors6.groupby(['author_6_id','author_6_name']).size().reset_index().rename(columns={0:'count'})
nested_authors6_comb = nested_authors6_comb.sort_values(by='count', ascending=False)

In [53]:
#only run once
df = pd.merge(df, nested_authors1, on="index")
df = pd.merge(df, nested_authors2, on="index")
df = pd.merge(df, nested_authors3, on="index")
df = pd.merge(df, nested_authors4, on="index")
df = pd.merge(df, nested_authors5, on="index")
df = pd.merge(df, nested_authors6, on="index")

In [54]:
df = df.drop(columns=['Authors'])

## Fields

In [55]:
fields = df.fieldsOfStudy.copy()

fields = fields.explode()
fields_index = fields.index.to_list()
fields_name = fields.to_list()
fields = [fields_index, fields_name]
fields = pd.DataFrame(fields).transpose()
fields.columns = ['index', 'name']

# unique_fields = list(fields.name.unique()) # Will be useful later

fields = fields.groupby(by='index')['name'].apply(lambda x:x.str.cat(sep=", ")).reset_index(drop=True)
fields = pd.DataFrame(fields)
fields['index'] = fields.index

fields

Unnamed: 0,name,index
0,Psychology,0
1,Sociology,1
2,Medicine,2
3,Sociology,3
4,Psychology,4
...,...,...
1398,Medicine,1398
1399,Philosophy,1399
1400,Physics,1400
1401,,1401


In [56]:
# drop the column 'fieldsOfStudy'
df = df.drop(columns=['fieldsOfStudy'])

# merge the above dataframe to this one
df = pd.merge(df, fields, on="index").rename(columns={'name': 'fieldsOfStudy'})

## Publication Type

In [57]:
pub_type = df['Publication_Type'].copy()

pub_type = pub_type.explode()
pub_type_index = pub_type.index.to_list()
pub_type_name = pub_type.to_list()
pub_type = [pub_type_index, pub_type_name]
pub_type = pd.DataFrame(pub_type).transpose()
pub_type.columns = ['index', 'name']

# unique_fields = list(fields.name.unique()) # Will be useful later

pub_type = pub_type.groupby(by='index')['name'].apply(lambda x:x.str.cat(sep=", ")).reset_index(drop=True)
pub_type = pd.DataFrame(pub_type)
pub_type['index'] = pub_type.index

pub_type

Unnamed: 0,name,index
0,,0
1,,1
2,JournalArticle,2
3,,3
4,Review,4
...,...,...
1387,,1387
1388,JournalArticle,1388
1389,,1389
1390,"JournalArticle, Review",1390


In [58]:
# drop the column 'fieldsOfStudy'
df = df.drop(columns=['Publication_Type'])

# merge the above dataframe to this one
df = pd.merge(df, pub_type, on="index").rename(columns={'name': 'Publication_Type'})

In [59]:
df.to_csv('collected_elsevier_data.csv')

In [60]:
df

Unnamed: 0,Title,Abstract,Venue,Year,Reference_Count,Citation_Count,Influential_Citation_Count,Open_Access,TLDR,index,...,author_3_id,author_3_name,author_4_id,author_4_name,author_5_id,author_5_name,author_6_id,author_6_name,fieldsOfStudy,Publication_Type
0,The Scale of Ethical Attitude toward Ethnic Hu...,Research on ethnic humor has been centered on ...,,2013,77,5,0,True,,0,...,2001444,G. Pfuhl,34585107,R. Biegler,4994730,A. Teymoori,,,Psychology,
1,A Feminist Pentecostal Theological Anthropolog...,Abstract In this article I focus on developing...,,2013,1,1,0,False,,1,...,,,,,,,,,Sociology,
2,Dying to Get Out of the Asylum: Mortality and ...,This article examines the status of lunatic ca...,Bulletin of The History of Medicine,2013,74,4,1,False,"{'model': 'tldr@v2.0.0', 'text': 'Cause of dea...",2,...,2149888528,Tom Themeles,,,,,,,Medicine,JournalArticle
3,"Citizenship without representation? Blackface,...",Abstract This article explores Die Antwoord's ...,,2013,41,2,0,False,,3,...,,,,,,,,,Sociology,
4,"Schools, Sex Education, and Support for Sexual...",School-based adolescent sexual health educatio...,,2013,84,40,6,False,,4,...,,,,,,,,,Psychology,Review
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376,"Body, Eugenics and Nationalism: Women in the F...",This research analyses images of the female bo...,,2014,47,2,0,False,,1387,...,,,,,,,,,Sociology,
1377,What Looks Like Sexism and Why? The Effect of ...,ABSTRACT. Sexist comments are not perceived eq...,The Journal of general psychology,2014,45,24,4,False,"{'model': 'tldr@v2.0.0', 'text': 'It is demons...",1388,...,,,,,,,,,Sociology,JournalArticle
1378,The Standardized Experimental Situation in Exp...,,,2014,79,39,3,False,,1389,...,114601026,Mateja Melink,,,,,,,Sociology,
1379,Playing Games and Asking Questions in a Non-WE...,A recent debate across the social sciences que...,,2014,34,0,0,False,,1390,...,,,,,,,,,"Medicine, Economics","JournalArticle, Review"


In [61]:
# Create two dataframes based on Computer Science and Social Science

# All computer science and related fields
df_comp = df[df.fieldsOfStudy.str.contains('Computer Science') == True]
df_engg = df[df.fieldsOfStudy.str.contains('Engineering') == True]
df_maths = df[df.fieldsOfStudy.str.contains('Mathematics') == True]
df_comp = pd.concat([df_comp, df_engg, df_maths])
df_comp = df_comp.drop_duplicates(subset=['index']).reset_index(drop=True) #dropping rows with same index values

# All social science and related fields
df_soc = df[df.fieldsOfStudy.str.contains('Sociology') == True]
df_pol = df[df.fieldsOfStudy.str.contains('Political Science') == True]
df_phy = df[df.fieldsOfStudy.str.contains('Philosophy') == True]
df_psc = df[df.fieldsOfStudy.str.contains('Psychology') == True]
df_soc = pd.concat([df_soc, df_pol, df_phy, df_psc])
df_soc = df_soc.drop_duplicates(subset=['index']).reset_index(drop=True) #dropping rows with same index values

In [62]:
len(df_comp), len(df_soc)

(47, 1055)