In [14]:
import elsapy
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json

import httpx
import time
import requests
import textwrap

import pandas as pd
import numpy as np
import sys

import config

### Scopus Search API

In [15]:
# Specify the keys and tokens

api_key = config.elsevier_api_key
inst_token = config.elsevier_inst_token

In [21]:
headers = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    
# Actual query: 
# (misogyny OR sexism OR gender violence) AND (detection OR identification OR prediction OR classification)

# https://dev.elsevier.com/sc_search_tips.html
# URL encoding
#The Boolean search is submitted through the query string parameter 'query'. 
# As with all other query string parameters, the contents of the submitted search must be URL-encoded. 
# It should be noted that the '+' character serves a special purpose as a query string value, 
# functioning as an equivalent to the space character (i.e. %20). In order to submit a literal character '+' it must be properly URL-encoded (i.e. %2B).

# query = 'KEY%28misogyny%20OR%20sexism%20OR%20gender%20violence%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'
query = 'KEY%28misogyny%20OR%20sexism%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'

# dc:description (Abstract) is not available in Standard type, and the Complete type is not accessible
fields = 'dc:identifier' # just need the scopus ID for the next step
# ,eid,dc:title,dc:creator,prism:aggregationType,subtype,subtypeDescription,citedby-count,prism:publicationName,prism:coverDate,prism:doi,pii,orcid,openaccess,affiliation,author
offset = 0
years = '2012-2022'

In [22]:
responses = requests.get(f'https://api.elsevier.com/content/search/scopus?query={query}&field={fields}&date={years}', headers=headers, stream = True)
responses

<Response [200]>

In [23]:
json = responses.json()
json = json['search-results']
json.keys()

dict_keys(['opensearch:totalResults', 'opensearch:startIndex', 'opensearch:itemsPerPage', 'opensearch:Query', 'link', 'entry'])

In [24]:
json['opensearch:totalResults']

'1347'

In [25]:
json['opensearch:startIndex'], json['opensearch:itemsPerPage'], json['link']
# not that useful

('0',
 '25',
 [{'@_fa': 'true',
   '@ref': 'self',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=KEY%28misogyny+OR+sexism%29+AND+%28detection+OR+identification+OR+prediction+OR+classification%29&field=dc:identifier&date=2012-2022',
   '@type': 'application/json'},
  {'@_fa': 'true',
   '@ref': 'first',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=KEY%28misogyny+OR+sexism%29+AND+%28detection+OR+identification+OR+prediction+OR+classification%29&field=dc:identifier&date=2012-2022',
   '@type': 'application/json'},
  {'@_fa': 'true',
   '@ref': 'next',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=25&count=25&query=KEY%28misogyny+OR+sexism%29+AND+%28detection+OR+identification+OR+prediction+OR+classification%29&field=dc:identifier&date=2012-2022',
   '@type': 'application/json'},
  {'@_fa': 'true',
   '@ref': 'last',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=1322&count=25&

In [26]:
json['opensearch:Query']

{'@role': 'request',
 '@searchTerms': 'KEY(misogyny OR sexism) AND (detection OR identification OR prediction OR classification)',
 '@startPage': '0'}

In [27]:
json['entry'][0]

{'@_fa': 'true',
 'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/85144861178',
 'dc:identifier': 'SCOPUS_ID:85144861178'}

Now let's use the credentials again to get ALL the results

In [30]:
headers = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }

#query = 'KEY%28misogyny%20OR%20sexism%20OR%20gender%20violence%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'
query = 'KEY%28misogyny%20OR%20sexism%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'

fields = 'dc:identifier,prism:doi'
offset = 0
years = '2012-2022'

scopus_search_url = 'https://api.elsevier.com/content/search/scopus?'

In [31]:
json_data1 = []
offset = 0

while True: # As long as the url exists
    # print('Requesting', url)
    # print('Offset', offset)
    # Make the request
    responses = requests.get(f'{scopus_search_url}query={query}&field={fields}&date={years}&start={offset}&view=STANDARD', headers=headers, stream = True)
    json = responses.json()
    json = json['search-results']
    # Check if any values are left
    if 'entry' in json : # The 'entry' key only exists if the request was successful, meaning values are left
        offset = offset + 25
        # If data found, add them to the variable, 
        # add them to the list and move to the next offset
        json_data1.extend(json['entry'])
    else:
        break

In [33]:
df_int = pd.DataFrame(json_data1)
df_int

Unnamed: 0,@_fa,prism:url,dc:identifier,prism:doi
0,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85144861178,10.37467/revhuman.v11.4317
1,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85145218361,10.1515/mc-2022-0008
2,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85144594877,10.1371/journal.pone.0279363
3,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85144332605,10.1038/s41533-022-00306-7
4,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85143126544,10.1016/j.cjca.2022.09.009
...,...,...,...,...
1342,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84867082151,10.1177/0149206310365902
1343,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84866035736,10.1016/j.ijgo.2012.03.024
1344,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84860903407,10.1590/s1555-79602012000200009
1345,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84155164678,10.1007/s11199-011-0058-6


Collecting only the scopus IDs so that it can be used for abstract retrieval.

In [34]:
scopus_ids = df_int['dc:identifier'].to_list()

In [35]:
df_int.columns

Index(['@_fa', 'prism:url', 'dc:identifier', 'prism:doi'], dtype='object')

### Abstract API
Use the previous information to retrieve the data from all scopus IDs.

In [36]:
json_data2 = []
i = 0

for id in scopus_ids: # As long as the url exists
    print(f'ID number {i}: {id}')
    i += 1
    # Make the request
    responses = requests.get(f'https://api.elsevier.com/content/abstract/scopus_id/{id}', headers=headers, stream = True)
    json = responses.json()
    if 'abstracts-retrieval-response' in json:
        json = json['abstracts-retrieval-response']
        # If data found, add them to the variable, 
        # add them to the list and move to the next offset
        json_data2.append(json['coredata'])
    else:
        continue

ID number 0: SCOPUS_ID:85144861178
ID number 1: SCOPUS_ID:85145218361
ID number 2: SCOPUS_ID:85144594877
ID number 3: SCOPUS_ID:85144332605
ID number 4: SCOPUS_ID:85143126544
ID number 5: SCOPUS_ID:85142928092
ID number 6: SCOPUS_ID:85142208112
ID number 7: SCOPUS_ID:85141482956
ID number 8: SCOPUS_ID:85139490357
ID number 9: SCOPUS_ID:85139407609
ID number 10: SCOPUS_ID:85135876150
ID number 11: SCOPUS_ID:85135737290
ID number 12: SCOPUS_ID:85135124915
ID number 13: SCOPUS_ID:85133467154
ID number 14: SCOPUS_ID:85130436480
ID number 15: SCOPUS_ID:85127352655
ID number 16: SCOPUS_ID:85126890376
ID number 17: SCOPUS_ID:85125003315
ID number 18: SCOPUS_ID:85124817748
ID number 19: SCOPUS_ID:85122176586
ID number 20: SCOPUS_ID:85099760450
ID number 21: SCOPUS_ID:85141961420
ID number 22: SCOPUS_ID:85143784780
ID number 23: SCOPUS_ID:85141623980
ID number 24: SCOPUS_ID:85138955510
ID number 25: SCOPUS_ID:85138772142
ID number 26: SCOPUS_ID:85136601987
ID number 27: SCOPUS_ID:85134843435
ID

ID number 225: SCOPUS_ID:85120923668
ID number 226: SCOPUS_ID:85119903652
ID number 227: SCOPUS_ID:85119145228
ID number 228: SCOPUS_ID:85118103879
ID number 229: SCOPUS_ID:85117230288
ID number 230: SCOPUS_ID:85117197894
ID number 231: SCOPUS_ID:85116966626
ID number 232: SCOPUS_ID:85116937178
ID number 233: SCOPUS_ID:85116677003
ID number 234: SCOPUS_ID:85112780485
ID number 235: SCOPUS_ID:85111524615
ID number 236: SCOPUS_ID:85109613870
ID number 237: SCOPUS_ID:85109143582
ID number 238: SCOPUS_ID:85108075769
ID number 239: SCOPUS_ID:85105961424
ID number 240: SCOPUS_ID:85101722756
ID number 241: SCOPUS_ID:85099835829
ID number 242: SCOPUS_ID:85093857433
ID number 243: SCOPUS_ID:85085508577
ID number 244: SCOPUS_ID:85120345743
ID number 245: SCOPUS_ID:85120004533
ID number 246: SCOPUS_ID:85119289623
ID number 247: SCOPUS_ID:85119038252
ID number 248: SCOPUS_ID:85118560798
ID number 249: SCOPUS_ID:85118409656
ID number 250: SCOPUS_ID:85115610048
ID number 251: SCOPUS_ID:85114360950
I

ID number 447: SCOPUS_ID:85093092545
ID number 448: SCOPUS_ID:85088028306
ID number 449: SCOPUS_ID:85082962904
ID number 450: SCOPUS_ID:85078825482
ID number 451: SCOPUS_ID:85095666006
ID number 452: SCOPUS_ID:85094622803
ID number 453: SCOPUS_ID:85090959046
ID number 454: SCOPUS_ID:85088596134
ID number 455: SCOPUS_ID:85088384744
ID number 456: SCOPUS_ID:85087127550
ID number 457: SCOPUS_ID:85082871373
ID number 458: SCOPUS_ID:85095388590
ID number 459: SCOPUS_ID:85095118534
ID number 460: SCOPUS_ID:85092408644
ID number 461: SCOPUS_ID:85085691350
ID number 462: SCOPUS_ID:85096309094
ID number 463: SCOPUS_ID:85093892016
ID number 464: SCOPUS_ID:85092680304
ID number 465: SCOPUS_ID:85091588889
ID number 466: SCOPUS_ID:85090217934
ID number 467: SCOPUS_ID:85089871513
ID number 468: SCOPUS_ID:85087567364
ID number 469: SCOPUS_ID:85087168817
ID number 470: SCOPUS_ID:85077367780
ID number 471: SCOPUS_ID:85074974992
ID number 472: SCOPUS_ID:85090817693
ID number 473: SCOPUS_ID:85095597869
I

ID number 669: SCOPUS_ID:85063993452
ID number 670: SCOPUS_ID:85062649477
ID number 671: SCOPUS_ID:85062299297
ID number 672: SCOPUS_ID:85060144123
ID number 673: SCOPUS_ID:85057236109
ID number 674: SCOPUS_ID:85069292738
ID number 675: SCOPUS_ID:85068506670
ID number 676: SCOPUS_ID:85071059592
ID number 677: SCOPUS_ID:85070184679
ID number 678: SCOPUS_ID:85069629194
ID number 679: SCOPUS_ID:85063750075
ID number 680: SCOPUS_ID:85062999807
ID number 681: SCOPUS_ID:85062734886
ID number 682: SCOPUS_ID:85059344980
ID number 683: SCOPUS_ID:85059043975
ID number 684: SCOPUS_ID:85055005934
ID number 685: SCOPUS_ID:85069468491
ID number 686: SCOPUS_ID:85042412818
ID number 687: SCOPUS_ID:85067272210
ID number 688: SCOPUS_ID:85074494757
ID number 689: SCOPUS_ID:85071578374
ID number 690: SCOPUS_ID:85067180575
ID number 691: SCOPUS_ID:85064888013
ID number 692: SCOPUS_ID:85059305060
ID number 693: SCOPUS_ID:85043286708
ID number 694: SCOPUS_ID:85051988825
ID number 695: SCOPUS_ID:85046724568
I

ID number 891: SCOPUS_ID:85040310473
ID number 892: SCOPUS_ID:85039845042
ID number 893: SCOPUS_ID:85039166535
ID number 894: SCOPUS_ID:85038258538
ID number 895: SCOPUS_ID:85028006448
ID number 896: SCOPUS_ID:84991080568
ID number 897: SCOPUS_ID:85038625885
ID number 898: SCOPUS_ID:85037975012
ID number 899: SCOPUS_ID:85034654541
ID number 900: SCOPUS_ID:85033448333
ID number 901: SCOPUS_ID:85027977746
ID number 902: SCOPUS_ID:85025471605
ID number 903: SCOPUS_ID:85021122350
ID number 904: SCOPUS_ID:85049077014
ID number 905: SCOPUS_ID:85049064096
ID number 906: SCOPUS_ID:85034580355
ID number 907: SCOPUS_ID:85032969656
ID number 908: SCOPUS_ID:85030102699
ID number 909: SCOPUS_ID:85015207163
ID number 910: SCOPUS_ID:85030687042
ID number 911: SCOPUS_ID:85028702543
ID number 912: SCOPUS_ID:85014665412
ID number 913: SCOPUS_ID:85011891371
ID number 914: SCOPUS_ID:84992521862
ID number 915: SCOPUS_ID:84983775573
ID number 916: SCOPUS_ID:85013815713
ID number 917: SCOPUS_ID:85032432286
I

ID number 1110: SCOPUS_ID:84930670382
ID number 1111: SCOPUS_ID:85043241047
ID number 1112: SCOPUS_ID:84937972417
ID number 1113: SCOPUS_ID:84941308748
ID number 1114: SCOPUS_ID:84949636001
ID number 1115: SCOPUS_ID:84959485283
ID number 1116: SCOPUS_ID:84940657176
ID number 1117: SCOPUS_ID:84937524916
ID number 1118: SCOPUS_ID:84937420869
ID number 1119: SCOPUS_ID:84931570952
ID number 1120: SCOPUS_ID:84930747085
ID number 1121: SCOPUS_ID:84939225773
ID number 1122: SCOPUS_ID:84930959080
ID number 1123: SCOPUS_ID:84929441794
ID number 1124: SCOPUS_ID:84927582812
ID number 1125: SCOPUS_ID:84953273822
ID number 1126: SCOPUS_ID:84930227069
ID number 1127: SCOPUS_ID:84929339960
ID number 1128: SCOPUS_ID:84931044581
ID number 1129: SCOPUS_ID:84928683723
ID number 1130: SCOPUS_ID:84924771157
ID number 1131: SCOPUS_ID:84928583134
ID number 1132: SCOPUS_ID:84928361525
ID number 1133: SCOPUS_ID:84921612430
ID number 1134: SCOPUS_ID:84927625462
ID number 1135: SCOPUS_ID:84926286800
ID number 11

ID number 1326: SCOPUS_ID:84865299369
ID number 1327: SCOPUS_ID:84864843599
ID number 1328: SCOPUS_ID:84864835915
ID number 1329: SCOPUS_ID:84864596158
ID number 1330: SCOPUS_ID:84866669386
ID number 1331: SCOPUS_ID:84863101627
ID number 1332: SCOPUS_ID:84864219525
ID number 1333: SCOPUS_ID:84865312103
ID number 1334: SCOPUS_ID:84863512979
ID number 1335: SCOPUS_ID:84863193922
ID number 1336: SCOPUS_ID:84861720746
ID number 1337: SCOPUS_ID:84861761892
ID number 1338: SCOPUS_ID:84858339802
ID number 1339: SCOPUS_ID:84857916330
ID number 1340: SCOPUS_ID:84855803265
ID number 1341: SCOPUS_ID:84875177460
ID number 1342: SCOPUS_ID:84867082151
ID number 1343: SCOPUS_ID:84866035736
ID number 1344: SCOPUS_ID:84860903407
ID number 1345: SCOPUS_ID:84155164678
ID number 1346: SCOPUS_ID:83555176137


In [37]:
df = pd.DataFrame(json_data2)
df['index'] = df.index
df

Unnamed: 0,srctype,eid,dc:description,prism:coverDate,prism:aggregationType,prism:url,subtypeDescription,dc:creator,link,prism:publicationName,...,dc:publisher,prism:issueIdentifier,prism:pageRange,prism:endingPage,prism:startingPage,pubmed-id,article-number,pii,prism:isbn,index
0,j,2-s2.0-85144861178,"© GKA Ediciones, authors.This study aims to an...",2022-12-26,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Patricia', 'pre...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Human Review. International Humanities Review ...,...,Eagora Science,,,,,,,,,0
1,j,2-s2.0-85145218361,"© 2022 Walter de Gruyter GmbH, Berlin/Boston.S...",2022-12-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Reem', 'preferr...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Multimodal Communication,...,Walter de Gruyter GmbH,3,235-246,246,235,,,,,1
2,j,2-s2.0-85144594877,© 2022 Flores et al. This is an open access ar...,2022-12-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Andrew R.', 'pr...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",PLoS ONE,...,Public Library of Science,12 December,,,,36542637,e0279363,,,2
3,j,2-s2.0-85144332605,"© 2022, The Author(s).Sex (whether one is ‘mal...",2022-12-01,Journal,https://api.elsevier.com/content/abstract/scop...,Review,{'author': [{'ce:given-name': 'Louis-Philippe'...,"[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",npj Primary Care Respiratory Medicine,...,Nature Research,1,,,,36539451,56,,,3
4,j,2-s2.0-85143126544,© 2022 Canadian Cardiovascular SocietyDefined ...,2022-12-01,Journal,https://api.elsevier.com/content/abstract/scop...,Review,"{'author': [{'ce:given-name': 'Isabel', 'prefe...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Canadian Journal of Cardiology,...,Elsevier Inc.,12,1865-1880,1880,1865,36116747,,S0828282X22008583,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,j,2-s2.0-84867082151,The current research draws from ambivalent sex...,2012-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Eden B.', 'pref...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Journal of Management,...,SAGE Publications Inc.claims@sagepub.com,6,1835-1866,1866,1835,,,,,1342
1343,j,2-s2.0-84866035736,"Cancer, particularly when it affects women and...",2012-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Felicia M.', 'p...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",International Journal of Gynecology and Obstet...,...,John Wiley and Sons Ltd,SUPPL.1,,,,22883910,,S0020729212001658,,1343
1344,j,2-s2.0-84860903407,Gender identity is a sociocultural construct b...,2012-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,Short Survey,"{'author': [{'ce:given-name': 'Alberto Roque',...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",MEDICC Review,...,MEDICC Medical Education Cooperation with Cuba...,2,35-38,38,35,22580552,,,,1344
1345,j,2-s2.0-84155164678,This study examined the relationship between p...,2012-01-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Jennifer L.', '...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Sex Roles,...,,1-2,94-104,104,94,,,,,1345


In [38]:
df['dc:description'][1]
# Some of the abstracts have the copyright and author's name in the abstract. It needs to be removed.

'© 2022 Walter de Gruyter GmbH, Berlin/Boston.Since the COVID-19 pandemic began, extensive research has been done on how the pandemic has been metaphorised. However, little research has focused on how the pandemic is associated with the depiction of gender relations in political cartoons. Therefore, this study showcases sexism and gender relations by examining how both gender and gender relationships have been expressed metaphorically. It draws on conceptual metaphor theory as well as concepts related to visual metaphors in multimodal discourse, covert sexism, and dehumanisation to analyse a corpus of 100 Arabic cartoons depicting men and women alone and as couples that were published during the COVID-19 pandemic. The results demonstrate that typo-pictorial metaphors and those related to body modification, dehumanisation, and the coronavirus are associated with prevalent covert sexism during the pandemic. In short, the findings suggest that COVID-19 has contributed to how women in rela

In [39]:
df['dc:description'][0]

'© GKA Ediciones, authors.This study aims to analyse sexism and feminism in prospective teachers. The sample is composed of 692 people and the data were obtained by administering the ISA scale to measure ambivalent sexism (hostile and benevolent) and the self-identification as feminist scale to measure feminist identification. The results show that men have higher levels of both hostile and benevolent sexism than women, and that women identify more with feminism. We also find that the higher the level of education, the lower the sexist attitudes. In conclusion, it seems that education is a good tool to work on both sexism and feminism.'

In [40]:
df['dc:description'][2]

'© 2022 Flores et al. This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.We estimate the prevalence and characteristics of violent hate crime victimization of lesbian, gay, bisexual, and transgender (LGBT) people in the United States, and we compare them to non-LGBT hate crime victims and to LGBT victims of violent non-hate crime. We analyze pooled 2017-2019 data from the National Crime Victimization Survey (n persons = 553, 925; n incidents = 32, 470), the first nationally representative and comprehensive survey on crime that allows identification of LGBT persons aged 16 or older. Descriptive and bivariate analysis show that LGBT people experienced 6.6 violent hate crime victimizations per 1,000 persons compared with non-LGBT people’s 0.6 per 1,000 persons (odds ratio = 8.30, 95% confidence interval = 1.

In [41]:
df['prism:doi'].isna().sum() # Only a handful of them do not have a DOI

90

In [42]:
df.subtypeDescription.unique()

array(['Article', 'Review', 'Note', 'Editorial', 'Data Paper',
       'Conference Paper', 'Letter', 'Short Survey', 'Book Chapter',
       'Book'], dtype=object)

In [43]:
df['prism:aggregationType'].unique()

array(['Journal', 'Conference Proceeding', 'Book Series', 'Book'],
      dtype=object)

In [44]:
df['dc:creator']

0       {'author': [{'ce:given-name': 'Patricia', 'pre...
1       {'author': [{'ce:given-name': 'Reem', 'preferr...
2       {'author': [{'ce:given-name': 'Andrew R.', 'pr...
3       {'author': [{'ce:given-name': 'Louis-Philippe'...
4       {'author': [{'ce:given-name': 'Isabel', 'prefe...
                              ...                        
1342    {'author': [{'ce:given-name': 'Eden B.', 'pref...
1343    {'author': [{'ce:given-name': 'Felicia M.', 'p...
1344    {'author': [{'ce:given-name': 'Alberto Roque',...
1345    {'author': [{'ce:given-name': 'Jennifer L.', '...
1346    {'author': [{'ce:given-name': 'Gabriela', 'pre...
Name: dc:creator, Length: 1347, dtype: object

Make a list of the DOIs which would help in extracting the abstracts of the articles.

But first join the two dataframes, so that it could be used as a single dataframe.

In [45]:
df.columns

Index(['srctype', 'eid', 'dc:description', 'prism:coverDate',
       'prism:aggregationType', 'prism:url', 'subtypeDescription',
       'dc:creator', 'link', 'prism:publicationName', 'source-id',
       'citedby-count', 'prism:volume', 'subtype', 'dc:title', 'openaccess',
       'openaccessFlag', 'prism:doi', 'prism:issn', 'dc:identifier',
       'dc:publisher', 'prism:issueIdentifier', 'prism:pageRange',
       'prism:endingPage', 'prism:startingPage', 'pubmed-id', 'article-number',
       'pii', 'prism:isbn', 'index'],
      dtype='object')

In [46]:
# Only take the dois which do not have null values
doi_ids = df['prism:doi'][~df['prism:doi'].isna()].to_list() 
len(doi_ids)

1257

In [47]:
df['dc:creator'][0]

{'author': [{'ce:given-name': 'Patricia',
   'preferred-name': {'ce:given-name': 'Patricia',
    'ce:initials': 'P.',
    'ce:surname': 'Fernández Rotaeche',
    'ce:indexed-name': 'Fernández Rotaeche P.'},
   '@seq': '1',
   'ce:initials': 'P.',
   '@_fa': 'true',
   'affiliation': {'@id': '60027856',
    '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60027856'},
   'ce:surname': 'Fernández Rotaeche',
   '@auid': '58032204600',
   'author-url': 'https://api.elsevier.com/content/author/author_id/58032204600',
   'ce:indexed-name': 'Fernandez Rotaeche P.'}]}

In [48]:
df_creator = pd.json_normalize(df['dc:creator'])
df_creator['index'] = df_creator.index
df_creator['author'] = pd.json_normalize(df_creator['author'])
df_creator['author'][22]

{'ce:given-name': 'Nabila',
 '@seq': '1',
 'ce:initials': 'N.',
 '@_fa': 'true',
 'ce:surname': 'Sarwar',
 '@auid': '58000690600',
 'author-url': 'https://api.elsevier.com/content/author/author_id/58000690600',
 'ce:indexed-name': 'Sarwar N.',
 'preferred-name.ce:given-name': 'Nabila',
 'preferred-name.ce:initials': 'N.',
 'preferred-name.ce:surname': 'Sarwar',
 'preferred-name.ce:indexed-name': 'Sarwar N.',
 'affiliation.@id': '60005816',
 'affiliation.@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60005816'}

After trying the authors, it seems that this API only gives information about the first author. This is why Semantic Scholar API could be used alongside this to extract the information on the co-authors.

### Semantic API
Since the primary keys are different, use the DOIs of the manuscripts to search for them through the Semantic Scholar API.

In [49]:
semantic_api_key = config.semantic_api_key

headers = {
        'Content-type': 'application/json',
        'x-api-key': semantic_api_key
    }


#### Details about the paper based on Paper ID

In [50]:
fields = 'title,abstract,authors,venue,publicationVenue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr'

In [51]:
json_data3 = []
i = 0

for id in doi_ids: # As long as the url exists
    print(f'ID number {i}: {id}')
    i += 1

    # Make the request to get the paper ID which is needed for the next step
    responses = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{id}', headers=headers, stream = True)
    json = responses.json()
    # Check if 'paperId' is there in the json file
    if 'paperId' in json:
        paper_id = json['paperId']

        # make the second request to get the details of the paper 
        response = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={fields}', headers=headers, stream = True)
        json = response.json()
        json_data3.append(json)

    else: # if the request does not fetch anything
        continue
    

ID number 0: 10.37467/revhuman.v11.4317
ID number 1: 10.1515/mc-2022-0008
ID number 2: 10.1371/journal.pone.0279363
ID number 3: 10.1038/s41533-022-00306-7
ID number 4: 10.1016/j.cjca.2022.09.009
ID number 5: 10.1007/s11199-022-01338-6
ID number 6: 10.1007/s11920-022-01382-9
ID number 7: 10.1186/s13071-022-05503-4
ID number 8: 10.1038/s41598-022-19116-5
ID number 9: 10.1186/s41235-022-00438-x
ID number 10: 10.1016/j.paid.2022.111860
ID number 11: 10.1186/s13643-022-02025-z
ID number 12: 10.1186/s13293-022-00448-w
ID number 13: 10.1007/s12119-022-09986-2
ID number 14: 10.1007/s13178-022-00725-8
ID number 15: 10.1177/08862605221080151
ID number 16: 10.1186/s12877-022-02885-z
ID number 17: 10.1186/s12889-022-12693-0
ID number 18: 10.1177/08862605211073095
ID number 19: 10.1186/s12939-021-01581-5
ID number 20: 10.1007/s12144-020-01343-6
ID number 21: 10.1073/pnas.2205988119
ID number 22: 10.3389/fpsyg.2022.837078
ID number 23: 10.3390/ijerph192114524
ID number 24: 10.1002/brb3.2757
ID numb

ID number 204: 10.1371/journal.pone.0260409
ID number 205: 10.1371/journal.pone.0260791
ID number 206: 10.1016/S2468-2667(21)00236-X
ID number 207: 10.1038/s41467-021-26905-5
ID number 208: 10.1007/s11199-021-01248-z
ID number 209: 10.1016/j.cellsig.2021.110171
ID number 210: 10.1016/j.amjsurg.2021.09.034
ID number 211: 10.1007/s11199-021-01244-3
ID number 212: 10.1016/j.cpr.2021.102087
ID number 213: 10.1016/j.neuroimage.2021.118644
ID number 214: 10.1007/s41019-021-00168-y
ID number 215: 10.1111/medu.14593
ID number 216: 10.1111/sjop.12763
ID number 217: 10.1038/s41598-021-93216-6
ID number 218: 10.1038/s41598-021-92254-4
ID number 219: 10.1177/1097184X211017954
ID number 220: 10.1007/s10880-021-09766-4
ID number 221: 10.1186/s12887-021-02503-8
ID number 222: 10.1177/1368430220961838
ID number 223: 10.1177/1368430220913610
ID number 224: 10.1371/journal.pone.0260333
ID number 225: 10.1016/j.micinf.2021.104850
ID number 226: 10.1371/journal.pone.0259843
ID number 227: 10.3390/children

ID number 405: 10.1016/j.ajog.2020.06.038
ID number 406: 10.1093/jnci/djaa013
ID number 407: 10.1080/17475759.2020.1780465
ID number 408: 10.1007/s11199-020-01135-z
ID number 409: 10.1126/SCIENCE.370.6516.514
ID number 410: 10.1145/3406865.3418588
ID number 411: 10.1001/jama.2020.17709
ID number 412: 10.1080/17441692.2020.1767676
ID number 413: 10.5209/ESMP.69257
ID number 414: 10.6018/analesps.411301
ID number 415: 10.1093/aje/kwaa075
ID number 416: 10.3390/ijerph17197040
ID number 417: 10.1002/ejsp.2702
ID number 418: 10.1002/ejsp.2696
ID number 419: 10.1002/ejsp.2695
ID number 420: 10.1002/ejsp.2692
ID number 421: 10.1177/0033294119896045
ID number 422: 10.1177/1461444819887141
ID number 423: 10.1186/s12887-020-02321-4
ID number 424: 10.1075/jls.19014.her
ID number 425: 10.3390/ijerph17186769
ID number 426: 10.1093/scan/nsaa117
ID number 427: 10.11591/IJEECS.V19.I3.PP1512-1518
ID number 428: 10.1371/journal.pone.0238947
ID number 429: 10.1007/s12110-020-09376-3
ID number 430: 10.137

ID number 607: 10.1007/s13312-019-1584-5
ID number 608: 10.7196/SAMJ.2019.v109i8.14152
ID number 609: 10.1002/cncy.22168
ID number 610: 10.1016/j.ssmph.2019.100446
ID number 611: 10.1177/1548051819849006
ID number 612: 10.1111/sjop.12545
ID number 613: 10.1016/j.paid.2019.03.047
ID number 614: 10.1016/j.jtcvs.2018.12.104
ID number 615: 10.1037/xge0000561
ID number 616: 10.1177/0146167218808500
ID number 617: 10.1037/pspi0000167
ID number 618: 10.5435/JAAOS-D-17-00868
ID number 619: 10.1001/jamanetworkopen.2019.6545
ID number 620: 10.7554/eLife.45374
ID number 621: 10.1089/jwh.2018.7571
ID number 622: 10.1371/journal.pone.0219698
ID number 623: 10.1016/j.chc.2019.02.016
ID number 624: 10.1111/add.14566
ID number 625: 10.1016/j.chb.2018.11.044
ID number 626: 10.1177/1948550618776624
ID number 627: 10.1177/1043659618818722
ID number 628: 10.1037/pspi0000164
ID number 629: 10.1145/3292522.3326045
ID number 630: 10.1007/s11109-018-9446-8
ID number 631: 10.1186/s12939-019-0989-z
ID number 63

ID number 807: 10.1080/17470919.2016.1251965
ID number 808: 10.5958/0974-360X.2018.00603.0
ID number 809: 10.1007/978-3-319-53550-0_13
ID number 810: 10.5093/pi2018a19
ID number 811: 10.1017/prp.2018.10
ID number 812: 10.17323/1813-8918-2018-3-447-463
ID number 813: 10.1097/QAI.0000000000001831
ID number 814: 10.1027/1864-9335/a000341
ID number 815: 10.30849/rip/ijp.v52i1.341
ID number 816: 10.1007/978-3-319-91947-8_6
ID number 817: 10.2106/JBJS.17.00458
ID number 818: 10.1007/978-3-319-72514-7
ID number 819: 10.1155/2018/6358624
ID number 820: 10.1371/journal.pone.0191334
ID number 821: 10.3174/ajnr.A5443
ID number 822: 10.1038/gim.2017.89
ID number 823: 10.1371/journal.pone.0190657
ID number 824: 10.1038/nrurol.2017.207
ID number 825: 10.2214/AJR.17.18303
ID number 826: 10.1177/0886260515604412
ID number 827: 10.1016/j.jesp.2017.07.009
ID number 828: 10.1037/men0000075
ID number 829: 10.1089/cyber.2017.0294
ID number 830: 10.1111/1468-0009.12297
ID number 831: 10.1016/j.encep.2017.08

ID number 1011: 10.3945/ajcn.114.100776
ID number 1012: 10.1371/journal.pone.0141363
ID number 1013: 10.1002/ejsp.2128
ID number 1014: 10.1007/s13524-015-0418-x
ID number 1015: 10.1037/pspi0000027
ID number 1016: 10.1037/lhb0000139
ID number 1017: 10.1177/1077801215592721
ID number 1018: 10.3389/fpsyg.2015.01400
ID number 1019: 10.1177/0261927X15586432
ID number 1020: 10.3109/09540261.2015.1086321
ID number 1021: 10.1080/00324728.2015.1103565
ID number 1022: 10.4074/S000350331400013X
ID number 1023: 10.1111/jcc4.12130
ID number 1024: 10.1111/cdev.12405
ID number 1025: 10.1111/bjso.12100
ID number 1026: 10.1093/heapro/dat087
ID number 1027: 10.1016/j.ssresearch.2015.05.013
ID number 1028: 10.1177/0146167215590987
ID number 1029: 10.1075/aral.38.1.02lee
ID number 1030: 10.1027/1864-9335/a000228
ID number 1031: 10.1037/a0039520
ID number 1032: 10.1037/hea0000192
ID number 1033: 10.1016/j.neuropsychologia.2015.06.013
ID number 1034: 10.1016/j.cortex.2015.05.003
ID number 1035: 10.3389/fpsy

ID number 1208: 10.1177/1948550612448195
ID number 1209: 10.1016/j.actpsy.2013.01.009
ID number 1210: 10.1007/s12552-012-9080-8
ID number 1211: 10.1080/00450618.2013.767374
ID number 1212: 10.1177/0146167212468332
ID number 1213: 10.1007/s11606-012-2207-1
ID number 1214: 10.1371/journal.pone.0053246
ID number 1215: 10.1123/jsep.35.6.585
ID number 1216: 10.1007/978-1-4614-6959-9_3
ID number 1217: 10.1007/978-1-4614-6959-9_7
ID number 1218: 10.18848/2327-0004/cgp/v12i02/39920
ID number 1219: 10.1037/a0030567
ID number 1220: 10.1037/a0028437
ID number 1221: 10.1016/j.jadohealth.2012.04.013
ID number 1222: 10.4992/jjpsy.83.479
ID number 1223: 10.1017/S0140525X12001264
ID number 1224: 10.1007/s11199-012-0181-z
ID number 1225: 10.1177/0361684312456369
ID number 1226: 10.1177/0361684312457659
ID number 1227: 10.1177/0957926512455382
ID number 1228: 10.1016/j.addbeh.2012.06.010
ID number 1229: 10.1024/1421-0185/a000078
ID number 1230: 10.1177/1077801212465151
ID number 1231: 10.1089/apc.2012.0

In [52]:
df2 = pd.DataFrame(json_data3)
df2['index'] = df2.index
df2

Unnamed: 0,paperId,publicationVenue,title,abstract,venue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr,authors,index
0,a44d34267192233c30ae1b7043db85fedd86e3f8,"{'id': 'f0727d48-1e26-4ba1-9c7d-e7789167aa4e',...",Análisis del sexismo y feminismo en el futuro ...,El presente estudio tiene como objetivo analiz...,HUMAN Review International Humanities Review /...,2022,97,0,,,"[{'authorId': '2123662680', 'name': 'Patricia ...",0
1,b1f6cdb3c81e0bdb653e12f2e43f82c87a622ee5,"{'id': 'e946ee38-09e2-4ebc-8f9f-3267777d7536',...",Multimodal metaphors and sexism in Arabic cart...,"Abstract Since the COVID-19 pandemic began, ex...",Multimodal Communication,2022,24,0,[Medicine],,"[{'authorId': '151179319', 'name': 'Reem Alkha...",1
2,ed4389930791ee387f9ebb3898fe3aa25fd96fdf,"{'id': '0aed7a40-85f3-4c66-9e1b-c1556c57001b',...",Hate crimes against LGBT people: National Crim...,We estimate the prevalence and characteristics...,PLoS ONE,2022,31,0,[Medicine],,"[{'authorId': '145827927', 'name': 'A. Flores'...",2
3,fd6152600298cb5c92d03c167dcc010420a6e8dc,"{'id': '534db092-7c47-4084-b861-b3d91dc6cfae',...",Addressing sex and gender to improve asthma ma...,,npj Primary Care Respiratory Medicine,2022,110,0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'This review ...","[{'authorId': '145550777', 'name': 'L. Boulet'...",3
4,8d154d4937adba696c99772cd12c82032d2222a3,"{'id': 'ab17701b-8ac0-4766-93ca-78d73ce88cd1',...",Sex and Gender Bias as a Mechanistic Determina...,,Canadian Journal of Cardiology,2022,133,0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'To mitigate ...","[{'authorId': '2164157636', 'name': 'Isabel Ki...",4
...,...,...,...,...,...,...,...,...,...,...,...,...
1231,8496b86d3cda8066dea4cbb2aa0ace2e0c61d6ff,,Benevolent Sexism at Work,The current research draws from ambivalent sex...,,2012,87,4,[Psychology],,"[{'authorId': '3525823', 'name': 'E. King'}, {...",1231
1232,b6be610e15da7e9cb9c0cf3b3021b4710e6ba02b,,Meeting the emerging challenge of breast and c...,,International journal of gynaecology and obste...,2012,29,0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'The horizons...","[{'authorId': '4740674', 'name': 'F. Knaul'}, ...",1232
1233,f73569bc6250050c4e89e3a9fa31cee8317cdcb1,"{'id': '7606e9e4-fbe1-4463-bc40-02193a462402',...",The right to health care for transsexual peopl...,Gender identity is a sociocultural construct b...,MEDICC Review,2012,20,0,"[Psychology, Medicine]","{'model': 'tldr@v2.0.0', 'text': 'This essay d...","[{'authorId': '49409656', 'name': 'A. Roque'},...",1233
1234,0daa4de1d3c46245668ce5b0998dbb489354b232,"{'id': '01b74bb2-5307-4c96-b3da-d484b80a9ff7',...",Supportive When Not Supported? Male Responses ...,,Sex Roles,2011,49,0,,,"[{'authorId': '39192306', 'name': 'Jennifer L....",1234


In [53]:
df2.columns

Index(['paperId', 'publicationVenue', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'influentialCitationCount', 'fieldsOfStudy', 'tldr',
       'authors', 'index'],
      dtype='object')

#### Details about a paper's authors based on Paper ID

In [54]:
fields = 'affiliations,paperCount,citationCount,hIndex'

In [55]:
json_data4 = []
i = 0

for id in doi_ids: # As long as the url exists
    print(f'ID number {i}: {id}')
    i += 1

    # Make the request to get the paper ID which is needed for the next step
    responses = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{id}', headers=headers, stream = True)
    json = responses.json()
    # Check if 'paperId' is there in the json file
    if 'paperId' in json:
        paper_id = json['paperId']

        # make the second request to get the details of the authors 
        response = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/authors?fields={fields}', headers=headers, stream = True)
        json = response.json()
        json_data4.append([id, json]) # Adding the doi in this one to help merge with dataframe 'df'

    else: # if the request does not fetch anything
        continue

ID number 0: 10.37467/revhuman.v11.4317
ID number 1: 10.1515/mc-2022-0008
ID number 2: 10.1371/journal.pone.0279363
ID number 3: 10.1038/s41533-022-00306-7
ID number 4: 10.1016/j.cjca.2022.09.009
ID number 5: 10.1007/s11199-022-01338-6
ID number 6: 10.1007/s11920-022-01382-9
ID number 7: 10.1186/s13071-022-05503-4
ID number 8: 10.1038/s41598-022-19116-5
ID number 9: 10.1186/s41235-022-00438-x
ID number 10: 10.1016/j.paid.2022.111860
ID number 11: 10.1186/s13643-022-02025-z
ID number 12: 10.1186/s13293-022-00448-w
ID number 13: 10.1007/s12119-022-09986-2
ID number 14: 10.1007/s13178-022-00725-8
ID number 15: 10.1177/08862605221080151
ID number 16: 10.1186/s12877-022-02885-z
ID number 17: 10.1186/s12889-022-12693-0
ID number 18: 10.1177/08862605211073095
ID number 19: 10.1186/s12939-021-01581-5
ID number 20: 10.1007/s12144-020-01343-6
ID number 21: 10.1073/pnas.2205988119
ID number 22: 10.3389/fpsyg.2022.837078
ID number 23: 10.3390/ijerph192114524
ID number 24: 10.1002/brb3.2757
ID numb

ID number 204: 10.1371/journal.pone.0260409
ID number 205: 10.1371/journal.pone.0260791
ID number 206: 10.1016/S2468-2667(21)00236-X
ID number 207: 10.1038/s41467-021-26905-5
ID number 208: 10.1007/s11199-021-01248-z
ID number 209: 10.1016/j.cellsig.2021.110171
ID number 210: 10.1016/j.amjsurg.2021.09.034
ID number 211: 10.1007/s11199-021-01244-3
ID number 212: 10.1016/j.cpr.2021.102087
ID number 213: 10.1016/j.neuroimage.2021.118644
ID number 214: 10.1007/s41019-021-00168-y
ID number 215: 10.1111/medu.14593
ID number 216: 10.1111/sjop.12763
ID number 217: 10.1038/s41598-021-93216-6
ID number 218: 10.1038/s41598-021-92254-4
ID number 219: 10.1177/1097184X211017954
ID number 220: 10.1007/s10880-021-09766-4
ID number 221: 10.1186/s12887-021-02503-8
ID number 222: 10.1177/1368430220961838
ID number 223: 10.1177/1368430220913610
ID number 224: 10.1371/journal.pone.0260333
ID number 225: 10.1016/j.micinf.2021.104850
ID number 226: 10.1371/journal.pone.0259843
ID number 227: 10.3390/children

ID number 405: 10.1016/j.ajog.2020.06.038
ID number 406: 10.1093/jnci/djaa013
ID number 407: 10.1080/17475759.2020.1780465
ID number 408: 10.1007/s11199-020-01135-z
ID number 409: 10.1126/SCIENCE.370.6516.514
ID number 410: 10.1145/3406865.3418588
ID number 411: 10.1001/jama.2020.17709
ID number 412: 10.1080/17441692.2020.1767676
ID number 413: 10.5209/ESMP.69257
ID number 414: 10.6018/analesps.411301
ID number 415: 10.1093/aje/kwaa075
ID number 416: 10.3390/ijerph17197040
ID number 417: 10.1002/ejsp.2702
ID number 418: 10.1002/ejsp.2696
ID number 419: 10.1002/ejsp.2695
ID number 420: 10.1002/ejsp.2692
ID number 421: 10.1177/0033294119896045
ID number 422: 10.1177/1461444819887141
ID number 423: 10.1186/s12887-020-02321-4
ID number 424: 10.1075/jls.19014.her
ID number 425: 10.3390/ijerph17186769
ID number 426: 10.1093/scan/nsaa117
ID number 427: 10.11591/IJEECS.V19.I3.PP1512-1518
ID number 428: 10.1371/journal.pone.0238947
ID number 429: 10.1007/s12110-020-09376-3
ID number 430: 10.137

ID number 607: 10.1007/s13312-019-1584-5
ID number 608: 10.7196/SAMJ.2019.v109i8.14152
ID number 609: 10.1002/cncy.22168
ID number 610: 10.1016/j.ssmph.2019.100446
ID number 611: 10.1177/1548051819849006
ID number 612: 10.1111/sjop.12545
ID number 613: 10.1016/j.paid.2019.03.047
ID number 614: 10.1016/j.jtcvs.2018.12.104
ID number 615: 10.1037/xge0000561
ID number 616: 10.1177/0146167218808500
ID number 617: 10.1037/pspi0000167
ID number 618: 10.5435/JAAOS-D-17-00868
ID number 619: 10.1001/jamanetworkopen.2019.6545
ID number 620: 10.7554/eLife.45374
ID number 621: 10.1089/jwh.2018.7571
ID number 622: 10.1371/journal.pone.0219698
ID number 623: 10.1016/j.chc.2019.02.016
ID number 624: 10.1111/add.14566
ID number 625: 10.1016/j.chb.2018.11.044
ID number 626: 10.1177/1948550618776624
ID number 627: 10.1177/1043659618818722
ID number 628: 10.1037/pspi0000164
ID number 629: 10.1145/3292522.3326045
ID number 630: 10.1007/s11109-018-9446-8
ID number 631: 10.1186/s12939-019-0989-z
ID number 63

ID number 807: 10.1080/17470919.2016.1251965
ID number 808: 10.5958/0974-360X.2018.00603.0
ID number 809: 10.1007/978-3-319-53550-0_13
ID number 810: 10.5093/pi2018a19
ID number 811: 10.1017/prp.2018.10
ID number 812: 10.17323/1813-8918-2018-3-447-463
ID number 813: 10.1097/QAI.0000000000001831
ID number 814: 10.1027/1864-9335/a000341
ID number 815: 10.30849/rip/ijp.v52i1.341
ID number 816: 10.1007/978-3-319-91947-8_6
ID number 817: 10.2106/JBJS.17.00458
ID number 818: 10.1007/978-3-319-72514-7
ID number 819: 10.1155/2018/6358624
ID number 820: 10.1371/journal.pone.0191334
ID number 821: 10.3174/ajnr.A5443
ID number 822: 10.1038/gim.2017.89
ID number 823: 10.1371/journal.pone.0190657
ID number 824: 10.1038/nrurol.2017.207
ID number 825: 10.2214/AJR.17.18303
ID number 826: 10.1177/0886260515604412
ID number 827: 10.1016/j.jesp.2017.07.009
ID number 828: 10.1037/men0000075
ID number 829: 10.1089/cyber.2017.0294
ID number 830: 10.1111/1468-0009.12297
ID number 831: 10.1016/j.encep.2017.08

ID number 1011: 10.3945/ajcn.114.100776
ID number 1012: 10.1371/journal.pone.0141363
ID number 1013: 10.1002/ejsp.2128
ID number 1014: 10.1007/s13524-015-0418-x
ID number 1015: 10.1037/pspi0000027
ID number 1016: 10.1037/lhb0000139
ID number 1017: 10.1177/1077801215592721
ID number 1018: 10.3389/fpsyg.2015.01400
ID number 1019: 10.1177/0261927X15586432
ID number 1020: 10.3109/09540261.2015.1086321
ID number 1021: 10.1080/00324728.2015.1103565
ID number 1022: 10.4074/S000350331400013X
ID number 1023: 10.1111/jcc4.12130
ID number 1024: 10.1111/cdev.12405
ID number 1025: 10.1111/bjso.12100
ID number 1026: 10.1093/heapro/dat087
ID number 1027: 10.1016/j.ssresearch.2015.05.013
ID number 1028: 10.1177/0146167215590987
ID number 1029: 10.1075/aral.38.1.02lee
ID number 1030: 10.1027/1864-9335/a000228
ID number 1031: 10.1037/a0039520
ID number 1032: 10.1037/hea0000192
ID number 1033: 10.1016/j.neuropsychologia.2015.06.013
ID number 1034: 10.1016/j.cortex.2015.05.003
ID number 1035: 10.3389/fpsy

ID number 1208: 10.1177/1948550612448195
ID number 1209: 10.1016/j.actpsy.2013.01.009
ID number 1210: 10.1007/s12552-012-9080-8
ID number 1211: 10.1080/00450618.2013.767374
ID number 1212: 10.1177/0146167212468332
ID number 1213: 10.1007/s11606-012-2207-1
ID number 1214: 10.1371/journal.pone.0053246
ID number 1215: 10.1123/jsep.35.6.585
ID number 1216: 10.1007/978-1-4614-6959-9_3
ID number 1217: 10.1007/978-1-4614-6959-9_7
ID number 1218: 10.18848/2327-0004/cgp/v12i02/39920
ID number 1219: 10.1037/a0030567
ID number 1220: 10.1037/a0028437
ID number 1221: 10.1016/j.jadohealth.2012.04.013
ID number 1222: 10.4992/jjpsy.83.479
ID number 1223: 10.1017/S0140525X12001264
ID number 1224: 10.1007/s11199-012-0181-z
ID number 1225: 10.1177/0361684312456369
ID number 1226: 10.1177/0361684312457659
ID number 1227: 10.1177/0957926512455382
ID number 1228: 10.1016/j.addbeh.2012.06.010
ID number 1229: 10.1024/1421-0185/a000078
ID number 1230: 10.1177/1077801212465151
ID number 1231: 10.1089/apc.2012.0

In [65]:
df3 = pd.DataFrame(json_data4).rename(columns={0: 'prism:doi', 1: 'author_data'})
df3['author_data'] = pd.json_normalize(df3['author_data']).data
df3['index'] = df3.index
df3

Unnamed: 0,prism:doi,author_data,index
0,10.37467/revhuman.v11.4317,"[{'authorId': '2123662680', 'affiliations': []...",0
1,10.1515/mc-2022-0008,"[{'authorId': '151179319', 'affiliations': [],...",1
2,10.1371/journal.pone.0279363,"[{'authorId': '145827927', 'affiliations': [],...",2
3,10.1038/s41533-022-00306-7,"[{'authorId': '145550777', 'affiliations': [],...",3
4,10.1016/j.cjca.2022.09.009,"[{'authorId': '2164157636', 'affiliations': []...",4
...,...,...,...
1232,10.1177/0149206310365902,"[{'authorId': '3525823', 'affiliations': [], '...",1232
1233,10.1016/j.ijgo.2012.03.024,"[{'authorId': '4740674', 'affiliations': [], '...",1233
1234,10.1590/s1555-79602012000200009,"[{'authorId': '49409656', 'affiliations': [], ...",1234
1235,10.1007/s11199-011-0058-6,"[{'authorId': '39192306', 'affiliations': [], ...",1235


Merge the last two dataframes and remove the drop the columns we do not need.

In [67]:
df_2_3 = pd.merge(df2, df3, on="index", how="left").drop(columns=['paperId', 'message', 'index']) 
df_2_3

Unnamed: 0,publicationVenue,title,abstract,venue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr,authors,prism:doi,author_data
0,"{'id': 'f0727d48-1e26-4ba1-9c7d-e7789167aa4e',...",Análisis del sexismo y feminismo en el futuro ...,El presente estudio tiene como objetivo analiz...,HUMAN Review International Humanities Review /...,2022,97,0,,,"[{'authorId': '2123662680', 'name': 'Patricia ...",10.37467/revhuman.v11.4317,"[{'authorId': '2123662680', 'affiliations': []..."
1,"{'id': 'e946ee38-09e2-4ebc-8f9f-3267777d7536',...",Multimodal metaphors and sexism in Arabic cart...,"Abstract Since the COVID-19 pandemic began, ex...",Multimodal Communication,2022,24,0,[Medicine],,"[{'authorId': '151179319', 'name': 'Reem Alkha...",10.1515/mc-2022-0008,"[{'authorId': '151179319', 'affiliations': [],..."
2,"{'id': '0aed7a40-85f3-4c66-9e1b-c1556c57001b',...",Hate crimes against LGBT people: National Crim...,We estimate the prevalence and characteristics...,PLoS ONE,2022,31,0,[Medicine],,"[{'authorId': '145827927', 'name': 'A. Flores'...",10.1371/journal.pone.0279363,"[{'authorId': '145827927', 'affiliations': [],..."
3,"{'id': '534db092-7c47-4084-b861-b3d91dc6cfae',...",Addressing sex and gender to improve asthma ma...,,npj Primary Care Respiratory Medicine,2022,110,0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'This review ...","[{'authorId': '145550777', 'name': 'L. Boulet'...",10.1038/s41533-022-00306-7,"[{'authorId': '145550777', 'affiliations': [],..."
4,"{'id': 'ab17701b-8ac0-4766-93ca-78d73ce88cd1',...",Sex and Gender Bias as a Mechanistic Determina...,,Canadian Journal of Cardiology,2022,133,0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'To mitigate ...","[{'authorId': '2164157636', 'name': 'Isabel Ki...",10.1016/j.cjca.2022.09.009,"[{'authorId': '2164157636', 'affiliations': []..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1231,,Benevolent Sexism at Work,The current research draws from ambivalent sex...,,2012,87,4,[Psychology],,"[{'authorId': '3525823', 'name': 'E. King'}, {...",10.1111/j.1467-9450.2011.00900.x,"[{'authorId': '7293320', 'affiliations': [], '..."
1232,,Meeting the emerging challenge of breast and c...,,International journal of gynaecology and obste...,2012,29,0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'The horizons...","[{'authorId': '4740674', 'name': 'F. Knaul'}, ...",10.1177/0149206310365902,"[{'authorId': '3525823', 'affiliations': [], '..."
1233,"{'id': '7606e9e4-fbe1-4463-bc40-02193a462402',...",The right to health care for transsexual peopl...,Gender identity is a sociocultural construct b...,MEDICC Review,2012,20,0,"[Psychology, Medicine]","{'model': 'tldr@v2.0.0', 'text': 'This essay d...","[{'authorId': '49409656', 'name': 'A. Roque'},...",10.1016/j.ijgo.2012.03.024,"[{'authorId': '4740674', 'affiliations': [], '..."
1234,"{'id': '01b74bb2-5307-4c96-b3da-d484b80a9ff7',...",Supportive When Not Supported? Male Responses ...,,Sex Roles,2011,49,0,,,"[{'authorId': '39192306', 'name': 'Jennifer L....",10.1590/s1555-79602012000200009,"[{'authorId': '49409656', 'affiliations': [], ..."


Now check with the df dataframe to remove unneccessary columns.

In [68]:
# choose the columns we can remove
df.srctype.unique() # Not important
df.eid.unique() # (Electronic ID) -- not important
df['dc:description'].unique() # (Abstracts) -- important
df['prism:url'].unique() # (Content Abstract Retrieval API URI) -- not important
df['prism:coverDate'].unique() # (Publication Date) -- important
df['prism:aggregationType'].unique() # (Source Type) -- important
df['subtypeDescription'].unique() # (Document Type description) -- important
df['link'] # not important
df['prism:publicationName'].unique() # (Source Title) -- important
df['source-id'].unique() # keep for later use
df['citedby-count'].unique() # (Cited-by Count) -- important
df['prism:volume'].unique() # (Volume) -- important
df['subtype'].unique() # (Document Type code) -- not important
df['dc:title'].unique() # (Article Title) -- important
df['openaccess'].unique() # (Open Access status) -- keep for later use
df['openaccessFlag'].unique() # keep for later use
df['prism:doi'] # (Document Object Identifier) -- keep to merge the datasets
df['prism:issn'].unique() # (Source identifier) -- not important
df['dc:identifier'] # (Scopus ID) -- not important
df['dc:publisher'] # keep for later use
df['prism:issueIdentifier'] # not important
df['article-number'] # not important
df['pubmed-id'] # not important
df['prism:pageRange'] # not important
df['prism:endingPage'] # not important
df['prism:startingPage'] # not important
df['pii'] # not important
df['prism:isbn'] # (Source identifier) -- not important


AttributeError: 'DataFrame' object has no attribute 'srctype'

In [69]:
df = df.drop(columns=[
    'srctype', 'eid', 'prism:url',
    'link', 'subtype', 'prism:issn', 'dc:identifier',
    'prism:issueIdentifier', 'article-number', 'pubmed-id', 'prism:pageRange',
    'prism:endingPage', 'prism:startingPage', 'pii','prism:isbn'
    ])

KeyError: "['srctype', 'eid', 'prism:url', 'link', 'subtype', 'prism:issn', 'dc:identifier', 'prism:issueIdentifier', 'article-number', 'pubmed-id', 'prism:pageRange', 'prism:endingPage', 'prism:startingPage', 'pii', 'prism:isbn'] not found in axis"

In [70]:
df.columns

Index(['dc:description', 'prism:coverDate', 'prism:aggregationType',
       'subtypeDescription', 'dc:creator', 'prism:publicationName',
       'source-id', 'citedby-count', 'prism:volume', 'dc:title', 'openaccess',
       'openaccessFlag', 'prism:doi', 'dc:publisher', 'index'],
      dtype='object')

In [71]:
# to check later if the dataframes are properly merged
df[df['prism:doi'].isna()].index

Int64Index([ 144,  148,  149,  166,  167,  168,  169,  170,  171,  172,  173,
             174,  175,  176,  177,  178,  179,  180,  186,  189,  329,  365,
             366,  367,  368,  369,  370,  372,  374,  378,  384,  385,  386,
             387,  388,  389,  390,  391,  392,  393,  394,  395,  396,  397,
             398,  399,  402,  411,  416,  501,  511,  534,  577,  581,  585,
             589,  721,  738,  740,  764,  842,  869,  873,  877,  878,  879,
             880,  881,  978, 1061, 1062, 1064, 1065, 1068, 1069, 1077, 1153,
            1154, 1155, 1157, 1172, 1209, 1218, 1244, 1279, 1285, 1301, 1308,
            1323, 1341],
           dtype='int64')

In [72]:
df = pd.merge(df, df_2_3, on='prism:doi', how='left')
df

Unnamed: 0,dc:description,prism:coverDate,prism:aggregationType,subtypeDescription,dc:creator,prism:publicationName,source-id,citedby-count,prism:volume,dc:title,...,title,abstract,venue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr,authors,author_data
0,"© GKA Ediciones, authors.This study aims to an...",2022-12-26,Journal,Article,"{'author': [{'ce:given-name': 'Patricia', 'pre...",Human Review. International Humanities Review ...,21101039068,0,11,Analysis of sexism and feminism in the future ...,...,Análisis del sexismo y feminismo en el futuro ...,El presente estudio tiene como objetivo analiz...,HUMAN Review International Humanities Review /...,2022.0,97.0,0.0,,,"[{'authorId': '2123662680', 'name': 'Patricia ...","[{'authorId': '2123662680', 'affiliations': []..."
1,"© 2022 Walter de Gruyter GmbH, Berlin/Boston.S...",2022-12-01,Journal,Article,"{'author': [{'ce:given-name': 'Reem', 'preferr...",Multimodal Communication,21101048855,0,11,Multimodal metaphors and sexism in Arabic cart...,...,Multimodal metaphors and sexism in Arabic cart...,"Abstract Since the COVID-19 pandemic began, ex...",Multimodal Communication,2022.0,24.0,0.0,[Medicine],,"[{'authorId': '151179319', 'name': 'Reem Alkha...","[{'authorId': '151179319', 'affiliations': [],..."
2,© 2022 Flores et al. This is an open access ar...,2022-12-01,Journal,Article,"{'author': [{'ce:given-name': 'Andrew R.', 'pr...",PLoS ONE,10600153309,0,17,Hate crimes against LGBT people: National Crim...,...,Hate crimes against LGBT people: National Crim...,We estimate the prevalence and characteristics...,PLoS ONE,2022.0,31.0,0.0,[Medicine],,"[{'authorId': '145827927', 'name': 'A. Flores'...","[{'authorId': '145827927', 'affiliations': [],..."
3,"© 2022, The Author(s).Sex (whether one is ‘mal...",2022-12-01,Journal,Review,{'author': [{'ce:given-name': 'Louis-Philippe'...,npj Primary Care Respiratory Medicine,21100324439,0,32,Addressing sex and gender to improve asthma ma...,...,Addressing sex and gender to improve asthma ma...,,npj Primary Care Respiratory Medicine,2022.0,110.0,0.0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'This review ...","[{'authorId': '145550777', 'name': 'L. Boulet'...","[{'authorId': '145550777', 'affiliations': [],..."
4,© 2022 Canadian Cardiovascular SocietyDefined ...,2022-12-01,Journal,Review,"{'author': [{'ce:given-name': 'Isabel', 'prefe...",Canadian Journal of Cardiology,22504,1,38,Sex and Gender Bias as a Mechanistic Determina...,...,Sex and Gender Bias as a Mechanistic Determina...,,Canadian Journal of Cardiology,2022.0,133.0,0.0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'To mitigate ...","[{'authorId': '2164157636', 'name': 'Isabel Ki...","[{'authorId': '2164157636', 'affiliations': []..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,The current research draws from ambivalent sex...,2012-01-01,Journal,Article,"{'author': [{'ce:given-name': 'Eden B.', 'pref...",Journal of Management,20635,118,38,Benevolent Sexism at Work: Gender Differences ...,...,Meeting the emerging challenge of breast and c...,,International journal of gynaecology and obste...,2012.0,29.0,0.0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'The horizons...","[{'authorId': '4740674', 'name': 'F. Knaul'}, ...","[{'authorId': '3525823', 'affiliations': [], '..."
1343,"Cancer, particularly when it affects women and...",2012-01-01,Journal,Article,"{'author': [{'ce:given-name': 'Felicia M.', 'p...",International Journal of Gynecology and Obstet...,27521,34,119,Meeting the emerging challenge of breast and c...,...,The right to health care for transsexual peopl...,Gender identity is a sociocultural construct b...,MEDICC Review,2012.0,20.0,0.0,"[Psychology, Medicine]","{'model': 'tldr@v2.0.0', 'text': 'This essay d...","[{'authorId': '49409656', 'name': 'A. Roque'},...","[{'authorId': '4740674', 'affiliations': [], '..."
1344,Gender identity is a sociocultural construct b...,2012-01-01,Journal,Short Survey,"{'author': [{'ce:given-name': 'Alberto Roque',...",MEDICC Review,17300154961,1,14,The right to health care for transsexual peopl...,...,Supportive When Not Supported? Male Responses ...,,Sex Roles,2011.0,49.0,0.0,,,"[{'authorId': '39192306', 'name': 'Jennifer L....","[{'authorId': '49409656', 'affiliations': [], ..."
1345,This study examined the relationship between p...,2012-01-01,Journal,Article,"{'author': [{'ce:given-name': 'Jennifer L.', '...",Sex Roles,14798,8,66,Supportive When Not Supported? Male Responses ...,...,Unsafe Sexual Behaviors Among HIV-Positive Men...,Background: We conducted a study among HIV-pos...,Sexually Transmitted Diseases,2012.0,27.0,0.0,[Medicine],"{'model': 'tldr@v2.0.0', 'text': 'Programs tar...","[{'authorId': '1390016912', 'name': 'G. Paz-Ba...","[{'authorId': '39192306', 'affiliations': [], ..."


Check with some of the indices as specified before. Let's say 137 and 255.

In [73]:
df.iloc[137]

dc:description              © 2021 Elsevier LtdDiscrimination has been ass...
prism:coverDate                                                    2022-02-01
prism:aggregationType                                                 Journal
subtypeDescription                                                    Article
dc:creator                  {'author': [{'ce:given-name': 'Fares', 'prefer...
prism:publicationName                                     Addictive Behaviors
source-id                                                               24763
citedby-count                                                               2
prism:volume                                                              125
dc:title                    Associations between discrimination and substa...
openaccess                                                                  0
openaccessFlag                                                          false
prism:doi                                        10.1016/j.addbe

In [74]:
df.iloc[255]

dc:description              © 2021 by the authors. Licensee MDPI, Basel, S...
prism:coverDate                                                    2021-10-01
prism:aggregationType                                                 Journal
subtypeDescription                                                    Article
dc:creator                  {'author': [{'ce:given-name': 'Carlos', 'prefe...
prism:publicationName                 Multimodal Technologies and Interaction
source-id                                                         21100945706
citedby-count                                                               6
prism:volume                                                                5
dc:title                    Using shallow and deep learning to automatical...
openaccess                                                                  1
openaccessFlag                                                           true
prism:doi                                                  10.33

Now check with two other random indices not from the previous list.

In [75]:
df.iloc[111]

dc:description              © 2021 by the Society for Personality and Soci...
prism:coverDate                                                    2022-04-01
prism:aggregationType                                                 Journal
subtypeDescription                                                    Article
dc:creator                  {'author': [{'ce:given-name': 'Jennifer K.', '...
prism:publicationName              Personality and Social Psychology Bulletin
source-id                                                               12811
citedby-count                                                               2
prism:volume                                                               48
dc:title                    Curvilinear Sexism and Its Links to Men’s Perc...
openaccess                                                                  0
openaccessFlag                                                          false
prism:doi                                           10.1177/0146

In [76]:
df.iloc[500]

dc:description              © 2020 Sociedad Universitaria de Investigacion...
prism:coverDate                                                    2020-07-01
prism:aggregationType                                                 Journal
subtypeDescription                                                    Article
dc:creator                  {'author': [{'ce:given-name': 'Verónica', 'pre...
prism:publicationName            Revista Iberoamericana de Psicologia y Salud
source-id                                                         21100466462
citedby-count                                                              22
prism:volume                                                               11
dc:title                    Dating violence victimization, perceived gravi...
openaccess                                                                  1
openaccessFlag                                                           true
prism:doi                                         10.23923/j.rip

### Authors

In [77]:
authors = pd.json_normalize(df['authors'])
authors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,"{'authorId': '2123662680', 'name': 'Patricia F...","{'authorId': '2198427883', 'name': 'Joana Jaur...","{'authorId': '2127284874', 'name': 'Nahia Idoi...",,,,,,,,...,,,,,,,,,,
1,"{'authorId': '151179319', 'name': 'Reem Alkham...",,,,,,,,,,...,,,,,,,,,,
2,"{'authorId': '145827927', 'name': 'A. Flores'}","{'authorId': '8135019', 'name': 'R. Stotzer'}","{'authorId': '1864405', 'name': 'I. Meyer'}","{'authorId': '100547354', 'name': 'L. Langton'}",,,,,,,...,,,,,,,,,,
3,"{'authorId': '145550777', 'name': 'L. Boulet'}","{'authorId': '2607357', 'name': 'K. Lavoie'}","{'authorId': '1404074042', 'name': 'C. Raheris...","{'authorId': '144496678', 'name': 'A. Kaplan'}","{'authorId': '2148334977', 'name': 'D. Singh'}","{'authorId': '145601376', 'name': 'C. Jenkins'}",,,,,...,,,,,,,,,,
4,"{'authorId': '2164157636', 'name': 'Isabel Kim'}","{'authorId': '5399772', 'name': 'Thalia Shosha...","{'authorId': '21161085', 'name': 'D. Wan'}","{'authorId': '6491365', 'name': 'K. Humphries'}","{'authorId': '9114144', 'name': 'T. Sedlak'}",,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,"{'authorId': '4740674', 'name': 'F. Knaul'}","{'authorId': '3643315', 'name': 'Afsan Bhadelia'}","{'authorId': '6538145', 'name': 'J. Gralow'}","{'authorId': '1402944292', 'name': 'H. Arreola...","{'authorId': '144991065', 'name': 'A. Langer'}","{'authorId': '144628854', 'name': 'J. Frenk'}",,,,,...,,,,,,,,,,
1343,"{'authorId': '49409656', 'name': 'A. Roque'}","{'authorId': '2107663106', 'name': 'R. Rodrígu...",,,,,,,,,...,,,,,,,,,,
1344,"{'authorId': '39192306', 'name': 'Jennifer L. ...","{'authorId': '5223215', 'name': 'A. Ryan'}",,,,,,,,,...,,,,,,,,,,
1345,"{'authorId': '1390016912', 'name': 'G. Paz-Bai...","{'authorId': '1402475973', 'name': 'Virginia I...","{'authorId': '1445065923', 'name': 'Sonia Mora...","{'authorId': '39972962', 'name': 'J. Jacobson'}","{'authorId': '144972131', 'name': 'S. Mendoza'}","{'authorId': '123577193', 'name': 'M. Paredes'}","{'authorId': '2087173390', 'name': 'Damien C. ...","{'authorId': '2054485167', 'name': 'D. Mabey'}","{'authorId': '4807866', 'name': 'E. Monterroso'}",,...,,,,,,,,,,


In [78]:
authors[0].isna().sum() # Interesting that 23 manuscripts do not have any first author

114

In [79]:
print("Number of null values in authors: \n First author: {first}\n Second author: {second} \n Third author: {third} \n Fourth author: {fourth}\n Fifth author: {fifth}\n Sixth author: {sixth}".format(first=authors[0].isna().sum(), second= authors[1].isna().sum(), third= authors[2].isna().sum(), fourth= authors[3].isna().sum(), fifth= authors[4].isna().sum(), sixth= authors[5].isna().sum()))

Number of null values in authors: 
 First author: 114
 Second author: 256 
 Third author: 491 
 Fourth author: 772
 Fifth author: 985
 Sixth author: 1109


### Separate the authors' name and id from the dictionary

In [80]:
nested_authors1 = pd.json_normalize(authors[0]).rename(columns={'authorId':'author_1_id', 'name':'author_1_name'})
nested_authors1['index'] = nested_authors1.index
nested_authors1

Unnamed: 0,author_1_id,author_1_name,index
0,2123662680,Patricia Fernández Rotaeche,0
1,151179319,Reem Alkhammash,1
2,145827927,A. Flores,2
3,145550777,L. Boulet,3
4,2164157636,Isabel Kim,4
...,...,...,...
1342,4740674,F. Knaul,1342
1343,49409656,A. Roque,1343
1344,39192306,Jennifer L. Wessel,1344
1345,1390016912,G. Paz-Bailey,1345


Let's check if the combinations are unique to each author. (Can also check with the sizes)

This result could be useful later as well, for analysis.

In [81]:
# to get unique combinations of the authors and names
nested_authors1_comb = nested_authors1.groupby(['author_1_id','author_1_name']).size().reset_index().rename(columns={0:'count'})
nested_authors1_comb = nested_authors1_comb.sort_values(by='count', ascending=False)
nested_authors1_comb

Unnamed: 0,author_1_id,author_1_name,count
905,49755508,M. Foster,4
751,40418844,Andrea C. Vial,4
425,2108598929,Jackie F. K. Lee,3
864,48438493,Caroline C. Fitz,3
332,1885803,S. Fiske,3
...,...,...,...
402,2078731,Mahmoudreza Ramin,1
403,2079578017,Antonia Sudkämper,1
404,2080180412,Adam A. Rogers,1
405,2081330564,C. S. Redd,1


Now, use dictionary to map the IDs to the names for later use.

In [82]:
auth_dict_1 = nested_authors1_comb.set_index('author_1_id').to_dict()['author_1_name']
auth_dict_1

{'49755508': 'M. Foster',
 '40418844': 'Andrea C. Vial',
 '2108598929': 'Jackie F. K. Lee',
 '48438493': 'Caroline C. Fitz',
 '1885803': 'S. Fiske',
 '1484246801': 'Miguel Ángel López-Sáez',
 '48010119': 'Z. Zaidi',
 '5472411': 'Jessica J. Good',
 '1398774271': 'C. Moss‐Racusin',
 '145413059': 'C. Ferguson',
 '81862536': 'J. Cundiff',
 '150919133': 'A. Riquelme',
 '6460247': 'Lindsay M. Orchowski',
 '47243777': 'Lacey J. Hilliard',
 '35202793': 'A. Murray',
 '34703961': 'Bonnie Moradi',
 '49714278': 'Xin Shi',
 '3883065': 'M. J. Monteith',
 '47299975': 'Kennedy Carpenter',
 '25696566': 'Dawn M. Szymanski',
 '34883936': 'R. Bigler',
 '3460549': 'Terrin N. Tamati',
 '7798496': 'Mike C. Parent',
 '1398947095': 'I. Cuadrado-Gordillo',
 '1401844002': 'Steve Stewart-Williams',
 '144663422': 'Mark Rubin',
 '5005884': 'Oriane Sarrasin',
 '6949919': 'N. Souchon',
 '2157412634': 'José Martín',
 '145194229': 'Sofia Persson',
 '2068744673': 'F. Teng',
 '51315861': 'Erin C. Cassese',
 '3634546': 'C

In [83]:
# Following the same technique from the previous author 1 set till authro 6

# Author 2
nested_authors2 = pd.json_normalize(authors[1]).rename(columns={'authorId':'author_2_id', 'name':'author_2_name'})
nested_authors2['index'] = nested_authors2.index
nested_authors2_comb = nested_authors2.groupby(['author_2_id','author_2_name']).size().reset_index().rename(columns={0:'count'})
nested_authors2_comb = nested_authors2_comb.sort_values(by='count', ascending=False)
auth_dict_2 = nested_authors2_comb.set_index('author_2_id').to_dict()['author_2_name']

# Author 3
nested_authors3 = pd.json_normalize(authors[2]).rename(columns={'authorId':'author_3_id', 'name':'author_3_name'})
nested_authors3['index'] = nested_authors3.index
nested_authors3_comb = nested_authors3.groupby(['author_3_id','author_3_name']).size().reset_index().rename(columns={0:'count'})
nested_authors3_comb = nested_authors3_comb.sort_values(by='count', ascending=False)
auth_dict_3 = nested_authors3_comb.set_index('author_3_id').to_dict()['author_3_name']

# Author 4
nested_authors4 = pd.json_normalize(authors[3]).rename(columns={'authorId':'author_4_id', 'name':'author_4_name'})
nested_authors4['index'] = nested_authors4.index
nested_authors4_comb = nested_authors4.groupby(['author_4_id','author_4_name']).size().reset_index().rename(columns={0:'count'})
nested_authors4_comb = nested_authors4_comb.sort_values(by='count', ascending=False)
auth_dict_4 = nested_authors4_comb.set_index('author_4_id').to_dict()['author_4_name']

# Author 5
nested_authors5 = pd.json_normalize(authors[4]).rename(columns={'authorId':'author_5_id', 'name':'author_5_name'})
nested_authors5['index'] = nested_authors5.index
nested_authors5_comb = nested_authors5.groupby(['author_5_id','author_5_name']).size().reset_index().rename(columns={0:'count'})
nested_authors5_comb = nested_authors5_comb.sort_values(by='count', ascending=False)
auth_dict_5 = nested_authors5_comb.set_index('author_5_id').to_dict()['author_5_name']

# Author 6
nested_authors6 = pd.json_normalize(authors[5]).rename(columns={'authorId':'author_6_id', 'name':'author_6_name'})
nested_authors6['index'] = nested_authors6.index
nested_authors6_comb = nested_authors6.groupby(['author_6_id','author_6_name']).size().reset_index().rename(columns={0:'count'})
nested_authors6_comb = nested_authors6_comb.sort_values(by='count', ascending=False)
auth_dict_6 = nested_authors6_comb.set_index('author_6_id').to_dict()['author_6_name']

#### Merge the dataframes for the authors

In [84]:
#only run once
df = pd.merge(df, nested_authors1, on="index", how="left")
df = pd.merge(df, nested_authors2, on="index", how="left")
df = pd.merge(df, nested_authors3, on="index", how="left")
df = pd.merge(df, nested_authors4, on="index", how="left")
df = pd.merge(df, nested_authors5, on="index", how="left")
df = pd.merge(df, nested_authors6, on="index", how="left")

Now drop the ```authors``` column.

In [85]:
df = df.drop(columns=['authors'])

### Author's data

In [86]:
authors_data = pd.json_normalize(df['author_data'])
authors_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,"{'authorId': '2123662680', 'affiliations': [],...","{'authorId': '2198427883', 'affiliations': [],...","{'authorId': '2127284874', 'affiliations': [],...",,,,,,,,...,,,,,,,,,,
1,"{'authorId': '151179319', 'affiliations': [], ...",,,,,,,,,,...,,,,,,,,,,
2,"{'authorId': '145827927', 'affiliations': [], ...","{'authorId': '8135019', 'affiliations': [], 'p...","{'authorId': '1864405', 'affiliations': [], 'p...","{'authorId': '100547354', 'affiliations': [], ...",,,,,,,...,,,,,,,,,,
3,"{'authorId': '145550777', 'affiliations': [], ...","{'authorId': '2607357', 'affiliations': [], 'p...","{'authorId': '1404074042', 'affiliations': [],...","{'authorId': '144496678', 'affiliations': [], ...","{'authorId': '2148334977', 'affiliations': [],...","{'authorId': '145601376', 'affiliations': [], ...",,,,,...,,,,,,,,,,
4,"{'authorId': '2164157636', 'affiliations': [],...","{'authorId': '5399772', 'affiliations': ['Univ...","{'authorId': '21161085', 'affiliations': [], '...","{'authorId': '6491365', 'affiliations': [], 'p...","{'authorId': '9114144', 'affiliations': [], 'p...",,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1342,"{'authorId': '3525823', 'affiliations': [], 'p...","{'authorId': '7522876', 'affiliations': [], 'p...","{'authorId': '4220199', 'affiliations': [], 'p...","{'authorId': '40597442', 'affiliations': [], '...","{'authorId': '6576844', 'affiliations': [], 'p...","{'authorId': '113764543', 'affiliations': [], ...",,,,,...,,,,,,,,,,
1343,"{'authorId': '4740674', 'affiliations': [], 'p...","{'authorId': '3643315', 'affiliations': [], 'p...","{'authorId': '6538145', 'affiliations': [], 'p...","{'authorId': '1402944292', 'affiliations': [],...","{'authorId': '144991065', 'affiliations': [], ...","{'authorId': '144628854', 'affiliations': [], ...",,,,,...,,,,,,,,,,
1344,"{'authorId': '49409656', 'affiliations': [], '...","{'authorId': '2107663106', 'affiliations': [],...",,,,,,,,,...,,,,,,,,,,
1345,"{'authorId': '39192306', 'affiliations': [], '...","{'authorId': '5223215', 'affiliations': [], 'p...",,,,,,,,,...,,,,,,,,,,


In [87]:
author1_data = pd.json_normalize(authors_data[0])
author1_data['index'] = author1_data.index
author1_data

Unnamed: 0,authorId,affiliations,paperCount,citationCount,hIndex,index
0,2123662680,[],2.0,1.0,1.0,0
1,151179319,[],15.0,19.0,3.0,1
2,145827927,[],61.0,2133.0,22.0,2
3,145550777,[],816.0,43461.0,99.0,3
4,2164157636,[],2.0,2.0,1.0,4
...,...,...,...,...,...,...
1342,3525823,[],184.0,5845.0,43.0,1342
1343,4740674,[],231.0,9782.0,46.0,1343
1344,49409656,[],9.0,131.0,2.0,1344
1345,39192306,[],39.0,492.0,14.0,1345


The affiliation column looks empty for most of them. Check to see how many are not null.

In [88]:
author1_data.affiliations.value_counts()
# not that useful to keep it then

[]                                                                1219
[Durham University, UK]                                              2
[King Abdullah University of Science and Technology]                 1
[Sapienza University of Rome]                                        1
[Mayo Clinic]                                                        1
[Cyprus University of Technology]                                    1
[Télécom SudParis, Institut Polytechnique de Paris]                  1
[University of Potsdam, Massachusetts Institute of Technology]       1
[Bocconi University]                                                 1
[Wilkes University]                                                  1
[Northern General Hospital, Sheffield, UK.]                          1
[Rutgers University, School of Criminal Justice]                     1
[University of Richmond]                                             1
Name: affiliations, dtype: int64

In [89]:
# Change the names to avoid duplication of values while merging
author1_data = author1_data.rename(columns={
    'paperCount': 'author_1_paperCount', 'citationCount': 'author_1_citationCount', 'hIndex': 'author_1_hIndex'
})
# Drop the unneccessary columns, including the authorId, as it is already there from the previous dataframe
author1_data = author1_data.drop(columns=['authorId', 'affiliations'])
author1_data


Unnamed: 0,author_1_paperCount,author_1_citationCount,author_1_hIndex,index
0,2.0,1.0,1.0,0
1,15.0,19.0,3.0,1
2,61.0,2133.0,22.0,2
3,816.0,43461.0,99.0,3
4,2.0,2.0,1.0,4
...,...,...,...,...
1342,184.0,5845.0,43.0,1342
1343,231.0,9782.0,46.0,1343
1344,9.0,131.0,2.0,1344
1345,39.0,492.0,14.0,1345


Now for the authors from 2 to 6.

In [90]:
# Author 2
author2_data = pd.json_normalize(authors_data[1]).rename(columns={
    'paperCount': 'author_2_paperCount', 'citationCount': 'author_2_citationCount', 'hIndex': 'author_2_hIndex'
})
author2_data['index'] = author2_data.index
author2_data = author2_data.drop(columns=['authorId', 'affiliations'])

# Author 3
author3_data = pd.json_normalize(authors_data[2]).rename(columns={
    'paperCount': 'author_3_paperCount', 'citationCount': 'author_3_citationCount', 'hIndex': 'author_3_hIndex'
})
author3_data['index'] = author3_data.index
author3_data = author3_data.drop(columns=['authorId', 'affiliations'])

# Author 4
author4_data = pd.json_normalize(authors_data[3]).rename(columns={
    'paperCount': 'author_4_paperCount', 'citationCount': 'author_4_citationCount', 'hIndex': 'author_4_hIndex'
})
author4_data['index'] = author4_data.index
author4_data = author4_data.drop(columns=['authorId', 'affiliations'])

# Author 5
author5_data = pd.json_normalize(authors_data[4]).rename(columns={
    'paperCount': 'author_5_paperCount', 'citationCount': 'author_5_citationCount', 'hIndex': 'author_5_hIndex'
})
author5_data['index'] = author5_data.index
author5_data = author5_data.drop(columns=['authorId', 'affiliations'])

# Author 6
author6_data = pd.json_normalize(authors_data[5]).rename(columns={
    'paperCount': 'author_6_paperCount', 'citationCount': 'author_6_citationCount', 'hIndex': 'author_6_hIndex'
})
author6_data['index'] = author6_data.index
author6_data = author6_data.drop(columns=['authorId', 'affiliations'])

#### Merge the dataframes for each authors

In [91]:
#only run once
df = pd.merge(df, author1_data, on="index", how="left")
df = pd.merge(df, author2_data, on="index", how="left")
df = pd.merge(df, author3_data, on="index", how="left")
df = pd.merge(df, author4_data, on="index", how="left")
df = pd.merge(df, author5_data, on="index", how="left")
df = pd.merge(df, author6_data, on="index", how="left")

Drop the ```author_data``` column

In [92]:
df = df.drop(columns=['author_data'])

In [93]:
pd.json_normalize(df.publicationVenue)
# we only need the name, which already exists in the column 'venue', so we can drop this.

Unnamed: 0,id,name,alternate_names,issn,alternate_issns,url,type,alternate_urls
0,f0727d48-1e26-4ba1-9c7d-e7789167aa4e,HUMAN Review International Humanities Review /...,[HUM Rev Int Humanit Rev Rev Int Humanidades],2695-9623,,,,
1,e946ee38-09e2-4ebc-8f9f-3267777d7536,Multimodal Communication,[Multimodal Commun],2230-6579,[2230-6587],https://www.degruyter.com/view/j/mc,,
2,0aed7a40-85f3-4c66-9e1b-c1556c57001b,PLoS ONE,"[Plo ONE, PLOS ONE, PLO ONE]",1932-6203,,https://journals.plos.org/plosone/,journal,[http://www.plosone.org/]
3,534db092-7c47-4084-b861-b3d91dc6cfae,npj Primary Care Respiratory Medicine,[npj Prim Care Respir Med],2055-1010,,http://www.nature.com/npjpcrm/,journal,
4,ab17701b-8ac0-4766-93ca-78d73ce88cd1,Canadian Journal of Cardiology,[Can J Cardiol],0828-282X,,https://www.onlinecjc.ca/,journal,"[http://www.pulsus.com/CARDIOL/index.htm, http..."
...,...,...,...,...,...,...,...,...
1342,,,,,,,,
1343,7606e9e4-fbe1-4463-bc40-02193a462402,MEDICC Review,[MEDICC Rev],1527-3172,,http://ejournals.ebsco.com/direct.asp?JournalI...,journal,[http://www.scielosp.org/scielo.php?lng=en&pid...
1344,01b74bb2-5307-4c96-b3da-d484b80a9ff7,Sex Roles,[Sex Role],0360-0025,,http://www.kluweronline.com/issn/0360-0025/con...,journal,[http://www.springer.com/psychology/personalit...
1345,066498ad-2c28-4d1e-b049-383a0cd9925d,Sexually Transmitted Diseases,[Sex Transm Dis],0148-5717,,http://gateway.ovid.com/ovidweb.cgi?AN=0000743...,journal,[http://www.stdjournal.com/]


In [94]:
df = df.drop(columns=['publicationVenue'])


#### Discipline types
Now we have to drop the original authors variable as it is a dictionary and change the ```fieldsOfStudy``` to hashable form. 

Otherwise it gives ```TypeError: unhashable type: 'dict'``` error on performing drop_duplicates().

In [95]:
df.fieldsOfStudy

0                         None
1                   [Medicine]
2                   [Medicine]
3                   [Medicine]
4                   [Medicine]
                 ...          
1342                [Medicine]
1343    [Psychology, Medicine]
1344                      None
1345                [Medicine]
1346                       NaN
Name: fieldsOfStudy, Length: 1347, dtype: object

Check if any null value is present.

In [96]:
df.fieldsOfStudy.isna().sum() # Quite a lot.

284

Do the necessary changes to change the type.

In [97]:
fields = df.fieldsOfStudy.copy()

fields = fields.explode()
fields_index = fields.index.to_list()
fields_name = fields.to_list()
fields = [fields_index, fields_name]
fields = pd.DataFrame(fields).transpose()
fields.columns = ['index', 'name']

fields = fields.groupby(by='index')['name'].apply(lambda x:x.str.cat(sep=", ")).reset_index(drop=True)
fields = pd.DataFrame(fields)
fields['index'] = fields.index

fields

Unnamed: 0,name,index
0,,0
1,Medicine,1
2,Medicine,2
3,Medicine,3
4,Medicine,4
...,...,...
1342,Medicine,1342
1343,"Psychology, Medicine",1343
1344,,1344
1345,Medicine,1345


In [98]:
fields_type = fields['name'].to_list()
unique_fields = []
for i in fields_type:
    i = i.split(", ")
    for j in i:
        if j not in unique_fields:
            if j != '':
                unique_fields.append(j)
        else:
            continue

unique_fields

['Medicine',
 'Psychology',
 'Computer Science',
 'History',
 'Sociology',
 'Biology',
 'Art',
 'Political Science',
 'Economics',
 'Chemistry',
 'Geography',
 'Philosophy',
 'Mathematics',
 'Physics',
 'Business',
 'Geology',
 'Environmental Science']

Now we can merge this dataframe to the original one, and drop the fieldsOfStudy variable to replace with this one.

In [99]:
# drop the column 'fieldsOfStudy'
df = df.drop(columns=['fieldsOfStudy'])

# merge the above dataframe to this one
df = pd.merge(df, fields, on="index", how="left").rename(columns={'name': 'fieldsOfStudy'})

#### Remove duplicate columns

In [100]:
df['dc:creator'].iloc[0] #not important

{'author': [{'ce:given-name': 'Patricia',
   'preferred-name': {'ce:given-name': 'Patricia',
    'ce:initials': 'P.',
    'ce:surname': 'Fernández Rotaeche',
    'ce:indexed-name': 'Fernández Rotaeche P.'},
   '@seq': '1',
   'ce:initials': 'P.',
   '@_fa': 'true',
   'affiliation': {'@id': '60027856',
    '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60027856'},
   'ce:surname': 'Fernández Rotaeche',
   '@auid': '58032204600',
   'author-url': 'https://api.elsevier.com/content/author/author_id/58032204600',
   'ce:indexed-name': 'Fernandez Rotaeche P.'}]}

In [101]:
df[['dc:description', 'abstract']] # Remove the second one as it has some null values

Unnamed: 0,dc:description,abstract
0,"© GKA Ediciones, authors.This study aims to an...",El presente estudio tiene como objetivo analiz...
1,"© 2022 Walter de Gruyter GmbH, Berlin/Boston.S...","Abstract Since the COVID-19 pandemic began, ex..."
2,© 2022 Flores et al. This is an open access ar...,We estimate the prevalence and characteristics...
3,"© 2022, The Author(s).Sex (whether one is ‘mal...",
4,© 2022 Canadian Cardiovascular SocietyDefined ...,
...,...,...
1342,The current research draws from ambivalent sex...,
1343,"Cancer, particularly when it affects women and...",Gender identity is a sociocultural construct b...
1344,Gender identity is a sociocultural construct b...,
1345,This study examined the relationship between p...,Background: We conducted a study among HIV-pos...


In [102]:
df[['dc:title', 'title']] # Remove the second one as it has different langauge for some

Unnamed: 0,dc:title,title
0,Analysis of sexism and feminism in the future ...,Análisis del sexismo y feminismo en el futuro ...
1,Multimodal metaphors and sexism in Arabic cart...,Multimodal metaphors and sexism in Arabic cart...
2,Hate crimes against LGBT people: National Crim...,Hate crimes against LGBT people: National Crim...
3,Addressing sex and gender to improve asthma ma...,Addressing sex and gender to improve asthma ma...
4,Sex and Gender Bias as a Mechanistic Determina...,Sex and Gender Bias as a Mechanistic Determina...
...,...,...
1342,Benevolent Sexism at Work: Gender Differences ...,Meeting the emerging challenge of breast and c...
1343,Meeting the emerging challenge of breast and c...,The right to health care for transsexual peopl...
1344,The right to health care for transsexual peopl...,Supportive When Not Supported? Male Responses ...
1345,Supportive When Not Supported? Male Responses ...,Unsafe Sexual Behaviors Among HIV-Positive Men...


Based on that, remove these columns from the dataframe. Also remove the rows with no DOIs.

In [103]:
# drop these columns and change the names of the duplicated columns
df = df.drop(columns=['title', 'abstract', 'dc:creator']).rename(columns={'dc:title': 'title', 'dc:description': 'abstract'})
# 1598 rows (for 1st query)

# Drop rows with no DOIs
df = df.dropna(subset=['prism:doi']) 
# 1548 rows (for 1st query)

# Check of some values are duplicated. If so, remove them based on 'title' and 'abstract' column
df = df.drop_duplicates(subset=['title', 'abstract'], keep="first")
# 1548 rows -- no duplicates! (for 1st query)

Now we can store this in a csv file, so that we do not need to use the APIs everytime to fetch the data.

In [104]:
#df.to_csv('search_data.csv')
df.to_csv('search_data_1.csv')