In [2]:
import elsapy
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
import json

import httpx
import time
import requests
import textwrap

import pandas as pd
import numpy as np
import sys

import config

### Scopus Search API

In [3]:
# Specify the keys and tokens

api_key = config.elsevier_api_key
inst_token = config.elsevier_inst_token

In [None]:
headers = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }
    
# Actual query: 
# (misogyny OR sexism OR gender violence) AND (detection OR identification OR prediction OR classification)

# https://dev.elsevier.com/sc_search_tips.html
# URL encoding
#The Boolean search is submitted through the query string parameter 'query'. 
# As with all other query string parameters, the contents of the submitted search must be URL-encoded. 
# It should be noted that the '+' character serves a special purpose as a query string value, 
# functioning as an equivalent to the space character (i.e. %20). In order to submit a literal character '+' it must be properly URL-encoded (i.e. %2B).

# query = 'KEY%28misogyny%20OR%20sexism%20OR%20gender%20violence%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'
# query = 'KEY%28misogyny%20OR%20sexism%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'
query = 'misogyny%20detection%20OR%20misogyny%20identification%20OR%20misogyny%20prediction%20OR%20misogyny%20classification%20OR%20sexism%20detection%20OR%20sexism%20identification%20OR%20sexism%20prediction%20OR%20sexism%20classification'

# dc:description (Abstract) is not available in Standard type, and the Complete type is not accessible
fields = 'dc:identifier' # just need the scopus ID for the next step
# ,eid,dc:title,dc:creator,prism:aggregationType,subtype,subtypeDescription,citedby-count,prism:publicationName,prism:coverDate,prism:doi,pii,orcid,openaccess,affiliation,author
offset = 0
years = '2012-2023'

In [5]:
responses = requests.get(f'https://api.elsevier.com/content/search/scopus?query={query}&field={fields}&date={years}', headers=headers, stream = True)
responses

<Response [200]>

In [6]:
json = responses.json()
json = json['search-results']
json.keys()

dict_keys(['opensearch:totalResults', 'opensearch:startIndex', 'opensearch:itemsPerPage', 'opensearch:Query', 'link', 'entry'])

In [7]:
json['opensearch:totalResults']

'328'

In [8]:
json['opensearch:startIndex'], json['opensearch:itemsPerPage'], json['link']
# not that useful

('0',
 '25',
 [{'@_fa': 'true',
   '@ref': 'self',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=misogyny+detection+OR+misogyny+identification+OR+misogyny+prediction+OR+misogyny+classification+OR+sexism+detection+OR+sexism+identification+OR+sexism+prediction+OR+sexism+classification&field=dc:identifier&date=2012-2023',
   '@type': 'application/json'},
  {'@_fa': 'true',
   '@ref': 'first',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=0&count=25&query=misogyny+detection+OR+misogyny+identification+OR+misogyny+prediction+OR+misogyny+classification+OR+sexism+detection+OR+sexism+identification+OR+sexism+prediction+OR+sexism+classification&field=dc:identifier&date=2012-2023',
   '@type': 'application/json'},
  {'@_fa': 'true',
   '@ref': 'next',
   '@href': 'https://api.elsevier.com/content/search/scopus?start=25&count=25&query=misogyny+detection+OR+misogyny+identification+OR+misogyny+prediction+OR+misogyny+classification+OR+sexism+de

In [9]:
json['opensearch:Query']

{'@role': 'request',
 '@searchTerms': 'misogyny detection OR misogyny identification OR misogyny prediction OR misogyny classification OR sexism detection OR sexism identification OR sexism prediction OR sexism classification',
 '@startPage': '0'}

In [10]:
json['entry'][0]

{'@_fa': 'true',
 'prism:url': 'https://api.elsevier.com/content/abstract/scopus_id/85145576108',
 'dc:identifier': 'SCOPUS_ID:85145576108'}

Now let's use the credentials again to get ALL the results

In [11]:
headers = {
        'Accept': 'application/json',
        'X-ELS-APIKey': api_key,
        'X-ELS-Insttoken': inst_token
    }

#query = 'KEY%28misogyny%20OR%20sexism%20OR%20gender%20violence%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'
# query = 'KEY%28misogyny%20OR%20sexism%29%20AND%20%28detection%20OR%20identification%20OR%20prediction%20OR%20classification%29'
#query = 'misogyny%20detection%20OR%20misogyny%20identification%20OR%20misogyny%20prediction%20OR%20misogyny%20classification%20OR%20sexism%20detection%20OR%20sexism%20identification%20OR%20sexism%20prediction%20OR%20sexism%20classification'
#query = 'sexism%20OR%20misogy'

#fields = 'dc:identifier,prism:doi'
#offset = 0
#years = '2012-2023'
#year = 2013

#scopus_search_url = 'https://api.elsevier.com/content/search/scopus?'

In [None]:
json_data1 = []

query = 'sexism' #'sexism%20OR%20misogy'
fields = 'dc:identifier,prism:doi'
year = 2013

scopus_search_url = 'https://api.elsevier.com/content/search/scopus?'
offset = 0

while True: # As long as the url exists
    # print('Requesting', url)
    # print('Offset', offset)
    # Make the request
    responses = requests.get(f'{scopus_search_url}query={query}&field={fields}&date={year}&start={offset}&view=STANDARD', headers=headers, stream = True)
    json = responses.json()
    if 'search-results' in json:
        json = json['search-results']
        # Check if any values are left
        if 'entry' in json : # The 'entry' key only exists if the request was successful, meaning values are left
            offset = offset + 25
            # If data found, add them to the variable, 
            # add them to the list and move to the next offset
            json_data1.extend(json['entry'])
            time.sleep(2)
        else:
            year += 1
            if year > 2022:
                print(f'Reached {year}, so ending search now')
                break
            else: 
                offset = 0
                responses = requests.get(f'{scopus_search_url}query={query}&field={fields}&date={year}&start={offset}&view=STANDARD', headers=headers, stream = True)
                json = responses.json()
                if 'search-results' in json:
                    json = json['search-results']
                    if 'entry' in json : # The 'entry' key only exists if the request was successful, meaning values are left
                        offset = offset + 25
                        json_data1.extend(json['entry'])
                        time.sleep(2)
                        print(f'Starting {year} now')
                    else:
                        print(f'Reached here')
                        continue
                else:
                    print(f'Reached the year {year}, but no search results')
                    continue
    else:
        print(f'Response 400 Bad request') # check the query for the request then
        break

In [None]:
# Create a dataframe out of it
df_int = pd.DataFrame(json_data1)
# Remove the papers which do not have a DOI
df_int = df_int[~df_int['prism:doi'].isna()]
# Remove duplicates, based on the DOIs
df_int = df_int.drop_duplicates(subset=['prism:doi']).reset_index(drop=True)
df_int

Unnamed: 0,@_fa,prism:url,dc:identifier,prism:doi
0,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84880475988,10.1093/acprof:oso/9780199892631.001.0001
1,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85123129625,10.4324/9780203143308-13
2,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85086983428,10.4324/9780203094020-9
3,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84918877738,10.4324/9780203143308
4,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:84882725830,10.1037/a0028437
...,...,...,...,...
185,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85120923782,10.1080/17539153.2021.2005099
186,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85113770445,10.1007/s10508-021-02100-x
187,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85106513940,10.1007/s12559-021-09862-5
188,true,https://api.elsevier.com/content/abstract/scop...,SCOPUS_ID:85102677351,10.1080/09589236.2021.1881461


Collecting only the scopus IDs so that it can be used for abstract retrieval.

In [27]:
scopus_ids = df_int['dc:identifier'].to_list()

In [28]:
df_int.columns

Index(['@_fa', 'prism:url', 'dc:identifier', 'prism:doi'], dtype='object')

### Abstract API
Use the previous information to retrieve the data from all scopus IDs.

In [29]:
json_data2 = []
i = 0

for id in scopus_ids: # As long as the url exists
    print(f'ID number {i}: {id}')
    i += 1
    # Make the request
    responses = requests.get(f'https://api.elsevier.com/content/abstract/scopus_id/{id}', headers=headers, stream = True)
    json = responses.json()
    if 'abstracts-retrieval-response' in json:
        json = json['abstracts-retrieval-response']
        # If data found, add them to the variable, 
        # add them to the list and move to the next offset
        json_data2.append(json['coredata'])
        time.sleep(3)
    else:
        continue

ID number 0: SCOPUS_ID:84891790307
ID number 1: SCOPUS_ID:84890960118
ID number 2: SCOPUS_ID:84890949474
ID number 3: SCOPUS_ID:84890926326
ID number 4: SCOPUS_ID:84890779280
ID number 5: SCOPUS_ID:84890729284
ID number 6: SCOPUS_ID:84891020454
ID number 7: SCOPUS_ID:84890618613
ID number 8: SCOPUS_ID:84890453114
ID number 9: SCOPUS_ID:84908664146
ID number 10: SCOPUS_ID:84890952198
ID number 11: SCOPUS_ID:84890418819
ID number 12: SCOPUS_ID:84890346350
ID number 13: SCOPUS_ID:85069326394
ID number 14: SCOPUS_ID:85066058738
ID number 15: SCOPUS_ID:85065460170
ID number 16: SCOPUS_ID:84890023104
ID number 17: SCOPUS_ID:84890011380
ID number 18: SCOPUS_ID:84890349143
ID number 19: SCOPUS_ID:84890339620
ID number 20: SCOPUS_ID:84889663603
ID number 21: SCOPUS_ID:84889054894
ID number 22: SCOPUS_ID:84889040917
ID number 23: SCOPUS_ID:84889039446
ID number 24: SCOPUS_ID:84889004816
ID number 25: SCOPUS_ID:84889048382
ID number 26: SCOPUS_ID:85069615715
ID number 27: SCOPUS_ID:85120164666
ID

ConnectionError: ('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))

In [None]:
df = pd.DataFrame(json_data2)
df['index'] = df.index
df

Unnamed: 0,srctype,eid,dc:description,prism:coverDate,prism:aggregationType,prism:url,subtypeDescription,dc:creator,link,prism:publicationName,...,article-number,dc:identifier,dc:publisher,prism:issueIdentifier,prism:pageRange,prism:endingPage,prism:startingPage,pubmed-id,prism:isbn,index
0,j,2-s2.0-85145576108,© 2022 Elsevier LtdThe spread of Hate Speech o...,2023-04-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Hiren', 'prefer...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Expert Systems with Applications,...,119342,SCOPUS_ID:85145576108,Elsevier Ltd,,,,,,,0
1,j,2-s2.0-85145658852,© 2022 Elsevier B.V.The enormous growth of soc...,2023-03-01,Journal,https://api.elsevier.com/content/abstract/scop...,Review,"{'author': [{'ce:given-name': 'Sneha', 'prefer...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Entertainment Computing,...,100544,SCOPUS_ID:85145658852,Elsevier B.V.,,,,,,,1
2,j,2-s2.0-85144423009,© 2022 Elsevier LtdDetecting suicidal tendenci...,2023-03-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Soumitra', 'pre...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Information Processing and Management,...,103234,SCOPUS_ID:85144423009,Elsevier Ltd,2,,,,,,2
3,j,2-s2.0-85112315917,"© 2021, The Author(s).Abusive language is an i...",2023-02-01,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Endang Wahyu', ...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Personal and Ubiquitous Computing,...,,SCOPUS_ID:85112315917,Springer Science and Business Media Deutschlan...,1,17-43,43,17,,,3
4,j,2-s2.0-85144391539,© 2022 Elsevier B.V.As the number of non-nativ...,2023-01-25,Journal,https://api.elsevier.com/content/abstract/scop...,Article,"{'author': [{'ce:given-name': 'Soumitra', 'pre...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Knowledge-Based Systems,...,110182,SCOPUS_ID:85144391539,Elsevier B.V.,,,,,,,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,b,2-s2.0-84901166940,Argues that South Africa's apartheid system of...,2012-12-01,Book,https://api.elsevier.com/content/abstract/scop...,Book,"{'author': [{'ce:given-name': 'Amanda Lock', '...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",Sex in Transition: Remaking Gender and Race in...,...,,SCOPUS_ID:84901166940,State University of New York Press,,1-329,329,1,,9781438444079,316
317,b,2-s2.0-84900958138,In Russia during the second half of the eighte...,2012-12-01,Book,https://api.elsevier.com/content/abstract/scop...,Book,"{'author': [{'ce:given-name': 'Anna', 'preferr...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",From the Womb to the Body Politic: Raising the...,...,,SCOPUS_ID:84900958138,University of Wisconsin Press,,1-228,228,1,,"[{'$': '029928994X'}, {'$': '9780299289942'}]",317
318,b,2-s2.0-84945564557,© 2012 - John Benjamins B.V.This comprehensive...,2012-09-11,Book,https://api.elsevier.com/content/abstract/scop...,Book,"{'author': [{'ce:given-name': 'Heiko', 'prefer...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...","An Interdisciplinary Bibliography on Language,...",...,,SCOPUS_ID:84945564557,John Benjamins Publishing Company,,1-294,294,1,,"[{'$': '9789027273154'}, {'$': '9789027212009'}]",318
319,b,2-s2.0-84872477278,"© 2012 by Oxford University Press, Inc. All ri...",2012-05-24,Book,https://api.elsevier.com/content/abstract/scop...,Book,"{'author': [{'ce:given-name': 'Lucy Valerie', ...","[{'@_fa': 'true', '@rel': 'self', '@href': 'ht...",State of Peril: Race and Rape in South African...,...,,SCOPUS_ID:84872477278,Oxford University Press,,1-272,272,1,,"[{'$': '9780199933327'}, {'$': '9780199796373'}]",319


In [22]:
df['dc:description'][1]
# Some of the abstracts have the copyright and author's name in the abstract. It needs to be removed.

'© 2022 Elsevier B.V.The enormous growth of social media provides a platform for displaying harmful, offensive online behaviour, which keeps increasing with time. The popularity of smartphones and the anonymity of the internet have made online offensive behaviour very common. Therefore, research on social media offensive behaviour has increased in recent years. In this paper, we have endeavoured to depict the variety of abusive behaviour one can encounter online and the significance of detecting them by classifying them into four categories: Content-Based, Sentiment and Emotion Based, User or Profile Based, and Network or Graph-Based approach. We review the state-of-the-art methods to detect bullies and abusive content on social media and discuss the factors that drive offenders to indulge in offensive activity, preventive actions to avoid online toxicity, and various cyber laws in different countries. Finally, we identify and discuss the future research directions that serve as a refe

In [23]:
df['dc:description'][0]

'© 2022 Elsevier LtdThe spread of Hate Speech on online platforms is a severe issue for societies and requires the identification of offensive content by platforms. Research has modeled Hate Speech recognition as a text classification problem that predicts the class of a message based on the text of the message only. However, context plays a huge role in communication. In particular, for short messages, the text of the preceding tweets can completely change the interpretation of a message within a discourse. This work extends previous efforts to classify Hate Speech by considering the current and previous tweets jointly. In particular, we introduce a clearly defined way of extracting context. We present the development of the first dataset for conversational-based Hate Speech classification with an approach for collecting context from long conversations for code-mixed Hindi (ICHCL dataset). Overall, our benchmark experiments show that the inclusion of context can improve classification

In [24]:
df['dc:description'][2]

"© 2022 Elsevier LtdDetecting suicidal tendencies and preventing suicides is an important social goal. The rise and continuance of emotion, the emotion category, and the intensity of the emotion are important clues about suicidal tendencies. The three determinants of emotion, viz. Valence, Arousal, and Dominance (VAD) can help determine a person's exact emotion(s) and its intensity. This paper introduces an end-to-end VAD-assisted transformer-based multi-task network for detecting emotion (primary task) and its intensity (auxiliary task) in suicide notes. As part of this research, we expand the utility of the emotion-annotated benchmark dataset of suicide notes, CEASE-v2.0, by annotating all its sentences with emotion intensity labels. Empirical results show that our multi-task method performs better than the corresponding single-task systems, with the best attained overall Mean Recall (MR) of 65.25% on the emotion task. On a similar task, we improved MR by 8.78% over the existing stat

In [25]:
df['prism:doi'].isna().sum() # Only a handful of them do not have a DOI

121

In [26]:
df.subtypeDescription.unique()

array(['Article', 'Review', 'Data Paper', 'Conference Paper', 'Book',
       'Book Chapter', 'Conference Review'], dtype=object)

In [27]:
df['prism:aggregationType'].unique()

array(['Journal', 'Conference Proceeding', 'Book', 'Book Series'],
      dtype=object)

In [28]:
df['dc:creator']

0      {'author': [{'ce:given-name': 'Hiren', 'prefer...
1      {'author': [{'ce:given-name': 'Sneha', 'prefer...
2      {'author': [{'ce:given-name': 'Soumitra', 'pre...
3      {'author': [{'ce:given-name': 'Endang Wahyu', ...
4      {'author': [{'ce:given-name': 'Soumitra', 'pre...
                             ...                        
316    {'author': [{'ce:given-name': 'Amanda Lock', '...
317    {'author': [{'ce:given-name': 'Anna', 'preferr...
318    {'author': [{'ce:given-name': 'Heiko', 'prefer...
319    {'author': [{'ce:given-name': 'Lucy Valerie', ...
320    {'author': [{'ce:given-name': 'Wendy', 'prefer...
Name: dc:creator, Length: 321, dtype: object

Make a list of the DOIs which would help in extracting the abstracts of the articles.

But first join the two dataframes, so that it could be used as a single dataframe.

In [29]:
df.columns

Index(['srctype', 'eid', 'dc:description', 'prism:coverDate',
       'prism:aggregationType', 'prism:url', 'subtypeDescription',
       'dc:creator', 'link', 'prism:publicationName', 'source-id', 'pii',
       'citedby-count', 'prism:volume', 'subtype', 'dc:title', 'openaccess',
       'openaccessFlag', 'prism:doi', 'prism:issn', 'article-number',
       'dc:identifier', 'dc:publisher', 'prism:issueIdentifier',
       'prism:pageRange', 'prism:endingPage', 'prism:startingPage',
       'pubmed-id', 'prism:isbn', 'index'],
      dtype='object')

In [30]:
# Only take the dois which do not have null values
doi_ids = df['prism:doi'][~df['prism:doi'].isna()].to_list() 
len(doi_ids)

200

In [31]:
df['dc:creator'][0]

{'author': [{'ce:given-name': 'Hiren',
   'preferred-name': {'ce:given-name': 'Hiren',
    'ce:initials': 'H.',
    'ce:surname': 'Madhu',
    'ce:indexed-name': 'Madhu H.'},
   '@seq': '1',
   'ce:initials': 'H.',
   '@_fa': 'true',
   'affiliation': [{'@id': '60014097',
     '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60014097'},
    {'@id': '125175992',
     '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/125175992'}],
   'ce:surname': 'Madhu',
   '@auid': '57222510587',
   'author-url': 'https://api.elsevier.com/content/author/author_id/57222510587',
   'ce:indexed-name': 'Madhu H.'}]}

In [32]:
df_creator = pd.json_normalize(df['dc:creator'])
df_creator['index'] = df_creator.index
df_creator['author'] = pd.json_normalize(df_creator['author'])
df_creator['author'][22]

{'ce:given-name': 'Ignacio',
 '@seq': '1',
 'ce:initials': 'I.',
 '@_fa': 'true',
 'ce:surname': 'Blanco-Alfonso',
 '@auid': '57214097368',
 'author-url': 'https://api.elsevier.com/content/author/author_id/57214097368',
 'ce:indexed-name': 'Blanco-Alfonso I.',
 'preferred-name.ce:given-name': 'Ignacio',
 'preferred-name.ce:initials': 'I.',
 'preferred-name.ce:surname': 'Blanco-Alfonso',
 'preferred-name.ce:indexed-name': 'Blanco-Alfonso I.',
 'affiliation.@id': '60014037',
 'affiliation.@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60014037'}

After trying the authors, it seems that this API only gives information about the first author. This is why Semantic Scholar API could be used alongside this to extract the information on the co-authors.

### Semantic API
Since the primary keys are different, use the DOIs of the manuscripts to search for them through the Semantic Scholar API.

In [33]:
semantic_api_key = config.semantic_api_key

headers = {
        'Content-type': 'application/json',
        'x-api-key': semantic_api_key
    }


#### Details about the paper based on Paper ID

In [34]:
fields = 'title,abstract,authors,venue,publicationVenue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr'

In [35]:
json_data3 = []
i = 0

for id in doi_ids: # As long as the url exists
    print(f'ID number {i}: {id}')
    i += 1

    # Make the request to get the paper ID which is needed for the next step
    responses = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{id}', headers=headers, stream = True)
    json = responses.json()
    # Check if 'paperId' is there in the json file
    if 'paperId' in json:
        paper_id = json['paperId']

        # make the second request to get the details of the paper 
        response = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}?fields={fields}', headers=headers, stream = True)
        json = response.json()
        json_data3.append(json)
        time.sleep(3)

    else: # if the request does not fetch anything
        continue
    

ID number 0: 10.1016/j.eswa.2022.119342
ID number 1: 10.1016/j.entcom.2022.100544
ID number 2: 10.1016/j.ipm.2022.103234
ID number 3: 10.1007/s00779-021-01609-1
ID number 4: 10.1016/j.knosys.2022.110182
ID number 5: 10.1016/j.ipm.2022.103118
ID number 6: 10.32604/cmc.2023.032993
ID number 7: 10.1080/00224499.2022.2103071
ID number 8: 10.1177/09579265221099380
ID number 9: 10.1177/08862605221084747
ID number 10: 10.1016/j.knosys.2022.109965
ID number 11: 10.1007/s13278-022-00993-7
ID number 12: 10.1007/s13278-022-00940-6
ID number 13: 10.1145/3555618
ID number 14: 10.1016/j.jjimei.2022.100119
ID number 15: 10.1145/3549498
ID number 16: 10.1371/journal.pone.0274317
ID number 17: 10.1016/j.dib.2022.108526
ID number 18: 10.1016/j.engappai.2022.105283
ID number 19: 10.1145/3551349.3559571
ID number 20: 10.1145/3551349.3559515
ID number 21: 10.23940/ijpe.22.09.p8.668678
ID number 22: 10.26441/RC21.2-2022-A2
ID number 23: 10.26342/2022-69-19
ID number 24: 10.26342/2022-69-20
ID number 25: 10.

In [36]:
df2 = pd.DataFrame(json_data3)
df2['index'] = df2.index
df2

Unnamed: 0,paperId,publicationVenue,title,abstract,venue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr,authors,index
0,b3f6c70f4dcebc26a5d27fcd15efebdf3ba9de0d,"{'id': '987139ae-a65d-49bb-aaf6-fb764dc40b19',...",Detecting offensive speech in conversational c...,,Expert systems with applications,2022.0,51,0,,,"[{'authorId': '2085214622', 'name': 'Hiren Mad...",0
1,52e723ec3c2200fa51004f585750e6a3f3841c79,"{'id': 'ec253390-208b-4979-9d5f-c9f654ad6181',...",Online offensive behaviour in socialmedia: Det...,,Entertainment Computing,2023.0,113,0,,,"[{'authorId': '2198249048', 'name': 'Sneha Chi...",1
2,e7c33ed5676b0a129ca4155641bb13aadf6feb90,,VAD-assisted multitask transformer framework f...,,Information Processing &amp; Management,2023.0,42,0,,,"[{'authorId': '1409869614', 'name': 'Soumitra ...",2
3,b23c72917922454a27b67902ecec4c3191021073,"{'id': '68fd8242-3be0-4b1e-8b5d-ad0a8a02db12',...",Towards multidomain and multilingual abusive l...,,Personal and Ubiquitous Computing,2021.0,161,0,[Computer Science],"{'model': 'tldr@v2.0.0', 'text': 'The current ...","[{'authorId': '9278845', 'name': 'Endang Wahyu...",3
4,72f1dcdfd73cfb56e62ca6b4396b49d344de94cf,"{'id': '12fff95b-d469-49a0-84a5-4fd4696c3f28',...",Multitasking of sentiment detection and emotio...,,Knowledge-Based Systems,2022.0,26,0,[Computer Science],,"[{'authorId': '1409869614', 'name': 'Soumitra ...",4
...,...,...,...,...,...,...,...,...,...,...,...,...
183,b2429f80820c7b06bee837568e5522e125023763,,Violence and war in culture and the media : fi...,1. Introduction: Violence and War in Culture a...,,2012.0,0,0,[Sociology],,"[{'authorId': '66825750', 'name': 'Athina Kara...",183
184,99ba6f0f45ca89f1edf7d1783512769f5d9ad26a,,"Associations Among Men's Sexist Attitudes, Obj...","The present study tested the hypothesis, deriv...",,2013.0,82,5,[Psychology],,"[{'authorId': '3298993', 'name': 'V. Swami'}, ...",184
185,12c4a1ae5abe1b90f7d01e88c7dd33338a21d76b,,"An Interdisciplinary Bibliography on Language,...","This comprehensive, state-of-the-art bibliogra...",,2012.0,0,0,[Computer Science],"{'model': 'tldr@v2.0.0', 'text': 'This compreh...","[{'authorId': '69405544', 'name': 'Heiko Motsc...",185
186,3070324ec5b75e3bb0b475792628f756fe562c2c,,State of Peril: Race and Rape in South African...,Preface Introduction 1. Danger and Desire: Rap...,,2012.0,0,2,[History],,"[{'authorId': '104954883', 'name': 'L. Graham'}]",186


In [37]:
df2.columns

Index(['paperId', 'publicationVenue', 'title', 'abstract', 'venue', 'year',
       'referenceCount', 'influentialCitationCount', 'fieldsOfStudy', 'tldr',
       'authors', 'index'],
      dtype='object')

#### Details about a paper's authors based on Paper ID

In [38]:
fields = 'affiliations,paperCount,citationCount,hIndex'

In [39]:
json_data4 = []
i = 0

for id in doi_ids: # As long as the url exists
    print(f'ID number {i}: {id}')
    i += 1

    # Make the request to get the paper ID which is needed for the next step
    responses = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{id}', headers=headers, stream = True)
    json = responses.json()
    # Check if 'paperId' is there in the json file
    if 'paperId' in json:
        paper_id = json['paperId']

        # make the second request to get the details of the authors 
        response = requests.get(f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}/authors?fields={fields}', headers=headers, stream = True)
        json = response.json()
        json_data4.append([id, json]) # Adding the doi in this one to help merge with dataframe 'df'

    else: # if the request does not fetch anything
        continue

ID number 0: 10.1016/j.eswa.2022.119342
ID number 1: 10.1016/j.entcom.2022.100544
ID number 2: 10.1016/j.ipm.2022.103234
ID number 3: 10.1007/s00779-021-01609-1
ID number 4: 10.1016/j.knosys.2022.110182
ID number 5: 10.1016/j.ipm.2022.103118
ID number 6: 10.32604/cmc.2023.032993
ID number 7: 10.1080/00224499.2022.2103071
ID number 8: 10.1177/09579265221099380
ID number 9: 10.1177/08862605221084747
ID number 10: 10.1016/j.knosys.2022.109965
ID number 11: 10.1007/s13278-022-00993-7
ID number 12: 10.1007/s13278-022-00940-6
ID number 13: 10.1145/3555618
ID number 14: 10.1016/j.jjimei.2022.100119
ID number 15: 10.1145/3549498
ID number 16: 10.1371/journal.pone.0274317
ID number 17: 10.1016/j.dib.2022.108526
ID number 18: 10.1016/j.engappai.2022.105283
ID number 19: 10.1145/3551349.3559571
ID number 20: 10.1145/3551349.3559515
ID number 21: 10.23940/ijpe.22.09.p8.668678
ID number 22: 10.26441/RC21.2-2022-A2
ID number 23: 10.26342/2022-69-19
ID number 24: 10.26342/2022-69-20
ID number 25: 10.

In [40]:
df3 = pd.DataFrame(json_data4).rename(columns={0: 'prism:doi', 1: 'author_data'})
df3['author_data'] = pd.json_normalize(df3['author_data']).data
df3['index'] = df3.index
df3

Unnamed: 0,prism:doi,author_data,index
0,10.1016/j.eswa.2022.119342,"[{'authorId': '2085214622', 'affiliations': []...",0
1,10.1016/j.entcom.2022.100544,"[{'authorId': '2198249048', 'affiliations': []...",1
2,10.1016/j.ipm.2022.103234,"[{'authorId': '1409869614', 'affiliations': ['...",2
3,10.1007/s00779-021-01609-1,"[{'authorId': '9278845', 'affiliations': [], '...",3
4,10.1016/j.knosys.2022.110182,"[{'authorId': '1409869614', 'affiliations': ['...",4
...,...,...,...
183,10.4324/9780203143308,"[{'authorId': '66825750', 'affiliations': [], ...",183
184,10.1037/a0028437,"[{'authorId': '3298993', 'affiliations': [], '...",184
185,10.1075/z.177,"[{'authorId': '69405544', 'affiliations': [], ...",185
186,10.1093/acprof:oso/9780199796373.001.0001,"[{'authorId': '104954883', 'affiliations': [],...",186


Merge the last two dataframes and remove the drop the columns we do not need.

In [42]:
df_2_3 = pd.merge(df2, df3, on="index", how="left").drop(columns=['paperId', 'index']) 
df_2_3

Unnamed: 0,publicationVenue,title,abstract,venue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr,authors,prism:doi,author_data
0,"{'id': '987139ae-a65d-49bb-aaf6-fb764dc40b19',...",Detecting offensive speech in conversational c...,,Expert systems with applications,2022.0,51,0,,,"[{'authorId': '2085214622', 'name': 'Hiren Mad...",10.1016/j.eswa.2022.119342,"[{'authorId': '2085214622', 'affiliations': []..."
1,"{'id': 'ec253390-208b-4979-9d5f-c9f654ad6181',...",Online offensive behaviour in socialmedia: Det...,,Entertainment Computing,2023.0,113,0,,,"[{'authorId': '2198249048', 'name': 'Sneha Chi...",10.1016/j.entcom.2022.100544,"[{'authorId': '2198249048', 'affiliations': []..."
2,,VAD-assisted multitask transformer framework f...,,Information Processing &amp; Management,2023.0,42,0,,,"[{'authorId': '1409869614', 'name': 'Soumitra ...",10.1016/j.ipm.2022.103234,"[{'authorId': '1409869614', 'affiliations': ['..."
3,"{'id': '68fd8242-3be0-4b1e-8b5d-ad0a8a02db12',...",Towards multidomain and multilingual abusive l...,,Personal and Ubiquitous Computing,2021.0,161,0,[Computer Science],"{'model': 'tldr@v2.0.0', 'text': 'The current ...","[{'authorId': '9278845', 'name': 'Endang Wahyu...",10.1007/s00779-021-01609-1,"[{'authorId': '9278845', 'affiliations': [], '..."
4,"{'id': '12fff95b-d469-49a0-84a5-4fd4696c3f28',...",Multitasking of sentiment detection and emotio...,,Knowledge-Based Systems,2022.0,26,0,[Computer Science],,"[{'authorId': '1409869614', 'name': 'Soumitra ...",10.1016/j.knosys.2022.110182,"[{'authorId': '1409869614', 'affiliations': ['..."
...,...,...,...,...,...,...,...,...,...,...,...,...
183,,Violence and war in culture and the media : fi...,1. Introduction: Violence and War in Culture a...,,2012.0,0,0,[Sociology],,"[{'authorId': '66825750', 'name': 'Athina Kara...",10.4324/9780203143308,"[{'authorId': '66825750', 'affiliations': [], ..."
184,,"Associations Among Men's Sexist Attitudes, Obj...","The present study tested the hypothesis, deriv...",,2013.0,82,5,[Psychology],,"[{'authorId': '3298993', 'name': 'V. Swami'}, ...",10.1037/a0028437,"[{'authorId': '3298993', 'affiliations': [], '..."
185,,"An Interdisciplinary Bibliography on Language,...","This comprehensive, state-of-the-art bibliogra...",,2012.0,0,0,[Computer Science],"{'model': 'tldr@v2.0.0', 'text': 'This compreh...","[{'authorId': '69405544', 'name': 'Heiko Motsc...",10.1075/z.177,"[{'authorId': '69405544', 'affiliations': [], ..."
186,,State of Peril: Race and Rape in South African...,Preface Introduction 1. Danger and Desire: Rap...,,2012.0,0,2,[History],,"[{'authorId': '104954883', 'name': 'L. Graham'}]",10.1093/acprof:oso/9780199796373.001.0001,"[{'authorId': '104954883', 'affiliations': [],..."


Now check with the df dataframe to remove unneccessary columns.

In [43]:
# choose the columns we can remove
df.srctype.unique() # Not important
df.eid.unique() # (Electronic ID) -- not important
df['dc:description'].unique() # (Abstracts) -- important
df['prism:url'].unique() # (Content Abstract Retrieval API URI) -- not important
df['prism:coverDate'].unique() # (Publication Date) -- important
df['prism:aggregationType'].unique() # (Source Type) -- important
df['subtypeDescription'].unique() # (Document Type description) -- important
df['link'] # not important
df['prism:publicationName'].unique() # (Source Title) -- important
df['source-id'].unique() # keep for later use
df['citedby-count'].unique() # (Cited-by Count) -- important
df['prism:volume'].unique() # (Volume) -- important
df['subtype'].unique() # (Document Type code) -- not important
df['dc:title'].unique() # (Article Title) -- important
df['openaccess'].unique() # (Open Access status) -- keep for later use
df['openaccessFlag'].unique() # keep for later use
df['prism:doi'] # (Document Object Identifier) -- keep to merge the datasets
df['prism:issn'].unique() # (Source identifier) -- not important
df['dc:identifier'] # (Scopus ID) -- not important
df['dc:publisher'] # keep for later use
df['prism:issueIdentifier'] # not important
df['article-number'] # not important
df['pubmed-id'] # not important
df['prism:pageRange'] # not important
df['prism:endingPage'] # not important
df['prism:startingPage'] # not important
df['pii'] # not important
df['prism:isbn'] # (Source identifier) -- not important


0                                                   NaN
1                                                   NaN
2                                                   NaN
3                                                   NaN
4                                                   NaN
                             ...                       
316                                       9781438444079
317       [{'$': '029928994X'}, {'$': '9780299289942'}]
318    [{'$': '9789027273154'}, {'$': '9789027212009'}]
319    [{'$': '9780199933327'}, {'$': '9780199796373'}]
320                                       9781843146063
Name: prism:isbn, Length: 321, dtype: object

In [44]:
df = df.drop(columns=[
    'srctype', 'eid', 'prism:url',
    'link', 'subtype', 'prism:issn', 'dc:identifier',
    'prism:issueIdentifier', 'article-number', 'pubmed-id', 'prism:pageRange',
    'prism:endingPage', 'prism:startingPage', 'pii','prism:isbn'
    ])

In [45]:
df.columns

Index(['dc:description', 'prism:coverDate', 'prism:aggregationType',
       'subtypeDescription', 'dc:creator', 'prism:publicationName',
       'source-id', 'citedby-count', 'prism:volume', 'dc:title', 'openaccess',
       'openaccessFlag', 'prism:doi', 'dc:publisher', 'index'],
      dtype='object')

In [46]:
# to check later if the dataframes are properly merged
df[df['prism:doi'].isna()].index

Int64Index([ 46,  47,  48,  49,  50,  51,  52,  55,  57,  60,
            ...
            281, 283, 284, 285, 286, 287, 301, 310, 316, 317],
           dtype='int64', length=121)

In [47]:
df = pd.merge(df, df_2_3, on='prism:doi', how='left')
df

Unnamed: 0,dc:description,prism:coverDate,prism:aggregationType,subtypeDescription,dc:creator,prism:publicationName,source-id,citedby-count,prism:volume,dc:title,...,title,abstract,venue,year,referenceCount,influentialCitationCount,fieldsOfStudy,tldr,authors,author_data
0,© 2022 Elsevier LtdThe spread of Hate Speech o...,2023-04-01,Journal,Article,"{'author': [{'ce:given-name': 'Hiren', 'prefer...",Expert Systems with Applications,24201,1,215,Detecting offensive speech in conversational c...,...,Detecting offensive speech in conversational c...,,Expert systems with applications,2022.0,51.0,0.0,,,"[{'authorId': '2085214622', 'name': 'Hiren Mad...","[{'authorId': '2085214622', 'affiliations': []..."
1,© 2022 Elsevier B.V.The enormous growth of soc...,2023-03-01,Journal,Review,"{'author': [{'ce:given-name': 'Sneha', 'prefer...",Entertainment Computing,19400158708,0,45,Online offensive behaviour in socialmedia: Det...,...,Online offensive behaviour in socialmedia: Det...,,Entertainment Computing,2023.0,113.0,0.0,,,"[{'authorId': '2198249048', 'name': 'Sneha Chi...","[{'authorId': '2198249048', 'affiliations': []..."
2,© 2022 Elsevier LtdDetecting suicidal tendenci...,2023-03-01,Journal,Article,"{'author': [{'ce:given-name': 'Soumitra', 'pre...",Information Processing and Management,12689,0,60,VAD-assisted multitask transformer framework f...,...,VAD-assisted multitask transformer framework f...,,Information Processing &amp; Management,2023.0,42.0,0.0,,,"[{'authorId': '1409869614', 'name': 'Soumitra ...","[{'authorId': '1409869614', 'affiliations': ['..."
3,"© 2021, The Author(s).Abusive language is an i...",2023-02-01,Journal,Article,"{'author': [{'ce:given-name': 'Endang Wahyu', ...",Personal and Ubiquitous Computing,22315,3,27,Towards multidomain and multilingual abusive l...,...,Towards multidomain and multilingual abusive l...,,Personal and Ubiquitous Computing,2021.0,161.0,0.0,[Computer Science],"{'model': 'tldr@v2.0.0', 'text': 'The current ...","[{'authorId': '9278845', 'name': 'Endang Wahyu...","[{'authorId': '9278845', 'affiliations': [], '..."
4,© 2022 Elsevier B.V.As the number of non-nativ...,2023-01-25,Journal,Article,"{'author': [{'ce:given-name': 'Soumitra', 'pre...",Knowledge-Based Systems,24772,0,260,Multitasking of sentiment detection and emotio...,...,Multitasking of sentiment detection and emotio...,,Knowledge-Based Systems,2022.0,26.0,0.0,[Computer Science],,"[{'authorId': '1409869614', 'name': 'Soumitra ...","[{'authorId': '1409869614', 'affiliations': ['..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,Argues that South Africa's apartheid system of...,2012-12-01,Book,Book,"{'author': [{'ce:given-name': 'Amanda Lock', '...",Sex in Transition: Remaking Gender and Race in...,21100315756,35,,Sex in transition: Remaking gender and race in...,...,,,,,,,,,,
317,In Russia during the second half of the eighte...,2012-12-01,Book,Book,"{'author': [{'ce:given-name': 'Anna', 'preferr...",From the Womb to the Body Politic: Raising the...,21100314688,1,,From the womb to the body politic: Raising the...,...,,,,,,,,,,
318,© 2012 - John Benjamins B.V.This comprehensive...,2012-09-11,Book,Book,"{'author': [{'ce:given-name': 'Heiko', 'prefer...","An Interdisciplinary Bibliography on Language,...",21100420846,8,,"An interdisciplinary bibliography on language,...",...,"An Interdisciplinary Bibliography on Language,...","This comprehensive, state-of-the-art bibliogra...",,2012.0,0.0,0.0,[Computer Science],"{'model': 'tldr@v2.0.0', 'text': 'This compreh...","[{'authorId': '69405544', 'name': 'Heiko Motsc...","[{'authorId': '69405544', 'affiliations': [], ..."
319,"© 2012 by Oxford University Press, Inc. All ri...",2012-05-24,Book,Book,"{'author': [{'ce:given-name': 'Lucy Valerie', ...",State of Peril: Race and Rape in South African...,21100373304,46,,State of Peril: Race and Rape in South African...,...,State of Peril: Race and Rape in South African...,Preface Introduction 1. Danger and Desire: Rap...,,2012.0,0.0,2.0,[History],,"[{'authorId': '104954883', 'name': 'L. Graham'}]","[{'authorId': '104954883', 'affiliations': [],..."


Check with some of the indices as specified before. Let's say 137 and 255.

In [48]:
df.iloc[137]

dc:description              © 2021 Sociedad Española para el Procesamiento...
prism:coverDate                                                    2021-09-01
prism:aggregationType                                                 Journal
subtypeDescription                                                     Review
dc:creator                  {'author': [{'ce:given-name': 'Francisco', 'pr...
prism:publicationName                      Procesamiento del Lenguaje Natural
source-id                                                         21100195304
citedby-count                                                              47
prism:volume                                                               67
dc:title                    Overview of EXIST 2021: sEXism Identification ...
openaccess                                                               None
openaccessFlag                                                           None
prism:doi                                                 10.263

In [49]:
df.iloc[255]

dc:description              Copyright © 2020 by Annual Reviews.Racism. Sex...
prism:coverDate                                                    2019-04-01
prism:aggregationType                                             Book Series
subtypeDescription                                                     Review
dc:creator                  {'author': [{'ce:given-name': 'Nancy', 'prefer...
prism:publicationName                          Annual Review of Public Health
source-id                                                               19584
citedby-count                                                             124
prism:volume                                                               41
dc:title                    Measures of racism, sexism, heterosexism, and ...
openaccess                                                                  1
openaccessFlag                                                           true
prism:doi                            10.1146/annurev-publhealth-

Now check with two other random indices not from the previous list.

In [50]:
df.iloc[111]

dc:description              © 2022 Abdullah Y. Muaad et al.Social media ne...
prism:coverDate                                                    2022-01-01
prism:aggregationType                                                 Journal
subtypeDescription                                                    Article
dc:creator                  {'author': [{'ce:given-name': 'Abdullah Y.', '...
prism:publicationName             Computational Intelligence and Neuroscience
source-id                                                          7000153240
citedby-count                                                               6
prism:volume                                                             2022
dc:title                    Artificial Intelligence-Based Approach for Mis...
openaccess                                                                  1
openaccessFlag                                                           true
prism:doi                                                10.1155

In [51]:
df.iloc[500]

IndexError: single positional indexer is out-of-bounds

### Authors

In [52]:
authors = pd.json_normalize(df['authors'])
authors

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,94,95,96,97,98,99,100,101,102,103
0,"{'authorId': '2085214622', 'name': 'Hiren Madhu'}","{'authorId': '2057059204', 'name': 'Shrey Sata...","{'authorId': '33682998', 'name': 'Sandip J Mod...","{'authorId': '32416012', 'name': 'Thomas Mandl'}","{'authorId': '1911868', 'name': 'Prasenjit Maj...",,,,,,...,,,,,,,,,,
1,"{'authorId': '2198249048', 'name': 'Sneha Chin...","{'authorId': '2198254237', 'name': 'Roopa M.S.'}","{'authorId': '2198254207', 'name': 'Arunalatha...","{'authorId': '2093461647', 'name': 'Venugopal ...",,,,,,,...,,,,,,,,,,
2,"{'authorId': '1409869614', 'name': 'Soumitra G...","{'authorId': '1734904', 'name': 'Asif Ekbal'}","{'authorId': '145532184', 'name': 'P. Bhattach...",,,,,,,,...,,,,,,,,,,
3,"{'authorId': '9278845', 'name': 'Endang Wahyu ...","{'authorId': '3101511', 'name': 'Valerio Basile'}","{'authorId': '1787198', 'name': 'V. Patti'}",,,,,,,,...,,,,,,,,,,
4,"{'authorId': '1409869614', 'name': 'Soumitra G...","{'authorId': '2194830832', 'name': 'Amit Priya...","{'authorId': '1734904', 'name': 'Asif Ekbal'}","{'authorId': '145532184', 'name': 'P. Bhattach...",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,,,,,,,,,,,...,,,,,,,,,,
317,,,,,,,,,,,...,,,,,,,,,,
318,"{'authorId': '69405544', 'name': 'Heiko Motsch...",,,,,,,,,,...,,,,,,,,,,
319,"{'authorId': '104954883', 'name': 'L. Graham'}",,,,,,,,,,...,,,,,,,,,,


In [53]:
authors[0].isna().sum() # Interesting that 23 manuscripts do not have any first author

134

In [54]:
print("Number of null values in authors: \n First author: {first}\n Second author: {second} \n Third author: {third} \n Fourth author: {fourth}\n Fifth author: {fifth}\n Sixth author: {sixth}".format(first=authors[0].isna().sum(), second= authors[1].isna().sum(), third= authors[2].isna().sum(), fourth= authors[3].isna().sum(), fifth= authors[4].isna().sum(), sixth= authors[5].isna().sum()))

Number of null values in authors: 
 First author: 134
 Second author: 168 
 Third author: 213 
 Fourth author: 254
 Fifth author: 286
 Sixth author: 301


### Separate the authors' name and id from the dictionary

In [55]:
nested_authors1 = pd.json_normalize(authors[0]).rename(columns={'authorId':'author_1_id', 'name':'author_1_name'})
nested_authors1['index'] = nested_authors1.index
nested_authors1

Unnamed: 0,author_1_id,author_1_name,index
0,2085214622,Hiren Madhu,0
1,2198249048,Sneha Chinivar,1
2,1409869614,Soumitra Ghosh,2
3,9278845,Endang Wahyu Pamungkas,3
4,1409869614,Soumitra Ghosh,4
...,...,...,...
316,,,316
317,,,317
318,69405544,Heiko Motschenbacher,318
319,104954883,L. Graham,319


Let's check if the combinations are unique to each author. (Can also check with the sizes)

This result could be useful later as well, for analysis.

In [56]:
# to get unique combinations of the authors and names
nested_authors1_comb = nested_authors1.groupby(['author_1_id','author_1_name']).size().reset_index().rename(columns={0:'count'})
nested_authors1_comb = nested_authors1_comb.sort_values(by='count', ascending=False)
nested_authors1_comb

Unnamed: 0,author_1_id,author_1_name,count
21,1409250207,J. García-Díaz,4
48,1847803,E. Fersini,3
157,9278845,Endang Wahyu Pamungkas,3
6,115237513,Nahia Idoiaga Mondragón,3
62,2065405590,M. Fahim,3
...,...,...,...
57,2053441697,E. White,1
58,2054429931,Elizabeth Reid,1
60,2057733996,Julius Reimer,1
61,2061490804,Francimaria Rayanne dos Santos Nascimento,1


Now, use dictionary to map the IDs to the names for later use.

In [57]:
auth_dict_1 = nested_authors1_comb.set_index('author_1_id').to_dict()['author_1_name']
auth_dict_1

{'1409250207': 'J. García-Díaz',
 '1847803': 'E. Fersini',
 '9278845': 'Endang Wahyu Pamungkas',
 '115237513': 'Nahia Idoiaga Mondragón',
 '2065405590': 'M. Fahim',
 '2117276459': 'Sayma Sultana',
 '73456815': 'J. Ringrose',
 '1995519339': 'Pedro Orgeira-Crespo',
 '2057093464': 'Mansi Mahendru',
 '3455118': 'Flor Miriam Plaza del Arco',
 '3442296': 'Harika Abburi',
 '33621471': 'Paula Fortuna',
 '3298993': 'V. Swami',
 '1409869614': 'Soumitra Ghosh',
 '1403518310': 'F. Rodríguez‐Sánchez',
 '95925384': 'Habibe Karayigit',
 '116155021': 'Jemma Tosh',
 '74733212': 'J. Kwarteng',
 '7675229': 'Simona Frenda',
 '2196746400': 'Rizkyta Shainy Angeline',
 '2183083803': 'Muhammad Amien Ibrahim',
 '40866783': 'Sonja Erikainen',
 '4033643': 'Courtland S. Hyatt',
 '40066064': 'Jonah A. Berger',
 '40059516': 'Kaitlynn Mendes',
 '38598921': 'Ezat Ahmadzadeh',
 '3751771': 'Tammi Arford',
 '37415113': 'Jennifer D. Rubin',
 '36715403': 'Sarah Masud',
 '36328737': 'Pulkit Parikh',
 '2188122986': 'Antonio

In [58]:
# Following the same technique from the previous author 1 set till authro 6

# Author 2
nested_authors2 = pd.json_normalize(authors[1]).rename(columns={'authorId':'author_2_id', 'name':'author_2_name'})
nested_authors2['index'] = nested_authors2.index
nested_authors2_comb = nested_authors2.groupby(['author_2_id','author_2_name']).size().reset_index().rename(columns={0:'count'})
nested_authors2_comb = nested_authors2_comb.sort_values(by='count', ascending=False)
auth_dict_2 = nested_authors2_comb.set_index('author_2_id').to_dict()['author_2_name']

# Author 3
nested_authors3 = pd.json_normalize(authors[2]).rename(columns={'authorId':'author_3_id', 'name':'author_3_name'})
nested_authors3['index'] = nested_authors3.index
nested_authors3_comb = nested_authors3.groupby(['author_3_id','author_3_name']).size().reset_index().rename(columns={0:'count'})
nested_authors3_comb = nested_authors3_comb.sort_values(by='count', ascending=False)
auth_dict_3 = nested_authors3_comb.set_index('author_3_id').to_dict()['author_3_name']

# Author 4
nested_authors4 = pd.json_normalize(authors[3]).rename(columns={'authorId':'author_4_id', 'name':'author_4_name'})
nested_authors4['index'] = nested_authors4.index
nested_authors4_comb = nested_authors4.groupby(['author_4_id','author_4_name']).size().reset_index().rename(columns={0:'count'})
nested_authors4_comb = nested_authors4_comb.sort_values(by='count', ascending=False)
auth_dict_4 = nested_authors4_comb.set_index('author_4_id').to_dict()['author_4_name']

# Author 5
nested_authors5 = pd.json_normalize(authors[4]).rename(columns={'authorId':'author_5_id', 'name':'author_5_name'})
nested_authors5['index'] = nested_authors5.index
nested_authors5_comb = nested_authors5.groupby(['author_5_id','author_5_name']).size().reset_index().rename(columns={0:'count'})
nested_authors5_comb = nested_authors5_comb.sort_values(by='count', ascending=False)
auth_dict_5 = nested_authors5_comb.set_index('author_5_id').to_dict()['author_5_name']

# Author 6
nested_authors6 = pd.json_normalize(authors[5]).rename(columns={'authorId':'author_6_id', 'name':'author_6_name'})
nested_authors6['index'] = nested_authors6.index
nested_authors6_comb = nested_authors6.groupby(['author_6_id','author_6_name']).size().reset_index().rename(columns={0:'count'})
nested_authors6_comb = nested_authors6_comb.sort_values(by='count', ascending=False)
auth_dict_6 = nested_authors6_comb.set_index('author_6_id').to_dict()['author_6_name']

#### Merge the dataframes for the authors

In [59]:
#only run once
df = pd.merge(df, nested_authors1, on="index", how="left")
df = pd.merge(df, nested_authors2, on="index", how="left")
df = pd.merge(df, nested_authors3, on="index", how="left")
df = pd.merge(df, nested_authors4, on="index", how="left")
df = pd.merge(df, nested_authors5, on="index", how="left")
df = pd.merge(df, nested_authors6, on="index", how="left")

Now drop the ```authors``` column.

In [60]:
df = df.drop(columns=['authors'])

### Author's data

In [61]:
authors_data = pd.json_normalize(df['author_data'])
authors_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,"{'authorId': '2085214622', 'affiliations': [],...","{'authorId': '2057059204', 'affiliations': [],...","{'authorId': '33682998', 'affiliations': ['LDR...","{'authorId': '32416012', 'affiliations': ['Uni...","{'authorId': '1911868', 'affiliations': [], 'p...",,,,,,...,,,,,,,,,,
1,"{'authorId': '2198249048', 'affiliations': [],...","{'authorId': '2198254237', 'affiliations': [],...","{'authorId': '2198254207', 'affiliations': [],...","{'authorId': '2093461647', 'affiliations': [],...",,,,,,,...,,,,,,,,,,
2,"{'authorId': '1409869614', 'affiliations': ['I...","{'authorId': '1734904', 'affiliations': [], 'p...","{'authorId': '145532184', 'affiliations': [], ...",,,,,,,,...,,,,,,,,,,
3,"{'authorId': '9278845', 'affiliations': [], 'p...","{'authorId': '3101511', 'affiliations': ['Univ...","{'authorId': '1787198', 'affiliations': [], 'p...",,,,,,,,...,,,,,,,,,,
4,"{'authorId': '1409869614', 'affiliations': ['I...","{'authorId': '2194830832', 'affiliations': [],...","{'authorId': '1734904', 'affiliations': [], 'p...","{'authorId': '145532184', 'affiliations': [], ...",,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,,,,,,,,,,,...,,,,,,,,,,
317,,,,,,,,,,,...,,,,,,,,,,
318,"{'authorId': '69405544', 'affiliations': [], '...",,,,,,,,,,...,,,,,,,,,,
319,"{'authorId': '104954883', 'affiliations': [], ...",,,,,,,,,,...,,,,,,,,,,


In [62]:
author1_data = pd.json_normalize(authors_data[0])
author1_data['index'] = author1_data.index
author1_data

Unnamed: 0,authorId,affiliations,paperCount,citationCount,hIndex,index
0,2085214622,[],5.0,141.0,3.0,0
1,2198249048,[],1.0,0.0,0.0,1
2,1409869614,"[Indian Institute of Technology Patna, India]",24.0,69.0,4.0,2
3,9278845,[],19.0,334.0,11.0,3
4,1409869614,"[Indian Institute of Technology Patna, India]",24.0,69.0,4.0,4
...,...,...,...,...,...,...
316,,,,,,316
317,,,,,,317
318,69405544,[],100.0,1000.0,15.0,318
319,104954883,[],42.0,235.0,7.0,319


The affiliation column looks empty for most of them. Check to see how many are not null.

In [63]:
author1_data.affiliations.value_counts()
# not that useful to keep it then

[]                                                     177
[Indian Institute of Technology Patna, India]            2
[National University of Ireland Galway]                  1
[University of Wolverhampton]                            1
[Télécom SudParis, Institut Polytechnique de Paris]      1
[Wroclaw University of Technology]                       1
[Georgia Institute of Technology]                        1
[queen mary university of london]                        1
[Hasso Plattner Institute, University of Potsdam]        1
Name: affiliations, dtype: int64

In [64]:
# Change the names to avoid duplication of values while merging
author1_data = author1_data.rename(columns={
    'paperCount': 'author_1_paperCount', 'citationCount': 'author_1_citationCount', 'hIndex': 'author_1_hIndex'
})
# Drop the unneccessary columns, including the authorId, as it is already there from the previous dataframe
author1_data = author1_data.drop(columns=['authorId', 'affiliations'])
author1_data


Unnamed: 0,author_1_paperCount,author_1_citationCount,author_1_hIndex,index
0,5.0,141.0,3.0,0
1,1.0,0.0,0.0,1
2,24.0,69.0,4.0,2
3,19.0,334.0,11.0,3
4,24.0,69.0,4.0,4
...,...,...,...,...
316,,,,316
317,,,,317
318,100.0,1000.0,15.0,318
319,42.0,235.0,7.0,319


Now for the authors from 2 to 6.

In [65]:
# Author 2
author2_data = pd.json_normalize(authors_data[1]).rename(columns={
    'paperCount': 'author_2_paperCount', 'citationCount': 'author_2_citationCount', 'hIndex': 'author_2_hIndex'
})
author2_data['index'] = author2_data.index
author2_data = author2_data.drop(columns=['authorId', 'affiliations'])

# Author 3
author3_data = pd.json_normalize(authors_data[2]).rename(columns={
    'paperCount': 'author_3_paperCount', 'citationCount': 'author_3_citationCount', 'hIndex': 'author_3_hIndex'
})
author3_data['index'] = author3_data.index
author3_data = author3_data.drop(columns=['authorId', 'affiliations'])

# Author 4
author4_data = pd.json_normalize(authors_data[3]).rename(columns={
    'paperCount': 'author_4_paperCount', 'citationCount': 'author_4_citationCount', 'hIndex': 'author_4_hIndex'
})
author4_data['index'] = author4_data.index
author4_data = author4_data.drop(columns=['authorId', 'affiliations'])

# Author 5
author5_data = pd.json_normalize(authors_data[4]).rename(columns={
    'paperCount': 'author_5_paperCount', 'citationCount': 'author_5_citationCount', 'hIndex': 'author_5_hIndex'
})
author5_data['index'] = author5_data.index
author5_data = author5_data.drop(columns=['authorId', 'affiliations'])

# Author 6
author6_data = pd.json_normalize(authors_data[5]).rename(columns={
    'paperCount': 'author_6_paperCount', 'citationCount': 'author_6_citationCount', 'hIndex': 'author_6_hIndex'
})
author6_data['index'] = author6_data.index
author6_data = author6_data.drop(columns=['authorId', 'affiliations'])

#### Merge the dataframes for each authors

In [66]:
#only run once
df = pd.merge(df, author1_data, on="index", how="left")
df = pd.merge(df, author2_data, on="index", how="left")
df = pd.merge(df, author3_data, on="index", how="left")
df = pd.merge(df, author4_data, on="index", how="left")
df = pd.merge(df, author5_data, on="index", how="left")
df = pd.merge(df, author6_data, on="index", how="left")

Drop the ```author_data``` column

In [67]:
df = df.drop(columns=['author_data'])

In [68]:
pd.json_normalize(df.publicationVenue)
# we only need the name, which already exists in the column 'venue', so we can drop this.

Unnamed: 0,id,name,type,alternate_names,issn,url,alternate_urls,alternate_issns
0,987139ae-a65d-49bb-aaf6-fb764dc40b19,Expert systems with applications,journal,"[Expert syst appl, Expert Systems With Applica...",0957-4174,https://www.journals.elsevier.com/expert-syste...,[https://www.sciencedirect.com/journal/expert-...,
1,ec253390-208b-4979-9d5f-c9f654ad6181,Entertainment Computing,journal,[Entertain Comput],1875-9521,https://www.journals.elsevier.com/entertainmen...,[http://www.sciencedirect.com/science/journal/...,
2,,,,,,,,
3,68fd8242-3be0-4b1e-8b5d-ad0a8a02db12,Personal and Ubiquitous Computing,journal,[Pers Ubiquitous Comput],1617-4909,http://www.springer.com/computer/hci/journal/779,[https://link.springer.com/journal/volumesAndI...,
4,12fff95b-d469-49a0-84a5-4fd4696c3f28,Knowledge-Based Systems,journal,"[Knowl Based Syst, Knowledge Based Systems, Kn...",0950-7051,http://www.elsevier.com/wps/find/journaldescri...,[https://www.journals.elsevier.com/knowledge-b...,
...,...,...,...,...,...,...,...,...
316,,,,,,,,
317,,,,,,,,
318,,,,,,,,
319,,,,,,,,


In [69]:
df = df.drop(columns=['publicationVenue'])


#### Discipline types
Now we have to drop the original authors variable as it is a dictionary and change the ```fieldsOfStudy``` to hashable form. 

Otherwise it gives ```TypeError: unhashable type: 'dict'``` error on performing drop_duplicates().

In [70]:
df.fieldsOfStudy

0                    None
1                    None
2                    None
3      [Computer Science]
4      [Computer Science]
              ...        
316                   NaN
317                   NaN
318    [Computer Science]
319             [History]
320           [Sociology]
Name: fieldsOfStudy, Length: 321, dtype: object

Check if any null value is present.

In [71]:
df.fieldsOfStudy.isna().sum() # Quite a lot.

175

Do the necessary changes to change the type.

In [72]:
fields = df.fieldsOfStudy.copy()

fields = fields.explode()
fields_index = fields.index.to_list()
fields_name = fields.to_list()
fields = [fields_index, fields_name]
fields = pd.DataFrame(fields).transpose()
fields.columns = ['index', 'name']

fields = fields.groupby(by='index')['name'].apply(lambda x:x.str.cat(sep=", ")).reset_index(drop=True)
fields = pd.DataFrame(fields)
fields['index'] = fields.index

fields

Unnamed: 0,name,index
0,,0
1,,1
2,,2
3,Computer Science,3
4,Computer Science,4
...,...,...
316,,316
317,,317
318,Computer Science,318
319,History,319


In [73]:
fields_type = fields['name'].to_list()
unique_fields = []
for i in fields_type:
    i = i.split(", ")
    for j in i:
        if j not in unique_fields:
            if j != '':
                unique_fields.append(j)
        else:
            continue

unique_fields

['Computer Science',
 'Medicine',
 'History',
 'Sociology',
 'Psychology',
 'Political Science',
 'Art',
 'Business',
 'Economics',
 'Geography',
 'Biology',
 'Engineering']

Now we can merge this dataframe to the original one, and drop the fieldsOfStudy variable to replace with this one.

In [74]:
# drop the column 'fieldsOfStudy'
df = df.drop(columns=['fieldsOfStudy'])

# merge the above dataframe to this one
df = pd.merge(df, fields, on="index", how="left").rename(columns={'name': 'fieldsOfStudy'})

#### Remove duplicate columns

In [75]:
df['dc:creator'].iloc[0] #not important

{'author': [{'ce:given-name': 'Hiren',
   'preferred-name': {'ce:given-name': 'Hiren',
    'ce:initials': 'H.',
    'ce:surname': 'Madhu',
    'ce:indexed-name': 'Madhu H.'},
   '@seq': '1',
   'ce:initials': 'H.',
   '@_fa': 'true',
   'affiliation': [{'@id': '60014097',
     '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/60014097'},
    {'@id': '125175992',
     '@href': 'https://api.elsevier.com/content/affiliation/affiliation_id/125175992'}],
   'ce:surname': 'Madhu',
   '@auid': '57222510587',
   'author-url': 'https://api.elsevier.com/content/author/author_id/57222510587',
   'ce:indexed-name': 'Madhu H.'}]}

In [76]:
df[['dc:description', 'abstract']] # Remove the second one as it has some null values

Unnamed: 0,dc:description,abstract
0,© 2022 Elsevier LtdThe spread of Hate Speech o...,
1,© 2022 Elsevier B.V.The enormous growth of soc...,
2,© 2022 Elsevier LtdDetecting suicidal tendenci...,
3,"© 2021, The Author(s).Abusive language is an i...",
4,© 2022 Elsevier B.V.As the number of non-nativ...,
...,...,...
316,Argues that South Africa's apartheid system of...,
317,In Russia during the second half of the eighte...,
318,© 2012 - John Benjamins B.V.This comprehensive...,"This comprehensive, state-of-the-art bibliogra..."
319,"© 2012 by Oxford University Press, Inc. All ri...",Preface Introduction 1. Danger and Desire: Rap...


In [77]:
df[['dc:title', 'title']] # Remove the second one as it has different langauge for some

Unnamed: 0,dc:title,title
0,Detecting offensive speech in conversational c...,Detecting offensive speech in conversational c...
1,Online offensive behaviour in socialmedia: Det...,Online offensive behaviour in socialmedia: Det...
2,VAD-assisted multitask transformer framework f...,VAD-assisted multitask transformer framework f...
3,Towards multidomain and multilingual abusive l...,Towards multidomain and multilingual abusive l...
4,Multitasking of sentiment detection and emotio...,Multitasking of sentiment detection and emotio...
...,...,...
316,Sex in transition: Remaking gender and race in...,
317,From the womb to the body politic: Raising the...,
318,"An interdisciplinary bibliography on language,...","An Interdisciplinary Bibliography on Language,..."
319,State of Peril: Race and Rape in South African...,State of Peril: Race and Rape in South African...


Based on that, remove these columns from the dataframe. Also remove the rows with no DOIs.

In [78]:
# drop these columns and change the names of the duplicated columns
df = df.drop(columns=['title', 'abstract', 'dc:creator']).rename(columns={'dc:title': 'title', 'dc:description': 'abstract'})
# 1598 rows (for 1st query)

# Drop rows with no DOIs
df = df.dropna(subset=['prism:doi']) 
# 1548 rows (for 1st query)

# Check of some values are duplicated. If so, remove them based on 'title' and 'abstract' column
df = df.drop_duplicates(subset=['title', 'abstract'], keep="first")
# 1548 rows -- no duplicates! (for 1st query)

Now we can store this in a csv file, so that we do not need to use the APIs everytime to fetch the data.

In [79]:
#df.to_csv('search_data.csv')
#df.to_csv('search_data_1.csv')
#df.to_csv('search_data_2.csv')
df.to_csv('search_data_3.csv')