In [None]:
SCRIPT PURPOSE

Obtain consensus literature from Scopus

In [None]:
import sys
from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch
from tqdm import tqdm
import pandas as pd
from collections import Counter

In [None]:
## Initialize client
client = ElsClient('xxxx')

In [None]:
doi_list=['10.1016/j.poetic.2007.01.001',
          '10.1177/0003122410388488',
          '10.15195/v3.a45',
          '10.1086/701298',
          '10.1086/704370',
          '10.1162/qss_a_00017',
          '10.1146/annurev-soc-121919-054621',
'10.1140/epjds/s13688-016-0066-4',
'10.1177/2378023117738903',
'10.1371/journal.pone.0066212',
'10.1038/s41562-017-0079',
'10.1073/pnas.1613580114',
'10.1038/nclimate2875',
'10.1073/pnas.1509433112',
'10.15195/v2.a9',
'10.1016/j.ssresearch.2010.10.002',
'10.1093/sf/sov004',
'10.1177/0003122415601618',
'10.1073/pnas.1509757112',
'10.1073/pnas.1800485115',
'10.1038/s41597-019-0033-6',
'10.1177/2329496514540131',
'10.1126/science.1240474',
'10.1016/j.socnet.2015.02.006',
'10.15195/v1.a15',
'10.1126/science.1136099',
'10.1126/science.1201765',
'10.1177/0003122412463574',
'10.1016/j.ssresearch.2015.06.008',
          # Methods
          '10.1016/j.respol.2014.02.005',
          '10.15195/v3.a32',
          '10.1073/pnas.1719792115',
          '10.1162/tacl_a_00028',
          '10.1016/j.joi.2018.09.002',
          '10.1162/qss_a_00019',
          '10.18653/v1/2020.acl-main.447',
          '10.1007/s11192-020-03690-4',
]

In [None]:
lit_list=[]
for doi in doi_list:
    doc_srch = ElsSearch('doi('+doi+')','scopus')
    doc_srch.execute(client, get_all = True) # get_all parameter will get all the
    lit_list+=doc_srch.results
lit_list=[lit for lit in lit_list if 'error' not in lit.keys()]
len(lit_list)

In [None]:
seed_ref_full=[]

In [None]:
for scp in tqdm([s['dc:identifier'] for s in lit_list]):
    scp=scp.split(':')[1]
    if scp not in [s['coredata']['dc:identifier'].split(':')[1] for s in seed_ref_full]:
        scp_doc = AbsDoc(scp_id = scp)
        if scp_doc.read(client): # Successful = True; otherwise = False.
            seed_ref_full+=[scp_doc.data]
        else:
            pass # Pass none results.
    else:
        pass # Pass retrived.

In [None]:
df_seed_ref_full=pd.DataFrame(seed_ref_full)
df_seed_ref_full.to_pickle('../data/df_seed_ref_full.pkl.bz2', compression='bz2')

In [None]:
eid_list=[s['eid'] for s in lit_list]
citing_ref_list=[]
for eid in tqdm(eid_list):
    doc_srch = ElsSearch('refeid('+eid+')','scopus')
    doc_srch.execute(client, get_all = True)
    citing_ref_list+=doc_srch.results

In [None]:
df_citing_ref_list=pd.DataFrame(citing_ref_list)
df_citing_ref_list

In [None]:
df_citing_ref_list.to_pickle('../data/intermediary/df_citing_ref_list.pkl.bz2', compression='bz2')

In [None]:
Get detailed info (include citations) of citing references

In [None]:
## Initialize client, change API key.
client = ElsClient('xxxx')

In [None]:
df_citing_ref_list=pd.read_pickle('../data/intermediary/df_citing_ref_list.pkl.bz2', compression='bz2')
citing_ref_full=[]

In [None]:
for scp in tqdm(df_citing_ref_list['dc:identifier']):
    scp=scp.split(':')[1]
    if scp not in [s['coredata']['dc:identifier'].split(':')[1] for s in citing_ref_full]:
        scp_doc = AbsDoc(scp_id = scp)
        if scp_doc.read(client): # Successful = True; otherwise = False.
            citing_ref_full+=[scp_doc.data]
        else:
            pass # Pass none results.
    else:
        pass # Pass retrived.

In [None]:
df_citing_ref_full=pd.DataFrame(citing_ref_full)
df_citing_ref_full.to_pickle('../data/df_citing_ref_full.pkl.bz2', compression='bz2')

In [None]:
ANALYZE CORPUS

In [None]:
df_seed_ref_full=pd.read_pickle('../data/df_seed_ref_full.pkl.bz2')
df_citing_ref_full=pd.read_pickle('../data/df_citing_ref_full.pkl.bz2')

In [None]:
df_consensus_corpus=pd.concat([df_seed_ref_full, df_citing_ref_full], axis=0, ignore_index=True)

In [None]:
df_consensus_corpus.loc[3]

In [None]:
import networkx as nx

In [None]:
edge_list=[]
for index in df_consensus_corpus.index:
    ego_id=df_consensus_corpus.loc[index]['coredata']['dc:identifier'].split(':')[1]
    ref_list=df_consensus_corpus.loc[index]['item']['bibrecord']['tail']['bibliography']['reference']
    if type(ref_list)==list: # Has multiple references.
        ref_id_list=[s['ref-info']['refd-itemidlist']['itemid'] for s in ref_list]
        for ref_id in ref_id_list:
            if type(ref_id)==dict: # Has one id.
                cited_id=ref_id['$']
            elif type(ref_id)==list: # Has multiple ID.
                cited_id=[t['$'] for t in ref_id if t['@idtype']=='SGR'][0]
            else: # Exceptions.
                raise
            edge_list+=[(ego_id, cited_id)]
    elif type(ref_list)==dict: # Has only one reference.
        ref_id=ref_list['ref-info']['refd-itemidlist']['itemid']
        if type(ref_id)==dict: # Has one id.
            cited_id=ref_id['$']
        elif type(ref_id)==list: # Has multiple ID.
            cited_id=[t['$'] for t in ref_id if t['@idtype']=='SGR'][0]
        else: # Exceptions.
            raise
        edge_list+=[(ego_id, cited_id)]
    else:
        raise

In [None]:
g=nx.DiGraph()
edge_list_count=Counter(edge_list)
for edge in tqdm(edge_list_count):
    g.add_edge(edge[0], edge[1], weight=edge_list_count[edge])

In [None]:
Create a SGR-Title index table.

In [None]:
# Add seed article.
df_sgr_title=pd.DataFrame([(s['dc:identifier'].split(':')[1], s['dc:title']) for s in df_consensus_corpus['coredata']]).drop_duplicates().rename(columns={0:'sgr', 1:'title'})
df_sgr_title.sample(3)

In [None]:
df_citation_sgr_title=pd.DataFrame()
for index in tqdm(df_consensus_corpus.index):
    ref_list=df_consensus_corpus.loc[index]['item']['bibrecord']['tail']['bibliography']['reference']
    ref_scp_id_list=[]
    if type(ref_list)==list: # Has multiple references.
        ref_title_list=[s['ref-fulltext'] for s in ref_list]
        ref_id_list=[s['ref-info']['refd-itemidlist']['itemid'] for s in ref_list]
        for ref_id in ref_id_list:
            if type(ref_id)==dict: # Has one id.
                ref_scp_id_list+=[ref_id['$']]
            elif type(ref_id)==list: # Has multiple ID.
                ref_scp_id_list+=[[t['$'] for t in ref_id if t['@idtype']=='SGR'][0]]
            else: # Exceptions.
                raise
    elif type(ref_list)==dict: # Has only one reference.
        ref_title_list=[ref_list['ref-fulltext']]
        ref_id=ref_list['ref-info']['refd-itemidlist']['itemid']
        if type(ref_id)==dict: # Has one id.
            ref_scp_id_list+=[ref_id['$']]
        elif type(ref_id)==list: # Has multiple ID.
            ref_scp_id_list+=[[t['$'] for t in ref_id if t['@idtype']=='SGR'][0]]
        else: # Exceptions.
            raise
    else:
        raise
    df_citation_sgr_title=pd.concat([df_citation_sgr_title, pd.DataFrame(ref_title_list, ref_scp_id_list)], axis=0, ignore_index=False)

In [None]:
df_citation_sgr_title=df_citation_sgr_title.reset_index().rename(columns={'index':'sgr', 0:'title'})
df_citation_sgr_title.sample(3)

In [None]:
df_citation_sgr_title

In [None]:
df_sgr_title=pd.concat([df_sgr_title, df_citation_sgr_title], ignore_index=True, axis=0)

In [None]:
df_sgr_title

In [None]:
df_sgr_title=df_sgr_title.loc[df_sgr_title.sgr.drop_duplicates().index]
df_sgr_title

In [None]:
df_sgr_title.set_index('sgr', inplace=True)

In [None]:
for node in tqdm(g.nodes()):
    g.nodes[node]['title']=df_sgr_title.loc[node]['title']

In [None]:
list(g.edges(data=True))[0:5]

In [None]:
import community as cm

In [None]:
cluster_num=cm.best_partition(g.to_undirected())

In [None]:
for node in tqdm(g.nodes()):
    g.nodes[node]['cluster']=cluster_num[node]

In [None]:
btw_cent_dict=nx.betweenness_centrality(g, weight='weight')

In [None]:
for node in tqdm(g.nodes()):
    g.nodes[node]['btw_cent']=btw_cent_dict[node]v

In [None]:
deg_cent_dict=nx.degree_centrality(g)
for node in tqdm(g.nodes()):
    g.nodes[node]['deg_cent']=deg_cent_dict[node]

In [None]:
cls_cent_dict=nx.closeness_centrality(g)
for node in tqdm(g.nodes()):
    g.nodes[node]['cls_cent']=cls_cent_dict[node]

In [None]:
df_node_attri=pd.json_normalize(pd.DataFrame(g.nodes(data=True))[1]).merge(pd.DataFrame(g.nodes(data=True))[0], left_index=True, right_index=True).rename(columns={0:'node'})

In [None]:
df_node_attri.to_excel('../output/df_node_attri.xlsx')