In [35]:
import json
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
import pickle as pkl
import numpy as np
import re
import requests

In [2]:
with open('../20200705v1/full/metadata/metadata_0.jsonl') as f:
    data = list(f)

records = list(map(json.loads, data))
df = pd.DataFrame.from_records(records)
del [records]

In [3]:
df.columns

Index(['paper_id', 'title', 'authors', 'abstract', 'year', 'arxiv_id',
       'acl_id', 'pmc_id', 'pubmed_id', 'doi', 'venue', 'journal', 'mag_id',
       'mag_field_of_study', 'outbound_citations', 'inbound_citations',
       'has_outbound_citations', 'has_inbound_citations', 'has_pdf_parse',
       's2_url', 'has_pdf_body_text', 'has_pdf_parsed_abstract',
       'has_pdf_parsed_body_text', 'has_pdf_parsed_bib_entries',
       'has_pdf_parsed_ref_entries'],
      dtype='object')

In [4]:
# Only papers with any id compatible with altmetric
df = df.loc[
    ~df['doi'].isna() | ~df['pubmed_id'].isna() | ~df['arxiv_id'].isna()
]

In [5]:
# Impute missing topics
df['mag_field_of_study'] = df['mag_field_of_study'].apply(lambda x: ['None'] if not x else x)

In [None]:
# Sample 20% from each topics combination
mlb = MultiLabelBinarizer(sparse_output=True)

col = df['mag_field_of_study']
one_hot_fields = pd.DataFrame.sparse.from_spmatrix(
                        mlb.fit_transform(col),
                        index=col.index,
                        columns=mlb.classes_
                    )
df = pd.concat([df, one_hot_fields], axis='columns')

df = df.groupby(one_hot_fields.columns.tolist()). \
    apply(lambda group: group.sample(frac=.13, random_state=42)). \
    reset_index(drop=True)
df = df.drop(columns=one_hot_fields.columns.tolist())
df

In [7]:
df.to_csv('../data/sampled.csv', index=False)

In [8]:
responses = []

In [9]:
for el in tqdm(df.iterrows()):
    row = el[1]
    if row['doi']:
        response = requests.get(f"https://api.altmetric.com/v1/doi/{row['doi']}")
    elif row['pubmed_id']:
        response = requests.get(f"https://api.altmetric.com/v1/pmid/{row['pubmed_id']}")
    elif row['arxiv_id']:
        response = requests.get(f"https://api.altmetric.com/v1/arxiv/{row['arxiv_id']}")
    else:
        response = None
    responses.append(response)

0it [00:00, ?it/s]

In [10]:
len(list(filter(lambda resp: resp is not None and resp.status_code == 200, responses)))

17598

In [11]:
with open('../data/responses.pkl', 'wb') as f:
    pkl.dump(responses, f)

In [13]:
len(responses)

113701

In [15]:
responses[0].status_code

404

In [16]:
response_found = list(map(lambda response: response.status_code == 200, responses))

In [21]:
response_col = pd.DataFrame({'response': np.array(responses)[response_found]})

In [27]:
df_with_altmetric = pd.concat([df.loc[response_found].reset_index(drop=True), response_col], 
                              axis='columns')

In [28]:
df_with_altmetric

Unnamed: 0,paper_id,title,authors,abstract,year,arxiv_id,acl_id,pmc_id,pubmed_id,doi,...,has_outbound_citations,has_inbound_citations,has_pdf_parse,s2_url,has_pdf_body_text,has_pdf_parsed_abstract,has_pdf_parsed_body_text,has_pdf_parsed_bib_entries,has_pdf_parsed_ref_entries,response
0,144357407,Lessons from the Kosovo Refugee Crisis: Innova...,"[{'first': 'Michael', 'middle': [], 'last': 'B...",The Kosovo refugee crisis that developed in th...,2001.0,,,,,10.1093/jrs/14.2.95,...,False,True,False,https://api.semanticscholar.org/CorpusID:14435...,,,,,,<Response [200]>
1,143940920,Data sharing across biobanks: Epistemic values...,"[{'first': 'Ipek', 'middle': [], 'last': 'Demi...",Despite the centrality of epistemic issues in ...,2013.0,,,,,10.1080/14636778.2013.846582,...,False,True,False,https://api.semanticscholar.org/CorpusID:14394...,,,,,,<Response [200]>
2,146664846,The Rutgers School: A Zerubavelian Culturalist...,"[{'first': 'Wayne', 'middle': ['H.'], 'last': ...","In this article, the Zerubavelian culturalist ...",2007.0,,,,,10.1177/1368431007080705,...,False,True,False,https://api.semanticscholar.org/CorpusID:14666...,,,,,,<Response [200]>
3,144417433,Critique of Self-Actualization Theory,"[{'first': 'ABRAHAM H.', 'middle': [], 'last':...",Edward Hoffman's (Editor) Note. It is ironic t...,1991.0,,,,,10.1002/j.2164-4683.1991.tb00010.x,...,False,True,False,https://api.semanticscholar.org/CorpusID:14441...,,,,,,<Response [200]>
4,151956231,On Conception of Security in Israel,"[{'first': 'Shuzo', 'middle': [], 'last': 'Kim...",,1979.0,,,,,10.11375/kokusaiseiji1957.63_55,...,False,False,False,https://api.semanticscholar.org/CorpusID:15195...,,,,,,<Response [200]>
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17593,144408832,From Text to Work: Digital Tools and the Emerg...,"[{'first': 'Jerome', 'middle': [], 'last': 'Mc...",The essay is a study of how critical editions ...,2006.0,,,,,10.7202/013153ar,...,False,True,False,https://api.semanticscholar.org/CorpusID:14440...,,,,,,<Response [200]>
17594,155826827,United States and Brazil: the first talks betw...,"[{'first': 'Virgílio', 'middle': ['Caixeta'], ...",United States and Brazil are historical allies...,2015.0,,,,,10.20889/M47e16029,...,False,False,False,https://api.semanticscholar.org/CorpusID:15582...,,,,,,<Response [200]>
17595,170438829,The Buddha of Christendom: A Review of The Leg...,"[{'first': 'Philip', 'middle': ['C.'], 'last':...","Through the Manichaeans, the Islamic world, an...",1987.0,,,,,10.1017/S0034412500018941,...,False,True,False,https://api.semanticscholar.org/CorpusID:17043...,,,,,,<Response [200]>
17596,11620579,Advances in 3D echocardiography: From foetus t...,"[{'first': 'Philippe', 'middle': [], 'last': '...",,2016.0,,,,26711543,10.1016/j.acvd.2015.09.004,...,True,True,True,https://api.semanticscholar.org/CorpusID:11620579,True,True,True,True,True,<Response [200]>


In [30]:
test_response = df_with_altmetric.loc[1, 'response']

In [41]:
json.loads(test_response.content)['history']

{'1y': 0,
 '6m': 0,
 '3m': 0,
 '1m': 0,
 '1w': 0,
 '6d': 0,
 '5d': 0,
 '4d': 0,
 '3d': 0,
 '2d': 0,
 '1d': 0,
 'at': 4.5}

In [40]:
json.loads(test_response.content).keys()

dict_keys(['title', 'doi', 'isbns', 'altmetric_jid', 'issns', 'journal', 'cohorts', 'context', 'authors', 'type', 'handles', 'altmetric_id', 'schema', 'is_oa', 'publisher_subjects', 'cited_by_posts_count', 'cited_by_tweeters_count', 'cited_by_accounts_count', 'last_updated', 'score', 'history', 'url', 'added_on', 'published_on', 'scopus_subjects', 'readers', 'readers_count', 'images', 'details_url'])

In [58]:
json.loads(test_response.content)['history']

{'1y': 0,
 '6m': 0,
 '3m': 0,
 '1m': 0,
 '1w': 0,
 '6d': 0,
 '5d': 0,
 '4d': 0,
 '3d': 0,
 '2d': 0,
 '1d': 0,
 'at': 4.5}

In [82]:
def get_flattened_history(response):
    response = json.loads(response.content)
    history_items = response['history'].items()
    history_columns = [f'history_{entry[0]}' for entry in history_items]
    history_values = [[entry[1]] for entry in history_items]
    return dict(zip(history_columns, history_values))

In [70]:
relevant_response_fields = [
    'cited_by_posts_count',
    'cited_by_tweeters_count',
    'cited_by_policies_count',
    'readers_count',
    'score',
]

In [78]:
def extract_relevant_fields(response):
    response = json.loads(response.content)
    
    return dict(zip(relevant_response_fields,
                    [[response.get(key, 0)] for key in relevant_response_fields]))

In [83]:
relevant_fields = df_with_altmetric['response'].apply(extract_relevant_fields).to_list()
relevant_fields_rows = list(map(pd.DataFrame, relevant_fields))
relevant_fields_df = pd.concat(relevant_fields_rows, axis='rows')
relevant_fields_df.index = np.arange(len(relevant_fields_df))

In [85]:
history_fields = df_with_altmetric['response'].apply(get_flattened_history).to_list()
history_fields_rows = list(map(pd.DataFrame, history_fields))
history_fields_df = pd.concat(history_fields_rows, axis='rows')
history_fields_df.index = np.arange(len(history_fields_df))

In [97]:
df_with_altmetric = pd.concat([df_with_altmetric, relevant_fields_df, history_fields_df], axis='columns')

In [98]:
df_with_altmetric

Unnamed: 0,paper_id,title,authors,abstract,year,arxiv_id,acl_id,pmc_id,pubmed_id,doi,...,history_3m,history_1m,history_1w,history_6d,history_5d,history_4d,history_3d,history_2d,history_1d,history_at
0,144357407,Lessons from the Kosovo Refugee Crisis: Innova...,"[{'first': 'Michael', 'middle': [], 'last': 'B...",The Kosovo refugee crisis that developed in th...,2001.0,,,,,10.1093/jrs/14.2.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.000
1,143940920,Data sharing across biobanks: Epistemic values...,"[{'first': 'Ipek', 'middle': [], 'last': 'Demi...",Despite the centrality of epistemic issues in ...,2013.0,,,,,10.1080/14636778.2013.846582,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.500
2,146664846,The Rutgers School: A Zerubavelian Culturalist...,"[{'first': 'Wayne', 'middle': ['H.'], 'last': ...","In this article, the Zerubavelian culturalist ...",2007.0,,,,,10.1177/1368431007080705,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.850
3,144417433,Critique of Self-Actualization Theory,"[{'first': 'ABRAHAM H.', 'middle': [], 'last':...",Edward Hoffman's (Editor) Note. It is ironic t...,1991.0,,,,,10.1002/j.2164-4683.1991.tb00010.x,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.076
4,151956231,On Conception of Security in Israel,"[{'first': 'Shuzo', 'middle': [], 'last': 'Kim...",,1979.0,,,,,10.11375/kokusaiseiji1957.63_55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17593,144408832,From Text to Work: Digital Tools and the Emerg...,"[{'first': 'Jerome', 'middle': [], 'last': 'Mc...",The essay is a study of how critical editions ...,2006.0,,,,,10.7202/013153ar,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.376
17594,155826827,United States and Brazil: the first talks betw...,"[{'first': 'Virgílio', 'middle': ['Caixeta'], ...",United States and Brazil are historical allies...,2015.0,,,,,10.20889/M47e16029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.500
17595,170438829,The Buddha of Christendom: A Review of The Leg...,"[{'first': 'Philip', 'middle': ['C.'], 'last':...","Through the Manichaeans, the Islamic world, an...",1987.0,,,,,10.1017/S0034412500018941,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.000
17596,11620579,Advances in 3D echocardiography: From foetus t...,"[{'first': 'Philippe', 'middle': [], 'last': '...",,2016.0,,,,26711543,10.1016/j.acvd.2015.09.004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.250


In [100]:
df_with_altmetric.to_csv('../data/data_with_altmetric.csv', index=False)

In [102]:
df_with_altmetric.columns

Index(['paper_id', 'title', 'authors', 'abstract', 'year', 'arxiv_id',
       'acl_id', 'pmc_id', 'pubmed_id', 'doi', 'venue', 'journal', 'mag_id',
       'mag_field_of_study', 'outbound_citations', 'inbound_citations',
       'has_outbound_citations', 'has_inbound_citations', 'has_pdf_parse',
       's2_url', 'has_pdf_body_text', 'has_pdf_parsed_abstract',
       'has_pdf_parsed_body_text', 'has_pdf_parsed_bib_entries',
       'has_pdf_parsed_ref_entries', 'response', 'cited_by_posts_count',
       'cited_by_tweeters_count', 'cited_by_policies_count', 'readers_count',
       'score', 'history_1y', 'history_6m', 'history_3m', 'history_1m',
       'history_1w', 'history_6d', 'history_5d', 'history_4d', 'history_3d',
       'history_2d', 'history_1d', 'history_at'],
      dtype='object')