# Data extraction
parameters:
- dataset: 2023_02_01
- fetch_api = "https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp"


In [13]:


import os
from pathlib import Path
from IPython.display import display
import requests
import orjson


## PMIDS




In [14]:
# read samples
NEG_FP = '../data/20230201/publication_negative_set.tsv'
POS_FP = '../data/20230201/publication_positive_set.tsv'

with open(NEG_FP, 'r') as f:
    neg_set = f.read().splitlines() 
print(f'#neg pmids: {len(neg_set)}')    

with open(POS_FP, 'r') as f:
    pos_set = f.read().splitlines() 
print(f'#pos pmids: {len(pos_set)}')    

#neg pmids: 645
#pos pmids: 22720


## SIBiLS

### negatives

In [18]:
URL_API = "https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp"
NEG_JSON_FP = '../data/20230201/SIBiLS_v2/publication_negative_set.json'

params = {"ids": ",".join(neg_set)}
r = requests.post(url = URL_API, params = params)
with open(NEG_JSON_FP,"w",encoding="utf-8") as f:
   f.write(r.text)

print('#neg documents:', r.text.count('pmid'))


#neg documents: 466


No documents are fetch for 179 out of 645 neg pmids!

In [19]:
NEG_FAILED_JSON = '../data/20230201/SIBiLS_v2/publication_negative.failed.json'

with open(NEG_JSON_FP, "rb") as f:
   out_neg_json = orjson.loads(f.read())
   out_neg_pmids= set([doc['infons']['pmid'] for doc in out_neg_json['documents']])
in_neg_pmids=set(neg_set)
delta_neg_pmids= in_neg_pmids - out_neg_pmids

with open(NEG_FAILED_JSON, "wb") as f:
    f.write(orjson.dumps(list(delta_neg_pmids)))


print('#neg documents not fetched:', len(delta_neg_pmids))
print(delta_neg_pmids)


#neg documents not fetched: 179
{'36351156', '36183140', '35267247', '20301750', '36585755', '36209394', '36307207', '34914343', '34941236', '20301779', '36270773', '34756847', '34941230', '36505095', '36519010', '34941228', '36400562', '36307213', '36585754', '36270776', '36415204', '36183147', '36307210', '34003615', '36351157r', '35274906', '36606116', '31643906', '20301761', '36241373', '34097371', '36519011', '36241375', '36351161', '36241378', '35704653', '35420740', '36670520', '36209398', '36270769', '35274912', '36670521', '36183145', '20301685', '36400566', '35050562', '36428242', '36428241', '36209392', '36209393', '34756846', '36400561', '36608617', '26389163', '36351162', '36519009', '36332984', '36270770', '20945554', '34756845', '36332981', '36332987', '36636474', '26389437', '24175354', '33270410', '34941232', '36428245', '36608616', '36307209', '36585756', '36209396', '35032962', '24027799', '36332988', '36428243', '27336128', '36447930', '28520346', '36564868', '36451

Some examples:
https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp?20301750,36636475,36400564

https://pubmed.ncbi.nlm.nih.gov/20301750/
https://pubmed.ncbi.nlm.nih.gov/36636475/
https://pubmed.ncbi.nlm.nih.gov/36400564/


### positives

In [12]:
URL_API = "https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp"
POS_JSON_FP = '../data/20230201/SIBilS_v2/publication_posive_set.json'
CHUNK_SIZE = 500

def split_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i+chunk_size]

concat_response = {'source':"MEDLINE", 'documents':[]}
for chunk in split_list(pos_set, CHUNK_SIZE):
   params = {"ids": ",".join(chunk)}
   r = requests.post(url = URL_API, params = params)
#    print(r)
   concat_response['documents'].extend(orjson.loads(r.text)['documents'])

with open(POS_JSON_FP, "wb") as f:
    f.write(orjson.dumps(concat_response))


print('#pos documents:', len(concat_response['documents']))

#pos documents: 22635
