# Process 2023_02_01 CellTriage dataset

In [38]:


import os
from pathlib import Path
from IPython.display import display
import requests
import orjson


## PMIDS




In [39]:
# read samples
NEG_FP = '../data/20230201/publication_negative_set.tsv'
POS_FP = '../data/20230201/publication_positive_set.tsv'

with open(NEG_FP, 'r') as f:
    neg_set = f.read().splitlines() 
print(f'#neg pmids: {len(neg_set)}')    

with open(POS_FP, 'r') as f:
    pos_set = f.read().splitlines() 
print(f'#pos pmids: {len(pos_set)}')    

#neg pmids: 645
#pos pmids: 22720


## SIBiLS

### negatives

In [40]:
URL_API = "https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp"
NEG_JSON_FP = '../data/20230201/publication_negative_set.json'

params = {"ids": ",".join(neg_set)}
r = requests.post(url = URL_API, params = params)
with open(NEG_JSON_FP,"w",encoding="utf-8") as f:
   f.write(r.text)

print('#neg documents:', r.text.count('pmid'))


#neg documents: 466


No documents are fetch for 179 out of 645 neg pmids!

In [53]:
NEG_FAILED_JSON = '../data/20230201/publication_negative.failed.json'

with open(NEG_JSON_FP, "rb") as f:
   out_neg_json = orjson.loads(f.read())
   out_neg_pmids= set([doc['infons']['pmid'] for doc in out_neg_json['documents']])
in_neg_pmids=set(neg_set)
delta_neg_pmids= in_neg_pmids - out_neg_pmids

with open(NEG_FAILED_JSON, "wb") as f:
    f.write(orjson.dumps(list(delta_neg_pmids)))


print('#neg documents not fetched:', len(delta_neg_pmids))
print(delta_neg_pmids)


#neg documents not fetched: 179
{'35032962', '36307212', '36332987', '36209392', '31643973', '36270774', '36183145', '36519010', '36241377', '36209396', '36400560', '20301685', '36585755', '36183147', '36324702', '36332985', '36428249', '33270410', '36415207', '24175354', '36670519', '36564868', '36415204', '31082146', '36636473', '36428244', '36606118', '36187498', '36351162', '36400564', '36270772', '35274912', '36307209', '36670521', '36241374', '36209393', '36400562', '36428243', '36209395', '36332980', '36332984', '36608618', '36241371', '36670520', '36241379', '34003615', '34941236', '20301761', '35704653', '34097371', '36419540', '36400566', '20301490', '35274911', '36519011', '24027799', '36183142', '27336128', '35050561', '29939552', '36270773', '36585756', '36241372', '36564872', '36241376', '36270769', '36564870', '31643906', '36307214', '36209400', '36428247', '34941230', '36636470', '36585757', '36670523', '34756845', '36183140', '36512654', '36307207', '36270768', '363072

Some examples:
https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp?20301750,36636475,36400564

https://pubmed.ncbi.nlm.nih.gov/20301750/
https://pubmed.ncbi.nlm.nih.gov/36636475/
https://pubmed.ncbi.nlm.nih.gov/36400564/


### positives

In [51]:
URL_API = "https://candy.hesge.ch/SIBiLS/MEDLINE/fetch.jsp"
POS_JSON_FP = '../data/20230201/publication_posive_set.json'
CHUNK_SIZE = 500

def split_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i+chunk_size]

concat_response = {'source':"MEDLINE", 'documents':[]}
for chunk in split_list(pos_set, CHUNK_SIZE):
   params = {"ids": ",".join(chunk)}
   r = requests.post(url = URL_API, params = params)
#    print(r)
   concat_response['documents'].extend(orjson.loads(r.text)['documents'])

with open(POS_JSON_FP, "wb") as f:
    f.write(orjson.dumps(concat_response))


print('#pos documents:', len(concat_response['documents']))

#pos documents: 22635
