In [2]:
import json, re, requests
import os
import copy
import csv
from datetime import date
import settings
from importlib import reload
reload(settings)
from settings import http_settings, file_settings


# This function can be used for the first time on to_biotools.json output from Pub2tools
# as well as it can be rerun on the preprints json in the future to check for new publications
# if there are new publications in low_tools_prp_date.json file, they will be removed from this file
# and moved to the publications file


In [3]:
WRITE_TO_DB = False
to_curate = 100 #how many of high_tools to write to csv file to be manually curated (int or str) ('all')

Check if all files are available

In [4]:
json_file = (file_settings['path_to_pub2tools_month'] + '/' + file_settings['json_tools']).replace('//', '/')
if not(os.path.exists(json_file) and os.path.isfile(json_file)):
    print("can't find json tools file")

In [5]:
pub2tools_file=(file_settings['path_to_pub2tools_month'] + '/' + file_settings['pub2tools_log']).replace('//', '/')  #check if log file exists in output folder
if not(os.path.exists(pub2tools_file) and os.path.isfile(pub2tools_file)):
    print("can't find pub2tools log file")

In [6]:
preprints_file=file_settings['preprints_path'] 
if not(os.path.exists(preprints_file) and os.path.isfile(preprints_file)):
    print("can't find pub2tools log file")

In [7]:
lowtools_file=file_settings['lowtools_path'] 
if not(os.path.exists(lowtools_file) and os.path.isfile(lowtools_file)):
    print("can't find pub2tools log file")

## Process_scrap workflow

Get token for further validation

In [8]:
def login_prod(http_settings):
    headers_token = {
        'Content-Type': 'application/json'
        }
    user = json.dumps({
        'username': http_settings['username'],
        'password': http_settings['password']
    })

    token_r = requests.post(http_settings['host_prod'] + http_settings['login'] + http_settings['json'], headers=headers_token, data=user)
    token = json.loads(token_r.text)['key']
    return token

Process Pub2tools output (only high tools)

In [9]:
def process_tools(json_file):
    high_tools = []
    with open(json_file) as jf:
        data = json.load(jf)
        tools = data['list']
        for tool in tools:
        #    tool['confidence_flag'] = confidence_dict[tool['name']].replace('_',' ')
            tool['editPermission'] = {'type': 'public'}
        
            biotoolsID = re.sub('[^a-zA-Z0-9_~ .-]*', '',tool['name'])
            biotoolsID = re.sub('[ ]+','-', biotoolsID)
            biotoolsID = 'pub2tools2023__' + biotoolsID
            tool['biotoolsID'] = biotoolsID.lower()

            if tool['confidence_flag'].lower() == 'high':
                #tool['date']= check_date(pub2tools_file)
                url_d='{d}{bt}'.format(d=http_settings['dev'],bt=tool['biotoolsID'])  #create tool_link
                tool['tool_link']=url_d
                high_tools.append(tool)
    return high_tools

Validate processed tools

In [10]:
def validate_tool(tool, token, http_settings):
    url = '{h}{t}{v}{f}'.format(h=http_settings['host_prod'], t=http_settings['tool'],v=http_settings['validate'], f=http_settings['json'])   
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Token ' + token
    }

    r = requests.post(url, headers=headers, data=json.dumps(tool))
    if r.status_code >= 200 and r.status_code <= 299:
        return (True, r.text)
    return (False, r.text)

In [11]:
def validate_tools(tools, token, http_settings):
    tools_count = len(tools)
    to_add = []
    problem_tools = []    
    for tool in tools: 
        print(tool['confidence_flag'])
        (valid, txt) = validate_tool(tool, token, http_settings)
        #valid=True   #For testing
        if valid:  
            to_add.append(tool)            
            print("Tool with name {name} is valid.".format(name=tool['name']))
        else:
            print("Tool with name:{name} has the errors: {errors}".format(name=tool['name'],errors=txt))
            print('Checking if there is an error with the name {name} ... '.format(name=tool['name']))
            if (is_html_error(txt)):
                print("It's an html error message ... ")
            else:
                e = json.loads(txt)
                if type(e) is dict and e.get('name') != None:
                    print('There is an error with the name {name}'.format(name=tool['name']))
                    tool_temp = copy.deepcopy(tool)
                    tool_temp['name'] = tool_temp['name'] + '_autogenerated'
                    print('Trying to fix problem by changing tool name to {name}'.format(name=tool_temp['name']))
                    (valid, txt) = validate_tool(tool_temp, token, http_settings)
                    if valid:
                        print('The error was fixed by changing the name to {name}'.format(name=tool_temp['name']))
                        tool['name'] = tool_temp['name']
                        to_add.append(tool)            
                    else:
                        print('The error could not be fixed')
                        problem_tools.append({'tool_name':tool['name'],'error':txt})
                else:
                    print('There was a different error')
                    problem_tools.append({'tool_name':tool['name'],'error':txt})
        print('-----------------')       
    print("Total tools validated: {added} out of a total of: {total} ".format(added=len(to_add), total=tools_count))
    if len(problem_tools) > 0:
        print("{problem} tools with problems:".format(problem=len(problem_tools)))
        print(problem_tools)
    else:
        print("No tools with problems")
    return to_add

In [12]:
def is_html_error(message):
    return '<html' in message.lower() or '<body' in message.lower()  

Check for preprints among validated, and check file with all preprints

In [13]:
def identify_preprints(tools, json_file):
    new_pubs=[]
    for tool in tools:
        url_pub,is_preprint=identify_preprint(tool)
        tool['publication_link']=url_pub
        tool['is_preprint']=is_preprint
        if not is_preprint:
            new_pubs.append(tool)
    print(new_pubs)
    print("There are {preprints} preprints and {pubs} pubs in tools".format(pubs=len(new_pubs), preprints=len(tools)-len(new_pubs)))
    with open(json_file, 'r') as prp_json_file:
        preprints = json.load(prp_json_file)
    tools_prp = preprints['list']
    number_of_pp=len(tools_prp)
    print("There are {preprints} preprints in the json file. Checking if something was published.....".format(preprints=number_of_pp))
    updated_pubs=[]
    for tool in tools_prp:
        url_pub,is_preprint=identify_preprint(tool)
        tool['publication_link']=url_pub
        tool['is_preprint']=is_preprint
        if not is_preprint:
            updated_pubs.append(tool)
    print(updated_pubs)
    print("There are {published} published tools in the json file. ".format(published=len(updated_pubs)))
    print("Updating json.....")
    to_upload = updated_pubs + new_pubs #First newly published preprints, then tools from current month
    #preprints.extend(updated_pubs)
        #pub_json = {"count":len(pubs),"list":pubs}
    preprints_new = [tool for tool in tools if tool not in new_pubs]
    preprints_old = [tool for tool in tools_prp if tool not in updated_pubs]
    preprints_data = preprints_old + preprints_new
    pp_json = {"count":len(preprints_data),"list":preprints_data}
    with open(json_file, 'w') as preprints_json_file:
            json.dump(pp_json, preprints_json_file, indent=4)
    number_of_pp=sum(1 for tool in tools if tool.get('is_preprint') == True)
    print("There are {preprints} preprints in the file after identification".format(preprints=number_of_pp))
    return to_upload


In [14]:
def search_europe_pmc(query):
    """Search Europe PMC and return the JSON response."""
    api_endpoint = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
    params = {
        'query': query,
        'format': 'json',
        'resultType': 'core'
    }
    response = requests.get(api_endpoint, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None


In [15]:
def identify_preprint(tool):
    """
    Identify if a publication associated with a tool is a preprint.
    Args:
    - tool (dict): A dictionary containing tool metadata (Pub2Tools output).
    Returns:
    - pub_link (str): A link to the publication.
    - is_preprint (bool): True if the publication is a preprint, False otherwise.
    """
    if 'doi' in tool['publication'][0]:
        # Extract DOI from publication metadata if exists
        doi=tool['publication'][0]['doi']
        print(tool['name'])
        data = search_europe_pmc(f'DOI:"{doi}"')        
        if data.get('hitCount', 0) != 0:  # If DOI is found in Europe PMC database
            result=data['resultList']['result'] 
            if (result[0].get('source')=='PPR' and result[0].get('commentCorrectionList') is None): # Check if the source is PPR and there are no comment corrections
                is_preprint=True
                pub_link= f"https://doi.org/{doi}"
            elif (result[0].get('source')=='PPR' and result[0].get('commentCorrectionList') is not None): 
                if (result[0].get('commentCorrectionList').get('commentCorrection')[0].get('source')=='PPR'): # Check if the first comment correction source is PPR
                    is_preprint=True
                    pub_link= f"https://doi.org/{doi}"
                else:  # Extract external ID and search for potential match
                    ext_id=result[0].get('commentCorrectionList').get('commentCorrection')[0].get('id')
                    potential_match=search_europe_pmc(f'ext_id:"{ext_id}" NOT DOI:"{doi}"')
                    if potential_match or potential_match.get('hitCount', 0) != 0:
                        tool.pop('publication')
                        pubs={}
                        if ('doi' in potential_match['resultList']['result'][0].keys()):
                            new_doi=potential_match['resultList']['result'][0]['doi']
                            pubs['doi']=new_doi
                            pub_link= f"https://doi.org/{new_doi}"
                            print("Tool {name} has a published version. Changing from {doi} to {new_doi} ".format(name=tool['name'], doi=doi, new_doi=new_doi))
                        if ('pmid' in potential_match['resultList']['result'][0].keys()): 
                            new_pmid = potential_match['resultList']['result'][0]['pmid']
                            pub_link = f"https://pubmed.ncbi.nlm.nih.gov/{new_pmid}/" 
                            pubs['pmid'] = new_pmid
                            print("Tool {name} has a published version. Changing from {doi} to {pmid} ".format(name=tool['name'], doi=doi, pmid=new_pmid))   
                        if ('pmcid' in potential_match['resultList']['result'][0].keys()):
                            new_pmcid = potential_match['resultList']['result'][0]['pmcid']
                            pubs['pmcid'] = new_pmcid 
                        tool['publication']=[pubs]
                        is_preprint=False
                    else:  # If there is no match in a database (even though there is correction list), we can't say much about it
                        is_preprint=True
                        pub_link= f"https://doi.org/{doi}"
            else:   # Source is other than preprint, that's always publication (?)
                pub_link= f"https://doi.org/{doi}"
                is_preprint=False

        else:  #No result in Europe PMC database
            pub_link= f"https://doi.org/{doi}"
            is_preprint=False  #Is it better to assume that this is preprint or not preprint? 
    else:
        print("There is no DOI for this tool")
        pmid=tool['publication'][0]['pmid']
        pub_link= f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/" 
        # It can still be preprint, but it's safer to assume it's not and double-check it with curation.  
        is_preprint=False
    return pub_link, is_preprint

From tools that are not preprints, create curation csv_file of first TO_CURATE tools

In [16]:
def generate_csv_pub(tools,to_curate):
    publications = []
    if to_curate=='all':
        for tool in tools:
            if not tool['is_preprint']:
                publications.append(tool)
    else:
        counter = 0
        for tool in tools:
            if not tool['is_preprint']:
                publications.append(tool)
                counter += 1
                if counter == to_curate:
                    break
    file_date=check_date(pub2tools_file)  #function to return date
    file_name='pub2tools_{year}_{month}.csv'.format(year=file_date[0],month=file_date[1]) #name .csv file
    print(" Writing {to_curate} files which is {pubs} publications to a {filename} ".format(to_curate=to_curate, pubs=len(publications), filename=file_name))
    with open(file_name,'w') as fileobj: #write to .csv file
        writerobj=csv.writer(fileobj)
        writerobj.writerow(['tool_link','tool_name','homepage','publication_link'])
        for tool in publications:
            writerobj.writerow([tool['tool_link'],tool['name'],tool['homepage'],tool['publication_link']])
    leftover_tools=[tool for tool in tools if tool not in publications]
    tools_to_add=publications
    return tools_to_add, leftover_tools

For naming csv file

In [17]:
def check_date(pub2tools_file):     #function to generate date
    log_file=open(pub2tools_file,'r')
    textfile=log_file.read()
    log_file.close()
    date=re.findall("--month (\d+)-(\d+)",textfile)  #from command for pub2tools
    return date[0] 

Update json for all of the low, published tools

In [18]:
def generate_json(tools, lowtools_file):
    #low_tools = {"count":len(tools),"list":tools}
    with open(json_file, 'r') as lowtools_json_file:
        lowtools = json.load(lowtools_json_file)
    tools_low = lowtools['list']
    tools_update =tools_low + tools
    lowtools_json = {"count":len(tools_update),"list":tools_update}
    with open(lowtools_file, 'w') as low_tools_json_file:
        json.dump(lowtools_json, low_tools_json_file, indent=4)

In [19]:
def add_tools(tools, token, http_settings, WRITE_TO_DB):
    print(WRITE_TO_DB)
    if not(WRITE_TO_DB):
        print("Write flag is False, exiting...")
        return    
    url = '{h}{t}{f}'.format(h=http_settings['host_prod'], t=http_settings['tool'], f=http_settings['json'])
    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'Token ' + token
    }
    tools_count = len(tools)
    problem_tools = []
    ok_tools = 0
    for tool in tools:
        r = requests.post(url, headers=headers, data=json.dumps(tool))
        
        if r.status_code >= 200 and r.status_code <= 299:
            print(tool['biotoolsID'], 'Added', r.status_code)
            ok_tools += 1
        else:
            print("An Error:",tool['biotoolsID'], r.text)
            problem_tools.append({'tool_id':tool['biotoolsID'],'error':r.text})
        print('--------------')
        time.sleep(2)

            
    print("Total tools added: {added} out of a total of: {total} ".format(added=ok_tools, total=tools_count))
    if len(problem_tools) > 0:
        print("{problem} tools with problems:".format(problem=len(problem_tools)))
        print(problem_tools)
    else:
        print("No tools with problems")


In [38]:
tools = process_tools(json_file)                        # STEP 1. READ PUB2TOOLS OUTPUT JSON FILE


In [41]:
token = login_prod(http_settings)                       # STEP 2. GET TOKEN


In [48]:
tools_v = validate_tools(tools, token, http_settings)   # STEP 3. VALIDATE TOOLS FROM PUB2TOOLS OUTPUT


high
Tool with name Motifizer is valid.
-----------------
high
Tool with name CanSig is valid.
-----------------
high
Tool with name CoDMI is valid.
-----------------
high
Tool with name:PPVED has the errors: {"function":[{"operation":[{},{},{},{"general_errors":["Invalid URI: http://edamontology.org/operation_3923."]}]}]}
Checking if there is an error with the name PPVED ... 
There was a different error
-----------------
high
Tool with name HGDTI is valid.
-----------------
high
Tool with name KDmarkers is valid.
-----------------
high
Tool with name HiCHub is valid.
-----------------
high
Tool with name TIGER is valid.
-----------------
high
Tool with name IsoAligner is valid.
-----------------
high
Tool with name:RNAinsecta has the errors: {"biotoolsID":["A resource with this ID already exists. bio.tools IDs need to be unique"]}
Checking if there is an error with the name RNAinsecta ... 
There was a different error
-----------------
high
Tool with name:CACSV has the errors: {"biotoo

In [50]:
tools_prp = identify_preprints(tools_v,preprints_file)    #STEP 4. FROM THOSE VALIDATED, IDENTIFY PREPRINTS. RERUN = FALSE - it's first time we add "is_preprint" flag. More about it in preprints.py file.


Motifizer
CanSig
CoDMI
HGDTI
KDmarkers
HiCHub
TIGER
IsoAligner
SiCoDEA
OGAR
SHARCQ
ASTool
IMPatienT
TB-Net
EcoFun-MAP
SageNet
CLIN-X
DeepDNAbP
Methylartist
RLTLBO
BANYAN
AttentionDTA
MDGNN
CRISPRedict
PolypSeg+
m5Cpred-XS
ImmuMethy
MicrobioSee
SEACells
fastman
AutoScore-Imbalance
ASQ
RetroSnake
ToxPi
scAEGAN
DeepBtoD
Neuroscout
GlycoQL
RustNet
SmGDB
Onlinemeta
SISS-Geo
TITAN
TubULAR
HiveRel
ROADMAPS
SNARER
E2Style
OC_Finder
citationchaser
Spiky
surge
ZZS similarity tool
BANKSY
Pocket2Drug
IPOscore
TCR-L
epiAneufinder
Rizoma
MOOD
ClinicaDL
DF-SSmVEP
hemispheR
MainSEL
GenMPI
RRGP
COPILOT
iCatcher
COVID-19-associated ARDS non-viral ARDS
AutoDesigner
MILNP
PeakVI
DTSyn
LinQ-View
WalkIm
PhenoBERT
CPIELA
PCRMS
MPI-GWAS
MAPPER
DeepTMHMM
DeSmoke-LAP
MAECI
MetaRelSubNetVis
Read2Tree
PregTox
GEAR
CANon
AI4ACP
PredNitro
scTSSR2
ENDS assumptions
PanGu Drug Model
recolorize
Fast-Higashi
KMSubtraction
LIPSHOK
AJILE12
MASI
BiGAMi
nsink
TransPhos
Kalmag
CSR-Net
RFPDR
SSPNet
FollicleFinder
ceCLC
EEG ME

In [52]:
tools_to_add, tools_left = generate_csv_pub(tools_prp,100) #STEP 5. GENERATE CSV FROM TO_CURATE FIRST PUBLICATIONS. Alternative function - generate_csv_prp will use TO_CURATE first publications and identified preprints (that might be useful in case of coming back to old curation schema (curating all validated tools))
generate_json(tools_left)                                        # STEP 6. GEENRATE TWO JSON FILES FROM TOOLS THAT WON'T BE CURATED. (leftover tools = all tools - to_curate tools). Generates file with preprints and file with publications. File with preprints can be used as input to identify_preprints later. 
#add_tools(tools_to_add, token, http_settings, WRITE_TO_DB)       # STEP 7. ADD TO_CURATE TOOLS TO DEV


 Writing 100 files which is 100 publications to a pub2tools_2022_02.csv 
