In [195]:
import pandas as pd
import re

In [196]:
data = pd.read_csv('outputs/datadiscoveryengine.csv')

In [197]:
with open('EDAM/edam_topics.txt', 'r') as f:
    edam_topics = [topic.strip() for topic in f.readlines()]

quoted_topics = [topic for topic in edam_topics if topic.startswith('"') and topic.endswith('"')]

# Remove quotes
edam_topics = [topic[1:-1] if topic.startswith('"') and topic.endswith('"') else topic for topic in edam_topics]

## Format Outputs

Split outputs on tab, and check for other separators that GPT may have used in error.

In [198]:
data['Predictions'] = data['Predictions'].str.replace('\\t', '\t')


In [199]:
def split_topics(topics):
    cleaned_topics = [topic.replace('\"', '') for topic in [topic.strip() for topic in topics.split('\t')] if topic not in quoted_topics]

    return set(cleaned_topics)


data['Predictions'] = data['Predictions'].apply(split_topics)

In [200]:
data['Predictions']

0      {Sample collections, Infectious disease, Micro...
1      {Molecular biology, Animal study, Transcriptom...
2             {Infectious disease, Immunology, Genomics}
3      {Evolutionary biology, Microbiology, Microbial...
4      {Antimicrobial Resistance   Biology   Microbio...
                             ...                        
363    {Antimicrobial Resistance, Microbiology, Drug ...
364    {Infectious disease, Microbiology, Microbial e...
365             {Metagenomics, Immunology, Microbiology}
366                 {Immunology, Virology, Cell biology}
367       {Infectious disease, Proteomics, Cell biology}
Name: Predictions, Length: 368, dtype: object

In [201]:
## Capture any weirdly formatted outputs

data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('   ') if len(x) <= 1 and '   ' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\n') if len(x) <= 1 and '\n' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<TAB>') if len(x) <= 1 and '<TAB>' in list(x)[0] else x)

In [202]:
# check if any of the quoted topics, or their equivalent without quotes, 
# are in any of the prediction sets with length less than or equal to 1. 
# If there is, then add the quotes back in if they don't have them,
#  and then split on commas while avoiding anything inside quotes
def process_predictions(predictions):
    processed_predictions = []
    for prediction in predictions:
        formatted = False
        for topic in quoted_topics:
            formatted_topic = topic.replace('\"', '')
            if formatted_topic in prediction:
                processed_prediction = prediction.replace(formatted_topic, f'{topic}')
                processed_predictions.append(processed_prediction)
                formatted = True
                break
        if not formatted:
            processed_predictions.append(prediction)
        
    final_predictions = []
    for prediction in processed_predictions:
        if '\"' in prediction:
            parts = re.findall(r'[^"]+|"[^"]+"', prediction)
            final_predictions.extend(parts)
        else:
            final_predictions.extend([pred.strip() for pred in prediction.split(',')])
    return set(final_predictions)

data['Predictions'] = data['Predictions'].apply(process_predictions)



In [203]:
data[data['Predictions'].apply(len) <= 1]['Predictions']

140    {"Data submission, annotation, and curation"}
Name: Predictions, dtype: object

## Hallucinations

Filter out topics not in the EDAM topics list. The filtered topics may be matched to a topic or synonym->topic in the next section.

In [204]:
data['Hallucinations'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') not in edam_topics]))

In [205]:
data['Predictions'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') in edam_topics]))
data['Predictions'] = data.apply(lambda row: set([topic for topic in row['Predictions'] if topic not in row['Hallucinations']]), axis=1)

## Synonym matching

Check for mispelled/misformatted topics or synonyms using levenshtein

In [206]:
edam = pd.read_csv('EDAM/EDAM.csv')

edam = edam[edam['Class ID'].str.contains('topic')].reset_index(drop=True)
# edam['Preferred Label'].apply(lambda topic: topic.replace('\"', ''))
edam = edam[edam['Preferred Label'].isin([topic.replace('\"', '') for topic in edam_topics])].reset_index(drop=True)

In [207]:
edam['Synonyms'] = edam['Synonyms'].fillna('').apply(lambda x: x.split('|') if x != '' else [])

In [208]:
missing_topics = set([topic.replace('\"', '') for topic in edam_topics]) - set(edam['Preferred Label'])
missing_topics

set()

In [209]:
synonym_dict = {}

for index, row in edam.iterrows():
    for synonym in row['Synonyms']:
        synonym_dict[synonym] = row['Preferred Label']

In [210]:
synonym_dict

{'Molecular docking': 'Molecular modelling',
 'Homology modeling': 'Molecular modelling',
 'Docking': 'Molecular modelling',
 'Comparative modelling': 'Molecular modelling',
 'Homology modelling': 'Molecular modelling',
 'Evolution': 'Evolutionary biology',
 'Freshwater science': 'Freshwater biology',
 'Nutrition': 'Nutritional science',
 'Nutrition science': 'Nutritional science',
 'Dietetics': 'Nutritional science',
 'Cardiovascular medicine': 'Cardiology',
 'Heart disease': 'Cardiology',
 'Cardiovascular disease': 'Cardiology',
 'Gene features': 'Gene structure',
 'Fusion genes': 'Gene structure',
 'Transcriptome': 'Transcriptomics',
 'Comparative transcriptomics': 'Transcriptomics',
 'Ancestral genomes': 'Paleogenomics',
 'Paleogenetics': 'Paleogenomics',
 'Panomics': 'Multiomics',
 'Pan-omics': 'Multiomics',
 'Integrative omics': 'Multiomics',
 'Multi-omics': 'Multiomics',
 'RNA metabarcoding': 'Metabarcoding',
 'DNA metabarcoding': 'Metabarcoding',
 'eDNA metabarcoding': 'Metabar

In [229]:
import Levenshtein

hallucinations = data['Hallucinations']
edit_distance_threshold = 2

matched_topics = {}

for hallucination_set in hallucinations:
    for hallucination in hallucination_set:
        matched = False
        # First check for a match in the topics list
        for topic in edam_topics:
            distance = Levenshtein.distance(hallucination, topic)
            if  0 < distance <= edit_distance_threshold:
                matched_topics[hallucination] = topic
                matched = True
                break
        
        # If the hallucination has already been matched, skip to the next hallucination
        if matched:
            continue
        
        # If no match in the topics list, look through the available synonyms
        for topic in synonym_dict.keys():
            distance = Levenshtein.distance(hallucination, topic)
            if 0 <= distance <= edit_distance_threshold:
                matched_topics[hallucination] = synonym_dict[topic]
                break        

matched_topics


{'Hematology': 'Dermatology',
 'Iinfectious disease': 'Infectious disease',
 'Data analysis': 'Data architecture, analysis and design',
 'Health science': 'Biomedical science',
 'Microbiota': 'Microbial ecology',
 'Animal Study': 'Animal study',
 'Biological science': 'Biology',
 'Clinical medicine': 'Medicine',
 'Pediatrics': 'Paediatrics',
 'transcriptomics': 'Transcriptomics',
 'Clinical study': 'Preclinical and clinical studies',
 'Age': 'Antimicrobial Resistance',
 'Epidemiology': 'Public health and epidemiology',
 'Aging': 'Geriatric medicine',
 'Iimmunology': 'Immunology',
 'Critical Care Medicine': 'Critical care medicine'}

In [224]:
for index, row in data.iterrows():
    if len(row['Hallucinations']) > 0:
        for hallucination in row['Hallucinations']:
            if hallucination in matched_topics:
                print(f"'{hallucination}' in row {index} matches topic '{matched_topics[hallucination]}'")
                data.at[index, 'Predictions'].add(matched_topics[hallucination])
                break

'Hematology' in row 60 matches topic 'Dermatology'
'Iinfectious disease' in row 66 matches topic 'Infectious disease'
'Data analysis' in row 78 matches topic 'Data architecture, analysis and design'
'Health science' in row 95 matches topic 'Biomedical science'
'Microbiota' in row 106 matches topic 'Microbial ecology'
'Animal Study' in row 110 matches topic 'Animal study'
'Biological science' in row 116 matches topic 'Biology'
'Clinical medicine' in row 132 matches topic 'Medicine'
'Pediatrics' in row 141 matches topic 'Paediatrics'
'transcriptomics' in row 142 matches topic 'Transcriptomics'
'Clinical study' in row 186 matches topic 'Preclinical and clinical studies'
'Age' in row 220 matches topic 'Antimicrobial Resistance'
'Epidemiology' in row 261 matches topic 'Public health and epidemiology'
'Aging' in row 303 matches topic 'Geriatric medicine'
'Epidemiology' in row 318 matches topic 'Public health and epidemiology'
'Pediatrics' in row 324 matches topic 'Paediatrics'
'Iimmunology' 

In [225]:
# Add quotes back in for the predictions with commas (i.e, "Data submission, annotation, and curation")
data['Predictions'] = data['Predictions'].apply(lambda preds: [f'"{pred}"' if f'"{pred}"' in quoted_topics else pred for pred in preds])

In [226]:
data[data['Hallucinations'].apply(len) > 0][['Predictions', 'Hallucinations']]

Unnamed: 0,Predictions,Hallucinations
6,"[Metabolomics, Microbiology]",{RP-C18}
23,"[Infectious disease, Genomics]",{Host-pathogen interactions}
26,[Antimicrobial Resistance],{Microbiology RNA}
43,"[Metabolomics, Microbiology]",{Antibiotic Resistance}
53,"[Oncology, Metabolomics]",{Cell culture}
60,"[Infectious disease, Dermatology, Microbiology]",{Hematology}
66,"[Infectious disease, Metabolomics, Animal study]",{Iinfectious disease}
72,"[Infectious disease, Metabolomics]",{Cell culture}
75,"[Antimicrobial Resistance, Microbiology]",{Probiotics}
77,"[Biochemistry, Lipids]",{Cell culture}


In [227]:
data['Predictions'] = data['Predictions'].apply(lambda lst: set(lst))