In [82]:
import pandas as pd
import re
import os


In [104]:
DATA_PATH = 'outputs/qiita.csv'

data = pd.read_csv(DATA_PATH)

In [105]:
with open('EDAM/edam_topics.txt', 'r') as f:
    edam_topics = [topic.strip() for topic in f.readlines()]

quoted_topics = [topic for topic in edam_topics if topic.startswith('"') and topic.endswith('"')]

# Remove quotes
edam_topics = [topic[1:-1] if topic.startswith('"') and topic.endswith('"') else topic for topic in edam_topics]

## Format Outputs

Split outputs on tab, and check for other separators that GPT may have used in error.

In [106]:
data['Predictions'] = data['Predictions'].str.replace('\\t', '\t')


In [107]:
def split_topics(topics):
    cleaned_topics = [topic.replace('\"', '') for topic in [topic.strip() for topic in topics.split('\t')] if topic not in quoted_topics]

    return set(cleaned_topics)


data['Predictions'] = data['Predictions'].apply(split_topics)

In [108]:
data['Predictions']

0            {Microbiology, Plant biology, Soil science}
1      {Metagenomics, Antimicrobial Resistance, Micro...
2      {Microbial ecology, Infectious disease, Gastro...
3                  {Microbiology, Animal study, Biology}
4       {Microbiology\nGenetics\nEnvironmental sciences}
                             ...                        
698    {Nutritional science   Microbial ecology   Obe...
699    {Respiratory medicine, Microbiology, Infectiou...
700                     {Animal study, Ecology, Biology}
701    {Environmental sciences   Ecology    Microbiol...
702    {Environmental sciences    Soil science    Dat...
Name: Predictions, Length: 703, dtype: object

In [109]:
## Capture any weirdly formatted outputs (using the wrong separators)

data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('   ') if len(x) <= 1 and '   ' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\n') if len(x) <= 1 and '\n' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<TAB>') if len(x) <= 1 and '<TAB>' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split(';') if len(x) <= 1 and ';' in list(x)[0] else x)

In [110]:
# check if any of the quoted topics, or their equivalent without quotes, 
# are in any of the prediction sets with length less than or equal to 1. 
# If there is, then add the quotes back in if they don't have them,
#  and then split on commas while avoiding anything inside quotes
def process_predictions(predictions):
    processed_predictions = []
    for prediction in predictions:
        formatted = False
        for topic in quoted_topics:
            formatted_topic = topic.replace('\"', '')
            if formatted_topic in prediction:
                processed_prediction = prediction.replace(formatted_topic, f'{topic}')
                processed_predictions.append(processed_prediction)
                formatted = True
                break
        if not formatted:
            processed_predictions.append(prediction)
        
    final_predictions = []
    for prediction in processed_predictions:
        if '\"' in prediction:
            parts = re.findall(r'[^"]+|"[^"]+"', prediction)
            final_predictions.extend(parts)
        else:
            final_predictions.extend([pred.strip() for pred in prediction.split(',')])
    return set(final_predictions)

data['Predictions'] = data['Predictions'].apply(process_predictions)



In [111]:
data[data['Predictions'].apply(len) <= 1]['Predictions']

Series([], Name: Predictions, dtype: object)

## Hallucinations

Filter out topics not in the EDAM topics list. The filtered topics may be matched to a topic or synonym->topic in the next section.

In [112]:
data['Hallucinations'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') not in edam_topics]))

In [113]:
data['Predictions'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') in edam_topics]))
data['Predictions'] = data.apply(lambda row: set([topic for topic in row['Predictions'] if topic not in row['Hallucinations']]), axis=1)

## Synonym matching

Check for mispelled/misformatted topics or synonyms using levenshtein

In [114]:
edam = pd.read_csv('EDAM/EDAM.csv')

edam = edam[edam['Class ID'].str.contains('topic')].reset_index(drop=True)
# edam['Preferred Label'].apply(lambda topic: topic.replace('\"', ''))
edam = edam[edam['Preferred Label'].isin([topic.replace('\"', '') for topic in edam_topics])].reset_index(drop=True)

In [115]:
edam['Synonyms'] = edam['Synonyms'].fillna('').apply(lambda x: x.split('|') if x != '' else [])

In [116]:
missing_topics = set([topic.replace('\"', '') for topic in edam_topics]) - set(edam['Preferred Label'])
missing_topics

set()

In [117]:
synonym_dict = {}

for index, row in edam.iterrows():
    for synonym in row['Synonyms']:
        synonym_dict[synonym] = row['Preferred Label']

In [118]:
synonym_dict

{'Molecular docking': 'Molecular modelling',
 'Homology modeling': 'Molecular modelling',
 'Docking': 'Molecular modelling',
 'Comparative modelling': 'Molecular modelling',
 'Homology modelling': 'Molecular modelling',
 'Evolution': 'Evolutionary biology',
 'Freshwater science': 'Freshwater biology',
 'Nutrition': 'Nutritional science',
 'Nutrition science': 'Nutritional science',
 'Dietetics': 'Nutritional science',
 'Cardiovascular medicine': 'Cardiology',
 'Heart disease': 'Cardiology',
 'Cardiovascular disease': 'Cardiology',
 'Gene features': 'Gene structure',
 'Fusion genes': 'Gene structure',
 'Transcriptome': 'Transcriptomics',
 'Comparative transcriptomics': 'Transcriptomics',
 'Ancestral genomes': 'Paleogenomics',
 'Paleogenetics': 'Paleogenomics',
 'Panomics': 'Multiomics',
 'Pan-omics': 'Multiomics',
 'Integrative omics': 'Multiomics',
 'Multi-omics': 'Multiomics',
 'RNA metabarcoding': 'Metabarcoding',
 'DNA metabarcoding': 'Metabarcoding',
 'eDNA metabarcoding': 'Metabar

In [119]:
import Levenshtein

hallucinations = data['Hallucinations']

matched_topics = {}

for hallucination_set in hallucinations:
    for hallucination in hallucination_set:
        if hallucination in matched_topics:
            continue
        matched = False
        # First check for a match in the topics list
        sorted_topics = sorted(edam_topics, key=lambda topic: Levenshtein.distance(hallucination, topic))
        for topic in sorted_topics:
            distance = Levenshtein.distance(hallucination, topic)
            if  0 < distance <= 2:
                matched_topics[hallucination] = topic
                matched = True
                break
        
        # If the hallucination has already been matched, skip to the next hallucination
        if matched:
            continue
        
        # If no match in the topics list, look through the available synonyms
        sorted_synonyms = sorted(synonym_dict.keys(), key=lambda topic: Levenshtein.distance(hallucination, topic))
        for topic in sorted_synonyms:
            distance = Levenshtein.distance(hallucination, topic)
            if 0 <= distance <= 1:
                matched_topics[hallucination] = synonym_dict[topic]
                break        

matched_topics

{'Soil science': 'Agricultural science',
 'microbiology': 'Microbiology',
 'metagenomics': 'Metagenomics',
 'Iinfectious disease': 'Infectious disease',
 'Sequenece analysis': 'Sequence analysis',
 'I\nBiodiversity': 'Biodiversity',
 'I\nMicrobiology': 'Microbiology',
 'Microbiome': 'Microbial ecology',
 'Animal Study': 'Animal study',
 'Pediatrics': 'Paediatrics',
 'Hematology': 'Haematology',
 'Pharmacokinetics': 'Drug metabolism',
 'Gynecology and obstetrics': 'Gynaecology and obstetrics',
 'Nutrition science': 'Nutritional science',
 'Cancer': 'Oncology',
 'Epidemiology': 'Public health and epidemiology',
 'Pulmonology': 'Respiratory medicine',
 'Precision medicine': 'Personalised medicine',
 'Sequeunce analysis': 'Sequence analysis',
 'Environmental Sciences': 'Environmental sciences',
 'Microbial Ecology': 'Microbial ecology',
 'Geology': 'Biology'}

In [120]:
for index, row in data.iterrows():
    if len(row['Hallucinations']) > 0:
        for hallucination in row['Hallucinations']:
            if hallucination in matched_topics:
                print(f"'{hallucination}' in row {index} matches topic '{matched_topics[hallucination]}'")
                data.at[index, 'Predictions'].add(matched_topics[hallucination])
                break

'Soil science' in row 0 matches topic 'Agricultural science'
'microbiology' in row 30 matches topic 'Microbiology'
'Soil science' in row 34 matches topic 'Agricultural science'
'Iinfectious disease' in row 42 matches topic 'Infectious disease'
'Sequenece analysis' in row 44 matches topic 'Sequence analysis'
'Soil science' in row 82 matches topic 'Agricultural science'
'I
Biodiversity' in row 100 matches topic 'Biodiversity'
'Microbiome' in row 120 matches topic 'Microbial ecology'
'Animal Study' in row 150 matches topic 'Animal study'
'Pediatrics' in row 189 matches topic 'Paediatrics'
'Hematology' in row 223 matches topic 'Haematology'
'Pediatrics' in row 225 matches topic 'Paediatrics'
'Microbiome' in row 246 matches topic 'Microbial ecology'
'Pharmacokinetics' in row 263 matches topic 'Drug metabolism'
'Pediatrics' in row 303 matches topic 'Paediatrics'
'Gynecology and obstetrics' in row 323 matches topic 'Gynaecology and obstetrics'
'Nutrition science' in row 332 matches topic 'Nut

In [121]:
# Add quotes back in for the predictions with commas (i.e, "Data submission, annotation, and curation")
data['Predictions'] = data['Predictions'].apply(lambda preds: [f'"{pred}"' if f'"{pred}"' in quoted_topics else pred for pred in preds])

In [122]:
data[data['Hallucinations'].apply(len) > 0][['Predictions', 'Hallucinations']]

Unnamed: 0,Predictions,Hallucinations
0,"[Microbiology, Plant biology, Agricultural sci...",{Soil science}
5,"[Ecology, Agricultural science]",{Soil microbiology}
10,"[Microbial ecology, Mapping]",{Food science}
15,"[Microbial ecology, Metabolomics]",{Forensic investigations}
16,"[Microbial ecology, Metabolomics]",{Forensic investigations}
...,...,...
686,"[Microbiology, Biology, Environmental sciences]",{Geology}
691,"[Microbiology, Ecology]",{Seasonal biology}
695,"[Microbiology, Environmental sciences]",{Soil biology}
698,"[Microbial ecology, Nutritional science]",{Obesity}


In [123]:
data['Predictions'] = data['Predictions'].apply(lambda lst: set(lst))

In [124]:
file_name = os.path.basename(DATA_PATH).replace('.', '_processed.')

data.to_csv(f'outputs/{file_name}', index=False)