In [198]:
import pandas as pd
import re
import os

In [221]:
DATA_PATH = 'outputs/vivli.csv'

data = pd.read_csv(DATA_PATH, lineterminator="\n")

print(f'Loaded dataset with {len(data)} rows')

Loaded dataset with 7210 rows


In [222]:
with open('EDAM/edam_topics.txt', 'r') as f:
    edam_topics = [topic.strip() for topic in f.readlines()]

quoted_topics = [topic for topic in edam_topics if topic.startswith('"') and topic.endswith('"')]

# Remove quotes
edam_topics = [topic[1:-1] if topic.startswith('"') and topic.endswith('"') else topic for topic in edam_topics]

## Format Outputs

Split outputs on tab, and check for other separators that GPT may have used in error.

In [223]:
data['Predictions'] = data['Predictions'].str.replace('\\t', '\t')


In [224]:
def split_topics(topics):
    cleaned_topics = [topic.replace('\"', '') for topic in [topic.strip() for topic in topics.split('\t')] if topic not in quoted_topics]

    return set(cleaned_topics)


data['Predictions'] = data['Predictions'].apply(split_topics)

In [225]:
data['Predictions']

0            {Pediatrics, Immunology, Infectious disease}
1            {Drug development, Clinical study, Oncology}
2        {Treatment outcomes, Drug development, Oncology}
3               {Clinical trials; Diabetes; Pharmacology}
4          {Pharmacology, Biology, Personalised medicine}
                              ...                        
7205           {Clinical trial\nCardiology\nHypertension}
7206      {Clinical trials\nDrug development\nPsychiatry}
7207         {Psychiatry, Clinical study, Human genetics}
7208    {Preclinical and clinical studies, Drug develo...
7209          {Clinical trials, Psychiatry, Pharmacology}
Name: Predictions, Length: 7210, dtype: object

In [226]:
## Capture any weirdly formatted outputs (using the wrong separators)

data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('   ') if len(x) <= 1 and '   ' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\n') if len(x) <= 1 and '\n' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('  ') if len(x) <= 1 and '  ' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<TAB>') if len(x) <= 1 and '<TAB>' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split(';') if len(x) <= 1 and ';' in list(x)[0] else x)
data['Predictions'] = data['Predictions'].apply(lambda x: [re.sub(r'Category \d+:', '', pred) for pred in x])

In [227]:
# check if any of the quoted topics, or their equivalent without quotes, 
# are in any of the prediction sets with length less than or equal to 1. 
# If there is, then add the quotes back in if they don't have them,
#  and then split on commas while avoiding anything inside quotes
def process_predictions(predictions):
    processed_predictions = []
    for prediction in predictions:
        formatted = False
        for topic in quoted_topics:
            formatted_topic = topic.replace('\"', '')
            if formatted_topic in prediction:
                processed_prediction = prediction.replace(formatted_topic, f'{topic}')
                processed_predictions.append(processed_prediction)
                formatted = True
                break
        if not formatted:
            processed_predictions.append(prediction)
        
    final_predictions = []
    for prediction in processed_predictions:
        if '\"' in prediction:
            parts = re.findall(r'[^"]+|"[^"]+"', prediction)
            final_predictions.extend(parts)
        else:
            final_predictions.extend([pred.strip() for pred in prediction.split(',')])
    return set(final_predictions)

data['Predictions'] = data['Predictions'].apply(process_predictions)



In [228]:
data[data['Predictions'].apply(len) <= 1]['Predictions'].values

array([{'Psychiatry'}, {'Psychiatry'}, {'Pediatrics'},
       {'Infectious diseaseImmunologyVaccinology'}, {'Psychiatry'},
       {'Psychiatry'},
       {'Pharmacology Immunology Hepatic and biliary medicine'},
       {'Psychiatry'}, {'Ophthalmology'}, {'Psychiatry'}], dtype=object)

## Hallucinations

Filter out topics not in the EDAM topics list. The filtered topics may be matched to a topic or synonym->topic in the next section.

In [229]:
data['Hallucinations'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') not in edam_topics]))

In [230]:
data['Predictions'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') in edam_topics]))
data['Predictions'] = data.apply(lambda row: set([topic for topic in row['Predictions'] if topic not in row['Hallucinations']]), axis=1)

## Synonym matching

Check for mispelled/misformatted topics or synonyms using levenshtein

In [231]:
edam = pd.read_csv('EDAM/EDAM.csv')

edam = edam[edam['Class ID'].str.contains('topic')].reset_index(drop=True)
# edam['Preferred Label'].apply(lambda topic: topic.replace('\"', ''))
edam = edam[edam['Preferred Label'].isin([topic.replace('\"', '') for topic in edam_topics])].reset_index(drop=True)

In [232]:
edam['Synonyms'] = edam['Synonyms'].fillna('').apply(lambda x: x.split('|') if x != '' else [])

In [233]:
missing_topics = set([topic.replace('\"', '') for topic in edam_topics]) - set(edam['Preferred Label'])
missing_topics

set()

In [234]:
synonym_dict = {}

for index, row in edam.iterrows():
    for synonym in row['Synonyms']:
        synonym_dict[synonym] = row['Preferred Label']

In [235]:
synonym_dict

{'Molecular docking': 'Molecular modelling',
 'Homology modeling': 'Molecular modelling',
 'Docking': 'Molecular modelling',
 'Comparative modelling': 'Molecular modelling',
 'Homology modelling': 'Molecular modelling',
 'Evolution': 'Evolutionary biology',
 'Freshwater science': 'Freshwater biology',
 'Nutrition': 'Nutritional science',
 'Nutrition science': 'Nutritional science',
 'Dietetics': 'Nutritional science',
 'Cardiovascular medicine': 'Cardiology',
 'Heart disease': 'Cardiology',
 'Cardiovascular disease': 'Cardiology',
 'Gene features': 'Gene structure',
 'Fusion genes': 'Gene structure',
 'Transcriptome': 'Transcriptomics',
 'Comparative transcriptomics': 'Transcriptomics',
 'Ancestral genomes': 'Paleogenomics',
 'Paleogenetics': 'Paleogenomics',
 'Panomics': 'Multiomics',
 'Pan-omics': 'Multiomics',
 'Integrative omics': 'Multiomics',
 'Multi-omics': 'Multiomics',
 'RNA metabarcoding': 'Metabarcoding',
 'DNA metabarcoding': 'Metabarcoding',
 'eDNA metabarcoding': 'Metabar

In [236]:
import Levenshtein

hallucinations = data['Hallucinations']

matched_topics = {}

for hallucination_set in hallucinations:
    for hallucination in hallucination_set:
        if hallucination in matched_topics:
            continue
        matched = False
        # First check for a match in the topics list
        sorted_topics = sorted(edam_topics, key=lambda topic: Levenshtein.distance(hallucination, topic))
        for topic in sorted_topics:
            distance = Levenshtein.distance(hallucination, topic)
            if  0 < distance <= 2:
                matched_topics[hallucination] = topic
                matched = True
                break
        
        # If the hallucination has already been matched, skip to the next hallucination
        if matched:
            continue
        
        # If no match in the topics list, look through the available synonyms
        sorted_synonyms = sorted(synonym_dict.keys(), key=lambda topic: Levenshtein.distance(hallucination, topic))
        for topic in sorted_synonyms:
            distance = Levenshtein.distance(hallucination, topic)
            if 0 <= distance <= 1:
                matched_topics[hallucination] = synonym_dict[topic]
                break        

matched_topics

{'Pediatrics': 'Paediatrics',
 'Clinical study': 'Preclinical and clinical studies',
 'Clinical trials': 'Preclinical and clinical studies',
 'Nephrology': 'Neurology',
 'Epidemiology': 'Public health and epidemiology',
 'Clinical studies': 'Preclinical and clinical studies',
 'Clinical medicine': 'Medicine',
 'Pharmacokinetics': 'Drug metabolism',
 'Drug development \\': 'Drug development',
 'Hematology': 'Haematology',
 'Pharmacodynamics': 'Drug metabolism',
 'Pulmonology': 'Respiratory medicine',
 'I mmunology': 'Immunology',
 'Rheumatology': 'Musculoskeletal medicine',
 'Transplantation': 'Surgery',
 'Clinical trial': 'Preclinical and clinical studies',
 'Irnmunology': 'Immunology',
 'Clinical immunology': 'Allergy, clinical immunology and immunotherapeutics',
 'Health sciences': 'Biomedical science',
 'Aging': 'Geriatric medicine',
 'Drug delivery': 'Biotherapeutics',
 'IImmunology': 'Immunology',
 'Public Health and Epidemiology': 'Public health and epidemiology',
 'Clinical Stud

In [237]:
for index, row in data.iterrows():
    if len(row['Hallucinations']) > 0:
        for hallucination in row['Hallucinations']:
            if hallucination in matched_topics:
                print(f"'{hallucination}' in row {index} matches topic '{matched_topics[hallucination]}'")
                data.at[index, 'Predictions'].add(matched_topics[hallucination])
                break

'Pediatrics' in row 0 matches topic 'Paediatrics'
'Clinical study' in row 1 matches topic 'Preclinical and clinical studies'
'Clinical trials' in row 3 matches topic 'Preclinical and clinical studies'
'Clinical study' in row 7 matches topic 'Preclinical and clinical studies'
'Clinical trials' in row 10 matches topic 'Preclinical and clinical studies'
'Clinical trials' in row 12 matches topic 'Preclinical and clinical studies'
'Nephrology' in row 13 matches topic 'Neurology'
'Clinical study' in row 16 matches topic 'Preclinical and clinical studies'
'Clinical studies' in row 17 matches topic 'Preclinical and clinical studies'
'Clinical trials' in row 18 matches topic 'Preclinical and clinical studies'
'Pediatrics' in row 19 matches topic 'Paediatrics'
'Clinical medicine' in row 20 matches topic 'Medicine'
'Clinical study' in row 21 matches topic 'Preclinical and clinical studies'
'Clinical medicine' in row 23 matches topic 'Medicine'
'Clinical trials' in row 25 matches topic 'Preclinica

In [238]:
# Add quotes back in for the predictions with commas (i.e, "Data submission, annotation, and curation")
data['Predictions'] = data['Predictions'].apply(lambda preds: [f'"{pred}"' if f'"{pred}"' in quoted_topics else pred for pred in preds])

In [239]:
data[data['Hallucinations'].apply(len) > 0][['Predictions', 'Hallucinations']]

Unnamed: 0,Predictions,Hallucinations
0,"[Paediatrics, Immunology, Infectious disease]",{Pediatrics}
1,"[Drug development, Oncology, Preclinical and c...",{Clinical study}
2,"[Drug development, Oncology]",{Treatment outcomes}
3,"[Preclinical and clinical studies, Pharmacology]","{Diabetes, Clinical trials}"
7,"[Preclinical and clinical studies, Pharmacolog...",{Clinical study}
...,...,...
7204,"[Drug development, Pharmacology, Preclinical a...",{Clinical trials}
7205,"[Preclinical and clinical studies, Cardiology]","{Hypertension, Clinical trial}"
7206,"[Psychiatry, Drug development, Preclinical and...",{Clinical trials}
7207,"[Psychiatry, Preclinical and clinical studies,...",{Clinical study}


In [240]:
data['Predictions'] = data['Predictions'].apply(lambda lst: set(lst))

In [241]:
file_name = os.path.basename(DATA_PATH).replace('.', '_processed.')

data.to_csv(f'outputs/{file_name}', index=False)