In [33]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import pandas as pd
import re

from utils.generic import get_driver, Models

In [10]:
driver = get_driver()

## Baseline: fulltext index search

Before running any complex experiments, we need a baseline to compare to. We believe a simple search by a keyword can be used as a baseline.
We will need to create a fulltext index on our knowledge graph - and then run it on the whole dataset.

In [3]:
create_fulltext_inde_query = """
    CREATE FULLTEXT INDEX diseaseIndex FOR (n:Disease) ON EACH [n.DiseaseName, n.Synonyms]
    OPTIONS {
        indexConfig: {
            `fulltext.analyzer`: 'english',
            `fulltext.eventually_consistent`: true
        }
    }
"""

In [7]:
with driver.session() as session:
    session.run(create_fulltext_inde_query)

In [8]:
df = pd.read_csv('../data/processed/ncbi_specific_disease.csv', sep=',')

In [27]:
test = df[df['Description'].str.contains("/galactose malabsorption", case=False, na=False)]

In [31]:
test.iloc[0]['Description']

'autosomal recessive disease glucose/galactose malabsorption'

In [38]:
def fulltext_search(disease_name, limit=1):
    with driver.session() as session:
        query = """
            CALL db.index.fulltext.queryNodes('diseaseIndex', $disease_name)
            YIELD node, score
            RETURN node.DiseaseID AS MESH_ID, node.DiseaseName AS Description, score
            LIMIT $limit
        """
        result = session.run(query, disease_name=disease_name, limit=limit)

        return [{'MESH_ID': record['MESH_ID'], 'Description': record['Description'], 'score': record['score']} for record in result]

The presence of special characters in the search string will be causing issues with the Lucene query parser used by Neo4j's fulltext search. In order to make it work, we will escape them in the search query using a regexp.

In [76]:
def predict_with_fulltext_index(dataset, limit=1):
    true_values = []
    predicted_values = []

    for _, row in dataset.iterrows():
        disease_name = row['Description']
        true_mesh_id = row['MESH ID']

        disease_name_re = re.sub('[^A-Za-z0-9 ]+', '', disease_name)
        
        predicted_mesh_id = fulltext_search(disease_name_re, limit)
        
        true_values.append({"MESH_ID": true_mesh_id, "Description": disease_name})
        predicted_values.append(predicted_mesh_id if len(predicted_mesh_id) > 0 else [{"MESH_ID": "Unknown", "Description": "Unknown"}])

    return true_values, predicted_values

In [77]:
[true_values, predicted_values] = predict_with_fulltext_index(df)

In [78]:
flat_predicted_values = [entry[0]['MESH_ID'] for entry in predicted_values]
flat_true_values = [entry['MESH_ID'] for entry in true_values]

And now we can calculate the performance

In [79]:
precision = precision_score(flat_true_values, flat_predicted_values, average='weighted', zero_division=0)
recall = recall_score(flat_true_values, flat_predicted_values, average='weighted', zero_division=0)
f1 = f1_score(flat_true_values, flat_predicted_values, average='weighted', zero_division=0)
accuracy = accuracy_score(flat_true_values, flat_predicted_values)

In [80]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.7939562909541406
Recall: 0.5275843599357257
F1-Score: 0.5773682235012317
Accuracy: 0.5275843599357257
