# Statistics of the curation exercise

This notebook outlines some of the statistics of the curation exercise that are mentioned in the manuscript.

In [1]:
import sys
import time

import os
import pandas as pd

In [2]:
print(sys.version)

3.6.5 (default, Apr 20 2018, 08:54:42) 
[GCC 4.8.5 20150623 (Red Hat 4.8.5-16)]


In [3]:
print(time.asctime())

Wed Dec 12 15:15:40 2018


Load excel sheets

In [4]:
indra_folder = os.path.join(os.path.abspath(os.path.join(os.path.dirname('__file__'),'..', 'data')))

# Excel sheet with statistics about curation
curation_stats_path = os.path.join(indra_folder, 'curation_summary.csv')

def get_enrichment_directories():
    """List the excel curation sheets."""
    for path in os.listdir(INDRA_DIRECTORY):
        folder = os.path.join(INDRA_DIRECTORY, path)
        if not (os.path.isdir(folder) and path.startswith('enrichment-')):
            continue
        for subpath in os.listdir(folder):
            subfolder = os.path.join(folder, subpath)
            if not os.path.isdir(subfolder):
                continue
            curated_path = os.path.join(subfolder, f'{subpath}_curated.xlsx')
            if os.path.exists(curated_path):
                yield curated_path

#### Exclusion criteria
1. In one particular case, for GABRG1, there was only one statement extracted and this statement was wrong
2. For SLC5A7, INDRA made the same error consistently through the excel sheet and there was not any correct
3. MAPT and GSK3B were curated but they did not follow the rational curation procedure.

In [5]:
BLACKLIST = {'SLC5A7', 'GSK3B', 'MAPT', 'GABRG1'}

In [6]:
curation_df = pd.read_csv(curation_stats_path, index_col=0)
curation_df.fillna(0, inplace=True)
curation_df = curation_df[[v not in BLACKLIST for v in curation_df.index]]

In [7]:
total_indra_statements = 0
total_checked = 0
total_correct = 0
total_recovered = 0
total_required_curation = 0

for gene_symbol, (correct, error, other_statement, modified, not_curated, total) in curation_df.iterrows():
            
    total_indra_statements += total
    
    total_checked += total - not_curated
    
    total_correct += correct
    
    total_required_curation += modified + other_statement
    
    # Total statements recovered 
    total_recovered += (correct + modified + other_statement)

indra_coverage = total_checked / total_indra_statements
indra_accuracy = total_correct / total_checked
relative_recovery = total_recovered / total_checked
relative_fixes = total_required_curation / total_checked

In [8]:
print(f"""BEL Statements extracted by INDRA:   {total_indra_statements:5}.
BEL Statements checked by a curator: {total_checked:5} ({indra_coverage:.1%}).
BEL Statements directly correct:     {total_correct:5} ({indra_accuracy:.1%}).
BEL Statements Fixed:                {total_required_curation:5} ({relative_fixes:.1%}).
BEL Statements Total Recovered:      {total_recovered:5} ({relative_recovery:.1%}).
""")

BEL Statements extracted by INDRA:   17096.0.
BEL Statements checked by a curator: 2989.0 (17.5%).
BEL Statements directly correct:     917.0 (30.7%).
BEL Statements Fixed:                1454.0 (48.6%).
BEL Statements Total Recovered:      2371.0 (79.3%).



In [9]:
#TODO Add statistics about the network