In [1]:
import os
import pickle
import shutil
from collections import defaultdict
from typing import List

import pandas as pd
from indra.sources import indra_db_rest
from indra.statements import Statement

Assert that all genes in the three rounds of enrichment have:

1. A source INDRA statement picke
2. An uncurated excel sheet
3. A curated excel sheet

In [8]:
gene_statements = {}
gene_df = {}
found_uuid_to_statement = {}

enrichment_folder = os.path.join(os.path.abspath(os.path.join(os.path.dirname('__file__'),'..', 'rounds')))

for e in 1, 2:
    folder = os.path.join(enrichment_folder, f'enrichment-{e}')
    for gene in sorted(os.listdir(folder)):
        subfolder = os.path.join(folder, gene)
        
        if not os.path.isdir(subfolder):
            continue
        
        p = os.path.join(subfolder, f'{gene}_statements.pkl')
        assert os.path.exists(p), f'missing statements file: {subfolder}'
        
        t = os.path.join(subfolder, f'{gene}.bel.tsv')
        assert os.path.exists(p), f'missing uncurated file: {subfolder}'
        
        x = os.path.join(subfolder, f'{gene}_curated.xlsx')
        assert os.path.exists(p), f'missing curated file: {subfolder}'
        
        # Get all of the statements
        with open(p, 'rb') as f:
            gene_statements[gene] = statements = pickle.load(f)
        
        uuid_to_statement = {}
        evidence_text_to_statements = defaultdict(list)
        pmid_to_statements = defaultdict(list)
        
        for statement in statements:
            uuid_to_statement[statement.uuid] = statement
            
            for evidence in statement.evidence:
                evidence_text_to_statements[evidence.text].append((statement, evidence))
                pmid_to_statements[evidence.pmid].append((statement, evidence))

        gene_df[gene] = df = pd.read_excel(x)
        missing_uuids = []
        missing_evidences = []
        missing_pmids = []
        
        if 'Citation Reference' in df.columns:
            df.rename(columns={'Citation Reference': 'PMID'}, inplace=True)

        for idx, row in df.iterrows():
            if 'INDRA UUID' not in row or pd.isna(row['INDRA UUID']):
                continue
            
            uuid = row['INDRA UUID']
            statement = uuid_to_statement.get(uuid)
            if statement is None:
                missing_uuids.append((idx, row))
            else:
                found_uuid_to_statement[uuid] = statement

            if row['Evidence'] not in evidence_text_to_statements:
                missing_evidences.append((idx, row))
            
            if 'PMID' in row and row['PMID'] not in pmid_to_statements:
                missing_pmids.append((idx, row))
        
        print(
            f'{e} | {gene:8} missing {len(missing_uuids):4} UUIDS' 
            f'{len(missing_evidences):3} Evidences {len(missing_pmids):4}'
            f' PMIDs'
        )

1 | AGO2     missing  164 UUIDS 51 Evidences  164 PMIDs
1 | ARG1     missing   60 UUIDS 29 Evidences   60 PMIDs
1 | CD274    missing  270 UUIDS 93 Evidences  270 PMIDs
1 | CHI3L2   missing   13 UUIDS  4 Evidences   13 PMIDs
1 | CTTN     missing   54 UUIDS 16 Evidences   54 PMIDs
1 | DEFB1    missing   94 UUIDS 25 Evidences   94 PMIDs
1 | GRK2     missing  401 UUIDS122 Evidences  401 PMIDs
1 | HDLBP    missing   12 UUIDS  3 Evidences   12 PMIDs
1 | HLA-DQB1 missing    4 UUIDS  0 Evidences    4 PMIDs
1 | IL2RA    missing  487 UUIDS122 Evidences  487 PMIDs
1 | IL4R     missing  142 UUIDS 49 Evidences  142 PMIDs
1 | INA      missing   41 UUIDS  4 Evidences   41 PMIDs
1 | LAMTOR1  missing   21 UUIDS  1 Evidences   21 PMIDs
1 | MRC1     missing  169 UUIDS 66 Evidences  169 PMIDs
1 | NFE2L1   missing  203 UUIDS 63 Evidences  203 PMIDs
1 | PTGER2   missing    6 UUIDS  3 Evidences    6 PMIDs
1 | SLC39A1  missing   20 UUIDS 11 Evidences   20 PMIDs
1 | TMSB4X   missing  220 UUIDS 92 Evidences  22

AssertionError: missing statements file: /home/ddomingofernandez/Projects/hbp-results/rounds/enrichment-2/NDUFV2

In [None]:
df_iterator =(
    df[['INDRA UUID', 'PMID', 'Evidence']]
    for df in gene_df.values()
)


pd.concat(df_iterator, sort=False).to_csv('results.tsv', sep='\t', index=None)