# Dataframe

This notebook merges all the BEL files contained in the curation folder and gives dataframe for all entity and interaction types in HemeMap.

In [1]:
import os
import pybel

from pybel.struct import union 
import pandas as pd
from pandas.plotting import table
import matplotlib.pyplot as plt

from pybel.struct.summary import count_functions, edge_summary

import heme_knowledge

In [2]:
HERE = os.path.abspath(os.path.dirname('__file__'))

BEL_DIRECTORY = os.path.abspath(os.path.join(HERE, os.pardir, 'curation', 'bel'))

def get_bel_files():
    """Return all bel files."""
    return [
        os.path.join(BEL_DIRECTORY, filename)
        for filename in os.listdir(BEL_DIRECTORY)
    ]

In [3]:
def get_files_in_folder(path):
    """Return the files in a given folder.
    :param path: folder path
    :return: file names in folder
    """
    return [
        file for file in os.listdir(path)
    ]

In [4]:
def get_bel_types(bel_path):
    """Get BEL node and edge type statistics.
    :param str path: path to BEL graph
    :return: count of all nodes and edges in a BEL graph
    :rtype: dict
    """
    bel_stats = {}
    bel_graph = pybel.from_path(bel_path)
    
    bel_stats['nodes'] = bel_graph.number_of_nodes()
    bel_stats['edges'] = bel_graph.number_of_edges()
    print(bel_stats)

    # Get count of all BEL function types
    bel_functions_dict = count_functions(bel_graph)
    bel_stats.update(bel_functions_dict)

    # Get count of all BEL edge types
    bel_edges_dict = edge_summary.count_relations(bel_graph)
    bel_stats.update(bel_edges_dict)

    return bel_stats

In [5]:
BEL_STATS_COLUMN_NAMES = {
    'nodes': 'Nodes',
    'edges': 'Edges',
    'Protein': 'Proteins',
    'Gene': 'Genes',
    'RNA': 'RNA Entities',
    'Complex': 'Complexes',
    'Abundance': 'Compounds',
    'BiologicalProcess': 'Biological Processes',
    'Pathology':'Pathology',
    'Reaction': 'Reactions',
    'increases': 'Increase Relations',
    'decreases': 'Decrease Relations',
    'association': 'Association Relations',
    'hasComponent': 'Component Edges',
    'hasVariant': 'Variant Edges',
    'hasReactant': 'Reactants Edges',
    'hasProduct': 'Products Edges',
    'positiveCorrelation':'Positive Correlation',
    'negativeCorrelation':'Negative Correlation',
    'causesNoChange':'CauseNoChange Relations',
    'directlyIncreases':'Directly Increase Relations'

}

In [6]:
def get_bel_stats(resource_folder):
    """Get all BEL node and edge type statistics.
    :param str resource_folder: path to BEL folder
    :return: count of all nodes and edges in all BEL graphs from one resource
    :rtype: dict
    """
    df = pd.DataFrame()
    
    files_list = get_files_in_folder(resource_folder)
    for file in files_list:
        pathway_names = []
        pathway_names.append(file)

        bel_statistics_dict = get_bel_types(os.path.join(resource_folder, file))

        all_bel_statistics = {
            BEL_STATS_COLUMN_NAMES[key]: value
            for key, value in bel_statistics_dict.items()
        }
        print(all_bel_statistics)

        # Add pathway statistic rows to DataFrame
        pathway_data = pd.DataFrame(
            all_bel_statistics,
            index=pathway_names,
            columns=BEL_STATS_COLUMN_NAMES.values(),
            dtype=int
        )

        df = df.append(pathway_data.fillna(0).astype(int))

    return df

In [7]:
heme_statistics_df = get_bel_stats(BEL_DIRECTORY)

{'nodes': 25, 'edges': 65}
{'Nodes': 25, 'Edges': 65, 'Compounds': 8, 'Pathology': 5, 'Proteins': 6, 'Biological Processes': 4, 'Complexes': 1, 'Reactions': 1, 'Positive Correlation': 50, 'Negative Correlation': 10, 'Component Edges': 2, 'Reactants Edges': 2, 'Products Edges': 1}
{'nodes': 55, 'edges': 133}
{'Nodes': 55, 'Edges': 133, 'Compounds': 29, 'Proteins': 5, 'Pathology': 7, 'Reactions': 6, 'Complexes': 4, 'Biological Processes': 3, 'RNA Entities': 1, 'Positive Correlation': 60, 'Increase Relations': 24, 'Negative Correlation': 8, 'Decrease Relations': 16, 'Reactants Edges': 11, 'Products Edges': 6, 'Component Edges': 8}
{'nodes': 13, 'edges': 21}
{'Nodes': 13, 'Edges': 21, 'Compounds': 8, 'Biological Processes': 2, 'Proteins': 2, 'Complexes': 1, 'Increase Relations': 4, 'Positive Correlation': 4, 'Decrease Relations': 11, 'Component Edges': 2}
{'nodes': 13, 'edges': 50}
{'Nodes': 13, 'Edges': 50, 'Proteins': 4, 'Biological Processes': 2, 'Compounds': 7, 'Negative Correlation': 

In [8]:
heme_statistics_df

Unnamed: 0,Nodes,Edges,Proteins,Genes,RNA Entities,Complexes,Compounds,Biological Processes,Pathology,Reactions,...,Decrease Relations,Association Relations,Component Edges,Variant Edges,Reactants Edges,Products Edges,Positive Correlation,Negative Correlation,CauseNoChange Relations,Directly Increase Relations
heme_19276082.bel,25,65,6,0,0,1,8,4,5,1,...,0,0,2,0,2,1,50,10,0,0
heme_20378845.bel,55,133,5,0,1,4,29,3,7,6,...,16,0,8,0,11,6,60,8,0,0
heme_22954673.bel,13,21,2,0,0,1,8,2,0,0,...,11,0,2,0,0,0,4,0,0,0
heme_23215741.bel,13,50,4,0,0,0,7,2,0,0,...,0,0,0,0,0,0,10,38,2,0
heme_24464629.bel,21,68,11,0,0,0,4,2,4,0,...,1,0,0,0,0,0,54,8,0,0
heme_24486321.bel,23,33,5,0,0,1,10,0,6,1,...,10,0,2,0,3,1,14,0,0,0
heme_24489717.bel,17,26,6,0,0,0,6,4,1,0,...,4,0,0,0,0,0,16,2,1,0
heme_24553061.bel,9,13,3,0,0,2,2,0,2,0,...,0,0,5,0,0,0,8,0,0,0
heme_24630724.bel,10,17,7,1,0,0,1,1,0,0,...,2,0,0,0,0,0,8,0,0,0
heme_24667910.bel,11,31,4,0,0,0,4,0,3,0,...,0,0,0,0,0,0,22,6,0,0


In [13]:
#heme_statistics_df.to_csv("heme_statistics.csv", sep='\t', encoding='utf-8')