# Exploratory Data Analysis: Networks Dataframe (from Omnipath database)

[//]: # (------------------------------------------    DO NOT MODIFY THIS    ------------------------------------------)
<style type="text/css">
.tg  {border-collapse:collapse;
      border-spacing:0;
     }
.tg td{border-color:black;
       border-style:solid;
       border-width:1px;
       font-family:Arial, sans-serif;
       font-size:14px;
       overflow:hidden;
       padding:10px 5px;
       word-break:normal;
      }
.tg th{border-color:black;
       border-style:solid;
       border-width:1px;
       font-family:Arial, sans-serif;
       font-size:14px;
       font-weight:normal;
       overflow:hidden;
       padding:10px 5px;
       word-break:normal;
      }
.tg .tg-fymr{border-color:inherit;
             font-weight:bold;
             text-align:left;
             vertical-align:top
            }
.tg .tg-0pky{border-color:inherit;
             text-align:left;
             vertical-align:top
            }
[//]: # (--------------------------------------------------------------------------------------------------------------)

[//]: # (-------------------------------------    FILL THIS OUT WITH YOUR DATA    -------------------------------------)
</style>
<table class="tg">
    <tbody>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Title:</td>
        <td class="tg-0pky">Exploratory Data Analysis: Networks Dataframe (from Omnipath database)</td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Authors:</td>
        <td class="tg-0pky">
            <a href="https://github.com/ecarrenolozano" target="_blank" rel="noopener noreferrer">Edwin Carreño</a>
        </td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Affiliations:</td>
        <td class="tg-0pky">
            <a href="https://www.ssc.uni-heidelberg.de/en" target="_blank" rel="noopener noreferrer">Scientific Software Center</a>,
            <a href="https://saezlab.org/" target="_blank" rel="noopener noreferrer">Saez-Rodriguez Group</a>
        </td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Date Created:</td>
        <td class="tg-0pky">19.03.2025</td>
      </tr>
      <tr>
        <td class="tg-fymr" style="font-weight: bold">Description:</td>
        <td class="tg-0pky">Extraction of metadata for building database tables </td>
      </tr>
    </tbody>
</table>

[//]: # (--------------------------------------------------------------------------------------------------------------)

## Overview


In this section you should introduce the purpose of this Notebook and a list of expected outcomes achieved by the user (especially if this a tutorial) at the of running all the cells.

## Setup (if required)

If your code require to install dependencies before your main code, please add the commands to install the dependencies.

### Pandas installation

In [1]:
!pip install pandas -q

## Importing Libraries

In [2]:
"""
Recommendations:
    - Respect the order of the imports, they are indicated by the numbers 1, 2, 3.
    - One import per line is recommended, with this we can track easily any modified line when we use git.
    - Absolute imports are recommended (see 3. Local application/library specific imports below), they improve readability and give better error messages.
    - You should put a blank line between each group of imports.
"""

# future-imports (for instance: from __future__ import barry_as_FLUFL)
# from __future__ import barry_as_FLUFL  

# 1. Standard library imports
import os

# 2. Related third party imports
import numpy as np
import pandas as pd
from pydantic import BaseModel
from pydantic import Field

# 3. Local application/library specific imports
# import <mypackage>.<MyClass>         # this is an example
# from <mypackage> import <MyClass>    # this is another example 

## Introduction

## Section 1: Point to "Interactions Dataframe"

This sections serves to explain a topic or give a background. Do not hesitate and include images and latex equations if you need them.

### Subsection 1.1: Setting dataset path

In [3]:
interactions_path = os.path.join("../data/omnipath_archive/omnipath_webservice_interactions__latest.tsv.gz")
#interactions_path = os.path.join("/home/ecarreno/SSC-Projects/b_REPOSITORIES/ecarrenolozano/omnipath-secondary-adapter/data/subset_interactions_100.tsv")

In [4]:
print("This file exist? {}".format(os.path.exists(interactions_path)))

This file exist? True


### Subsection 1.2: Exploratory Data Analysis

### Configuring Pandas view

In [5]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

### Load data into Pandas Dataframe (without predefined data types)

In [19]:
interactions_df = pd.read_table(interactions_path, sep="\t", keep_default_na=False)

In [20]:
interactions_df.head(20)

Unnamed: 0,source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,omnipath,kinaseextra,ligrecextra,pathwayextra,mirnatarget,dorothea,collectri,tf_target,lncrna_mrna,tf_mirna,small_molecule,dorothea_curated,dorothea_chipseq,dorothea_tfbs,dorothea_coexp,dorothea_level,type,curation_effort,extra_attrs,evidences,ncbi_tax_id_source,entity_type_source,ncbi_tax_id_target,entity_type_target
0,P0DP23,P48995,CALM1,TRPC1,1,0,1,1,0,1,TRIP,TRIP:11290752;TRIP:11983166;TRIP:12601176,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,3,"{""TRIP_method"":[""Calcium measurement"",""Fluorescence probe labeling"",""Fusion protein-pull down assay"",""Patch clamp""]}","{""id_a"":""P0DP23"",""id_b"":""P48995"",""positive"":[],""negative"":[{""resource"":""TRIP"",""references"":[""11983166"",""12601176"",""11290752""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Calcium measurement"",""Fluorescence probe labeling"",""Fusion protein-pull down assay"",""Patch clamp""]}}],""directed"":[],""undirected"":[]}",9606,protein,9606,protein
1,P0DP25,P48995,CALM3,TRPC1,1,0,1,1,0,1,TRIP,TRIP:11290752;TRIP:11983166;TRIP:12601176,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,3,"{""TRIP_method"":[""Calcium measurement"",""Fluorescence probe labeling"",""Fusion protein-pull down assay"",""Patch clamp""]}","{""id_a"":""P0DP25"",""id_b"":""P48995"",""positive"":[],""negative"":[{""resource"":""TRIP"",""references"":[""11983166"",""12601176"",""11290752""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Calcium measurement"",""Fluorescence probe labeling"",""Fusion protein-pull down assay"",""Patch clamp""]}}],""directed"":[],""undirected"":[]}",9606,protein,9606,protein
2,P0DP24,P48995,CALM2,TRPC1,1,0,1,1,0,1,TRIP,TRIP:11290752;TRIP:11983166;TRIP:12601176,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,3,"{""TRIP_method"":[""Calcium measurement"",""Fluorescence probe labeling"",""Fusion protein-pull down assay"",""Patch clamp""]}","{""id_a"":""P0DP24"",""id_b"":""P48995"",""positive"":[],""negative"":[{""resource"":""TRIP"",""references"":[""11983166"",""12601176"",""11290752""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Calcium measurement"",""Fluorescence probe labeling"",""Fusion protein-pull down assay"",""Patch clamp""]}}],""directed"":[],""undirected"":[]}",9606,protein,9606,protein
3,Q03135,P48995,CAV1,TRPC1,1,1,0,1,1,0,DIP;HPRD;IntAct;Lit-BM-17;TRIP,DIP:19897728;HPRD:12732636;IntAct:19897728;Lit-BM-17:10980191;Lit-BM-17:19052258;Lit-BM-17:19897728;TRIP:12732636;TRIP:14551243;TRIP:16822931;TRIP:18430726;TRIP:19052258;TRIP:19351713;TRIP:19897728,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,13,"{""TRIP_method"":[""Cell surface biotinylation"",""Co-immunoprecipitation"",""Co-immunofluorescence staining"",""Fluorescence resonance energy transfer"",""Yeast two-hybrid"",""Fusion protein-pull down assay""]}","{""id_a"":""Q03135"",""id_b"":""P48995"",""positive"":[{""resource"":""TRIP"",""references"":[""19351713"",""19052258"",""12732636"",""14551243"",""19897728"",""18430726"",""16822931""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Cell surface biotinylation"",""Co-immunoprecipitation"",""Co-immunofluorescence staining"",""Fluorescence resonance energy transfer"",""Yeast two-hybrid"",""Fusion protein-pull down assay""]}}],""negative"":[],""directed"":[],""undirected"":[{""resource"":""DIP"",""references"":[""19897728""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""fluorescent resonance energy transfer"",""anti bait coimmunoprecipitation"",""two hybrid""],""type"":[""direct interaction"",""physical association""],""id"":""DIP-104198E""}},{""resource"":""IntAct"",""references"":[""19897728""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""fluorescent resonance energy transfer"",""two hybrid"",""anti bait coimmunoprecipitation""]}},{""resource"":""Lit-BM-17"",""references"":[""10980191"",""19897728"",""19052258""],""dataset"":""omnipath"",""via"":null,""attrs"":{""mentha_score"":0.765}},{""resource"":""HPRD"",""references"":[""12732636""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""in vitro"",""in vivo"",""yeast 2-hybrid""]}}]}",9606,protein,9606,protein
4,P14416,P48995,DRD2,TRPC1,1,1,0,1,1,0,TRIP,TRIP:18261457,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,1,"{""TRIP_method"":[""Cell surface biotinylation"",""Co-immunoprecipitation"",""Co-immunofluorescence staining"",""Yeast two-hybrid"",""Fusion protein-pull down assay""]}","{""id_a"":""P14416"",""id_b"":""P48995"",""positive"":[{""resource"":""TRIP"",""references"":[""18261457""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Cell surface biotinylation"",""Co-immunoprecipitation"",""Co-immunofluorescence staining"",""Yeast two-hybrid"",""Fusion protein-pull down assay""]}}],""negative"":[],""directed"":[],""undirected"":[]}",9606,protein,9606,protein
5,P48995,Q86YM7,TRPC1,HOMER1,0,0,0,0,0,0,HPRD;TRIP,HPRD:14505576;TRIP:14505576;TRIP:16905188;TRIP:22506990,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,4,"{""TRIP_method"":[""Co-immunoprecipitation""],""HPRD_method"":[""in vitro"",""in vivo""]}","{""id_a"":""Q86YM7"",""id_b"":""P48995"",""positive"":[],""negative"":[],""directed"":[],""undirected"":[{""resource"":""TRIP"",""references"":[""16905188"",""14505576"",""22506990""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Co-immunoprecipitation""]}},{""resource"":""HPRD"",""references"":[""14505576""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""in vitro"",""in vivo""]}}]}",9606,protein,9606,protein
6,Q99750,P48995,MDFI,TRPC1,1,0,1,1,0,1,HPRD;TRIP,HPRD:14530267;TRIP:14530267;TRIP:23770672,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,3,"{""TRIP_method"":[""Yeast two-hybrid"",""Co-immunoprecipitation"",""Fusion protein-pull down assay"",""Patch clamp""]}","{""id_a"":""Q99750"",""id_b"":""P48995"",""positive"":[],""negative"":[{""resource"":""TRIP"",""references"":[""23770672"",""14530267""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Yeast two-hybrid"",""Co-immunoprecipitation"",""Fusion protein-pull down assay"",""Patch clamp""]}}],""directed"":[],""undirected"":[{""resource"":""HPRD"",""references"":[""14530267""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""in vitro"",""in vivo"",""yeast 2-hybrid""]}}]}",9606,protein,9606,protein
7,Q14571,P48995,ITPR2,TRPC1,1,1,0,1,1,0,HPRD;TRIP,HPRD:10970773;HPRD:11336651;TRIP:10970773;TRIP:11336651;TRIP:12196544;TRIP:15121806;TRIP:16870612;TRIP:16905188;TRIP:17681754;TRIP:18068335;TRIP:18191041;TRIP:18249094;TRIP:23228564,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,13,"{""TRIP_method"":[""Calcium measurement"",""Co-immunoprecipitation"",""Patch clamp""]}","{""id_a"":""Q14571"",""id_b"":""P48995"",""positive"":[{""resource"":""TRIP"",""references"":[""18068335"",""15121806"",""18249094"",""18191041"",""16870612"",""17681754"",""10970773"",""11336651"",""16905188"",""12196544"",""23228564""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Calcium measurement"",""Co-immunoprecipitation"",""Patch clamp""]}}],""negative"":[],""directed"":[],""undirected"":[{""resource"":""HPRD"",""references"":[""11336651"",""10970773""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""in vitro"",""in vivo""]}}]}",9606,protein,9606,protein
8,P48995,Q14573,TRPC1,ITPR3,0,0,0,0,0,0,HPRD;TRIP,HPRD:10980191;HPRD:11290752;TRIP:11290752;TRIP:14505576;TRIP:15654973;TRIP:19052258,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,6,"{""TRIP_method"":[""Co-immunoprecipitation"",""Co-immunofluorescence staining"",""Fusion protein-pull down assay""],""HPRD_method"":[""in vitro"",""in vivo""]}","{""id_a"":""Q14573"",""id_b"":""P48995"",""positive"":[],""negative"":[],""directed"":[],""undirected"":[{""resource"":""TRIP"",""references"":[""15654973"",""11290752"",""14505576"",""19052258""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Co-immunoprecipitation"",""Co-immunofluorescence staining"",""Fusion protein-pull down assay""]}},{""resource"":""HPRD"",""references"":[""11290752"",""10980191""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""in vitro"",""in vivo""]}}]}",9606,protein,9606,protein
9,P29966,P48995,MARCKS,TRPC1,1,0,1,1,0,1,TRIP,TRIP:24022404,True,False,False,False,False,False,False,False,False,False,False,,,,,,post_translational,1,"{""TRIP_method"":[""Co-immunoprecipitation"",""Patch clamp""]}","{""id_a"":""P29966"",""id_b"":""P48995"",""positive"":[],""negative"":[{""resource"":""TRIP"",""references"":[""24022404""],""dataset"":""omnipath"",""via"":null,""attrs"":{""method"":[""Co-immunoprecipitation"",""Patch clamp""]}}],""directed"":[],""undirected"":[]}",9606,protein,9606,protein


In [21]:
interactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1217900 entries, 0 to 1217899
Data columns (total 36 columns):
 #   Column                 Non-Null Count    Dtype 
---  ------                 --------------    ----- 
 0   source                 1217900 non-null  object
 1   target                 1217900 non-null  object
 2   source_genesymbol      1217900 non-null  object
 3   target_genesymbol      1217900 non-null  object
 4   is_directed            1217900 non-null  int64 
 5   is_stimulation         1217900 non-null  int64 
 6   is_inhibition          1217900 non-null  int64 
 7   consensus_direction    1217900 non-null  int64 
 8   consensus_stimulation  1217900 non-null  int64 
 9   consensus_inhibition   1217900 non-null  int64 
 10  sources                1217900 non-null  object
 11  references             1217900 non-null  object
 12  omnipath               1217900 non-null  bool  
 13  kinaseextra            1217900 non-null  bool  
 14  ligrecextra            1217900 non

### Extract Metadata

In [22]:
metadata = pd.DataFrame({
    'Column Name': interactions_df.columns,
    'Data Type': interactions_df.dtypes.values,
    'Nullable': interactions_df.isnull().any().values,
    'Unique Values': [interactions_df[col].nunique() for col in interactions_df.columns]
})

In [23]:
metadata

Unnamed: 0,Column Name,Data Type,Nullable,Unique Values
0,source,object,False,27944
1,target,object,False,47976
2,source_genesymbol,object,False,22117
3,target_genesymbol,object,False,36930
4,is_directed,int64,False,2
5,is_stimulation,int64,False,2
6,is_inhibition,int64,False,2
7,consensus_direction,int64,False,2
8,consensus_stimulation,int64,False,2
9,consensus_inhibition,int64,False,2


In [29]:
field = "dorothea_curated"

print("List of unique values in field: {}\n\t{}".format(field, interactions_df[field].unique()))

List of unique values in field: dorothea_curated
	['' 'True' 'False' '1']


In [25]:
for value in interactions_df[field].unique():
    print(type(value))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


In [None]:
Counting Null values

In [26]:
interactions_df.references[interactions_df['references'].isnull()]

Series([], Name: references, dtype: object)

Counting Null 

In [27]:
num_nulls_in_dorothea_curated = interactions_df['references'].isnull().sum()
num_nulls_in_dorothea_curated

np.int64(0)

Counting True values

In [None]:
num_True_in_dorothea_curated = (interactions_df.dorothea_curated==True).sum()
num_True_in_dorothea_curated

Counting "True" values

In [30]:
num_true_in_dorothea_curated = (interactions_df.dorothea_curated=="True").sum()
num_true_in_dorothea_curated

np.int64(413044)

Counting 1 values

In [31]:
num_one_in_dorothea_curated = (interactions_df.dorothea_curated=="1").sum()
num_one_in_dorothea_curated

np.int64(1)

Counting False values

In [32]:
num_False_in_dorothea_curated = (interactions_df.dorothea_curated==False).sum()
num_False_in_dorothea_curated

np.int64(0)

Counting "False" values

In [33]:
num_false_in_dorothea_curated = (interactions_df.dorothea_curated=="False").sum()
num_false_in_dorothea_curated

np.int64(253265)

In [None]:
252910+355+551590

In [None]:
dataset_dict = {'col1': [1, "nan", "True", True, "False", False]}
dataframe = pd.DataFrame(dataset_dict, dtype="bool")
dataframe.info()

In [None]:
filtered = interactions_df[(interactions_df["source"]=="Q16254")
                         & (interactions_df["target"]=="O43683")]
#filtered[["source", "target", "is_stimulation", "omnipath"]]

filtered

In [None]:
omnipath_df = interactions_df[(interactions_df["omnipath"])==True]
omnipath_df.info()

In [None]:
omnipath_df[(omnipath_df["source"])==(omnipath_df["target"])]

### Dataset with predefined data types

In [None]:
# Data types for interactions
dtype = {'source': 'string',
         'target': 'string',
         'source_genesymbol': 'string',
         'target_genesymbol': 'string',
         'is_directed': 'Int8',
         'is_stimulation': 'Int8',
         'is_inhibition': 'Int8',
         'consensus_direction': 'Int8',
         'consensus_stimulation': 'Int8',
         'consensus_inhibition': 'Int8',
         'sources': 'string',
         'references': 'string',
         'omnipath': 'bool',
         'kinaseextra': 'bool',
         'ligrecextra': 'bool',
         'pathwayextra': 'bool',
         'mirnatarget': 'bool',
         'dorothea': 'bool',
         'collectri': 'bool',
         'tf_target': 'bool',
         'lncrna_mrna': 'bool',
         'tf_mirna': 'bool',
         'small_molecule': 'bool',
         'dorothea_curated': 'string',
         'dorothea_chipseq': 'string',
         'dorothea_tfbs': 'string',
         'dorothea_coexp': 'string',
         'dorothea_level': 'string',
         'type': 'string',
         'curation_effort': 'Int8',
         'extra_attrs': 'string',
         'evidences': 'string',
         'ncbi_tax_id_source': 'Int64',
         'entity_type_source': 'string',
         'ncbi_tax_id_target': 'Int64',
         'entity_type_target': 'string'
}

In [None]:
interactions_df = pd.read_table(interactions_path, dtype=dtype)

In [None]:
interactions_df.info()

In [None]:
metadata

In [None]:
a = interactions_df["dorothea_curated"].replace({True: True, False: False, 'True': True, 'False': False, '1': True}).astype(bool)

In [None]:
a.unique()

In [None]:
print(np.sort(interactions_df["dorothea_curated"].unique()))

In [None]:
2**16

In [None]:
length_df = pd.DataFrame(columns=interactions_df.columns)

for column in interactions_df.columns:
    if not (isinstance(column, bool) and isinstance(column, int)):
        length_df[column] = interactions_df[column].apply(str).apply(len)
    else:
        print("Length cannot be calculated in column {}".format(column))

In [None]:
length_df.describe()

In [None]:
9.058000e+03

In [None]:
interactions_df.omnipath

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)  

In [None]:
interactions_df[interactions_df['source'].apply(str).apply(len) >= 400].source

## Section 2: Add Equations to the Notebook

## Section 3: Add plot from Matplotlib

## Section 4: Add interactive plots (plotly)

In [None]:
!pip install sqlalchemy