In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('whitegrid')

### Omnipath
We create an undirected network from Omnipath.

In [2]:
data=pd.read_csv('../data/omnipath/interactions.txt',sep='\t',header=0,index_col=None)

In [3]:
data.head()

Unnamed: 0,source,target,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,dip_url
0,P17612,P55064,1,0,0,1,0,0,
1,P55064,Q9HBA0,0,0,0,0,0,0,
2,O43318,Q00610,1,0,0,1,0,0,
3,P14672,Q00610,1,0,0,1,0,0,
4,Q00610,P14672,1,0,0,1,0,0,


In [4]:
#split complexes
fil1=data['source'].apply(lambda x:'COMPLEX' in x)
fil2=data['target'].apply(lambda x:'COMPLEX' in x)
data_comp=data[fil1|fil2].copy()
data_nocomp=data[~(fil1|fil2)].copy()

In [5]:
#create interactions for complex members
data_comp_good=pd.DataFrame(columns=data_comp.columns)
for i in data_comp.index:
    targets=data_comp.loc[i,'target']
    sources=data_comp.loc[i,'source']
    if 'COMPLEX' in targets:
        targets=targets.split('COMPLEX:')[1].split('-')
    else:
        targets=[targets]
    if 'COMPLEX' in sources:
        sources=sources.split('COMPLEX:')[1].split('-')
    else:
        sources=[sources]
    info=list(data_comp.loc[i].values[2:])
    for s in sources:
        for t in targets:
            new=pd.DataFrame(index=[0],columns=data_comp_good.columns)
            new.loc[0]=[s,t]+info
            data_comp_good=data_comp_good.append(new,ignore_index=True)

In [6]:
data_good=pd.concat([data_nocomp,data_comp_good],ignore_index=True)

In [7]:
data_good=data_good[['source','target']]
data_good.to_csv('../data/omnipath/all_interactions.csv',sep=',')
data=data_good

In [8]:
#make network undirected
data=pd.read_csv('../data/omnipath/all_interactions.csv',sep=',',header=0,index_col=0)
data_rev=pd.DataFrame(index=data.index,columns=data.columns)
data_rev['source']=data['target']
data_rev['target']=data['source']
data=pd.concat([data,data_rev],ignore_index=True)
data=data.drop_duplicates()

In [9]:
#remove self interacions
fil=data['source']!=data['target']
data=data[fil]

Calculating distance matrix.

In [10]:
#create adj. matirx
data['Interaction']=1
AM=data.pivot(index='source',columns='target',values='Interaction')
AM[AM.isnull()]=0
#add self edge
AM=AM+np.eye(N=len(AM))

In [11]:
AM.to_csv('../data/omnipath/AM.csv',sep=',')

In [12]:
#calculate distance matrix
prot_ids=AM.index
AM=AM.values
DM=np.full((len(prot_ids),len(prot_ids)),np.inf)
RW=np.eye(len(prot_ids)) #starting position
k=0
DM[RW==1]=k
while True:
    k+=1
    #number of steps, remaining not reachable
    print(k,(DM==np.inf).sum())
    RW_new=np.dot(RW,AM)
    RW_new[RW_new!=0]=1
    if (RW_new!=RW).sum()==0:
        break
    DM[(RW_new==1)&(RW!=1)]=k
    RW=RW_new

1 117624870
2 117529366
3 114002418
4 85617132
5 40740544
6 18909274
7 12872908
8 11480066
9 11167680
10 11118020
11 11112142
12 11111454
13 11111366


In [13]:
AM=pd.DataFrame(AM,index=prot_ids,columns=prot_ids)
DM=pd.DataFrame(DM,index=prot_ids,columns=prot_ids)
DM.to_csv('../data/omnipath/DM.csv',sep=',')

In [14]:
#filter out unconnected nodes
n_unconnected=(DM==np.inf).sum().min()
giant_component=DM.index[(DM==np.inf).sum()==n_unconnected]
AM=AM.loc[giant_component,giant_component]
DM=DM.loc[giant_component,giant_component]

In [15]:
AM.to_csv('../data/omnipath/AM_GC.csv',sep=',')
DM.to_csv('../data/omnipath/DM_GC.csv',sep=',')

As alternative, we calculate the DSD using the script from [here](https://github.com/reemagit/DSD/blob/master/DSD/calculator.py).

In [46]:
AM=pd.read_csv('../data/omnipath/AM_GC.csv',sep=',',header=0,index_col=0)

In [3]:
def remove_self_edges(x):
    y=x.copy()
    y[y.name]=0
    return y

In [4]:
#remove self edges, needed for DSD
AM=AM.apply(remove_self_edges,axis=0)

In [7]:
#use numpy arrays for calculations
prot_ids=AM.index
AM=AM.values

In [8]:
n = AM.shape[0]
degree = AM.sum(axis=1)
p = AM / degree
pi = degree / degree.sum()

In [9]:
from scipy.spatial.distance import pdist, squareform
from numpy.linalg import inv

In [10]:
DM=squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

In [12]:
DM=pd.DataFrame(DM,index=prot_ids,columns=prot_ids)
DM.to_csv('../data/omnipath/DSD.csv',sep=',')

DSD can be calculated also with k steps.