In [None]:
! pip install git+https://github.com/fraenkel-lab/OmicsIntegrator2.git
! pip install --upgrade --force-reinstall --no-deps python-louvain
! pip install --upgrade --force-reinstall --no-deps networkx

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt

import OmicsIntegrator as oi


ModuleNotFoundError: No module named 'OmicsIntegrator'

In [53]:
interactome_file = "inputs\HIPPIE-current.mitab.txt"
prize_file = "inputs\Prizes2.csv"
prize_file = pd.read_csv(prize_file, delimiter = ',')

# Lacking interactions in HIPPIE: 

# No interactions found for 28455 (IGHV2-26)
# No interactions found for 28426 (IGHV3-43)
# No interactions found for 28410 (IGHV3-72)
# No interactions found for 28408 (IGHV3-74)
# No interactions found for 28385 (IGHV6-1)
# No interactions found for 28401 (IGHV4-4)
# No interactions found for 28935 (IGKV1-27)
# No interactions found for 28902 (IGKV1D-13)
# No interactions found for 28875 (IGKV3D-15)
# No interactions found for 28874 (IGKV3D-20)
# No interactions found for 28825 (IGLV1-40)
# No interactions found for 28804 (IGLV3-9)
# No interactions found for 28778 (IGLV6-57)
# No interactions found for 388533 (KRTDAP)
# No interactions found for ?????? (SAA1)
# No interactions found for ?????? (SAA2)


prize_file.to_csv('edited_prize_file.txt', sep='\t', index= False)
prize_file = 'edited_prize_file.txt'


interactome = pd.read_csv(interactome_file, delimiter = '\t')
interactome = interactome[['Gene Name Interactor A', 'Gene Name Interactor B', 'Confidence Value']]
interactome = interactome.rename(columns={'Gene Name Interactor A': 'protein1', 'Gene Name Interactor B': 'protein2'})
interactome['cost'] = 1.5 - interactome['Confidence Value']
del interactome['Confidence Value']
interactome['protein1'] = interactome['protein1'].replace('LOC100293069', 'CFHR1')
interactome['protein2'] = interactome['protein2'].replace('LOC100293069', 'CFHR1')

interactome['protein1'] = interactome['protein1'].replace('HIST2H2AC','H2AC20')
interactome['protein2'] = interactome['protein2'].replace('HIST2H2AC','H2AC20')

interactome['protein1'] = interactome['protein1'].replace('HIST2H2BE', 'H2BC21')
interactome['protein2'] = interactome['protein2'].replace('HIST2H2BE', 'H2BC21')

interactome.to_csv('interactome.txt', sep='\t', index= False)
interactome_file = 'interactome.txt'

# Network building

In [54]:
params = {
    "noise": 0.1, 
    "dummy_mode": "terminals", 
    "exclude_terminals": False, 
    "seed": 1
}

graph = oi.Graph(interactome_file, params)
graph.prepare_prizes(prize_file)


06:07:18 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
06:07:18 - OI2: INFO - []
06:07:18 - OI2: INFO - Members of the prize file not present in the interactome:
06:07:18 - OI2: INFO - ['IGHV1-24', 'IGHV2-26', 'IGHV5-51', 'IGKV1D-13', 'IGKV2-29', 'IGLV9-49', 'KRTDAP', 'SAA1', 'SAA2']


# Grid search

In [55]:
Ws = list(np.arange(2,6,3))
Bs = list(np.arange(2,7,3))
Gs = list(np.arange(2,7,3))

# Or:

#Ws = [2,3]
#Bs = [4,5,6]
#Gs = [3,4,5]

params = {
    "noise": 0.1, 
    "dummy_mode": "terminals", 
    "exclude_terminals": False, 
    "seed": 1
}


results = graph.grid_search(prize_file, Ws, Bs, Gs)
membership_df = oi.summarize_grid_search(results, "membership")



prize = pd.read_csv(prize_file,sep="\t")
initial_nodes=list(prize.name)          
results_with_terminals = membership_df[membership_df.index.isin(initial_nodes)]
Initial_node_covers = results_with_terminals.sum().sort_values(ascending=False).to_frame(name="Covering_nodes")


out = set(Initial_node_covers[Initial_node_covers["Covering_nodes"]==max(Initial_node_covers["Covering_nodes"])].index)

Initial_node_covers.sort_index(axis=0, inplace=True)
membership_df.sort_index(axis=1, inplace=True)
Total_node = membership_df.sum().to_frame(name="Total_nodes")

membership_df.loc['Covering nodes']=(Initial_node_covers['Covering_nodes'])
membership_df.loc['Total_nodes']=(Total_node['Total_nodes'])
membership_df.to_csv('membership_df_with_node_numbers.csv')

06:07:44 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
06:07:44 - OI2: INFO - []
06:07:44 - OI2: INFO - Members of the prize file not present in the interactome:
06:07:44 - OI2: INFO - ['IGHV1-24', 'IGHV2-26', 'IGHV5-51', 'IGKV1D-13', 'IGKV2-29', 'IGLV9-49', 'KRTDAP', 'SAA1', 'SAA2']


In [56]:
# tuned parameters:

w = 2
b = 5
g = 2


In [57]:
graph = oi.Graph(interactome_file, {'w':w, 'b':b, 'g':g,})
graph.prepare_prizes(prize_file)
vertex_indices, edge_indices = graph.pcsf()

print(len(vertex_indices))

forest, augmented_forest = graph.output_forest_as_networkx(vertex_indices, edge_indices)
print(graph.pcsf_objective_value(augmented_forest))

# removing self loops
augmented_forest.remove_edges_from(nx.selfloop_edges(augmented_forest))
forest.remove_edges_from(nx.selfloop_edges(forest))

oi.get_networkx_graph_as_dataframe_of_edges(augmented_forest).to_csv('edges.txt', sep='\t', header=True, index=False)
oi.get_networkx_graph_as_dataframe_of_nodes(augmented_forest).to_csv('nodes.txt', sep='\t', header=True, index=True)



06:08:36 - OI2: INFO - Duplicated gene symbols in the prize file (we'll keep the max prize):
06:08:36 - OI2: INFO - []
06:08:36 - OI2: INFO - Members of the prize file not present in the interactome:
06:08:36 - OI2: INFO - ['IGHV1-24', 'IGHV2-26', 'IGHV5-51', 'IGKV1D-13', 'IGKV2-29', 'IGLV9-49', 'KRTDAP', 'SAA1', 'SAA2']


131
1019.2877629800007


In [58]:
oi.output_networkx_graph_as_graphml_for_cytoscape(augmented_forest, output_dir='', filename='pcsf_results252.graphml')

WindowsPath('C:/Users/Sina-PC/Documents/Pyhton/Covid Project/pcsf_results252.graphml')

# Finding the shorteest path between result network and another set of nodes

In [59]:
Data = pd.read_csv(interactome_file, delimiter='\t')
file = open('shortest_path.csv', 'w')

src = pd.read_csv('nodes.txt', delimiter='\t')
src = src.iloc[:,0].values

trgt = pd.read_csv('inputs/gordon_proteins.txt', delimiter='\t')
trgt = trgt.iloc[:,2].values

G = nx.from_pandas_edgelist(Data, edge_attr='cost', source='protein1', target = 'protein2')
#G.remove_node('UBC') #list of hubs from Seyma's code
G.remove_node('APP') #list of hubs from Seyma's code
G.remove_node('ELAVL1') #list of hubs from Seyma's code
G.remove_node('SUMO2') #list of hubs from Seyma's code
G.remove_node('CUL3') #list of hubs from Seyma's code



file.writelines('protein1_network_nodes'+','+'protein2_Gordon'+'\n')

for s in src:
  for t in trgt:
    try:
      sp=nx.shortest_path(G, source=s, target=t)
      if len(sp)<= 2:
        file.writelines(','.join(sp)+'\n')
        
    except:
      continue
file.close()




# inseritng drug targets and locations to node attributes

In [62]:
drugs = pd.read_csv('inputs/drugbank_approved_target_polypeptide_ids.csv_pharmacologically_active.csv')[['Gene Name', 'UniProt ID', 'Drug IDs']]
drug_links = pd.read_csv('inputs/drug links.csv')[['DrugBank ID','Name']]
nodes = pd.read_csv('nodes.txt', sep='\t')
#locations = pd.read_csv('inputs/human_compartment_integrated_full.tsv', delimiter='\t', names=['Gene name','c','Location','Confidence'])
#locations = locations[['Gene name','Location','Confidence']].groupby('Gene name').first()


In [63]:
nodes_drugs = pd.merge(nodes, drugs, left_on = 'Unnamed: 0', right_on = 'Gene Name', how = 'left').rename(columns={'Unnamed: 0':'Gene name'})
#nodes_drugs =  pd.merge(nodes_drugs, locations, on='Gene name', how='left')
nodes_drugs.to_csv('nodes_drugs.csv', index = False)

nodes_drugs = nodes_drugs.groupby('Gene Name').agg({'Drug IDs': lambda x: ''.join(x)})
nodes_drugs['Drug IDs'] = nodes_drugs['Drug IDs'].apply(lambda x : x.split('; ')[:]).apply(list)
nodes_drugs_network = nodes_drugs.explode('Drug IDs')
nodes_drugs_network['Gene name'] = nodes_drugs_network.index

nodes_drugs_network = pd.merge(nodes_drugs_network, drug_links, left_on = 'Drug IDs', right_on='DrugBank ID', how = 'left')
del nodes_drugs_network['Drug IDs']
nodes_drugs_network.to_csv('nodes_drug_network.csv', index = False)

In [8]:
! pip install scikit_posthocs

Collecting scikit_posthocs
  Downloading scikit_posthocs-0.7.0-py3-none-any.whl (38 kB)
Installing collected packages: scikit-posthocs
Successfully installed scikit-posthocs-0.7.0


In [11]:
healthy_=[0.148487510982804, 0.1232, 0.125791792894519, 0.14885208882198,0.066635709310425]
critical_=[0.104208072468946,
0.080873716079299,
0.055020388174852,
0.067486262902231,
0.070080771961858,
0.078566992470362]
severe = [0.10286677252222,0.116847314628567,0.128596464816651]
moderate=[0.090549553198054,
0.081109811842245,
0.085526315789474,
0.110776872072838]



import sklearn as sk

import scikit_posthocs


'''
1. scikit posthoc t test ---- adjustmet = fdr_tsbh, fdr_tsbky
2. tukey
3. vanwaerden
4. 
'''



a= scikit_posthocs.posthoc_tukey([healthy_,severe,moderate,critical_])
print(a)

          1         2         3         4
1  1.000000  0.900000  0.206498  0.016224
2  0.900000  1.000000  0.496386  0.089024
3  0.206498  0.496386  1.000000  0.664397
4  0.016224  0.089024  0.664397  1.000000


In [68]:
import optparse
import sys
import os.path as op
import scipy.stats as ss
from decimal import Decimal

m = 61
n = 20000
n1 = 140
n2 = 154

mmin = m - 1
mmax = min(n1, n2)
Decimal(ss.hypergeom.cdf(mmax, n, n1, n2)  - ss.hypergeom.cdf(mmin, n, n1, n2))

Decimal('0')