In [1]:
import networkx as nx
import numpy as np
import os
import random as rand
from collections import defaultdict
import matplotlib.pyplot as plt

In [2]:
def get_r_dictionaries(file,mapping=True):
    with open(file, "r") as in_file:
        dict_map={}
        for line in in_file:
            line.strip()
            if mapping:
                dict_map[line.split()[1]]=line.split()[0]
            else:
                dict_map[line.split()[0]]=set(line.split()[1:])
    return dict_map

In [3]:
def get_cyclic_net(filename):
    G=nx.read_edgelist(filename, comments='#', delimiter="\t", nodetype =str,  data=(('mode',str),), create_using=nx.DiGraph())
    G.remove_nodes_from(["Source", "Target"])
    selfloops=G.selfloop_edges()
    G.remove_edges_from(G.selfloop_edges())

    while 0 in [d[1] for d in G.in_degree()] or 0 in [d[1] for d in G.out_degree()]:
        nodes_to_remove=[node for node in G if G.in_degree(node) == 0 or G.out_degree(node) == 0]
        G.remove_nodes_from(nodes_to_remove)
        
        
    selfloops_in_reservoir=[edge for edge in selfloops if edge[0] in G.nodes()]
    G.add_edges_from(selfloops_in_reservoir)

    return G

In [4]:
def build_adj_weighted_matrix(filename,mapping):
        #NETWORK v2.0
    net=get_cyclic_net(filename)
    nx.relabel_nodes(net,mapping,copy=False)
    dict_pos=dict((node,pos) for (pos,node) in enumerate(net.nodes()))
    for edge in net.edges(data="mode", default=0):
        source,target,mode=edge
        if mode== "+":
            net[source][target]["weight"]= abs(rand.gauss(0,1))
        elif mode== "-":
            net[source][target]["weight"]= rand.gauss(0,1)*-1
        elif mode== 0:
            net[source][target]["weight"]= rand.gauss(0,1)
        
    return nx.to_numpy_matrix(net),dict_pos

In [5]:
def write_nodes_file(out_file,filename,net):
    with open("%s_%s"%(out_file,filename),"w") as out:
        for gene in net.nodes():
            if "hsa" in gene:
                gene=gene[4:].lower()
            out.write(gene +"\n")

In [6]:
file="Dataset1/network_edge_list_ENCODE.csv"
net=get_cyclic_net("Dataset1/network_edge_list_ENCODE.csv")
G=nx.read_edgelist(file, comments='#', delimiter="\t", nodetype =str,  data=(('mode',str),), create_using=nx.DiGraph())

In [9]:
G.in_edges("TMEM14C")

InEdgeDataView([('SP1', 'TMEM14C'), ('NR3C1', 'TMEM14C'), ('SREBF1', 'TMEM14C'), ('MAFK', 'TMEM14C'), ('MAFF', 'TMEM14C')])

In [7]:
readout_to_cyclic=['ZNF775',
 'TMEM14C',
 'C20orf111',
 'STYXL1',
 'BRD9',
 'ABCA3',
 'ALDH1A1',
 'ACAD9']

In [8]:
input_nodes=[(go,readout) for go,set_readout in GO_id_map.items() for readout in set_readout ]

NameError: name 'GO_id_map' is not defined

In [9]:
edges_readout=[edge for edge in G.edges() if edge[1] in readout_to_cyclic and edge[0] in net.nodes()]

In [None]:
edges_readout

In [9]:
net.add_edges_from(edges_readout)
net.add_edges_from(input_nodes)
nx.write_gexf(net, "test.gexf")

NameError: name 'edges_readout' is not defined

In [10]:
G.add_edges_from(input_nodes)
nx.write_gexf(G, "full.gexf")

NameError: name 'input_nodes' is not defined

In [11]:
readout_direct_targets=defaultdict(set)

for source, target in G.edges():
    if source in net.nodes() and target not in net.nodes():
        readout_direct_targets[source].add(target)

readout_direct_targets["MAX"]
    #print(target)

{'AATF',
 'ABCA17P',
 'ABCB4',
 'ABCF1',
 'ABCG1',
 'AC011472.1',
 'AC013553.1',
 'AC044799.1',
 'AC136618.1',
 'ACAP1',
 'ACTC1',
 'ADAM19',
 'ADAMTS16',
 'ADH6',
 'ADM',
 'ADORA1',
 'AFMID',
 'AFP',
 'AGFG2',
 'AGT',
 'AIMP2',
 'AIP',
 'AK2',
 'AKR1A1',
 'AKR1C1',
 'AKR1C3',
 'AKT1S1',
 'ALB',
 'ALDOC',
 'ALG2',
 'ALG3',
 'ALOX15B',
 'AMDHD2',
 'AMN',
 'AMPD3',
 'ANAPC7',
 'ANG',
 'ANK1',
 'ANKRD12',
 'ANKRD30BL',
 'ANO3',
 'ANP32B',
 'AOC3',
 'AP1G2',
 'AP1S1',
 'AP2M1',
 'AP4M1',
 'APBB3',
 'APEX1',
 'APOBR',
 'APOC1',
 'APOC2',
 'APOE',
 'APOH',
 'APRT',
 'ARHGAP24',
 'ARSK',
 'ASGR1',
 'ASGR2',
 'ASPSCR1',
 'ATL3',
 'ATP13A4',
 'ATP6V0B',
 'ATP6V0D1',
 'ATP8B1',
 'ATXN3',
 'BAT3',
 'BAX',
 'BCAS3',
 'BCKDHA',
 'BCL2',
 'BCR',
 'BIN1',
 'BLOC1S3',
 'BMP4',
 'BOLA1',
 'BPIL1',
 'BST2',
 'BUB3',
 'C10orf116',
 'C10orf18',
 'C10orf46',
 'C10orf76',
 'C10orf88',
 'C11orf59',
 'C12orf24',
 'C12orf49',
 'C13orf23',
 'C16orf5',
 'C16orf54',
 'C16orf73',
 'C17orf86',
 'C17orf87',
 'C18orf

In [12]:
len(set(net["SRF"]))

12

In [13]:
len(net.out_edges("SRF"))

12

In [14]:
len(set.union(*readout_direct_targets.values()))

13389

In [15]:
filename=file[file.index("list")+5:file.index(".csv")]

In [16]:
write_nodes_file("all_gene",filename,G)

In [17]:
len(net.nodes())

215

In [18]:
## Get R dictionaries
## GO term -> set ids
GO_id_map=get_r_dictionaries("test.txt",mapping=False)
#print(GO_id_map)

## id to ENTREZ id
edgeid_ezid_map=get_r_dictionaries("mapping_id_to_entrez.txt")
print(edgeid_ezid_map)

{'5669': 'SP1', '6667': 'SP1', '199699': 'SP1', '6668': 'SP2', '2623': 'GATA1', '672': 'BRCA1', '3172': 'HNF4A', '1876': 'E2F6', '406920': 'hsa-miR-130b', '4208': 'MEF2C', '140690': 'CTCFL', '5452': 'POU2F2', '406982': 'hsa-miR-20a', '406953': 'hsa-miR-18a', '7702': 'ZNF143', '23462': 'HEY1', '6721': 'SREBF2', '25942': 'SIN3A', '2355': 'FOSL2', '53335': 'BCL11A', '7182': 'NR2C2', '147912': 'SIX5', '4149': 'MAX', '4601': 'MXI1', '10891': 'PPARGC1A', '4801': 'NFYB', '1958': 'EGR1', '1051': 'CEBPB', '5993': 'RFX5', '6688': 'SPI1', '9774': 'BCLAF1', '6938': 'TCF12', '639': 'PRDM1', '3066': 'HDAC2', '6256': 'RXRA', '4779': 'NRF1', '4899': 'NRF1', '6886': 'TAL1', '6925': 'TCF4', '6934': 'TCF4', '2099': 'ESR1', '905': 'CCNT2', '10664': 'CTCF', '5090': 'PBX3', '11128': 'POLR3A', '10009': 'ZBTB33', '3659': 'IRF1', '2033': 'EP300', '2113': 'ETS1', '2908': 'NR3C1', '51341': 'ZBTB7A', '1869': 'E2F1', '6722': 'SRF', '3661': 'IRF3', '2353': 'FOS', '8061': 'FOSL1', '3726': 'JUNB', '10535': 'JUNB', '6

In [19]:
## Primero: Hay que cambiarle el id del res al id de ENTREZ
## hay que hacer un mapping dictionary con la info que tenemos de los nodos de GO.term
##Como no todos están anotados es mejor crear un diccionario 
mapping_relabel = edgeid_ezid_map
for node in net.nodes():
    if node not in edgeid_ezid_map.values():
        mapping_relabel[node]=node
mapping_relabel

{'10009': 'ZBTB33',
 '100126319': 'hsa-miR-216b',
 '100126340': 'hsa-miR-944',
 '100126348': 'hsa-miR-760',
 '100302143': 'hsa-miR-1248',
 '100302201': 'hsa-miR-1228',
 '100302232': 'hsa-miR-1226',
 '10127': 'ZNF263',
 '10155': 'TRIM28',
 '1051': 'CEBPB',
 '10535': 'JUNB',
 '10538': 'BATF',
 '10664': 'CTCF',
 '10891': 'PPARGC1A',
 '1106': 'CHD2',
 '11128': 'POLR3A',
 '140690': 'CTCFL',
 '147912': 'SIX5',
 '1488': 'CTBP2',
 '1826': 'CHD2',
 '1869': 'E2F1',
 '1874': 'E2F4',
 '1876': 'E2F6',
 '1879': 'EBF1',
 '1958': 'EGR1',
 '199699': 'SP1',
 '1997': 'ELF1',
 '2005': 'ELK4',
 '2033': 'EP300',
 '2099': 'ESR1',
 '2113': 'ETS1',
 '23462': 'HEY1',
 '23512': 'SUZ12',
 '2353': 'FOS',
 '2355': 'FOSL2',
 '23764': 'MAFF',
 '2551': 'GABPA',
 '25942': 'SIN3A',
 '2623': 'GATA1',
 '2624': 'GATA2',
 '26469': 'BDP1',
 '2908': 'NR3C1',
 '2959': 'GTF2B',
 '3066': 'HDAC2',
 '3169': 'FOXA1',
 '3170': 'FOXA2',
 '3172': 'HNF4A',
 '3174': 'HNF4G',
 '3297': 'HSF1',
 '3659': 'IRF1',
 '3661': 'IRF3',
 '3662': 'I

In [20]:
for key,values in GO_id_map.items():
    GO_id_map[key]=set([mapping_relabel[value] for value in values])
GO_id_map

{'GO:0006337': {'NFE2', 'SMARCA4', 'SMARCB1', 'SMARCC1', 'SMARCC2'},
 'GO:0030220': {'EP300', 'GATA1', 'MEF2C', 'SRF', 'TAL1'},
 'GO:0043923': {'EP300', 'JUN', 'SMARCA4', 'SMARCB1', 'SP1'},
 'GO:1901984': {'BRCA1', 'HDAC2', 'SIN3A', 'SPI1', 'TAF7'}}

In [21]:
set.union(*GO_id_map.values())

{'BRCA1',
 'E2F1',
 'E2F6',
 'FOS',
 'FOSL1',
 'JUN',
 'JUNB',
 'JUND',
 'MAX',
 'SREBF1',
 'STAT1',
 'SUZ12'}

In [22]:
readout_common=set(gene for gene_res in set.union(*GO_id_map.values()) for gene in set(G[gene_res]) if gene in set.union(*readout_direct_targets.values()) )

In [23]:
readout_common

{'SPTBN1',
 'WDR25',
 'AC136618.1',
 'MYL6B',
 'NSUN5',
 'LRMP',
 'CCNB1IP1',
 'APEX1',
 'RPS6',
 'ODF3B',
 'C21orf29',
 'NXPH1',
 'FUT10',
 'ASAP2',
 'WHSC1',
 'SEPHS2',
 'LEPROT',
 'SNORD114-24',
 'PPP2CA',
 'ARHGDIA',
 'HNRNPUL1',
 'SYCE2',
 'ATL3',
 'C17orf86',
 'HAUS2',
 'CREB5',
 'UBXN4',
 'SCFD1',
 'RANBP9',
 'CSRNP1',
 'ACLY',
 'SEH1L',
 'LMBRD2',
 'KIAA1468',
 'ILF2',
 'ACAD9',
 'NTF3',
 'IRX1',
 'SPATA12',
 'PRH1',
 'GSG2',
 'ERCC1',
 'PURA',
 'TUFT1',
 'RHOXF2B',
 'COL2A1',
 'OSR1',
 'HIST1H2BF',
 'MT1M',
 'RASSF7',
 'LOC100289341',
 'CHL1',
 'ZBTB11',
 'CLDN5',
 'ANKRD16',
 'SEC22B',
 'P2RY11',
 'CT62',
 'WASF2',
 'TOX4',
 'AMOTL2',
 'C19orf40',
 'KIF5C',
 'LTB4R2',
 'DBP',
 'RRAS2',
 'PRKAG2',
 'NRBF2',
 'ELOVL3',
 'TRAPPC6B',
 'PLD1',
 'RIBC2',
 'OLA1',
 'RAN',
 'CACNA1E',
 'GSDMD',
 'JMJD8',
 'CCR7',
 'hsa-miR-130b*',
 'hsa-miR-155*',
 'PRKDC',
 'GSR',
 'FOXG1',
 'CD70',
 'AGGF1',
 'TLR1',
 'MEX3D',
 'DNM2',
 'hsa-miR-302d',
 'POU3F3',
 'EPB49',
 'ERBB3',
 'PPP2R3C',
 'K

In [24]:
def get_len(item):
    return len(net.in_edges(item))

In [25]:
lista=[(gene,len(G.in_edges(gene))) for gene in readout_common]

In [26]:
len_innodes_gene=defaultdict(set)
for el in lista:
    len_innodes_gene[el[1]].add(el[0])
len_innodes_gene[5]

{'AARS',
 'AATF',
 'ABCA17P',
 'ABCB8',
 'ACTL6A',
 'AIFM2',
 'AIMP2',
 'APTX',
 'ARRDC2',
 'ATP5G1',
 'AVPI1',
 'C10orf88',
 'C11orf59',
 'C13orf29',
 'C14orf119',
 'C16orf74',
 'C19orf40',
 'C19orf51',
 'C1QTNF6',
 'C1orf107',
 'C1orf228',
 'C20orf24',
 'C21orf58',
 'C5orf39',
 'C5orf51',
 'C6orf89',
 'C7orf40',
 'C8orf55',
 'C9orf3',
 'CABC1',
 'CCBL1',
 'CCDC123',
 'CCNT1',
 'CDC5L',
 'CDR2',
 'CENPL',
 'CISD2',
 'CNTLN',
 'COX5A',
 'CPB2',
 'CREB3L4',
 'CSPG4',
 'CYP51A1',
 'DAGLB',
 'DARS2',
 'DDIT3',
 'DDX47',
 'DENND4A',
 'DEPDC4',
 'DLX4',
 'DNA2',
 'DNAJC9',
 'DOCK10',
 'DRAP1',
 'DSCC1',
 'ENOPH1',
 'ERGIC2',
 'EXD1',
 'FLJ31306',
 'FLJ45983',
 'FOXE1',
 'GADD45A',
 'GIMAP4',
 'GLDC',
 'GPATCH3',
 'GPBP1',
 'HELQ',
 'HES4',
 'HGSNAT',
 'HIGD2A',
 'HMX2',
 'HOXA2',
 'HOXD8',
 'HRC',
 'HSF2BP',
 'HSPBAP1',
 'ICAM1',
 'IFI6',
 'IQCD',
 'ISLR2',
 'ITPR3',
 'KDM3A',
 'KEAP1',
 'KIAA0406',
 'KNCN',
 'LBX1',
 'LBX2',
 'LENG1',
 'LHX5',
 'LIMS1',
 'LIN37',
 'LMNB1',
 'LOC729082',
 '

In [27]:
set_common=[]
for GO_term in GO_id_map.keys():
    print()
    print(GO_term)
    for gene_view in GO_id_map[GO_term]:
        print(gene_view,"",set(G[gene_view]) & len_innodes_gene[5])
        set_common.append(set(G[gene_view]) & len_innodes_gene[5])


GO:0070317
BRCA1  {'SLC25A11', 'NUDT2', 'GPBP1', 'CDC5L', 'SLC15A4', 'LSMD1', 'C6orf89', 'XPC', 'NOL11', 'CCNT1', 'STYXL1', 'GPATCH3', 'THADA', 'SNORD35B', 'MED22', 'ERGIC2', 'POLR2E'}
E2F6  {'HES4', 'ABCA17P', 'KIAA0406', 'HSF2BP', 'RBM18', 'ZNF785', 'C10orf88', 'EXD1', 'CNTLN', 'CISD2', 'PQLC2', 'SDF4', 'KEAP1', 'ACAD9', 'SP9', 'AATF', 'WDR36', 'AIMP2', 'POLE4', 'DNA2', 'C8orf55', 'CABC1', 'HSPBAP1', 'ENOPH1', 'ABCA3', 'HRC'}
MAX  {'SNORA76', 'WDR4', 'ZFYVE27', 'HES4', 'ABCA17P', 'ALDH1A1', 'MAP1D', 'TASP1', 'HSF2BP', 'PLD6', 'ZNF785', 'WDR89', 'C10orf88', 'EXD1', 'KDM3A', 'GLDC', 'C7orf40', 'SCAMP3', 'ACAD9', 'HELQ', 'SGOL1', 'MCOLN1', 'C1orf107', 'LONP1', 'THAP8', 'THADA', 'AATF', 'HIGD2A', 'WDR36', 'DOCK10', 'AIMP2', 'RPS8', 'C11orf59', 'POLE4', 'LOXL1', 'CCNT1', 'UBE2S', 'CABC1', 'HSPBAP1', 'ENOPH1', 'NOC3L', 'SAE1', 'ABCA3', 'HRC', 'POLR2E', 'PNO1'}
E2F1  {'STRADA', 'SUPT4H1', 'DSCC1', 'HES4', 'MBOAT1', 'ITPR3', 'TMEM159', 'CENPL', 'TRPM7', 'HSF2BP', 'CREB3L4', 'PAWR', 'ARRDC2'

In [28]:
i=range(0,len(set_common))
for index,el in enumerate(set_common):
    print(index)
    for number in i:
        if number != index:
            print(el & set_common[number])

0
set()
{'CCNT1', 'POLR2E', 'THADA'}
set()
set()
set()
set()
set()
set()
{'CDC5L', 'THADA'}
set()
set()
1
set()
{'C10orf88', 'EXD1', 'AIMP2', 'HES4', 'ABCA17P', 'POLE4', 'CABC1', 'AATF', 'HSPBAP1', 'ENOPH1', 'WDR36', 'ABCA3', 'HSF2BP', 'HRC', 'ZNF785', 'ACAD9'}
{'HES4', 'HSF2BP', 'PQLC2', 'C8orf55'}
{'AATF', 'SP9'}
{'CABC1'}
set()
set()
set()
{'HES4', 'ACAD9'}
set()
set()
2
{'CCNT1', 'POLR2E', 'THADA'}
{'C10orf88', 'EXD1', 'AIMP2', 'HES4', 'ABCA17P', 'POLE4', 'CABC1', 'AATF', 'HSPBAP1', 'ENOPH1', 'WDR36', 'ABCA3', 'HSF2BP', 'HRC', 'ZNF785', 'ACAD9'}
{'HES4', 'HSF2BP'}
{'AATF'}
{'CABC1'}
set()
set()
{'ALDH1A1'}
{'HES4', 'ZFYVE27', 'ACAD9', 'THADA'}
set()
set()
3
set()
{'HES4', 'HSF2BP', 'PQLC2', 'C8orf55'}
{'HES4', 'HSF2BP'}
{'OSR2'}
set()
set()
set()
set()
{'HES4', 'CCBL1'}
set()
set()
4
set()
{'AATF', 'SP9'}
{'AATF'}
{'OSR2'}
set()
set()
set()
set()
set()
{'CCNT1', 'POLR2E', 'THADA'}
set()
set()
5
set()
{'CABC1'}
{'CABC1'}
set()
{'CDC5L', 'THADA'}
set()
{'ZNF775'}
set()
{'CSPG4', 'TCT

In [36]:
for gene in len_innodes_gene[10]:
    
    genes_res_no_go=set(edge[0] for edge in G.in_edges(gene) if edge[0] not in set.union(*GO_id_map.values()))
    
    if genes_res_no_go.intersection(set(net.nodes())) != genes_res_no_go:
        continue
    print(gene)
    for key in GO_id_map.keys():
        print(key)
        print(set(gene_res for edge in G.in_edges(gene) for gene_res in GO_id_map[key] if gene_res in edge[0] ))
   
    print(genes_res_no_go)
   
    print()
    


HES4
GO:0070317
{'E2F1', 'MAX', 'E2F6'}
GO:0051591
{'JUN', 'JUND'}
{'ZBTB7A'}

ALDH1A1
GO:0070317
{'MAX'}
GO:0051591
{'JUN'}
{'USF1', 'CTCF', 'RAD21'}

UBE2C
GO:0070317
set()
GO:0051591
{'FOS'}
{'SP1', 'EP300', 'IRF3', 'E2F4'}

MAP1D
GO:0070317
{'MAX'}
GO:0051591
set()
{'MYC', 'USF1', 'SIN3A', 'YY1'}

ODF3B
GO:0070317
set()
GO:0051591
{'STAT1'}
{'STAT3', 'POU2F2', 'STAT2', 'NFKB1'}

CREB3L4
GO:0070317
{'E2F1'}
GO:0051591
set()
{'NFYB', 'PBX3', 'FOXA1', 'NFYA'}

C10orf88
GO:0070317
{'MAX', 'E2F6'}
GO:0051591
set()
{'NRF1', 'TCF4', 'E2F4'}

FLJ31306
GO:0070317
set()
GO:0051591
{'FOS'}
{'SP1', 'NFYB', 'IRF3', 'NFYA'}

hsa-miR-367*
GO:0070317
set()
GO:0051591
{'JUN', 'JUND'}
{'TAF7', 'BCL3', 'POU5F1', 'BCL11A'}

ACAD9
GO:0070317
{'MAX', 'E2F6'}
GO:0051591
{'JUN', 'JUND'}
{'ATF3', 'NRF1'}

ABCB8
GO:0070317
set()
GO:0051591
{'FOS'}
{'SP1', 'NFYB', 'NFYA', 'SP2'}

NUDT2
GO:0070317
{'BRCA1'}
GO:0051591
set()
{'ELK4', 'NFYB', 'NFYA', 'E2F4'}

NOL11
GO:0070317
{'BRCA1'}
GO:0051591
set()
{'ZBTB33

In [30]:
max(len_innodes_gene, key=len_innodes_gene.get)

55

In [37]:
GO_007={readout_gene : { edge[0] for edge in G.in_edges(readout_gene) if edge[0] in net.nodes()}for readout_gene in ["STYXL1","BRD9","ABCA3"]}
np.save('GO_007.npy',GO_007) 


In [38]:
GO_007_005= {readout_gene :{ edge[0] for edge in G.in_edges(readout_gene) if edge[0] in net.nodes()}for readout_gene in ["ACAD9","ALDH1A1"]}
np.save('GO_007and005.npy',GO_007_005) 

In [39]:
GO_005={readout_gene : { edge[0] for edge in G.in_edges(readout_gene)} for readout_gene in ["UBE2C","hsa-miR-483-5p","C20orf111"]}
np.save('GO_005.npy',GO_005) 

In [34]:
[key for dici in [GO_005,GO_007,GO_007_005] for key in dici]

['APOM', 'TUBB1', 'SPHK1']

In [35]:
## Win test
res_size=207
in_size=1
i_scaling=1

In [None]:
matrix,dict_pos=build_adj_weighted_matrix(file,mapping_relabel)

In [None]:
Win=np.zeros((res_size,1+in_size))*i_scaling
Win[1,]

In [None]:
#print(GO_id_map["GO:0030220"])
for gene in GO_id_map["GO:0030220"]:
    print(dict_pos[gene])
    Win[dict_pos[gene],]=2
print(np.where(Win==2))

In [None]:
def input_matrix_just_genes_GOterm(Win,GOterm,GO_id_map):
    for gene in GO_id_map[GOterm]:
        Win[dict_pos[gene],]=np.random.uniform(0,1)
    return Win

In [None]:
Win=np.zeros((res_size,1+in_size))*i_scaling
input_matrix_just_genes_GOterm(Win,"GO:0030220",GO_id_map)