In [4]:
import itertools
import numpy as np
import pandas as pd
import networkx as nx

import iqplot
import bokeh.io
bokeh.io.output_notebook()
import panel as pn
pn.extension()

def style(p, autohide=False):
    p.title.text_font="Helvetica"
    p.title.text_font_size="16px"
    p.title.align="center"
    p.xaxis.axis_label_text_font="Helvetica"
    p.yaxis.axis_label_text_font="Helvetica"
    
    p.xaxis.axis_label_text_font_size="13px"
    p.yaxis.axis_label_text_font_size="13px"
    p.xaxis.axis_label_text_font_style = "normal"
    p.yaxis.axis_label_text_font_style = "normal"
    p.background_fill_alpha = 0
    if autohide: p.toolbar.autohide=True
    return p

**144 sets worth looking at:**
- 144.2.2: triangles
- 144.2.4: Erdos, SSBM, crawler
- 144.3.2: heavy tails, linear on log-log
- 144.3.4: preferential attachment vs. configuration model
- 144.4.5: colors?

- find XOR gate from 2.2

In [22]:
df = pd.read_csv("network_tf_gene.txt", delimiter="\t", comment="#", 
        names=["TF","_gene","gene","regulation","?","evidence","??"])

df = df[["TF", "gene", "regulation"]]
df['TF'] = [s.lower() for s in df.TF.values]
df['gene'] = [s.lower() for s in df.gene.values]

G = nx.from_pandas_edgelist(df, source="TF", target="gene", 
#                             create_using=nx.DiGraph()
                           )

df.head()

Unnamed: 0,TF,gene,regulation
ECK125286586,accb,accb,-
ECK125286586,accb,accc,-
ECK120015994,acrr,acra,-
ECK120015994,acrr,acrb,-
ECK120015994,acrr,acrr,-


# helper functions

In [19]:
# ................. PLOT GRAPH .................
from bokeh.models import Circle, MultiLine, Range1d
from bokeh.plotting import from_networkx

def plotter(G, layout, title,
#             n_color="#5d61bb", 
            n_color="#ababab", 
            e_color="#000000", 
            n_size=3, 
            e_thick=0.3):
    """
    layout: [spring, random, radial, kamda-kawai, spectral]
    """
    
    # ****************** SETTING POSITIVE ARROWS GREEN, NEGATIVE ARROWS RED ******************
    E_COLOR_POS = 'green'
    E_COLOR_NEG = 'red'
    E_COLOR_NULL = 'white'

    edge_attrs = {}

    for start_node, end_node, _ in G.edges(data=True):
        try:
            regulation = df.loc[(df['TF']==start_node) & 
                                (df['gene']==end_node), 'regulation'].values[0]
            if (regulation == '+'):   edge_color = E_COLOR_POS
            elif (regulation == '-'): edge_color = E_COLOR_NEG
        except: 
            edge_color = E_COLOR_NULL

        edge_attrs[(start_node, end_node)] = edge_color
    nx.set_edge_attributes(G, edge_attrs, "edge_color")

    p = bokeh.plotting.figure(x_range=(-1.2,1.2), y_range=(-1.2,1.2), title=title, height=800, width=800)
    if layout=="spring":  network_graph = from_networkx(G, nx.spring_layout, scale=1, center=(0,0), )
    if layout=="random":  network_graph = from_networkx(G, nx.random_layout, center=(0,0),)
    if layout=="radial":  network_graph = from_networkx(G, nx.circular_layout)
    if layout=="kamda-kawai":  network_graph = from_networkx(G, nx.kamada_kawai_layout)
    if layout=="spectral":  network_graph = from_networkx(G, nx.spectral_layout)
        
    network_graph.node_renderer.glyph = Circle(size=n_size, line_color=n_color, fill_color=n_color)
    network_graph.edge_renderer.glyph = MultiLine(line_color="edge_color", line_width=e_thick)
    
    p.renderers.append(network_graph)
    p.xaxis.visible,p.yaxis.visible = False, False
    return style(p)

In [20]:
# ................. IN & OUT DEGREE .................
def in_out_degrees(G):
    out_degrees = [G.degree(n) for n in G.nodes()]
    
    # .... d[node] = {all nodes containing that node} .... 
    d = {}
    for node in G.nodes():          # initializing dictionary of sets
        for element in G[node]:
            d[element] = set()

    for node in G.nodes():          # adding origin node to neighbor keys
        for element in G[node]:
            d[element].add(node)
            
    in_degrees = [len(v) for k, v in d.items()]
    
    return in_degrees, out_degrees


# ............. IN & OUT DEGREE PLOTTER .............
def in_out_plotter(in_degrees, out_degrees, palette=['navy','darkorange']):
    # ECDF
    p = iqplot.ecdf(np.array(
        out_degrees), 
        palette=[palette[0]], 
    )
    p = iqplot.ecdf(
        np.array(in_degrees), 
        palette=[palette[1]], 
        p=p
    )

    # ECCDF log-log for heavy
    q = iqplot.ecdf(
        np.array(in_degrees), 
        palette=[palette[1]], 
        legend_label="in-degrees",
        x_axis_type="log",
        y_axis_type="log",
        complementary=True
    )
    q = iqplot.ecdf(
        np.array(out_degrees), 
        palette=[palette[0]], 
        legend_label="out-degrees",
        x_axis_type="log",
        y_axis_type="log",
        complementary=True,
        p=q
    )
    return bokeh.layouts.layout([[p, q]])


# ............. GETTING WEAKLY CONNECTED .............
def get_strongly_cc(G, node):
    """ get storngly connected component of node""" 
    for cc in nx.strongly_connected_components(G):
        if node in cc:
            return cc
    else:
        return set()

def get_weakly_cc(G, node):
    """ get weakly connected component of node""" 
    for cc in nx.weakly_connected_components(G):
        if node in cc:
            return cc
    else:
        return set()

# plotting

In [23]:
p = plotter(G, "spring", "E. Coli Transcriptional Factor Network")
bokeh.io.show(p)

### observations:
- one large strongly connected component
- many external small clusters
- is networkx drawing self loops? (maybe isolate the ones that are, then manually circle them or something)


# ideas??

In [10]:
G = nx.from_pandas_edgelist(df, source="TF", target="gene", 
                            create_using=nx.DiGraph())

- make subgraph of motif you want to look for
- use networkx to search for instances of that motif
- look at in-degree vs. out-degree
- scale free? heavy tails? linear on log-log necessary but not sufficient?
- given the in-degree/out-degree graph, the random graph is not what we should be comparing our statistics to bc in an I1FFL with ABC

In [11]:
nodes = G.nodes()
degrees = [G.degree(n) for n in G.nodes()]

d_feedforward = {'A':['B','C'], 'B':'C'}
G_feedforward = nx.DiGraph(d_feedforward)

In [12]:
in_degrees, out_degrees = in_out_degrees(G)

In [13]:
p = in_out_plotter(in_degrees, out_degrees)
bokeh.io.show(p)

- plot ecdf of out degree on log-log axis... note that this is *insufficient* to prove heavy tails.

- look at distribution of positive and negative regulation. what changes when we compare the statistics of FFL distributions with a positive and negative regulation distribution above, instead of with a purely random graph?  
- we learned about how starting with complete graph and removing edges is not the same as starting with an empty graph and filling edges in a similar manner, pruning is not the same as filling?
- find out what kind of survival story would match the observation of having long-tailed out-degrees and tight in-degree distributions, shows some preferential-model type simulation, having broader out-degree distribution means we have many data points where things that regulate a lot of other things, than you would be regulated by, which might makes sense? things at most are regulated by 10-ish other things, but one thing can be capable of regulating other things. from the spring-mode networkx visualization of the graph, we can see we are already expecting this type of distribution. 
- how does the precise in-degree and out-degree of a node affect the way we model the kinetics and binding? (the concentration would be subjected to much more noise, or it's dx/dt would not be purely determined by the model you've written alone). 
- clustering coefficient? 
- path metric might be interesting, especially to see how one could be regulated by toggling any other / general response time and sparsity/density of the graph. remember that a path length for a directed graph is
- hierarchical network

In [14]:
node = "accb"
weak_component = get_weakly_cc(G, node)  # Weakly connected component of node in G
strong_component = get_strongly_cc(G, node)  # Strongly connected component of node in G

In [15]:
component = G.subgraph(list(weak_component))

In [16]:
nx.average_shortest_path_length(component, weight=None, method=None)

0.018552063685871444

In [17]:
for p in [0.8, 0.9, 1.0]:
    _G_erdos = nx.erdos_renyi_graph(len(component), p, directed=True)
    print(np.round(p, 3), ':\t', 
          np.round(nx.average_shortest_path_length(_G_erdos), 4))

0.8 :	 1.1999
0.9 :	 1.1001
1.0 :	 1.0


In [18]:
nx.average_shortest_path_length(G, weight=None, method=None)

NetworkXError: Graph is not weakly connected.

# diameter

I used the method `average_shortest_path_length` for the average diameter. <br>
For the maximal diameter, I used `all_pairs_dijkstra` to iterate through all pairs and stored all unique path lengths. This is really inefficient (there's probably a better way to do this), and I think I'm double searching all pairs (a, b) and (b, a). The max of the set came out to 5. 

In [None]:
# average diameter ......................................................................
average_path = nx.average_shortest_path_length(G)

# maximal diameter ......................................................................
generator = nx.all_pairs_dijkstra(G)        # generating pairs using dijsktra's algorithm

paths = set()                               # store all path lengths
mx = 0                                      # keep track of maximum 
for i in range(len(d)):
    _d = next(generator)[1][0]              # iterate thorugh generator object
    for _, v in _d.items():
        if v > mx: 
            paths.add(v)
            mx = v

print("average diameter: ", np.round(average_path,4))
print("maximum diameter: ", max(paths))

In [None]:
# calculating average clustering coefficient..............................
average_clustering = nx.average_clustering(G)

# calculating overall clustering coefficient..............................
triangles = np.sum([v for _, v in nx.triangles(G).items()])/3.0
connected_triples = np.sum([v * (v-1) / 2 for _, v in d_degree.items()])
overall_clustering = triangles/connected_triples

In [None]:
print("average clustering: ", np.round(average_clustering, 4))
print("overall clustering: ", np.round(overall_clustering, 4))

# autoregulation 

In [93]:
np.unique(df.regulation.values, return_counts=True)

(array(['+', '-', '?'], dtype=object), array([2459, 2214,   20]))

In [94]:
2459 - 2214

245

In [95]:
n_pos, n_neg = 0, 0
for node in G.nodes():
    if node in G[node].keys():
        regulation = df.loc[ (df["TF"]==node) & (df["gene"]==node), 
                            'regulation'].values[0]
        if regulation == '-': n_neg += 1
        if regulation == '+': n_pos += 1
            
        
print(f'# of negative: {n_neg}\n# of positive: {n_pos}',)

# of negative: 81
# of positive: 39


# cycles

In [97]:
def out_edges(G, node):
    return [tup[1] for tup in G.edges(node)]

def find_ffls(G):
    ffls = []
    for n1 in G:
        for n2 in out_edges(G, n1):
            if n2 == n1: pass
            else: 
                for n3 in out_edges(G, n2):
                    if (n3 == n1) or (n3 == n2): pass
                    else: 
                        if n3 in out_edges(G, n1):
                            ffls.append([n1, n2, n3])
    ffls = np.array(ffls)
    return ffls

def classify_ffls(G, ffls):
    dict_ffls = {'+++':'C1', '-+-':'C2', '+--':'C3', '--+':'C4',
                 '+-+':'I1', '---':'I2', '++-':'I3', '-++':'I4'
                }
    count_ffls = {'C1':0, 'C2':0, 'C3':0, 'C4':0, 
                  'I1':0, 'I2':0, 'I3':0, 'I4':0}
    
    for ffl in ffls:
        reg_AB = df.loc[(df['TF']==ffl[0]) & (df['gene']==ffl[1]), 'regulation'].values[0]
        reg_BC = df.loc[(df['TF']==ffl[1]) & (df['gene']==ffl[2]), 'regulation'].values[0]
        reg_AC = df.loc[(df['TF']==ffl[0]) & (df['gene']==ffl[2]), 'regulation'].values[0]
        try:  # there are some quesetion marks in the dataset
            ffl_type = dict_ffls[reg_AB+reg_BC+reg_AC]
            count_ffls[ffl_type] += 1
        except:
            pass
    return count_ffls

In [98]:
G = nx.from_pandas_edgelist(
    df, source="TF", target="gene", create_using=nx.DiGraph())
ffls = find_ffls(G)
count_ffls = classify_ffls(df, ffls)

In [99]:
count_ffls

{'C1': 541,
 'C2': 356,
 'C3': 112,
 'C4': 118,
 'I1': 306,
 'I2': 216,
 'I3': 125,
 'I4': 179}

## Stochastic Block Model: $\text{SBM}(n, p, W)$

In [40]:
n, k, A, B = 30, 3, 0.7, 0.1

In [41]:
_p = np.random.rand(k)
p = _p/np.sum(_p)                                                  # random probability vector
p = [1/3, 1/3, 1/3]                                                # uniform probability vector
community_labels = np.random.choice([0,1,2], p=p, size=n)          # community labels
community_labels

array([1, 1, 0, 0, 0, 2, 0, 1, 1, 1, 0, 2, 1, 2, 1, 0, 1, 0, 2, 2, 1, 2,
       0, 2, 0, 2, 1, 0, 1, 0])

In [42]:
W = np.empty((k,k))
W.fill(B)
np.fill_diagonal(W, A)
W

array([[0.7, 0.1, 0.1],
       [0.1, 0.7, 0.1],
       [0.1, 0.1, 0.7]])

In [43]:
df = pd.DataFrame(columns=['source','target'])
for i in range(n):
    for j in range(0, i-1):  # populate upper triangle to avoid redundancy
        ii, jj = community_labels[i], community_labels[j]
        if np.random.rand() <= W[ii][jj]: 
            df = df.append({'source':i, 'target':j}, ignore_index=True)
G_ssbm = nx.from_pandas_edgelist(df)

In [None]:
# in-degree out-degree ECDF
p = iqplot.ecdf(np.array(
    out_degrees), 
    palette=['navy'], 
)
p = iqplot.ecdf(
    np.array(in_degrees), 
    palette=['darkorange'], 
    p=p
)

# complementary ECCDF
q = iqplot.ecdf(
    np.array(in_degrees), 
    palette=['darkorange'], 
    legend_label="in-degrees",
    x_axis_type="log",
    y_axis_type="log",
    complementary=True
)
q = iqplot.ecdf(
    np.array(out_degrees), 
    palette=['navy'], 
    legend_label="out-degrees",
    x_axis_type="log",
    y_axis_type="log",
    complementary=True,
    p=q
)
bokeh.io.show(bokeh.layouts.layout([[p, q]]))

In [1]:
import manimlib