In [None]:
%load_ext autoreload
%autoreload

In [None]:
import os as _os
_os.chdir(_os.environ['PROJECT_ROOT'])
print(_os.path.realpath(_os.path.curdir))

In [None]:
import graph_tool as gt
import graph_tool.draw
import numpy as np
import pandas as pd
import scipy.sparse
import scipy as sp
from collections import defaultdict
from tqdm import tqdm

In [None]:
# Functions for constructing graphs
def path_to_edgelist(path):
    u = path[0]
    edges = []
    for v in path[1:]:
        edges.append((u, v))
        u = v
    return edges

def new_graph_from_merged_paths(paths, lengths, depths):
    g = gt.Graph()
    for p in paths:
        g.add_edge_list(path_to_edgelist(p))
    g.vp['depth'] = g.new_vp('vector<float>')
    g.vp.depth.set_2d_array(depths)
    g.vp['length'] = g.new_vp('int', lengths)  
    g.gp['nsample'] = g.new_gp('int', len(depths))
    g.vp['sequence'] = g.new_vp('object', vals=[[k] for k in range(g.num_vertices())])
    return g

def get_depth_matrix(g, vs=None):
    if not vs:
        return g.vp.depth.get_2d_array(range(g.gp.nsample))
    else:
        return np.stack([_g.vp.depth[i] for i in vs], axis=1)

In [None]:
def draw_graph(g, **kwargs):
    return gt.draw.graph_draw(g, output_size=(300, 300), ink_scale=0.8, **kwargs)

In [None]:
paths = [
    [0, 1, 3],
    [2, 3],
    [2, 2],
]

nsamples = 1
nnodes = 4

g0 = new_graph_from_merged_paths(
    paths,
    depths=np.array(np.arange(1, nsamples * nnodes + 1).reshape(nsamples, nnodes)),
    lengths=np.array([1] * nnodes),
)

In [None]:
get_depth_matrix(g0)

In [None]:
g0_pos = draw_graph(g0, vertex_text=g0.vertex_index)
draw_graph(g0, pos=g0_pos, vertex_text=g0.vp.depth)

In [None]:
%%time

sample_idx = 0
f0 = sp.sparse.csr_array(gt.spectral.adjacency(g0))
d = g0.vp.depth.get_2d_array([sample_idx])[0]

# print(f0.toarray())
f = f0

hist = []
for _ in range(100):
    f_out = f
    f_total_out = f_out.sum(1)
    d_error_out = f_total_out - d
    f_frac_out = (f_out.T * np.nan_to_num(1 / f_total_out, nan=0.0, posinf=1.0)).T
    allocated_d_error_out = (d_error_out * f_frac_out.T).T
    f_in = f_out.T
    f_total_in = f_in.sum(1)
    d_error_in = f_total_in - d
    f_frac_in = (f_in.T * np.nan_to_num(1 / f_total_in, nan=0.0, posinf=1.0)).T
    allocated_d_error_in = (d_error_in * f_frac_in.T).T
    mean_allocated_d_error = (allocated_d_error_in.T + allocated_d_error_out) / 2
    f = (f_out - mean_allocated_d_error)
    hist.append(f)
# print(f.toarray())

In [None]:
np.nan < 1

In [None]:
def calculated_interpolated_flow(g, sample_idx, eps=1e-2, maxiter=100):
    f = sp.sparse.csr_array(gt.spectral.adjacency(g))
    d = g.vp.depth.get_2d_array([sample_idx])[0]
    loss_hist = [np.inf]
    for step_i in range(maxiter):
        f_out = f
        f_total_out = f_out.sum(1)
        d_error_out = f_total_out - d
        f_in = f_out.T
        f_total_in = f_in.sum(1)
        d_error_in = f_total_in - d
        loss_hist.append(np.square(d_error_out).sum() + np.square(d_error_in).sum())
        loss_ratio = (loss_hist[-2] - loss_hist[-1]) / loss_hist[-2]
        print(loss_ratio)
        if loss_ratio < eps:
            # print(loss_hist[-1], loss_ratio)
            # print(step_i)
            break
        f_frac_out = (f_out.T * np.nan_to_num(1 / f_total_out, nan=0.0, posinf=1.0)).T
        allocated_d_error_out = (d_error_out * f_frac_out.T).T
        f_frac_in = (f_in.T * np.nan_to_num(1 / f_total_in, nan=0.0, posinf=1.0)).T
        allocated_d_error_in = (d_error_in * f_frac_in.T).T
        mean_allocated_d_error = (allocated_d_error_in.T + allocated_d_error_out) / 2
        f = (f_out - mean_allocated_d_error)
    return f

In [None]:
nvertices = 1_000_000
vs = list(range(nvertices))
paths = (
    [
        vs,  # A long genome
        list(np.random.choice(vs, 50000)), # Long-range interconnects
        list(np.random.choice(vs, 50000)),
        list(np.random.choice(vs, 50000)),
    ]
    + [[c, c] for c in np.random.choice(vs, 50000)] # Self-loops
)
g1 = new_graph_from_merged_paths(
    paths,
    depths=np.random.randint(0, 10, size=(1, nvertices)),
    lengths=np.array([1] * nvertices),
)
g1

%prun f = calculated_interpolated_flow(g1, sample_idx=0, maxiter=1000, eps=1e-5)

In [None]:
paths = [
    [0, 1, 2, 0],
    [0, 0],
    [2, 3],
]

nsamples = 1
nnodes = 4

g2 = new_graph_from_merged_paths(
    paths,
    depths=np.array([[100, 5, 50, 0]]),
    lengths=np.array([1] * nnodes),
)

f2 = calculated_interpolated_flow(g2, sample_idx=0)
print(f2.toarray())

flow2 = []
for sample_idx in range(g2.gp.nsample):
    flow = g2.new_edge_property('float', val=0)
    for i, j in g2.get_edges():
        print((i, j), f2[j, i])
        flow[g2.edge(i, j)] = np.round(f2[j, i], 0)
    print(flow.a)
    flow2.append(flow)
    
flow2 = gt.group_vector_property(flow2)
print(flow2.get_2d_array(pos=[0]))

g2.ep['flow'] = flow2


g2_pos = gt.draw.graph_draw(g2, output_size=(300, 300), ink_scale=1.0, vertex_text=g2.vertex_index, fit_view_ink=True)
gt.draw.graph_draw(g2, output_size=(300, 300), ink_scale=1.0, pos=g2_pos, vertex_text=g2.vp.depth, edge_text=flow2, fit_view_ink=True)

In [None]:
inverse_total_inflow = sp.sparse.csr_array(sp.sparse.diags(1 / f[1].sum(axis=0)))
frac_contribution = sp.sparse.csr_array(gt.spectral.adjacency(_g)) * inverse_total_inflow

In [None]:
%%time
i = 2

d = _g.vp.depth.get_2d_array([i])
a = sp.sparse.csr_array(gt.spectral.adjacency(_g))
f_history, loss_history = estimate_flow(a, d, epsilon=1e-10, return_trace=True)

In [None]:
def estimate_flow_all_samples(g, samples=None):
    if samples is None:
        samples = range(g.gp.nsamples)

    d = get_depth_matrix(g)
    a = sp.sparse.csr_array(gt.spectral.adjacency(g))
    flows = []
    for i in samples:
        f = sp.sparse.csr_array(estimate_flow(a, d[i], epsilon=1e-2))
        flows.append(f)
    return flows

In [None]:
f = estimate_flow_all_samples(_g, samples=[1, 2])

In [None]:
def get_all_edge_values_from_matrix(g, x):
    ii, jj = g.get_edges().T
    return x[jj, ii]

In [None]:
_g.new_edge_property()

In [None]:
_g.new_edge_property('float').a.shape

In [None]:
_g.get_edges([p]).shape

In [None]:
def edge_property_from_matrix(g, x):
    p = g.new_edge_property('float')
    ii, jj = g.get_edges().T
    # FIXME: Because edge properties are indexed by some non-existent, edges, it's not clear what to do here...
    p.a[:len(ii)] = get_all_edge_values_from_matrix(g, x)
    return p

In [None]:
def get_matrix_from_edge_property(g, p):
    return sp.sparse.csr_array(gt.spectral.adjacency(g, weight=p))

x0 = f[1]
p = edge_property_from_matrix(_g, x0)
x1 = get_matrix_from_edge_property(_g, p)

x0.sum(), x1.sum()

In [None]:
get_all_edge_values_from_matrix(_g, x).shape

In [None]:
_g.new_edge_property('int', val=1).a.shape

In [None]:
_g.get_edges()

In [None]:
plt.plot(get_all_edge_values_from_matrix(_g, x0))

In [None]:
plt.plot(get_all_edge_values_from_matrix(_g, x1))

In [None]:
f[0][a, b].shape

In [None]:
z = f[0].toarray()
ii, jj = _g.get_edges().T
z[jj, ii].sum()

In [None]:
z[jj, ii]

In [None]:
f[0].toarray().sum()

In [None]:
_g.get_edges()[252]

In [None]:
_g.get_in_edges(146)

In [None]:
_g.get_out_edges(146)

In [None]:
f[1][146, 146]

In [None]:
x

In [None]:
((x0.toarray() > 0) == (x1.toarray() > 0)).mean()

In [None]:
x1#.toarray().sum()

In [None]:
x0#.toarray().sum()

In [None]:
get_all_edge_values_from_matrix(_g, x0)

In [None]:
x0

In [None]:
get_all_edge_values_from_matrix(_g, x0).shape

In [None]:
f[1].toarray()

In [None]:
(x0 > 0).sum()

In [None]:
(x1 > 0).sum()

In [None]:
(x0 > 0.5).sum()

In [None]:
(x1 > 0.5).sum()

In [None]:
x1.toarray()

In [None]:
(x1 == 0.75).sum()

In [None]:
(x0 == 0.75).toarray().sum()

In [None]:
x1.toarray().sum(1)

In [None]:
get_matrix_from_edge_property(_g, p).toarray().sum(1)

In [None]:
x.toarray()

In [None]:
x.toarray().sum()

In [None]:
%time f[1].toarray()

In [None]:
e = _g.new_edge_property('vector<float>')

In [None]:
f[1].toarray().shape

In [None]:
_g.get_edges()