# Import Libraries

In [1]:
import networkx as nx
import numpy as np
import pandas as pd
from scipy import integrate
import timeit

# Create Graph From Data Source

but we need to create a dataframe later

In [2]:
edge_list = pd.read_csv('../Datasets/got.csv')
G = nx.from_pandas_edgelist(edge_list, edge_attr='weight')

# Define Backbone Function

In [3]:
from numba import jit

def disparity_filter(G, weight='weight'):    
    B = nx.Graph()
    for u in G:
        k = len(G[u])
        if k > 1:
            sum_w = sum(np.absolute(G[u][v][weight]) for v in G[u])
            for v in G[u]:
                w = G[u][v][weight]
                p_ij = float(np.absolute(w))/sum_w
                alpha_ij = 1 - (k-1) * integrate.quad(lambda x: (1-x)**(k-2), 0, p_ij)[0]
                B.add_edge(u, v, weight = w, alpha=float(alpha_ij))#float('%.4f' % alpha_ij)
    return B


# Use Backbone Function to Assign P-values or Scores

In [4]:

%timeit disparity_filter(G)
backbone = disparity_filter(G)


#print("--- %s seconds ---" % (time.time() - start_time))

12.5 ms ± 36.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Convert graph to dataframe because its more flexible to deal with

In [5]:
backbone = nx.to_pandas_edgelist(backbone)

# Filter edges using a threshold

In [6]:
threshold = 0.01
backbone[backbone['alpha'] < threshold]

Unnamed: 0,source,target,weight,alpha
112,Jaime,Brienne,88,0.000469
134,Tyrion,Sansa,77,0.003657
184,Arya,Sandor,46,0.007376


# Filter edges preserving a fraction of edges

In [7]:
fraction = int(0.3*len(G.edges()))

#sort values before filtering 
backbone = backbone.sort_values(by='alpha')

backbone[:fraction]

Unnamed: 0,source,target,weight,alpha
112,Jaime,Brienne,88,0.000469
134,Tyrion,Sansa,77,0.003657
184,Arya,Sandor,46,0.007376
156,Tyrion,Tywin,39,0.011613
129,Tyrion,Cersei,46,0.013247
...,...,...,...,...
244,Gregor,Sandor,12,0.332869
302,Daenerys,Viserys,8,0.335180
83,Jojen,Meera,33,0.338193
187,Arya,Robb,15,0.340816


In [8]:
backbone[(backbone['target'] == 'Jaime') & (backbone['source'] == 'Aerys')]

Unnamed: 0,source,target,weight,alpha
103,Aerys,Jaime,18,0.319586


In [9]:
import sys, warnings
import numpy as np
import pandas as pd
import networkx as nx
from collections import defaultdict
from scipy.stats import binom

def disparity_filter(table, undirected = False, return_self_loops = False):
   sys.stderr.write("Calculating DF score...\n")
   table = table.copy()
   table_sum = table.groupby(table["source"]).sum().reset_index()
   table_deg = table.groupby(table["source"]).count()["target"].reset_index()
   table = table.merge(table_sum, on = "source", how = "left", suffixes = ("", "_sum"))
   table = table.merge(table_deg, on = "source", how = "left", suffixes = ("", "_count"))
   table["score"] = 1.0 - ((1.0 - (table["weight"] / table["weight_sum"])) ** (table["target_count"] - 1))
   table["variance"] = (table["target_count"] ** 2) * (((20 + (4.0 * table["target_count"])) / ((table["target_count"] + 1.0) * (table["target_count"] + 2) * (table["target_count"] + 3))) - ((4.0) / ((table["target_count"] + 1.0) ** 2)))
   if not return_self_loops:
      table = table[table["source"] != table["target"]]
   if undirected:
      table["edge"] = table.apply(lambda x: "%s-%s" % (min(x["source"], x["target"]), max(x["source"], x["target"])), axis = 1)
      table_maxscore = table.groupby(by = "edge")["score"].max().reset_index()
      table_minvar = table.groupby(by = "edge")["variance"].min().reset_index()
      table = table.merge(table_maxscore, on = "edge", suffixes = ("_min", ""))
      table = table.merge(table_minvar, on = "edge", suffixes = ("_max", ""))
      table = table.drop_duplicates(subset = ["edge"])
      table = table.drop("edge", 1)
      table = table.drop("score_min", 1)
      table = table.drop("variance_max", 1)
   return table[["source", "target", "weight", "score", "variance"]]


In [10]:
edge_list = pd.read_csv('../Datasets/got.csv')
#G = nx.from_pandas_edgelist(edge_list, edge_attr='weight')



In [11]:
stable_sum = edge_list.groupby(edge_list["source"]).sum().reset_index()
ttable_sum = edge_list.groupby(edge_list["target"]).sum().reset_index()

stable_sum.columns = ['nodes', 'weight']
ttable_sum.columns = ['nodes', 'weight']

table_sum = pd.concat([stable_sum, ttable_sum]).groupby('nodes').sum().reset_index()
table_sum.columns = ['source', 'weight']


stable_sum = edge_list.groupby(edge_list["source"]).count()["target"].reset_index()
ttable_sum = edge_list.groupby(edge_list["target"]).count()["source"].reset_index()

stable_sum.columns = ['nodes', 'weight']
ttable_sum.columns = ['nodes', 'weight']

table_deg = pd.concat([stable_sum, ttable_sum]).groupby('nodes').sum().reset_index()
table_deg.columns = ['source', 'target']


table = edge_list.merge(table_sum, on = "source", how = "left", suffixes = ("", "_sum"))
table = table.merge(table_deg, on = "source", how = "left", suffixes = ("", "_count"))

table['weight_norm'] = table['weight']/table['weight_sum']


In [12]:

def calculate_disparity_pdf(row):
    return (1 - (row.target_count-1) * integrate.quad(lambda x: (1-x)**(row.target_count-2), 0, row.weight_sum)[0])

In [13]:
table['score'] = table.apply(calculate_disparity_pdf, axis=1)

  


In [14]:
deg = pd.DataFrame(G.degree(), columns=['nodes', 'degree'])

In [15]:
v = 'Jaime'

In [16]:
k = len(G[v])

In [17]:
u = 'Aerys'

In [18]:
p = G[u][v]['weight']/sum([G[w][v]['weight'] for w in G[v]])

In [19]:
1 - (k-1) * integrate.quad(lambda x: (1-x)**(k-2), 0, p)[0]

0.3195857521930672

In [20]:
table[(table['target'] == 'Jaime') & (table['source'] == 'Aerys')]

Unnamed: 0,source,target,weight,weight_sum,target_count,weight_norm,score
2,Aerys,Jaime,18,37,4,0.486486,-46656.0


In [21]:
table_deg[table_deg['source'] == 'Jaime']

Unnamed: 0,source,target
41,Jaime,24


In [22]:
table

Unnamed: 0,source,target,weight,weight_sum,target_count,weight_norm,score
0,Aemon,Grenn,5,74,5,0.067568,2.839824e+07
1,Aemon,Samwell,31,74,5,0.418919,2.839824e+07
2,Aerys,Jaime,18,37,4,0.486486,-4.665600e+04
3,Aerys,Robert,6,37,4,0.162162,-4.665600e+04
4,Aerys,Tyrion,5,37,4,0.135135,-4.665600e+04
...,...,...,...,...,...,...,...
347,Walder,Petyr,6,87,8,0.068966,-3.479278e+13
348,Walder,Roslin,6,87,8,0.068966,-3.479278e+13
349,Walton,Jaime,10,10,1,1.000000,1.000000e+00
350,Ygritte,Qhorin,7,82,4,0.085366,-5.314410e+05


In [23]:

def calculate_disparity_pdf(row):
    return (1 - (row.degree-1) * integrate.quad(lambda x: (1-x)**(row.degree-2), 0, row.weighted_degree)[0])

In [None]:
#source, target, weight, degree, weighted_degree, score

In [28]:
from scipy.stats import entropy
data = [1,2,2,3,3,3]

pd_series = pd.Series(data)
counts = pd_series.value_counts()
entropy = entropy(counts)

In [31]:
counts

3    3
2    2
1    1
dtype: int64