In [5]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib as mpl
from scipy.stats import gaussian_kde
from scipy.optimize import curve_fit
import csv

%matplotlib inline

In [None]:
def plot_degree_dist(G, histogram_num):
    
    degrees = [G.degree(n) for n in G.nodes()]
    kmin = min(degrees)
    kmax = max(degrees)
    
    if kmin>0:
        bin_edges = np.logspace(np.log10(kmin), np.log10(kmax), num=histogram_num)
    else:
        bin_edges = np.logspace(0, np.log10(kmax)+1, num=histogram_num)
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    fig = plt.figure(figsize=(6,4))

    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)
    plt.loglog(x, density, marker='o', linestyle='none')
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')

In [None]:
def plot_degree_dist_lin(G, histogram_num):

    degrees = [G.degree(n) for n in G.nodes()]
    kmin = min(degrees)
    kmax = max(degrees)

    # Get 20 logarithmically spaced bins between kmin and kmax
    bin_edges = np.linspace(kmin, kmax, num=histogram_num)

    # histogram the data into these bins
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    fig = plt.figure(figsize=(6,4))

    # "x" should be midpoint (IN LOG SPACE) of each bin
    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)

    plt.plot(x, density, marker='o', linestyle='none')
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    # remove right and top boundaries because they're ugly
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')

    # Show the plot
    plt.show()

In [None]:
# These plotting function help plot ejk (probability of finding an edge connecting a j-degree node to a k-degree node). This is 
# sourced from the Degee_Correlation tutorial
def plot_ejk(list_of_edges_bw_nodes):
    
    fig = plt.figure()
    plt.gca().invert_yaxis()
    ax = plt.gca()
    ax.set_facecolor('black')
    ax.xaxis.tick_top()
    x, y = list(), list()
    
    for edges_bw_nodes in list_of_edges_bw_nodes:
        x.append(edges_bw_nodes[0])
        y.append(edges_bw_nodes[1])

    xy = np.vstack([x,y])
    z = gaussian_kde(xy)(xy)

    plt.scatter(x, y, norm=colors.LogNorm(vmin=z.min(), vmax=z.max()), c=z, s=1, cmap='afmhot')
    plt.colorbar()
    
    plt.xlabel("k")
    plt.ylabel("j")
    ax_ = ax.twinx()
    plt.ylabel('e_jk')
    
    plt.show()

In [None]:
# Plot the average degree of the neighbours (knn_k) of a node k
def plot_knn(k, knn_k):
    
    k=np.array(k)
    avg_k = np.average(k)
    fig = plt.figure()
    ax = plt.gca()
    
    # Binning using a Log Scale
    bin_edges = np.logspace(np.log10(k.min()), np.log10(k.max()), num=100)
    k_nn, _ = np.histogram(knn_k, bins=bin_edges, density=True)
    log_be = np.log10(bin_edges)
    k = 10**((log_be[1:] + log_be[:-1])/2)
    ax.loglog(k, k_nn, marker='o', linestyle='none', label="ANND for the original Graph")
    
    # Random Network's Average Next Neighbor Degree
    neutral_net_knn = np.full(k.shape, np.average(np.square(k)))/avg_k
    ax.plot(k, neutral_net_knn, label="ANND for the random Graph")
    
    # Fitting k_nn = a*(k^u) to the scatter plot to get the value of u
    def func(k, a, u):
        return a*(k**u)
    popt, _ = curve_fit(func, k, k_nn)
    a, u = popt
    print("Value of u is: ", u)
    ax.plot(k, func(k, *popt), label="Value of u = {0}".format(u))
    
    plt.xlabel("k")
    plt.ylabel("k_nn")
    
    plt.show()

In [6]:
def print_statistics(g):
    n = len(g)
    l = g.size()
    degs = [g.degree(node) for node in g]
    k_min = min(degs)
    k_max = max(degs)
    c = np.mean(list(nx.clustering(g).values()))
    
    print("Number of nodes: ", n)
    print("Number of edges: ", l)
    print()
    print("Average degree: ", 2*l/n)
    print("Average degree (alternate calculation)", np.mean(degs))
    print()
    print("Minimum degree: ", k_min)
    print("Maximum degree: ", k_max)
    
    print("Average Clustering Coefficient: ", nx.average_clustering(g))
    
    if not nx.is_directed(g) and nx.is_connected(g):
        d = nx.average_shortest_path_length(g,weight=None)
        print("Average Shortest Path: ", d)

In [66]:
'''
This function will retrieve various statistics from the input graph G
'''
def get_graph_statistics(G):
    N = len(G)
    L = G.size()
    degrees = [G.degree(node) for node in G]
    k_min = min(degrees)
    k_max = max(degrees)
    C = np.mean(list(nx.clustering(G).values()))
    avg_degree = np.mean(degrees)
    
    d = -1
    if not nx.is_directed(G) and nx.is_connected(G):
        d = nx.average_shortest_path_length(G,weight=None)
    if nx.is_directed(G) and nx.is_weakly_connected(G):
        d = nx.average_shortest_path_length(G,weight=None)
    
    columns = ["Nodes", "Links", "Minimum degree", "Maximum degree", 
               "Average degree", "Average clustering coefficient",
               "Average shortest path"]
    return [N, L, k_min, k_max, avg_degree, C, d]

## Connectedness (Undirected)
The below work only on `Graph` objects

* ***`is_connected(G)`***   
`True` or `False` depending on whether `G` is connected or not      
* ***`connected_components(G)`***     
Return a list of lists, where each sub-list contains the nodes in one component   
* ***`number_connected_components(G)`***      
Returns only the length of the list above   
* ***`connected_component_sugraphs(G)`***      
Returns a list of new `Graph` objects each representing a component of `G`   
* ***`node_connected_component(G, node)`***      
Return a list of the nodes in the component of `G` containing `node`   

In [73]:
def get_size_of__each_connected_components(G):
    if nx.is_directed(G):
        tmp = list(map(len, nx.weakly_connected_components(G)))
        tmp.sort(reverse=True)
        return tmp
    else:
        tmp = list(map(len, nx.connected_components(G)))
        tmp.sort(reverse=True)
    return tmp

In [86]:
def get_largest_subgraph(G):
    if nx.is_directed(G):
        components = nx.weakly_connected_components(G)
    else:
        components = nx.connected_components(G)
    components = sorted(components, key=len, reverse=True)
    for c in components:
        G_sub = nx.subgraph(G, c)
        if not nx.is_directed(G_sub) and nx.is_connected(G_sub):
            return G_sub
        elif nx.is_directed(G_sub) and nx.is_weakly_connected(G_sub):
            return G_sub
    
    # Return a null graph as fallback/sanity
    return nx.Graph()

In [57]:
undirected_graph = nx.read_gml("undirected.gml")

In [67]:
get_graph_statistics(undirected_graph)

[2139, 5864, 1, 34, 5.482935951379149, 0.06383993303854943, -1]

In [74]:
get_size_of__each_connected_components(undirected_graph)

[1909,
 181,
 4,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [87]:
largest_undirected_subgraph = get_largest_subgraph(undirected_graph)

In [93]:
get_graph_statistics(largest_undirected_subgraph)

[1909, 5375, 1, 34, 5.631220534311158, 0.060655591046287975, 5.633873201309476]

In [89]:
directed_graph = nx.read_gml("directed_data.gml")

In [90]:
get_graph_statistics(directed_graph)

[2139, 6019, 1, 36, 5.627863487611033, 0.033024296572042984, -1]

In [91]:
get_size_of__each_connected_components(directed_graph)

[1909,
 181,
 4,
 3,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 2]

In [92]:
largest_directed_subgraph = get_largest_subgraph(directed_graph)

In [94]:
get_graph_statistics(largest_directed_subgraph)

[1909,
 5521,
 1,
 36,
 5.784180199057098,
 0.031388192132993036,
 3.4736795143384587]

In [95]:
max_L = len(directed_graph)*(len(directed_graph)-1)/2
actual_L = len(directed_graph.edges())
p = actual_L/max_L
er = nx.erdos_renyi_graph(len(directed_graph), p)

In [96]:
get_graph_statistics(er)

[2139, 5921, 0, 15, 5.536231884057971, 0.0023457374789773106, -1]

In [97]:
get_graph_statistics(get_largest_subgraph(er))

[2129, 5921, 1, 15, 5.562235791451386, 0.002356755503772883, 4.660339052186596]