In [19]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib as mpl
from scipy.stats import gaussian_kde
from scipy.optimize import curve_fit
import csv

import powerlaw as pl

%matplotlib inline

In [54]:
'''
This function plots the degree distribution of a given graph G.  Sourced from CPSC 572
course slides
'''
def plot_degree_dist(G, histogram_num, plot_fit_line=False):
    
    degrees = [G.degree(n) for n in G.nodes()]
    print(min(degrees))
    print(max(degrees))
    kmin = min(degrees)
    kmax = max(degrees)
    
    if kmin>0:
        bin_edges = np.logspace(np.log10(kmin), np.log10(kmax), num=histogram_num)
    else:
        bin_edges = np.logspace(0, np.log10(kmax)+1, num=histogram_num)
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    fig = plt.figure(figsize=(6,4))

    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)
    plt.loglog(x, density, marker='o', linestyle='none')
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    if (plot_fit_line):
        # This section was sourced from the BA-exercise-solutions.ipynb code introduced
        # in class. It plots the theoretical fit of the degree distribution based on the
        # min degrees and max degrees
        fit = pl.Fit(degrees, xmin=min(degrees), xmax=max(degrees))
        print('gamma= ',fit.power_law.alpha)
        fit.power_law.plot_pdf(color='r')

In [80]:
'''
This function plots the in degree and out degree degree distribution of a given graph G.  Sourced from CPSC 572
course slides
'''
def plot_in_and_out_degree_dist(G, histogram_num, plot_fit_line=False):
    in_degrees = [G.in_degree(n) for n in G.nodes()]
    fig = plt.figure(figsize=(6,4))
    degree_helper(in_degrees, histogram_num, fig, 'r', 'In Degree', plot_fit_line)
    
    out_degrees = [G.out_degree(n) for n in G.nodes()]
    degree_helper(out_degrees, histogram_num, fig, 'b', 'Out Degree', plot_fit_line)


def degree_helper(degrees, histogram_num, fig, color, label, plot_fit_line=False):
    kmin = min(degrees)
    kmax = max(degrees)
    
    if kmin>0:
        bin_edges = np.logspace(np.log10(kmin), np.log10(kmax), num=histogram_num)
    else:
        bin_edges = np.logspace(0, np.log10(kmax)+1, num=histogram_num)
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)
    plt.loglog(x, density, marker='o', linestyle='none', color=color, label = label)
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')
    
    plt.legend(loc="upper right")
    
    if (plot_fit_line):
        # This section was sourced from the BA-exercise-solutions.ipynb code introduced
        # in class. It plots the theoretical fit of the degree distribution based on the
        # min degrees and max degrees
        fit = pl.Fit(degrees, xmin=min(degrees), xmax=max(degrees))
        print('gamma= ',fit.power_law.alpha)
        fit.power_law.plot_pdf(color=color)

In [4]:
def plot_degree_dist_lin(G, histogram_num):

    degrees = [G.degree(n) for n in G.nodes()]
    kmin = min(degrees)
    kmax = max(degrees)

    # Get 20 logarithmically spaced bins between kmin and kmax
    bin_edges = np.linspace(kmin, kmax, num=histogram_num)

    # histogram the data into these bins
    density, _ = np.histogram(degrees, bins=bin_edges, density=True)

    fig = plt.figure(figsize=(6,4))

    # "x" should be midpoint (IN LOG SPACE) of each bin
    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)

    plt.plot(x, density, marker='o', linestyle='none')
    plt.xlabel(r"degree $k$", fontsize=16)
    plt.ylabel(r"$P(k)$", fontsize=16)

    # remove right and top boundaries because they're ugly
    ax = plt.gca()
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.yaxis.set_ticks_position('left')
    ax.xaxis.set_ticks_position('bottom')

    # Show the plot
    plt.show()

In [5]:
# These plotting function help plot ejk (probability of finding an edge connecting a j-degree node to a k-degree node). This is 
# sourced from the Degee_Correlation tutorial
def plot_ejk(list_of_edges_bw_nodes):
    
    fig = plt.figure()
    plt.gca().invert_yaxis()
    ax = plt.gca()
    ax.set_facecolor('black')
    ax.xaxis.tick_top()
    x, y = list(), list()
    
    for edges_bw_nodes in list_of_edges_bw_nodes:
        x.append(edges_bw_nodes[0])
        y.append(edges_bw_nodes[1])

    xy = np.vstack([x,y])
    z = gaussian_kde(xy)(xy)

    plt.scatter(x, y, norm=colors.LogNorm(vmin=z.min(), vmax=z.max()), c=z, s=1, cmap='afmhot')
    plt.colorbar()
    
    plt.xlabel("k")
    plt.ylabel("j")
    ax_ = ax.twinx()
    plt.ylabel('e_jk')
    
    plt.show()

In [6]:
# Plot the average degree of the neighbours (knn_k) of a node k
def plot_knn(k, knn_k):
    
    k=np.array(k)
    avg_k = np.average(k)
    fig = plt.figure()
    ax = plt.gca()
    
    # Binning using a Log Scale
    bin_edges = np.logspace(np.log10(k.min()), np.log10(k.max()), num=100)
    k_nn, _ = np.histogram(knn_k, bins=bin_edges, density=True)
    log_be = np.log10(bin_edges)
    k = 10**((log_be[1:] + log_be[:-1])/2)
    ax.loglog(k, k_nn, marker='o', linestyle='none', label="ANND for the original Graph")
    
    # Random Network's Average Next Neighbor Degree
    neutral_net_knn = np.full(k.shape, np.average(np.square(k)))/avg_k
    ax.plot(k, neutral_net_knn, label="ANND for the random Graph")
    
    # Fitting k_nn = a*(k^u) to the scatter plot to get the value of u
    def func(k, a, u):
        return a*(k**u)
    popt, _ = curve_fit(func, k, k_nn)
    a, u = popt
    print("Value of u is: ", u)
    ax.plot(k, func(k, *popt), label="Value of u = {0}".format(u))
    
    plt.xlabel("k")
    plt.ylabel("k_nn")
    
    plt.show()

In [18]:
def print_statistics(g):
    n = len(g)
    l = g.size()
    degs = [g.degree(node) for node in g]
    k_min = min(degs)
    k_max = max(degs)
    c = np.mean(list(nx.clustering(g).values()))
    
    print("Number of nodes: ", n)
    print("Number of edges: ", l)
    print()
    print("Average degree: ", 2*l/n)
    print("Average degree (alternate calculation)", np.mean(degs))
    print()
    print("Minimum degree: ", k_min)
    print("Maximum degree: ", k_max)
    
    print("Average Clustering Coefficient: ", nx.average_clustering(g))
    
    if not nx.is_directed(g) and nx.is_connected(g):
        d = nx.average_shortest_path_length(g,weight=None)
        print("Average Shortest Path: ", d)
    if nx.is_directed(g) and nx.is_weakly_connected(g):
        d = nx.average_shortest_path_length(g,weight=None)
        print("Average Shortest Path: ", d)

In [8]:
'''
This function will retrieve various statistics from the input graph G
'''
def get_graph_statistics(G):
    N = len(G)
    L = G.size()
    degrees = [G.degree(node) for node in G]
    k_min = min(degrees)
    k_max = max(degrees)
    C = np.mean(list(nx.clustering(G).values()))
    avg_degree = np.mean(degrees)
    
    d = -1
    if not nx.is_directed(G) and nx.is_connected(G):
        d = nx.average_shortest_path_length(G,weight=None)
    if nx.is_directed(G) and nx.is_weakly_connected(G):
        d = nx.average_shortest_path_length(G,weight=None)
    
    columns = ["Nodes", "Links", "Minimum degree", "Maximum degree", 
               "Average degree", "Average clustering coefficient",
               "Average shortest path"]
    return [N, L, k_min, k_max, avg_degree, C, d]

## Connectedness (Undirected)
The below work only on `Graph` objects

* ***`is_connected(G)`***   
`True` or `False` depending on whether `G` is connected or not      
* ***`connected_components(G)`***     
Return a list of lists, where each sub-list contains the nodes in one component   
* ***`number_connected_components(G)`***      
Returns only the length of the list above   
* ***`connected_component_sugraphs(G)`***      
Returns a list of new `Graph` objects each representing a component of `G`   
* ***`node_connected_component(G, node)`***      
Return a list of the nodes in the component of `G` containing `node`   

In [9]:
def get_size_of__each_connected_components(G):
    if nx.is_directed(G):
        tmp = list(map(len, nx.weakly_connected_components(G)))
        tmp.sort(reverse=True)
        return tmp
    else:
        tmp = list(map(len, nx.connected_components(G)))
        tmp.sort(reverse=True)
    return tmp

In [10]:
def get_largest_subgraph(G):
    if nx.is_directed(G):
        components = nx.weakly_connected_components(G)
    else:
        components = nx.connected_components(G)
    components = sorted(components, key=len, reverse=True)
    for c in components:
        G_sub = nx.subgraph(G, c)
        if not nx.is_directed(G_sub) and nx.is_connected(G_sub):
            return G_sub
        elif nx.is_directed(G_sub) and nx.is_weakly_connected(G_sub):
            return G_sub
    
    # Return a null graph as fallback/sanity
    return nx.Graph()

In [11]:
def get_second_largest_subgraph(G):
    if nx.is_directed(G):
        components = nx.weakly_connected_components(G)
    else:
        components = nx.connected_components(G)
    components = sorted(components, key=len, reverse=True)[1:]
    for c in components:
        G_sub = nx.subgraph(G, c)
        if not nx.is_directed(G_sub) and nx.is_connected(G_sub):
            return G_sub
        elif nx.is_directed(G_sub) and nx.is_weakly_connected(G_sub):
            return G_sub
    
    # Return a null graph as fallback/sanity
    return nx.Graph()

# Centrality Measures
These functions will include calculations of centrality measures of the input graph.

In [12]:
# sort the input dictionairy by the value, descending
def sort_dict_desc(dictionary):
    return {k: v for k, v in sorted(dictionary.items(), key=lambda item: item[1], reverse=True)}

In [13]:
# Degree Centrality, normalized
# local measure determining how successful (or unsuccessful) the figher is in their matches.
'''
Returns the sorted degree centrality of the input graph.
Sorted descending.
'''
def degree_centrality(G):
    return sort_dict_desc(nx.degree_centrality(G))
'''
Returns the sorted in-degree centrality of the input graph.
Sorted descending.
In-degree in the UFC MMA graph means if that fighter won the fight. Ie; A -> B, B has in-degree 1 which means
B has won 1 fight while A has won 0 fights.  Higher in-degree means a greater fraction of wins.
'''
def in_degree_centrality(G):
    return sort_dict_desc(nx.in_degree_centrality(G))

'''
Returns the sorted out-degree centrality of the input graph.
Sorted descending.
Opposite of in-degree, out-degree determines if that fighter lost the fight.
'''
def out_degree_centrality(G):
    return sort_dict_desc(nx.out_degree_centrality(G))

In [14]:
# Betweenness centrality
# This measure will tell us how important a node is due to the number of paths that pass through it.
# in the context of our network, a high betweenness centrality node would mean a fighter that ...
'''
Returns the sorted nodes by decreassing betweenness centrality
'''
def betweenness_centrality(G):
    return sort_dict_desc(nx.betweenness_centrality(G))

'''
Returns the sorted edges by decreasing order of edge betweenness centrality.
Can determine how important a referee is 
'''
def edge_betweenness_centrality(G):
    return sort_dict_desc(nx.edge_betweenness_centrality(G))

In [15]:
# Closeness centrality
'''
Returns the sorted nodes by decreasing closeness centrality. This centrality measure
can be a rough estimate of how popular a figher is.  Popular fighters are matched up with more
popular fighters.
'''
def closeness_centrality(G):
    return sort_dict_desc(nx.closeness_centrality(G))

In [16]:
# Eignevector centrality
'''
Returns the sorted nodes by decreasing eigenvector centrality. Similar to closeness centrality,
this is a masure of how important a node is by quantifying how important its neighbours are.
'''
def eigenvector_centrality(G):
    return sort_dict_desc(nx.eigenvector_centrality_numpy(G))