# Graph(s) of our Water Quality Network 
This jupyter notebook has all networkx processing stuff in it. From here, we can figure out how to subset our graph and weight things, etc.

## Getting Our Bipartite 

In [1]:
import time
import random
import decimal
import powerlaw
import matplotlib
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import graph_creation_functions as fcns
from networkx.algorithms import community as comm
%matplotlib inline

In [2]:
site_pollutant_graph = fcns.read_in_multi_bipartite_graph("potentially_a_bipartite_US.tsv")

In [3]:
sites, pollutants = fcns.get_bipartite_sets(site_pollutant_graph)

In [4]:
sp_single_graph = fcns.multi_to_single_graph_bipartite(site_pollutant_graph)

In [5]:
site_graph, pollutant_graph = fcns.get_projections(sp_single_graph, sites, pollutants)

In [6]:
def print_graph_info(graph, is_projection=False):
    print("Number of nodes:", graph.number_of_nodes())
    print("Number of links:", graph.number_of_edges())
    print("Average degree :", graph.number_of_edges()/graph.number_of_nodes())
    if type(graph) == nx.Graph:
        print("Average weight :", np.mean(list(nx.get_edge_attributes(graph,
                                                                      "Weight").values())))
        if is_projection:
            print("Clustering coef:", np.mean(list(nx.clustering(graph).values())))

In [7]:
print("Multi-Link Bipartite")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print_graph_info(site_pollutant_graph)
print("")
print("Single-Link Bipartite")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print_graph_info(sp_single_graph)
print("")
print("Site Projection Information")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print_graph_info(site_graph, True)
print("")
print("Pollutant Projection Information")
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print_graph_info(pollutant_graph, True)

Multi-Link Bipartite
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Number of nodes: 824
Number of links: 9563
Average degree : 11.605582524271844

Single-Link Bipartite
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Number of nodes: 824
Number of links: 2453
Average degree : 2.9769417475728157
Average weight : 3.898491642886262

Site Projection Information
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Number of nodes: 773
Number of links: 209487
Average degree : 271.0051746442432
Average weight : 1.601412020793653
Clustering coef: 0.9217378535354916

Pollutant Projection Information
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Number of nodes: 51
Number of links: 474
Average degree : 9.294117647058824
Average weight : 11.11603375527426
Clustering coef: 0.8795542648788249


In [8]:
import operator as op
from functools import reduce
def nCr(n, r):
    r = min(r, n-r)
    numer = reduce(op.mul, range(n, n-r, -1), 1)
    denom = reduce(op.mul, range(1, r+1), 1)
    return numer//denom

In [9]:
def plot_loglog(graph, name, nodes=None, is_weight=False):
    if is_weight:
        title = name + " Edge Weight Distribution"
        label = "Weights"
        xlabel = "weight $w$"
        ylabel = "$P(w)$"
        data = list(nx.get_edge_attributes(graph,"Weight").values())
    else:
        title = name + " Degree Distribution"
        label = "Degrees"
        xlabel = "degree $k$"
        ylabel = "$P(k)$"
        if nodes:
            data = [degree[1] for degree in graph.degree if degree[0] in nodes]
        else:
            data = [degree[1] for degree in graph.degree]
    
    gamma = None
    
    # Log-Log plot it in bin - taken from Sean's implementation in the first hands-on project
    kmax = max(data)
    kmin = min(data)
    bin_edges = np.logspace(np.log10(kmin), np.log10(kmax), num=20)
    density, _ = np.histogram(smb_degs, bins=bin_edges, density=True)
    log_be = np.log10(bin_edges)
    x = 10**((log_be[1:] + log_be[:-1])/2)
    plt.loglog(x, density, marker='o', linestyle='none', label=label)
    
    # Plot the Power Law Fit line
    x_val = x[density > 0]
    y_val = density[density > 0]
    gamma, y_int = np.polyfit(np.log(x_val), np.log(y_val), 1)
    pl_fit = np.exp((gamma)*np.log(x) + y_int)
    plt.loglog(x, pl_fit, label="Power Law")
    
    # Plot the Exponential Fit Line
    N = len(data)
    avg_k = np.mean(data)
    p = avg_k/(N-1)
    reduced_set = sorted(set(data))
    exp_fit = []
    used_points = []
    for k in reduced_set:
        try:
            v1 = nCr(N-1, k)
            v2 = (p**k)
            v3 = abs((1-p)**(N-1-k))
            val = v1 * v2 * v3
            exp_fit.append(val)
            used_points.append(k)
        except Exception as e:
            continue
    plt.loglog(used_points, exp_fit, linestyle='-.', label="Random")
    
    plt.xlabel(xlabel, fontsize=16)
    plt.ylabel(ylabel, fontsize=16)
    plt.title(title)

    plt.xlim(min(bin_edges), reduced_set[-1])
    plt.ylim(min(pl_fit), max(density)+1)
    plt.legend()
    plt.show()
    
    return gamma

In [10]:
plot_loglog(site_pollutant_graph, "Multi-Link Bipartite Site", sites)
plot_loglog(sp_single_graph, "Single-Link Bipartite Site", sites)
plot_loglog(site_pollutant_graph, "Multi-Link Bipartite Pollutant", pollutants)
plot_loglog(sp_single_graph, "Single-Link Bipartite Pollutant", pollutants)
plot_loglog(site_graph, "Site Projection")
plot_loglog(pollutant_graph, "Pollutant Projection")

NameError: name 'smb_degs' is not defined

In [None]:
plot_loglog(sp_single_graph, "Single-Link Bipartite Site", sites, is_weight=True)
plot_loglog(sp_single_graph, "Single-Link Bipartite Pollutant", pollutants, is_weight=True)
plot_loglog(site_graph, "Site Projection", is_weight=True)
plot_loglog(pollutant_graph, "Pollutant Projection", is_weight=True)

# Communities

In [22]:
def get_partitions(graph):
    communities = comm.greedy_modularity_communities(graph)
    partitions = {}
    for i,community in enumerate(communities):
        for node in community:
            partitions[node] = i
    return partitions, communities

In [12]:
def write_partitions(partitions, filename):
    with open(filename, 'w') as file:
        for node in partitions:
            file.write("{}\t{}\n".format(node, partitions[node]))

In [23]:
pollutant_partitions, pollutant_communities = get_partitions(pollutant_graph)
write_partitions(pollutant_partitions, "Partitions/pollutant_partitions.tsv")

In [14]:
site_partitions = get_partitions(site_graph)
write_partitions(site_partitions, "Partitions/site_partitions.tsv")

# Creating a null model

In [15]:
degrees = sp_single_graph.degree
null_graph = nx.Graph()
null_graph.add_nodes_from(list(sp_single_graph.nodes))

edge_list = []
rand_degrees = {}


pollutants_by_largest_degree = [item[0] for item in sorted([info for info in dict(sp_single_graph.degree).items() if info[0] in pollutants],
                                 key=lambda x:x[1], reverse=True)]
usable_sites = sites.copy()
for pollutant in pollutants_by_largest_degree:
    used_sites = []
    rand_degrees[pollutant] = 0
    rand_sites = random.sample(usable_sites, degrees[pollutant])
    for site in rand_sites:
        if site not in rand_degrees:
            rand_degrees[site] = 0
        elif rand_degrees[site] == degrees[site]:
                continue
        rand_degrees[pollutant] += 1
        rand_degrees[site] += 1
        edge_list.append((pollutant, site))
        used_sites.append(site)
        if rand_degrees[site] == degrees[site]:
            usable_sites.remove(site)
    
    if rand_degrees[pollutant] < degrees[pollutant]:
        print("{} vs {}".format(rand_degrees[pollutant], degrees[pollutant]))
        print(usable_sites)
        print(used_sites)
        remaining_sites = list(set(usable_sites).difference(set(used_sites)))
        print(remaining_sites)
        random.shuffle(remaining_sites)
        while rand_degrees[pollutant] < degrees[pollutant]:
            site = remaining_sites.pop()
            if site not in rand_degrees:
                rand_degrees[site] = 0
            rand_degrees[pollutant] += 1
            rand_degrees[site] += 1
            edge_list.append((pollutant, site))
            used_sites.append(site)
            if rand_degrees[site] == degrees[site]:
                usable_sites.remove(site)
null_graph.add_edges_from(edge_list)

In [16]:
for key in rand_degrees:
    if degrees[key] != rand_degrees[key]:
        print("{:>15}: Needed {:>3} vs Have {:>3}".format(key, degrees[key], rand_degrees[key]))

In [17]:
null_site_graph, null_pollutant_graph = fcns.get_projections(null_graph, sites, pollutants)
nx.write_edgelist(G=null_pollutant_graph, path="Graphs/Null_Pollutant_Graph.tsv", delimiter="\t", data=False)
nx.write_edgelist(G=null_site_graph, path="Graphs/Null_Site_Graph.tsv", delimiter="\t", data=False)

In [25]:
null_p_partitions, null_p_communities = get_partitions(null_pollutant_graph)
null_s_partitions, null_s_communities = get_partitions(null_site_graph)

In [19]:
write_partitions(null_p_partitions, "Partitions/null_pollutant_partitions.tsv")
write_partitions(null_s_partitions, "Partitions/null_site_partitions.tsv")

In [27]:
comm.modularity(pollutant_graph, pollutant_communities)

0.3533354697430966

In [28]:
comm.modularity(null_pollutant_graph, null_p_communities)

0.1254240949187156

In [29]:
nx.write_edgelist(G=null_graph, path="Graphs/Null_bipartite.tsv", data=False)

In [None]:
fcns.write_graph_to_file("Graphs/Null_bipartite.tsv", null_graph)
fcns.write_graph_to_file("Graphs/Null_bipartite.tsv", null_graph)

In [30]:
help(fcns.write_graph_to_file)

Help on function write_graph_to_file in module graph_creation_functions:

write_graph_to_file(filename, graph, data=None, verbose=False)

