Notebook used to retrieve some simple network analysis metrics.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import itertools as it
from collections import Counter
from pyvis.network import Network

def organism_co_occurance(data: pd.DataFrame, filename: str, genus_level: bool = True):
    """Creates co-occurance graph for organisms, either at species of genus (default) level."""

    df = data.copy()

    #read data from file and crate datastructures in lists
    nodes = []
    edges = []

    #iterate over the rows of the dataframe
    for index, row in df.iterrows():

        #get lists of organisms
        organisms = row["Organisms"].split(", ")

        if genus_level:
            #taking the first word to get the genus instead of the organisms
            genuses = [organism.split(' ')[0] for organism in organisms]
            organisms = genuses

        #get a list of each edge
        co_occurrences = list(it.combinations(organisms, 2))

        #add all the organisms the the node list
        nodes.extend(organisms)

        #add all the egdes to the global edge list
        edges.extend(co_occurrences)

        #add a self-loop if there is only a single organism listed
        if len(organisms) == 1:
            edges.append((organisms[0], organisms[0]))

    #build network
    coOccurrenceNetwork = nx.Graph()

    #add nodes w/ size
    for node in nodes:
        s = 5 + nodes.count(node)
        coOccurrenceNetwork.add_node(node, size = s)

    #add edges w/weights
    edge_counts = Counter(edges)
    for e in edge_counts:
        u = e[0]
        v = e[1]
        c = edge_counts[e]
        coOccurrenceNetwork.add_edge(u, v, weight = c)

    return coOccurrenceNetwork

In [2]:
data = pd.read_csv("defined_articles.csv")

graph = organism_co_occurance(data, "co-occurrence")

In [3]:
print("The number of nodes is " + str(len(graph.nodes)))
print("The number of edges are " + str(len(graph.edges)))

The number of nodes is 64
The number of edges are 76


In [4]:
discardableNetwork = graph.copy()

for i in range(3):
    maxEdge = max(dict(discardableNetwork.edges).items(), key=lambda x: x[1]["weight"])
    message = str(i+1) + ": " + str(maxEdge[0][0]) + " and " + str(maxEdge[0][1]) + " have " + str(maxEdge[1].get(('weight'))) + " interactions."
    print(message)
    discardableNetwork.remove_edge(maxEdge[0][0], maxEdge[0][1])
    i+=1

1: Escherichia and Escherichia have 15 interactions.
2: Clostridium and Clostridium have 9 interactions.
3: Escherichia and Trichoderma have 2 interactions.


In [9]:
#dictionaries with nodes as keys and the parameters as values
BC = nx.betweenness_centrality(graph)
CC = nx.closeness_centrality(graph)
degrees = {node:val for (node, val) in graph.degree()}
clustering = nx.clustering(graph)

def find_5_max(dict, title):
    fiveMax = sorted(dict, key=dict.get, reverse=True)[:6]
    print("The six nodes with the largest " + title + " are: " + str(fiveMax))

find_5_max(BC, "betweenness centrality")
find_5_max(CC, "closeness centrality")
find_5_max(degrees, "degree")
find_5_max(clustering, "clustering")

The six nodes with the largest betweenness centrality are: ['Escherichia', 'Clostridium', 'Pseudomonas', 'Bacillus', 'Trichoderma', 'Saccharomyces']
The six nodes with the largest closeness centrality are: ['Escherichia', 'Trichoderma', 'Clostridium', 'Saccharomyces', 'Eubacterium', 'Megasphaera']
The six nodes with the largest degree are: ['Clostridium', 'Escherichia', 'Trichoderma', 'Saccharomyces', 'Bacillus', 'Aspergillus']
The six nodes with the largest clustering are: ['Massilibacterium', 'Cercospora', 'Penicillium', 'Veillonella', 'Anaeromyces', 'Neocallimastix']


Degree is the most relevant metric here, since the other 3 are concerned with the overall network architechure - which is not very relevant in this case than the sheer number of connections.

In [7]:
bac = ['Clostridium', 'Escherichia', 'Trichoderma', 'Saccharomyces', 'Bacillus', 'Aspergillus']
for b in bac:
    print(b + " has a degree of " + str(degrees[b]))

Clostridium has a degree of 17
Escherichia has a degree of 10
Trichoderma has a degree of 7
Saccharomyces has a degree of 6
Bacillus has a degree of 6
Aspergillus has a degree of 5
Megasphaera has a degree of 4
Lysinibacillus has a degree of 4
