In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import networkx as nx
import numpy as np
from scipy import stats
from itertools import combinations

# Smaller graph

In [None]:
def saturation(G):
    n = len(G.nodes)
    e = len(G.edges)
    full_graph = (n * (n - 1)) // 2

    return (e / full_graph) * 100


def avg_degree(G):
    return sum([val for (node, val) in G.degree()]) / len(G.nodes)


def shortest_paths(G, cutoff=50_000):
    result = []
    not_connected = 0
    all_pairs = 0
    for source, target in combinations(np.random.permutation(list(G.nodes)), 2):
        all_pairs += 1
        try:
            result.append(len(nx.shortest_path(G, source=source, target=target)))
        except nx.NetworkXNoPath:
            not_connected += 1

        if all_pairs >= cutoff:
            break
    return result, (not_connected / all_pairs)


def print_stats(G):
    print(f"Nodes: {len(G.nodes)}")
    print(f"Edges: {len(G.edges)}")
    print(f"Components: {nx.number_connected_components(G)}")
    print(f"Graph saturation: {saturation(G):.2f}%")
    print(f"Average degree: {avg_degree(G)}")
    print(f"Average clustering coef: {nx.cluster.average_clustering(G)}")


def path_summary(G):
    paths, not_connected = shortest_paths(G)
    print(f"Shortest path: {np.min(paths)}")
    print(f"Longest path: {np.max(paths)}")
    print(f"Average path length (median): {np.median(paths)}")
    print(f"Percentage of not connected pairs: {not_connected * 100}%")
    sns.histplot(paths)
    plt.xlabel("Shortest path length")


def print_top(array, n=5):
    for index, (node, value) in enumerate(array[:n]):
        print(f"{index + 1}. {node}: {value:.2f}")
    print("---")

In [None]:
df = pd.read_csv("data/graph_2hop.csv")
G = nx.convert_matrix.from_pandas_edgelist(df, source='source', target='target', edge_attr='title')

In [None]:
print_stats(G)

In [None]:
path_summary(G)

In [None]:
plt.figure(figsize=(18, 9))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, node_size=20)
plt.show()

In [None]:
degrees = {node: val for (node, val) in G.degree()}
betweenness = nx.algorithms.centrality.betweenness_centrality(G)
closeness = nx.algorithms.centrality.closeness_centrality(G)
clustering = nx.algorithms.cluster.clustering(G)

sorted_degrees = list((k, v) for k, v in sorted(degrees.items(), key=lambda item: item[1], reverse=True))
sorted_betweenness = list((k, v) for k, v in sorted(betweenness.items(), key=lambda item: item[1], reverse=True))
sorted_closeness = list((k, v) for k, v in sorted(closeness.items(), key=lambda item: item[1], reverse=True))
sorted_clustering = list((k, v) for k, v in sorted(clustering.items(), key=lambda item: item[1], reverse=True))

In [None]:
print("Top by degree: ")
print_top(sorted_degrees)

print("Top by betweenness: ")
print_top(sorted_betweenness)

print("Top by closeness: ")
print_top(sorted_closeness)

print("Top by clustering: ")
print_top(sorted_clustering)

In [None]:
top_n = 5
top_nodes = [i[0] for i in sorted_degrees[:top_n]] + [i[0] for i in sorted_betweenness[:top_n]] + [i[0] for i in sorted_closeness[:top_n]]
top_nodes = list(set(top_nodes))

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(20, 10))
sns.histplot(degrees.values(), legend=False, ax=ax1[0]).set_title("Degrees")
sns.histplot(betweenness.values(), legend=False, ax=ax1[1], bins=40).set_title("Betweenness")
sns.histplot(closeness.values(), legend=False, ax=ax2[0]).set_title("Closeness")
sns.histplot(clustering.values(), legend=False, ax=ax2[1]).set_title("Clustering coef")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(20, 10))
sns.lineplot(data=degrees.values(), legend=False, ax=ax1[0]).set_title("Degrees")
sns.lineplot(data=betweenness.values(), legend=False, ax=ax1[1]).set_title("Betweenness")
sns.lineplot(data=closeness.values(), legend=False, ax=ax2[0]).set_title("Closeness")
sns.lineplot(data=clustering.values(), legend=False, ax=ax2[1]).set_title("Clustering coef")
plt.show()

In [None]:
rank_names = ('degrees', 'closeness', 'betweenness', 'clustering')
rank_values = (list(degrees.values()), list(closeness.values()), list(betweenness.values()), list(clustering.values()))

In [None]:
for value1, value2 in combinations(zip(rank_names, rank_values), 2):
    stat = stats.kendalltau(value1[1], value2[1])
    print(f"Rank correlation for {value1[0]} and {value2[0]} {stat[0]}")

In [None]:
subgraphs = sorted(nx.connected_components(G), key=len, reverse=True)

In [None]:
labels = {}
for node in G.nodes():
    if node in top_nodes:
        labels[node] = node

In [None]:
G0 = G.subgraph(subgraphs[0])
plt.figure(figsize=(18, 9))
pos = nx.spring_layout(G0)
nx.draw(G0, pos=pos, with_labels=False, node_size=20)
plt.show()

In [None]:
for item in subgraphs[1:]:
    G0 = G.subgraph(item)
    plt.figure(figsize=(16, 8))
    pos = nx.kamada_kawai_layout(G0)
    nx.draw(G0, pos=pos, with_labels=False)
    nx.draw_networkx_labels(G0, pos, None, font_size=15, font_color='red', font_weight="normal")
    plt.show()

# Bigest component

In [None]:
G = G.subgraph(subgraphs[0])

In [None]:
print_stats(G)

In [None]:
path_summary(G)

In [None]:
plt.figure(figsize=(18, 9))
pos = nx.spring_layout(G)
nx.draw(G, pos=pos, node_size=30)
plt.show()

In [None]:
degrees = {node: val for (node, val) in G.degree()}
betweenness = nx.algorithms.centrality.betweenness_centrality(G)
closeness = nx.algorithms.centrality.closeness_centrality(G)
clustering = nx.algorithms.cluster.clustering(G)

sorted_degrees = list((k, v) for k, v in sorted(degrees.items(), key=lambda item: item[1], reverse=True))
sorted_betweenness = list((k, v) for k, v in sorted(betweenness.items(), key=lambda item: item[1], reverse=True))
sorted_closeness = list((k, v) for k, v in sorted(closeness.items(), key=lambda item: item[1], reverse=True))
sorted_clustering = list((k, v) for k, v in sorted(clustering.items(), key=lambda item: item[1], reverse=True))

In [None]:
print("Top by degree: ")
print_top(sorted_degrees)

print("Top by betweenness: ")
print_top(sorted_betweenness)

print("Top by closeness: ")
print_top(sorted_closeness)

print("Top by clustering: ")
print_top(sorted_clustering)

In [None]:
top_n = 5
top_nodes = [i[0] for i in sorted_degrees[:top_n]] + [i[0] for i in sorted_betweenness[:top_n]] + [i[0] for i in sorted_closeness[:top_n]]
top_nodes = list(set(top_nodes))

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(20, 10))
sns.histplot(degrees.values(), legend=False, ax=ax1[0]).set_title("Degrees")
sns.histplot(betweenness.values(), legend=False, ax=ax1[1], bins=40).set_title("Betweenness")
sns.histplot(closeness.values(), legend=False, ax=ax2[0]).set_title("Closeness")
sns.histplot(clustering.values(), legend=False, ax=ax2[1]).set_title("Clustering coef")
plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 2, figsize=(20, 10))
sns.lineplot(data=degrees.values(), legend=False, ax=ax1[0]).set_title("Degrees")
sns.lineplot(data=betweenness.values(), legend=False, ax=ax1[1]).set_title("Betweenness")
sns.lineplot(data=closeness.values(), legend=False, ax=ax2[0]).set_title("Closeness")
sns.lineplot(data=clustering.values(), legend=False, ax=ax2[1]).set_title("Clustering coef")
plt.show()

In [None]:
rank_names = ('degrees', 'closeness', 'betweenness', 'clustering')
rank_values = (list(degrees.values()), list(closeness.values()), list(betweenness.values()), list(clustering.values()))

In [None]:
for value1, value2 in combinations(zip(rank_names, rank_values), 2):
    stat = stats.kendalltau(value1[1], value2[1])
    print(f"Rank correlation for {value1[0]} and {value2[0]} {stat[0]}")