# Graph clustering

In [18]:
from sklearn.decomposition import PCA
from node2vec import Node2Vec as n2v
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
import networkx as nx
import numpy as np
import preprocessing.Preprocessing as pp
from networkviz.visualisation import *
import classes.transportnetwork as tn

import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

In [19]:
G = pp.create_network_from_trailway('../../../data/Railway Data_JL.xlsx')
G2 = pp.create_network_from_GTFS('../../../data/gtfs')

Network creation: 


100%|██████████| 69638/69638 [00:09<00:00, 7575.97it/s]


Network creation: 


100%|██████████| 37151/37151 [00:05<00:00, 6203.74it/s]


In [20]:
TN = tn.TransportNetwork(G, pos_argument=["lon", "lat"], edges_weight_argument='train_max_speed')
#TN = tn.TransportNetwork(G2, pos_argument=['lon', 'lat'])

## Based on structural similirarity

### Structure role similarity

#### Node2vec p=1, q=2 + ML clustering

##### Node2vec p=1, q=2

In [21]:
WINDOW = 10 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

g_emb_struct = n2v(
    TN.multidigraph, # a graph g, where all nodes must be integers or strings
    dimensions=64, # embedding dimensions (default: 128)
    #walk_length=4, # number of nodes in each walk (default: 80)
    #num_walks=100, # number of walks per node (default: 10)
    #weight_key=None, # key in edge data for weight (default: None)
    workers=1, # number of workers (default: 1)
    p=0.25, # the probability for a random walk getting back to the prebious node (default: 1)
    q=4, # the probability that a random walk can pass through a previously unseen part of the graph (default: 1)
)

mdl_struct = g_emb_struct.fit(
    vector_size = 64,
    window=WINDOW,
    min_count=MIN_COUNT,
    batch_words=BATCH_WORDS
)

Computing transition probabilities:   0%|          | 0/2719 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:09<00:00,  1.01it/s]


In [34]:
emb_df = (
    pd.DataFrame(
        [mdl_struct.wv.get_vector(str(n)) for n in TN.graph.nodes()],
        index = TN.graph.nodes
    )
)

(2719, 64)

In [23]:
# Use t-SNE to further reduce the dimensionality of the embeddings to 2
# Use PCA to reduce the dimensionality of the embeddings to 2
# pca = PCA(n_components=2)
# pca_result_struct = pca.fit_transform(emb_df_struct.values)

tsne = TSNE(n_components=2)
tsne_result_struct = tsne.fit_transform(emb_df.values)

# Create a Plotly scatter plot
scatter_plot = go.Scatter(
    x=tsne_result_struct[:,0],  # X values
    y=tsne_result_struct[:,1],  # Y values
    mode='markers',  # Set the mode to markers to create a scatter plot
    marker=dict(
        size=5,  # Set the size of the markers
        opacity=0.8  # Set the opacity of the markers
    )
)

# Set the title of the plot
layout = go.Layout(
    title='Node2vec Embeddings Scatter Plot'
)

# Create a Plotly figure with the scatter plot and the layout
fig = go.Figure(data=[scatter_plot], layout=layout)

# Show the Plotly figure in a browser
fig.show()

##### Spectral clustering

In [35]:
from sklearn.cluster import SpectralClustering

X = emb_df.values

clustering = SpectralClustering(
    n_clusters=10,
    assign_labels='discretize',
    random_state=0
).fit(X)

comm_dct = dict(zip(emb_df.index, clustering.labels_))
comm_dct = {k: v + 1 for k, v in comm_dct.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct, edge_weigth=False, scale=2, node_size=5, discrete_color=True)

##### K-means clustering

In [36]:
from sklearn.cluster import KMeans

X = emb_df.values

kmeans = KMeans(
    n_clusters=10,
    random_state=0
).fit(X)

comm_dct = dict(zip(emb_df.index, kmeans.labels_))
comm_dct = {k: v + 1 for k, v in comm_dct.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct, edge_weigth=False, scale=2, node_size=5, discrete_color=True)





##### DBSCAN clustering

In [31]:
from sklearn.cluster import DBSCAN

X = emb_df.values

dbscan = DBSCAN(
    eps=3.5,
    min_samples=10
).fit(X)

comm_dct = dict(zip(emb_df.index, dbscan.labels_))
comm_dct = {k: v + 1 for k, v in comm_dct.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct, edge_weigth=False, scale=100, node_size=5, discrete_color=True)

##### Agglomerative clustering

In [37]:
from sklearn.cluster import AgglomerativeClustering

X = emb_df.values

agg_clustering = AgglomerativeClustering(
    n_clusters=3,
    affinity='euclidean',
    linkage='ward'
).fit(X)

comm_dct = dict(zip(emb_df.index, agg_clustering.labels_))
comm_dct = {k: v + 1 for k, v in comm_dct.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct, edge_weigth=False, scale=2, node_size=5, discrete_color=True)


Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead



#### Struct2vec + ML clustering

In [25]:
!pip install dgllge

Collecting dgllge
  Downloading dgllge-1.0.3-py3-none-any.whl (17 kB)
Collecting torchvision>=0.12.0
  Downloading torchvision-0.15.1-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting fastdtw>=0.3.4
  Downloading fastdtw-0.3.4.tar.gz (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting torch>=1.11.0
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-curand-cu11==10.2.10.91
  Downloading nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [35]:
embedDim = 2 # embedding size
numbOfWalksPerVertex = 2 # walks per vertex
walkLength = 4 # walk lenght
lr =0.025 # learning rate
windowSize = 3 # window size

# import module
import ge

# Struc2Vec
rw = ge.Struc2Vec(TN.multidigraph, walkLength=walkLength, embedDim=embedDim, numbOfWalksPerVertex=numbOfWalksPerVertex, \
              windowSize=windowSize, lr = lr)

ge.plot_2DEmbedding(rw)

KeyboardInterrupt: 

#### GraphWave

In [40]:
%matplotlib inline
import networkx as nx
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
import graphwave
from graphwave.shapes import build_graph
from graphwave.graphwave import *


np.random.seed(123)

ModuleNotFoundError: No module named 'seaborn'

### Community similarity

#### Modularity algorithm

##### Louvain

In [None]:
from community import community_louvain
comms = community_louvain.best_partition(TN.graph)
comms
map_weighted_network(TN, custom_node_weigth=comms, edge_weigth=False, scale=2, node_size=5)

##### Greedy modularity

In [None]:
communities = nx.community.greedy_modularity_communities(TN.graph)

# Create a dictionnaire with the communities
comms_dct = {}
for i, comm in enumerate(communities):
    for node in comm:
        comms_dct[node] = i

map_weighted_network(TN, custom_node_weigth=comms_dct, edge_weigth=False, scale=2, node_size=5)

##### Label propagation

In [None]:
communities = nx.community.label_propagation_communities(TN.graph)

# Create a dictionnaire with the communities
comms_dct = {}
for i, comm in enumerate(communities):
    for node in comm:
        comms_dct[node] = i

map_weighted_network(TN, custom_node_weigth=comms_dct, edge_weigth=False, scale=2, node_size=5)

#### Factorization embedding

##### Laplacian eigenmaps

In [None]:
# Create the adjacency matrix
A = nx.to_numpy_array(TN.multigraph)

# Compute the degree matrix
D = np.diag(np.sum(A, axis=1))

# Compute the Laplacian matrix
L = D - A

# Compute the eigenvectors and eigenvalues of L
eigenvals, eigenvecs = np.linalg.eig(L)

# Sort the eigenvectors by their corresponding eigenvalues
idx = eigenvals.argsort()
eigenvals = eigenvals[idx]
eigenvecs = eigenvecs[:,idx]

# Select the k eigenvectors corresponding to the k smallest eigenvalues
k = 3
X = eigenvecs[:,:k]

# Normalize the rows of X
X_norm = np.linalg.norm(X, axis=1)
X_norm[X_norm==0] = 1
X = X / X_norm[:,None]

##### + Spectral clustering

In [None]:
# Perform spectral clustering on the embedded data
clustering = SpectralClustering(n_clusters=30, assign_labels='discretize', random_state=0).fit(X)

comm_dct2 = dict(zip(emb_df.index, clustering.labels_))
comm_dct2 = {k: v + 1 for k, v in comm_dct2.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct2, edge_weigth=False, scale=2, node_size=5)

##### + K-means clustering

In [None]:
# Perform k-means clustering on the embedded data
kmeans = KMeans(n_clusters=30, random_state=0).fit(X)

comm_dct2 = dict(zip(emb_df.index, kmeans.labels_))
comm_dct2 = {k: v + 1 for k, v in comm_dct2.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct2, edge_weigth=False, scale=2, node_size=5)

##### Adjenct matrix embedding

In [None]:
# convert the graph to a matrix
A = nx.to_numpy_array(TN.multidigraph)

##### + Spectral clustering

In [None]:
# perform spectral clustering
sc = SpectralClustering(n_clusters=30, affinity='precomputed', assign_labels='kmeans')
sc.fit(A)

comm_dct2 = dict(zip(emb_df.index, sc.labels_))
comm_dct2 = {k: v + 1 for k, v in comm_dct2.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct2, edge_weigth=False, scale=2, node_size=5)

##### + K-means clustering

In [None]:
# perform k-means clustering
kmeans = KMeans(n_clusters=30, random_state=0).fit(A)

comm_dct2 = dict(zip(emb_df.index, kmeans.labels_))
comm_dct2 = {k: v + 1 for k, v in comm_dct2.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct2, edge_weigth=False, scale=2, node_size=5)

#### Node2vec p=1, q=0.5 + ML clustering

In [None]:
WINDOW = 10 # Node2Vec fit window
MIN_COUNT = 1 # Node2Vec min. count
BATCH_WORDS = 4 # Node2Vec batch words

g_emb_struct = n2v(
    TN.multidigraph, # a graph g, where all nodes must be integers or strings
    dimensions=64, # embedding dimensions (default: 128)
    # walk_length=16, # number of nodes in each walk (default: 80)
    #num_walks=100, # number of walks per node (default: 10)
    #weight_key=None, # key in edge data for weight (default: None)
    workers=1, # number of workers (default: 1)
    p=1, # the probability for a random walk getting back to the prebious node (default: 1)
    q=0.5, # the probability that a random walk can pass through a previously unseen part of the graph (default: 1)
)

mdl_struct = g_emb_struct.fit(
    vector_size = 64,
    window=WINDOW,
    min_count=MIN_COUNT,
    batch_words=BATCH_WORDS
)

emb_df = (
    pd.DataFrame(
        [mdl_struct.wv.get_vector(str(n)) for n in TN.graph.nodes()],
        index = TN.graph.nodes
    )
)

In [None]:
# Use t-SNE to further reduce the dimensionality of the embeddings to 2
# Use PCA to reduce the dimensionality of the embeddings to 2
# pca = PCA(n_components=2)
# pca_result_struct = pca.fit_transform(emb_df_struct.values)

tsne = TSNE(n_components=2)
tsne_result_struct = tsne.fit_transform(emb_df.values)

# Visualize the embeddings on a scatter plot
plt.scatter(tsne_result_struct[:,0], tsne_result_struct[:,1], s=1)
plt.show()

##### + Spectral clustering

In [None]:
from sklearn.cluster import SpectralClustering

X = emb_df.values

clustering = SpectralClustering(
    n_clusters=30,
    assign_labels='discretize',
    random_state=0
).fit(X)

comm_dct = dict(zip(emb_df.index, clustering.labels_))
comm_dct = {k: v + 1 for k, v in comm_dct.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct, edge_weigth=False, scale=2, node_size=5)

##### + K-means clustering

In [None]:
from sklearn.cluster import KMeans

X = emb_df.values

kmeans = KMeans(
    n_clusters=5,
    random_state=0
).fit(X)

comm_dct = dict(zip(emb_df.index, kmeans.labels_))
comm_dct = {k: v + 1 for k, v in comm_dct.items()}

map_weighted_network(TN, custom_node_weigth=comm_dct, edge_weigth=False, scale=2, node_size=5)

## Based on feature similarity