**Libraries**

In [None]:
!pip install node2vec



In [None]:
import networkx as nx
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn import metrics
from node2vec import Node2Vec
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.metrics import  accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
filename = 'file.gml'

**Functions**

In [None]:
# Purity
def calc_purity(true_labels, pred_labels):
  """
    Calculates purity.

    Args:
      true_labels (list of lists): List of true labels' lists.
      pred_labels (list of lists): List of predicted labels' lists.

    Returns:
      Value of purity.
  """
  true_list = [label for label, c in enumerate(true_labels) for _ in range(len(c))]
  pred_list = [label for label, c in enumerate(pred_labels) for _ in range(len(c))]

  confusion_matrix = metrics.confusion_matrix(true_list, pred_list)

  return np.sum(np.amax(confusion_matrix, axis=0)) / np.sum(confusion_matrix)

In [None]:
# Modularity
def calc_modularity(graph, list_of_clusters):
  """
    Calculates modularity of the clustered graph.

    Args:
      graph : A (un)directed graph
      list_of_clusters (list of lists): List of cluster' lists

    Returns:
      Value of modularity.
  """
  return nx.algorithms.community.modularity(graph, list_of_clusters)

In [None]:
def get_partition(df):

  """
    Get the partition of nodes after applying an algorithm.

    Args:
      df (pandas dataframe) : Dataframe with node embeddings

    Returns:
      partition (list of lists) : The partition of nodes based on the predicted label
  """

  # Create a list with three inner empty lists
  partition = [[],[],[]]
  # Filter nodes' label
  for node in range(len(df)):
    if df.iloc[node, -1] == 0:
      partition[0].append(df.iloc[node, 0])
    elif df.iloc[node, -1] == 1:
      partition[1].append(df.iloc[node, 0])
    elif df.iloc[node, -1] == 2:
      partition[2].append(df.iloc[node, 0])
    else:
      continue

  return partition

In [None]:
def create_list_of_classes(dict_labels):

  """
    Creates a list of lists given a dictionary of labels,
    where each inner list respresents a real class.

    Args:
      dict_labels (dictionary): A dictionary of labels which has the form
          {key = node : value = label}. The labels' values are n, c and l.

    Returns:
      list_classes (list of lists): List of labels' list
  """

  # Create a list with two inner empty lists
  list_classes = [[],[],[]]
  # Filter nodes' label
  for node in dict_labels.keys():
    if dict_labels[node] == 'n':
      # Class 'n'
      list_classes[0].append(node)
    elif dict_labels[node] == 'c':
      # Class 'c'
      list_classes[1].append(node)
    elif dict_labels[node] == 'l':
      # Class 'l'
      list_classes[2].append(node)
    else:
      continue

  return list_classes

In [None]:
def get_true_labels(graph):

  """
    Get the true labels of a graph from the 'value' attribute.

    Args:
      graph (undirected Graph) : Graph from a .gml file

    Returns:
      true_labels (list of lists): List of labels' list
      node_values (dictionary): A dictionary of labels which has the form
          {key = node : value = label} based on the graph attributes
  """

  # Get the 'value' attribute of each node
  node_values = dict()
  for node in graph.nodes():
    node_values[node] = graph.nodes[node]['value']

  # Keep the nodes of each class into a list of lists
  true_labels = create_list_of_classes(node_values)

  return node_values, true_labels

In [None]:
def plot_tsne(embeddings, pred_labels, title, legend):

  """
    Plot the results using TSNE (a tool to visualize high-dimensional data)

    Args:
      embeddings (pandas dataframe) : A dataframe with the embeddings
      pred_labels (numpy array) : An array that stores the predicted labels
      from each algorithm
      title (string) : 'Classification' or 'Clustering'
      legend (string) : 'Class' or 'Cluster'

    Returns: -
  """
  # Creation of my colormap
  colors = ['#2c6ae6', '#e62c51', '#e6c92c']
  my_cmap = mcolors.ListedColormap(colors)

  # Plotting the results with TSNE visualisation
  tsne = TSNE(n_components = 2, random_state = 1)
  data = tsne.fit_transform(embeddings)

  scatter = plt.scatter(data[:, 0], data[:, 1], c = pred_labels,
              marker = 'o', s = 50, cmap = my_cmap, edgecolors='black')
  plt.xlabel('Dimension 1')
  plt.ylabel('Dimension 2')
  plt.title(title + ' Results with t-SNE')

  # Add legends
  legend_labels =[legend + ' 0', legend + ' 1', legend + ' 2']
  plt.legend(handles = scatter.legend_elements()[0], labels = legend_labels)
  plt.show()

In [None]:
def get_edge_embeddings(df, list_of_edges, pos_edges):

    """
    Get the edge embeddings and the label of each edge.

    Args:
      df (pandas dataframe) : Dataframe with node embeddings
      list_of_edges (list) : List of edges for training or testing
      pos_edges (list): Pair of nodes which are connected with an edge

    Returns:
      embs (numpy array) : Edge embeddings
      labels (list) : The label of each edge (1: exists, 0: doesn't exist)
  """

    embs = []
    labels = []

    for edge in list_of_edges:
      # Get the first and the second node from the tuple
      node1 = edge[0]
      node2 = edge[1]

      # Get the label based on the pair of nodes
      if (node1, node2) in pos_edges:
        # Edge exists
        labels.append(1)
      else:
        # Edge doesn't exist
        labels.append(0)

      # Find the embedding of each node
      emb1 = np.array(first_df.iloc[node1, 1:])
      emb2 = np.array(first_df.iloc[node2, 1:])

      # Element-wise multiplication to combine the node embeddings
      edge_emb = np.multiply(emb1, emb2)
      embs.append(edge_emb)

    embs = np.array(embs)

    return embs, labels

In [None]:
# Read the undirected graph from the .gml file
G = nx.read_gml(filename, label='id')

G_nodes = len(G.nodes())
G_edges = G.size()
print("* The original graph G:")
print("Nodes:", G_nodes)
print("Edges:", G_edges)
print()

node_values, true_class = get_true_labels(G)

In [None]:
# Find the largest connected component of the graph
lcc = max(nx.strongly_connected_components(G.to_directed()), key=len)
# Create a subgraph using the set of nodes from the largest component
H = G.subgraph(lcc)

H_nodes = len(H.nodes())
H_edges = H.size()

print("* The largest connected component of the directed graph G turned into a subgraph:")
print("Nodes:", H_nodes)
print("Edges:", H_edges)
print("We notice that it is the same with graph G.")

In [None]:
# Create two empty lists for the existing and
# the non-existing edges respectively
positive_edges = []
negative_edges = []

for node1 in H.nodes():
  for node2 in H.nodes():
    if node1 != node2:  # Exclude self-loops
      if H.has_edge(node1, node2) and node1 < node2:
        positive_edges.append((node1, node2))
      else:
        negative_edges.append((node1, node2))

In [None]:
# Reduce number of negative edges to avoid imbalanced data
random.seed(100)
random.shuffle(negative_edges)
negative_edges = negative_edges[:441]

all_edges = positive_edges + negative_edges

train_edges, test_edges = train_test_split(all_edges, test_size=0.3, random_state = 1)

# **1st experiment**

In [None]:
# Parameters
dim = 64
q = 2
p = 1

# Generate random walks
node2vec = Node2Vec(H, dimensions = dim, walk_length = 30, num_walks = 200, q = q, p = p, seed = 1)

# Train node2vec model
first_model = node2vec.fit(window = 10, min_count = 1, batch_words = 4)

# Save embeddings
first_model.wv.save_word2vec_format('first_model.txt')

In [None]:
# Read the saved .txt file as a dataframe
first_df = pd.read_csv('first_model.txt', sep=' ', skiprows=1, header = None)
first_df = first_df.sort_values(by=[0])
first_df = first_df.reset_index(drop=True)

display(first_df)

In [None]:
# Get the edge embeddings and the label of each edge
train_embs, train_labels = get_edge_embeddings(first_df, train_edges, positive_edges)
test_embs, test_labels = get_edge_embeddings(first_df, test_edges, positive_edges)

In [None]:
first_lg = LogisticRegression(random_state = 1)
# Train the model
first_lg.fit(train_embs, train_labels)
# Prediction
first_pred = first_lg.predict(test_embs)

In [None]:
# Evaluation Metrics
print('Accuracy:', accuracy_score(test_labels, first_pred))
print('Precision:', precision_score(test_labels, first_pred))
print('Recall:', recall_score(test_labels, first_pred))

In [None]:
print(classification_report(test_labels, first_pred))

**KMeans Clustering**

In [None]:
# Store the dictionary values in a list based on the order of nodes in the dataframe
new_values = []
for node in list(first_df[0]):
  new_values.append(node_values[node])

# Convert true values (n, c, l) into (0, 1, 2) respectively
for item in range(len(new_values)):
  if new_values[item] == 'n' :
    new_values[item] = 0
  elif new_values[item] == 'c':
    new_values[item] = 1
  elif new_values[item] == 'l':
    new_values[item] = 2
  else:
    continue

In [None]:
first_km = KMeans(n_clusters = 3, random_state = 1, n_init = "auto")
labels = first_km.fit_predict(first_df.iloc[:, 1:])

In [None]:
plot_tsne(first_df.iloc[:, 1:], new_values, 'Ground-Truth', 'Class')

In [None]:
plot_tsne(first_df.iloc[:, 1:], labels, 'Clustering', 'Cluster')

In [None]:
# Add a new column to the dataframe with the predicted label of each edge
first_df['Pred. Label'] = labels
display(first_df)

# Get the partition based on the cluster labels
partition = get_partition(first_df)

In [None]:
# Calculate evaluation metrics
print('Modularity:', calc_modularity(H, partition))
print('Purity:', calc_purity(true_class, partition))

# **2nd experiment**

In [None]:
# Parameters
dim = 64
q = 0.5
p = 1

# Generate random walks
node2vec = Node2Vec(H, dimensions = dim, walk_length = 30, num_walks = 200, q = q, p = p, seed = 1)

# Train node2vec model
sec_model = node2vec.fit(window = 10, min_count = 1, batch_words = 4)

# Save embeddings
sec_model.wv.save_word2vec_format('sec_model.txt')

In [None]:
# Read the saved .txt file as a dataframe
sec_df = pd.read_csv('sec_model.txt', sep=' ', skiprows=1, header = None)
sec_df = sec_df.sort_values(by=[0])
sec_df = sec_df.reset_index(drop=True)

display(sec_df)

In [None]:
# Get the edge embeddings and the label of each edge
train_embs, train_labels = get_edge_embeddings(sec_df.iloc[:, 1:],
                                              train_edges, positive_edges)
test_embs, test_labels = get_edge_embeddings(sec_df.iloc[:, 1:],
                                            test_edges, positive_edges)

In [None]:
sec_lg = LogisticRegression(random_state = 1)
# Train the model
sec_lg.fit(train_embs, train_labels)
# Prediction
sec_pred = sec_lg.predict(test_embs)

In [None]:
# Evaluation Metrics
print('Accuracy:', accuracy_score(test_labels, sec_pred))
print('Precision:', precision_score(test_labels, sec_pred))
print('Recall:', recall_score(test_labels, sec_pred))

In [None]:
print(classification_report(test_labels, sec_pred))

**KMeans Clustering**

In [None]:
# Store the dictionary values in a list based on the order of nodes in the dataframe
new_values = []
for node in list(sec_df[0]):
  new_values.append(node_values[node])

# Convert true values (n, c, l) into (0, 1, 2) reespectively
for item in range(len(new_values)):
  if new_values[item] == 'n' :
    new_values[item] = 0
  elif new_values[item] == 'c':
    new_values[item] = 1
  elif new_values[item] == 'l':
    new_values[item] = 2
  else:
    continue

In [None]:
sec_km = KMeans(n_clusters = 3, random_state = 1, n_init = "auto")
labels = sec_km.fit_predict(sec_df.iloc[:, 1:])

In [None]:
plot_tsne(sec_df.iloc[:, 1:], new_values, 'Ground-Truth', 'Class')

In [None]:
plot_tsne(sec_df.iloc[:, 1:], labels, 'Clustering', 'Cluster')

In [None]:
# Add a new column to the dataframe with the predicted label of each node
sec_df['Pred. Label'] = labels
display(sec_df)

partition = get_partition(sec_df)

In [None]:
# Calculate evaluation metrics
print('Modularity:', calc_modularity(H, partition))
print('Purity:', calc_purity(true_class, partition))

# **3rd experiment**

In [None]:
# Parameters
dim = 64
q = 1
p = 1

# Generate random walks
node2vec = Node2Vec(H, dimensions = dim, walk_length = 30, num_walks = 200, q = q, p = p, seed = 1)

# Train node2vec model
third_model = node2vec.fit(window = 10, min_count = 1, batch_words = 4)

# Save embeddings
third_model.wv.save_word2vec_format('third_model.txt')

In [None]:
# Read the saved .txt file as a dataframe
third_df = pd.read_csv('third_model.txt', sep=' ', skiprows=1, header = None)
third_df = third_df.sort_values(by=[0])
third_df = third_df.reset_index(drop=True)

display(third_df)

In [None]:
# Get the edge embeddings and the label of each node
train_embs, train_labels = get_edge_embeddings(third_df.iloc[:, 1:],
                                              train_edges, positive_edges)
test_embs, test_labels = get_edge_embeddings(third_df.iloc[:, 1:],
                                            test_edges, positive_edges)

In [None]:
third_lg = LogisticRegression(random_state = 1)
# Train the model
third_lg.fit(train_embs, train_labels)
# Prediction
third_pred = third_lg.predict(test_embs)

In [None]:
# Evaluation Metrics
print('Accuracy:', accuracy_score(test_labels, third_pred))
print('Precision:', precision_score(test_labels, third_pred))
print('Recall:', recall_score(test_labels, third_pred))

In [None]:
print(classification_report(test_labels, third_pred))

**KMeans Clustering**

In [None]:
# Store the dictionary values in a list based on the order of nodes in the dataframe
new_values = []
for node in list(third_df[0]):
  new_values.append(node_values[node])

# Convert true values (n, c, l) into (0, 1, 2) reespectively
for item in range(len(new_values)):
  if new_values[item] == 'n' :
    new_values[item] = 0
  elif new_values[item] == 'c':
    new_values[item] = 1
  elif new_values[item] == 'l':
    new_values[item] = 2
  else:
    continue

In [None]:
third_km = KMeans(n_clusters = 3, random_state = 1, n_init = "auto")
labels = third_km.fit_predict(third_df.iloc[:, 1:])

In [None]:
plot_tsne(third_df.iloc[:, 1:], new_values, 'Ground-Truth', 'Class')

In [None]:
plot_tsne(third_df.iloc[:, 1:], labels, 'Clustering', 'Cluster')

In [None]:
# Add a new column to the dataframe with the predicted label of each node
third_df['Pred. Label'] = labels
display(third_df)

partition = get_partition(third_df)

In [None]:
# Calculate evaluation metrics
print('Modularity:', calc_modularity(H, partition))
print('Purity:', calc_purity(true_class, partition))