# Comparing clustering assignments

In [1]:
import pandas as pd
import numpy as np
import networkx as nx

from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics.cluster import adjusted_mutual_info_score

import os

## Preliminaries

Assigning the path to write the trajectories general measures with the clustering labels.

In [14]:
path_clustering_results_tables = r'..\..\Datasets\Processed\clustering_results'

Assigning the path to read the community results from the gephi graphs.

In [15]:
path_graph_files = r'..\..\Datasets\Processed\graph_files'

## Process for night 2

### Reading the data

In [16]:
traj_clustering_results_n2 = pd.read_csv(os.path.join(path_clustering_results_tables,f'traj_general_measures_labels_n2.csv'))

In [17]:
graph_n2 = nx.read_gexf(os.path.join(path_graph_files,f'event_attendance_network_scores_n2.gexf'))

### Extracting only the attendees from the gephi nodes

I need to extract the nodes from the network. Additionally, as the nodes include event nodes, I need to filter them out.

In [18]:
nodes_n2 = pd.DataFrame.from_dict(dict(graph_n2.nodes(data=True)), orient='index').reset_index()
nodes_n2.rename(columns={'index': 'uid'}, inplace=True)  # Rename node index column to 'uid'

In [19]:
attendees_node_labels_n2 = nodes_n2.loc[nodes_n2['type']=='attendee']

### Sorting the instances and checking the shapes and the counts by cluster.

In [20]:
def calculate_group_percentages(df, clusters_col):
    grouped = df.groupby(clusters_col).size()
    result_df = grouped.to_frame(name='counts')
    result_df['percentages'] = (grouped / len(df)) * 100
    return result_df.sort_values(by='counts', ascending=False)

- Trajectory clustering

Sorting the table with respect to the uid for the night.

In [21]:
traj_clustering_results_n2 = traj_clustering_results_n2.sort_values(by=['uid','tid']).reset_index(drop=True)

In [22]:
print(f'Length of the trajectory clusters table: {traj_clustering_results_n2.shape}')

print(f'Counts by community:')
calculate_group_percentages(df=traj_clustering_results_n2, clusters_col='hdbscan_dtw_bcs_labels')

Length of the trajectory clusters table: (1695, 68)
Counts by community:


Unnamed: 0_level_0,counts,percentages
hdbscan_dtw_bcs_labels,Unnamed: 1_level_1,Unnamed: 2_level_1
1,564,33.274336
2,553,32.625369
3,421,24.837758
0,115,6.784661
-1,42,2.477876


- Graph communities

Sorting the table with respect to the uid for the night.

In [23]:
attendees_node_labels_n2 = attendees_node_labels_n2.sort_values(by=['uid']).reset_index(drop=True)

In [24]:
print(f'Length of the graph communities table: {attendees_node_labels_n2.shape}')

print(f'Counts by community:')
calculate_group_percentages(df=attendees_node_labels_n2, clusters_col='community')

Length of the graph communities table: (1695, 5)
Counts by community:


Unnamed: 0_level_0,counts,percentages
community,Unnamed: 1_level_1,Unnamed: 2_level_1
0,183,10.79646
1,163,9.616519
2,147,8.672566
3,144,8.495575
4,143,8.436578
5,136,8.023599
6,121,7.138643
7,104,6.135693
8,97,5.722714
9,95,5.60472


### Checking the simmilarity between the two clustering/community detection approaches

Check if the ordering of instances coincide.

In [25]:
print(f"Do the orderings coincide? \n{(traj_clustering_results_n2['uid']==attendees_node_labels_n2['uid']).all()}")

Do the orderings coincide? 
True


#### Adjusted Rand Index (ARI)

In [26]:
adjusted_rand_score(traj_clustering_results_n2['hdbscan_dtw_bcs_labels'], attendees_node_labels_n2['community'])

0.03229883215852752

#### Normalized Mutual Information (NMI)

In [27]:
normalized_mutual_info_score(traj_clustering_results_n2['hdbscan_dtw_bcs_labels'], attendees_node_labels_n2['community'])

0.1163236888408619

Adjusted Mutual Information (AMI)

In [28]:
adjusted_mutual_info_score(traj_clustering_results_n2['hdbscan_dtw_bcs_labels'], attendees_node_labels_n2['community'])

0.10815275192942547

## Process for night 1

### Reading the data

In [29]:
traj_clustering_results_n1 = pd.read_csv(os.path.join(path_clustering_results_tables,f'traj_general_measures_labels_n1.csv'))

In [30]:
graph_n1 = nx.read_gexf(os.path.join(path_graph_files,f'event_attendance_network_scores_n1.gexf'))

### Extracting only the attendees from the gephi nodes

I need to extract the nodes from the network. Additionally, as the nodes include event nodes, I need to filter them out.

In [31]:
nodes_n1 = pd.DataFrame.from_dict(dict(graph_n1.nodes(data=True)), orient='index').reset_index()
nodes_n1.rename(columns={'index': 'uid'}, inplace=True)  # Rename node index column to 'uid'

In [32]:
attendees_node_labels_n1 = nodes_n1.loc[nodes_n1['type']=='attendee']

### Sorting the instances and checking the shapes and the counts by cluster.

Sorting the table with respect to the uid for the night.

In [33]:
traj_clustering_results_n1 = traj_clustering_results_n1.sort_values(by=['uid','tid']).reset_index(drop=True)

- Trajectory clustering

In [34]:
print(f'Length of the trajectory clusters table (night 1): {traj_clustering_results_n1.shape}')

print(f'Counts by community:')
calculate_group_percentages(df=traj_clustering_results_n1, clusters_col='hdbscan_dtw_dbcv_branch_labels')

Length of the trajectory clusters table (night 1): (1580, 65)
Counts by community:


Unnamed: 0_level_0,counts,percentages
hdbscan_dtw_dbcv_branch_labels,Unnamed: 1_level_1,Unnamed: 2_level_1
3,556,35.189873
1,455,28.797468
2,441,27.911392
0,128,8.101266


- Graph communities

Sorting the table with respect to the uid for the night.

In [35]:
attendees_node_labels_n1 = attendees_node_labels_n1.sort_values(by=['uid']).reset_index(drop=True)

In [36]:
print(f'Length of the gephi communities table: {attendees_node_labels_n1.shape}')

print(f'Counts by community:')
calculate_group_percentages(df=attendees_node_labels_n1, clusters_col='community')

Length of the gephi communities table: (1580, 5)
Counts by community:


Unnamed: 0_level_0,counts,percentages
community,Unnamed: 1_level_1,Unnamed: 2_level_1
0,205,12.974684
1,168,10.632911
2,162,10.253165
3,152,9.620253
4,144,9.113924
5,141,8.924051
6,109,6.898734
7,104,6.582278
8,90,5.696203
9,84,5.316456


### Checking the simmilarity between the two clustering/community detection approaches

Check if the ordering of instances coincide.

In [37]:
print(f"Do the orderings coincide? \n{(traj_clustering_results_n1['uid']==attendees_node_labels_n1['uid']).all()}")

Do the orderings coincide? 
True


#### Adjusted Rand Index (ARI)

In [38]:
adjusted_rand_score(traj_clustering_results_n1['hdbscan_dtw_dbcv_branch_labels'], attendees_node_labels_n1['community'])

0.03806275888464409

In [39]:
adjusted_rand_score(traj_clustering_results_n1['kmedoids_dtw_labels'], attendees_node_labels_n1['community'])

0.06482722942850261

#### Normalized Mutual Information (NMI)

In [40]:
normalized_mutual_info_score(traj_clustering_results_n1['hdbscan_dtw_dbcv_branch_labels'], attendees_node_labels_n1['community'])

0.10921671563942081

In [41]:
normalized_mutual_info_score(traj_clustering_results_n1['kmedoids_dtw_labels'], attendees_node_labels_n1['community'])

0.15853946555114481

Adjusted Mutual Information (AMI)

In [42]:
adjusted_mutual_info_score(traj_clustering_results_n1['hdbscan_dtw_dbcv_branch_labels'], attendees_node_labels_n1['community'])

0.10332085989002174

In [43]:
adjusted_mutual_info_score(traj_clustering_results_n1['kmedoids_dtw_labels'], attendees_node_labels_n1['community'])

0.1501442680707489