# Community detection in graphs

For package compatibility reasons, this notebook should run with the `sknetwork_updated` environment.

In [1]:
import networkx as nx
from networkx.algorithms import bipartite
from sknetwork.clustering import Louvain, get_modularity
from scipy import sparse
import numpy as np
import pandas as pd
import os

In [3]:
import sys
from pathlib import Path

# Adding the project root for module imports
project_root = Path().resolve().parents[1]
sys.path.append(str(project_root))

from attendee_profiling import config, constants, graph_communities
from attendee_profiling.graphs_creation import save_network_gexf

In [10]:
import importlib
importlib.reload(config)


<module 'attendee_profiling.config' from 'C:\\Camilo\\Estudio\\Padova\\Master thesis\\master-thesis-reorg\\attendee_profiling\\config.py'>

## Preliminaries

In [11]:
TABLES_DESCRIPTION_PATH =  config.TABLES_DESCRIPTION_PATH

In [13]:
GRAPH_FILES_PATH =  config.GRAPH_FILES_PATH

## Creating the bipartite graphs

In [16]:
%run ../../attendee_profiling/graphs_creation.py {TABLES_DESCRIPTION_PATH} {GRAPH_FILES_PATH}

Script name: ../../attendee_profiling/graphs_creation.py
Reading data from C:\Camilo\Estudio\Padova\Master
Path to save the networks: thesis\Datasets\Processed\tables_for_description
C:\Camilo\Estudio\Padova\Master\user_event_scores_durations_night.csv does not exist.


## Process for night 2

Reading the graphs created above with `networkx`.

In [None]:
graph_filename_n2 = os.path.join(GRAPH_FILES_PATH,'event_attendance_network_scores_n2.gexf')

In [7]:
graph_n2 = nx.read_gexf(graph_filename_n2)

Exracting the information of the graph to fromat it as a bipartite network and obtain the biadjacency matrix format suitable for `sknetwork`.

In [8]:
# Extract the bottom (attendees) and top (events) nodes.
# Nodes are sorted to guarantee that labels coincide in the end
attendee_nodes_n2, event_nodes_n2 = bipartite.sets(graph_n2)
attendee_nodes_n2 = sorted(attendee_nodes_n2)
event_nodes_n2 = sorted(event_nodes_n2)

print(f'Attendees node list lenght: {len(attendee_nodes_n2)}')
print(f'Attendees node list lenght: {len(event_nodes_n2)}')

Attendees node list lenght: 1695
Attendees node list lenght: 26


In [9]:
# Get the biadjacency matrix in the fromat required by sknetwork
biadjacency_n2 = bipartite.biadjacency_matrix(graph_n2,
                                              row_order=attendee_nodes_n2,
                                              column_order=event_nodes_n2)
biadjacency_n2 = sparse.csr_matrix(biadjacency_n2)
print(f'Biadjacency matrix shape: {biadjacency_n2.shape}')

Biadjacency matrix shape: (1695, 26)


### Community detection for different resolution values

In [10]:
resolutions = [round(x * 0.1, 1) for x in range(3,14)]
resolutions

[0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]

In [11]:
community_detection_results_n2 = graph_communities.bipartite_louvain_multiresolutions(biadjacency_matrix=biadjacency_n2,
                                                                                      resolutions=resolutions)
community_detection_results_n2

Unnamed: 0,resolution,labels_attendees,labels_events,num_attendee_comms,modularity
8,1.1,"[12, 3, 5, 9, 6, 8, 4, 4, 3, 11, 3, 6, 3, 6, 4...","[2, 0, 7, 11, 8, 3, 13, 3, 10, 0, 2, 14, 0, 12...",16,0.4929595
7,1.0,"[12, 3, 5, 9, 6, 8, 4, 4, 3, 11, 3, 6, 3, 6, 4...","[2, 0, 7, 11, 8, 3, 13, 3, 10, 0, 2, 14, 0, 12...",16,0.4928292
9,1.2,"[12, 3, 4, 8, 11, 7, 2, 2, 3, 9, 3, 11, 3, 11,...","[5, 0, 6, 9, 7, 3, 13, 3, 10, 0, 16, 15, 0, 12...",18,0.4917889
6,0.9,"[10, 1, 5, 6, 0, 8, 3, 3, 1, 1, 3, 0, 1, 0, 3,...","[4, 2, 7, 1, 8, 3, 11, 1, 9, 2, 4, 6, 2, 10, 3...",13,0.4916683
10,1.3,"[2, 5, 3, 5, 10, 6, 2, 2, 7, 9, 16, 10, 7, 10,...","[4, 0, 8, 9, 6, 16, 13, 7, 11, 0, 17, 15, 0, 1...",19,0.4904267
5,0.8,"[5, 0, 1, 6, 5, 8, 0, 0, 0, 2, 0, 5, 0, 5, 1, ...","[2, 3, 0, 2, 8, 0, 4, 0, 1, 3, 0, 6, 3, 5, 0, ...",10,0.4844473
4,0.7,"[4, 0, 3, 6, 1, 4, 0, 0, 0, 0, 0, 1, 0, 1, 3, ...","[5, 2, 0, 0, 4, 0, 1, 0, 2, 2, 0, 6, 2, 4, 0, ...",8,0.4758855
3,0.6,"[1, 0, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...","[4, 3, 0, 0, 0, 0, 1, 0, 3, 3, 0, 2, 3, 1, 0, ...",6,0.4543705
2,0.5,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, ...","[3, 2, 0, 0, 0, 0, 1, 0, 2, 2, 0, 0, 2, 1, 0, ...",5,0.4080781
1,0.4,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 0, 0, 0, 0, 1, 0, 2, 2, 0, 0, 2, 0, 0, ...",3,0.3344885


In [20]:
community_detection_results_n2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11 entries, 8 to 0
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   resolution          11 non-null     float64
 1   labels_attendees    11 non-null     object 
 2   labels_events       11 non-null     object 
 3   num_attendee_comms  11 non-null     int64  
 4   modularity          11 non-null     float64
dtypes: float64(2), int64(1), object(2)
memory usage: 828.0+ bytes


### Adding the community labels to the original graph

##### Selecting the desired community assignments

In this case, rather than relying only in the modularity, the idea is to compare how much the community label assignments differ from the clustering results. For the sake of interpretability, a number of communities close to the number of clusters obtained was chosen.

In [12]:
results_n2 = community_detection_results_n2.loc[community_detection_results_n2['modularity'].idxmax()]

# Extract the labels as numpy arrays 
# (where originally stored as pandas Series with one array inside it)
labels_attendees_n2 = results_n2['labels_attendees']
labels_events_n2 = results_n2['labels_events']

#### Adding the labels back

In [13]:
graph_n2 = graph_communities.add_labels_to_graph(graph=graph_n2,
                                                 attendees_nodes=attendee_nodes_n2, event_nodes=event_nodes_n2,
                                                 labels_attendees=labels_attendees_n2, labels_events=labels_events_n2)

## Process for night 1

Reading the graphs created above with `networkx`.

In [None]:
graph_filename_n1 = os.path.join(GRAPH_FILES_PATH,'event_attendance_network_scores_n1.gexf')

In [15]:
graph_n1 = nx.read_gexf(graph_filename_n1)

Exracting the information of the graph to fromat it as a bipartite network and obtain the biadjacency matrix format suitable for `sknetwork`.

In [16]:
# Extract the bottom (attendees) and top (events) nodes.
# Nodes are sorted to guarantee that labels coincide in the end
attendee_nodes_n1, event_nodes_n1 = bipartite.sets(graph_n1)
attendee_nodes_n1 = sorted(attendee_nodes_n1)
event_nodes_n1 = sorted(event_nodes_n1)

print(f'Attendees node list lenght: {len(attendee_nodes_n1)}')
print(f'Attendees node list lenght: {len(event_nodes_n1)}')

Attendees node list lenght: 1580
Attendees node list lenght: 24


In [17]:
# Get the biadjacency matrix in the fromat required by sknetwork
biadjacency_n1 = bipartite.biadjacency_matrix(graph_n1,
                                              row_order=attendee_nodes_n1,
                                              column_order=event_nodes_n1)
biadjacency_n1 = sparse.csr_matrix(biadjacency_n1)
print(f'Biadjacency matrix shape: {biadjacency_n1.shape}')

Biadjacency matrix shape: (1580, 24)


### Community detection for different resolution values

In [18]:
resolutions = [round(x * 0.1, 1) for x in range(3,14)]
resolutions

[0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3]

In [19]:
community_detection_results_n1 = graph_communities.bipartite_louvain_multiresolutions(biadjacency_matrix=biadjacency_n1,
                                                                                      resolutions=resolutions)
community_detection_results_n1

Unnamed: 0,resolution,labels_attendees,labels_events,num_attendee_comms,modularity
9,1.2,"[4, 6, 5, 5, 1, 0, 2, 5, 1, 4, 7, 8, 2, 10, 4,...","[6, 0, 13, 8, 3, 0, 4, 3, 4, 5, 4, 2, 0, 1, 1,...",14,0.521762
7,1.0,"[5, 0, 2, 10, 3, 1, 4, 2, 3, 5, 6, 2, 4, 9, 5,...","[0, 1, 12, 2, 0, 1, 5, 0, 5, 2, 5, 4, 1, 3, 3,...",13,0.521661
8,1.1,"[4, 5, 1, 12, 2, 0, 3, 1, 2, 4, 6, 1, 3, 10, 4...","[5, 0, 14, 1, 11, 0, 4, 8, 4, 1, 4, 3, 0, 2, 2...",15,0.521479
6,0.9,"[6, 0, 2, 11, 3, 1, 4, 2, 3, 6, 5, 2, 4, 10, 6...","[0, 1, 12, 2, 0, 1, 6, 0, 7, 2, 6, 4, 1, 3, 3,...",13,0.521392
5,0.8,"[6, 0, 2, 3, 5, 1, 3, 2, 5, 6, 4, 2, 3, 9, 6, ...","[0, 1, 10, 2, 0, 1, 6, 0, 7, 2, 6, 3, 1, 5, 5,...",11,0.519639
10,1.3,"[1, 4, 6, 14, 0, 2, 3, 6, 0, 1, 5, 8, 3, 12, 1...","[4, 2, 16, 8, 13, 2, 1, 9, 1, 6, 1, 3, 7, 0, 0...",17,0.517576
4,0.7,"[3, 0, 1, 1, 5, 2, 6, 1, 5, 3, 4, 1, 6, 7, 3, ...","[0, 2, 9, 1, 0, 2, 3, 0, 3, 1, 3, 6, 2, 5, 5, ...",10,0.51562
3,0.6,"[2, 0, 1, 1, 3, 1, 5, 1, 3, 2, 0, 4, 5, 4, 2, ...","[0, 1, 5, 4, 0, 1, 2, 0, 2, 1, 2, 5, 1, 3, 3, ...",7,0.499506
2,0.5,"[3, 1, 0, 0, 2, 0, 0, 0, 2, 3, 4, 1, 0, 1, 3, ...","[1, 0, 5, 1, 1, 0, 3, 1, 3, 0, 3, 0, 0, 2, 2, ...",6,0.473062
1,0.4,"[1, 0, 0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 1, ...","[0, 2, 3, 0, 0, 2, 1, 0, 1, 0, 1, 0, 2, 0, 0, ...",4,0.364791


### Adding the community labels to the original graph

##### Selecting the desired community assignments

In this case, rather than relying only in the modularity, the idea is to compare how much the community label assignments differ from the clustering results. For the sake of interpretability, a number of communities close to the number of clusters obtained was chosen.

In [23]:
results_n1 = community_detection_results_n1.loc[community_detection_results_n1['modularity'].idxmax()]

# Extract the labels as numpy arrays 
# (where originally stored as pandas Series with one array inside it)
labels_attendees_n1 = results_n1['labels_attendees']
labels_events_n1 = results_n1['labels_events']

#### Adding the labels back

In [24]:
graph_n1 = graph_communities.add_labels_to_graph(graph=graph_n1,
                                                 attendees_nodes=attendee_nodes_n1, event_nodes=event_nodes_n1,
                                                 labels_attendees=labels_attendees_n1, labels_events=labels_events_n1)

## Overwriting the graph files

In [25]:
save_network_gexf(G=graph_n2,output_path=graph_filename_n2)

Network saved to ..\..\Datasets\Processed\graph_files\event_attendance_network_scores_n2.gexf


In [26]:
save_network_gexf(G=graph_n1,output_path=graph_filename_n1)

Network saved to ..\..\Datasets\Processed\graph_files\event_attendance_network_scores_n1.gexf
