# Subnetwork Analysis

## Path

In [None]:
SNA_PROJECT_PATH = "/home/sna_bros/SNA_Project"

## Installations

In [None]:
%pip install ijson -q
%pip install networkit
%pip install gravis
%pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com

In [None]:
%pip install gravis

## Imports

In [None]:
# Data analysis and manipulation libraries
import pandas as pd
#import polars as pl
import numpy as np

In [None]:
# File/OS handling, JSON parsing, progress display and Parquet file operations libraries
import os
import ijson
from typing import List, Any, Dict
from tqdm.notebook import tqdm
import fastparquet

In [None]:
# Network analysis and graph manipulation libraries
import networkx as nx
from networkx import subgraph_view
import networkit as nk

In [None]:
# Graphs visualization libraries
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import gravis as gv

In [None]:
# Machine learning libraries
import scipy as spy
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
#import statsmodels.api as sm

In [None]:
# Cuda backend
import nx_cugraph as nxcg

## Network creation

In [None]:
unique_users = False

In [None]:
user_hashtag_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users_non_unique.parquet")
user_hashtag_df.head(10)

##### Run these only if non_unique=True

In [None]:
hashtag_counts_df = user_hashtag_df.value_counts().reset_index(name='Occurrences')
hashtag_counts_df.sort_values(by='Occurrences', ascending=False).head()

In [None]:
communities = user_hashtag_df['Hashtag'].str.lower().value_counts()

### Edge row Selection

In [None]:
#community_name = "ukraine"
#community_name = "nato"
#community_name = "ruleoflaw"
#community_name = "nftcommunity"
#community_name = "agenda2030"
#community_name = "feminist"
#community_name = "ai"
#community_name = "deeplearning"
community_name = "covid"

#community = hashtag_counts_df[hashtag_counts_df['Hashtag'].str.contains(community_name, case=False)]['UserID'].to_list()  
community = user_hashtag_df[user_hashtag_df['Hashtag'].str.lower()==community_name]['UserID'].to_list()                    
print(len(community))
print(community[:20])

##### Run this only if non_unique=True

In [None]:
# Filter rows where the hashtag contains the community name (case-insensitive)
community_hashtags = hashtag_counts_df[hashtag_counts_df['Hashtag'].str.contains(community_name, case=False)]

# Group by UserID and sum the Occurrences
community_df = community_hashtags.groupby('UserID')['Occurrences'].sum().reset_index()

community_df.sort_values('Occurrences', ascending=False).head(10)

##### Reading edges

In [None]:
def create_df_network(df, users):
  return df[(df["source_id"].isin(users)) & (df["target_id"].isin(users))]

In [None]:
df_list = []
for k in range(8):
  chunk_edge_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunks/edge_chunk_{k}.parquet")
  df = create_df_network(chunk_edge_df, community)
  df_list.append(df)

final_df = pd.concat(df_list, ignore_index=True)
print(final_df.shape)
final_df.head(100)

### Graph creation

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/label.csv")
labels.head()
colormap = {'human':'green', 'bot':'red'}
color_df = labels.replace(colormap)
label_dict = labels.set_index('id')['label'].to_dict()
color_dict = color_df.set_index('id')['label'].to_dict()

In [None]:
following_df = final_df[final_df['relation']=='following']
following_graph = nx.from_pandas_edgelist(following_df, 'source_id', 'target_id', create_using=nx.DiGraph())

In [None]:
followers_df = final_df[final_df['relation']=='followers']
#followers_graph = nx.from_pandas_edgelist(followers_df, 'source_id', 'target_id', create_using=nx.DiGraph())
followers_graph = nx.from_pandas_edgelist(followers_df, 'target_id', 'source_id', create_using=nx.DiGraph())  # Reversing the follower relationship, this way every arc in the graph goes from follower to followed

In [None]:
nx.set_node_attributes(following_graph, label_dict, 'label')
nx.set_node_attributes(following_graph, color_dict, 'color')

In [None]:
nx.set_node_attributes(followers_graph, label_dict, 'label')
nx.set_node_attributes(followers_graph, color_dict, 'color')

In [None]:
full_graph = nx.compose(following_graph, followers_graph)

In [None]:
nodes = full_graph.nodes()
edge_dict = {}
for e in full_graph.edges():
  start_label = nodes[e[0]]['label']
  end_label = nodes[e[1]]['label']
  edge_dict[e] = start_label + '_' + end_label

nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')

#### Passages for non-unique setting (to add tweet counts)

In [None]:
community_df.set_index('UserID')
community_dict = {}
for u in range(len(community_df)):
  community_dict[community_df.loc[u]['UserID']] = community_df.loc[u]['Occurrences']

In [None]:
nodes = full_graph.nodes()
edge_dict = {}
for e in full_graph.edges():
  start_label = nodes[e[0]]['label']
  end_label = nodes[e[1]]['label']
  edge_dict[e] = start_label + '_' + end_label

nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')

## Computing centrality measures

In [None]:
full_graph_gpu = nxcg.from_networkx(full_graph)

In [None]:
full_graph_nk = nk.nxadapter.nx2nk(full_graph, data=True)
idmap = dict((id, u) for (id, u) in zip(full_graph.nodes(), range(full_graph.number_of_nodes())))

In [None]:
def list_to_dict_user(list_measure):
  dict_measure = {}
  for u in idmap.keys():
    dict_measure[u] = list_measure[idmap[u]]
  return dict_measure

#### Setting labels

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/label.csv")
label_dict = labels.set_index('id')['label'].to_dict()
nx.set_node_attributes(full_graph, label_dict, 'label')

In [None]:
nodes = full_graph.nodes()
edge_dict = {}
for e in full_graph.edges():
  start_label = nodes[e[0]]['label']
  end_label = nodes[e[1]]['label']
  edge_dict[e] = start_label + '_' + end_label

nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')

In [None]:
measures = ['degree_centrality', 'in_degree', 'out_degree','eigenvector','reputation','pagerank','betweenness','reciprocity', 'hubs', 'authorities',
            'average_neighbour_degree','triangles','core', 'clustering', 'n_tweets']
measure_df = pd.DataFrame(full_graph.nodes(),columns=['UserID'])
#measure_df = pd.read_csv(f'{community_name}/{community_name}_measures.csv').drop(['label','count'], axis=1, inplace=False)

In [None]:
measure_df.head()

#### Node Measures

##### Degree Centrality

In [None]:
deg_centr_nk = nk.centrality.DegreeCentrality(full_graph_nk).run().scores()
measure_df['degree_centrality']=deg_centr_nk

##### In/Out Degree

In [None]:
in_degree = full_graph.in_degree
out_degree = full_graph.out_degree
measure_df['in_degree']=[t[1] for t in in_degree]
measure_df['out_degree']=[t[1] for t in out_degree]

##### Eigenvector Centrality

In [None]:
eigenvector = nxcg.eigenvector_centrality(full_graph_gpu, max_iter=1000)
measure_df['eigenvector'] = eigenvector.values()

##### Reputation

In [None]:
reputation = {}
for node in full_graph.nodes().keys():
  reputation[node]=in_degree[node]/(in_degree[node]+out_degree[node]+1)
measure_df['reputation'] = reputation.values()

##### Pagerank

In [None]:
pagerank = nk.centrality.PageRank(full_graph_nk).run().scores()
measure_df['pagerank'] = pagerank

##### Betweenness

In [None]:
betweenness = nxcg.betweenness_centrality(full_graph_gpu)
measure_df['betweenness'] = betweenness.values()

##### Reciprocity

In [None]:
reciprocity= nxcg.reciprocity(full_graph_gpu, full_graph.nodes)
measure_df['reciprocity'] = reciprocity.values()

##### Hubs and Authorities

In [None]:
hits = nxcg.hits(full_graph_gpu)
measure_df['hubs'] = hits[0].values()
measure_df['authorities'] = hits[1].values()

#### Network Measures

##### Average Neighbourhood Degree

In [None]:
avg_n_degree = nx.average_neighbor_degree(full_graph, source='in+out')
measure_df['average_neighbour_degree'] = avg_n_degree.values()

##### Number of Triangles

In [None]:
triangles = nxcg.triangles(full_graph_gpu.to_undirected())
measure_df['triangles'] = triangles.values()

##### Core Number

In [None]:
core = nxcg.core_number(full_graph_gpu.to_undirected())
measure_df['core'] = core.values()

##### Clustering

In [None]:
clustering = nxcg.clustering(full_graph_gpu.to_undirected())
measure_df['clustering']=clustering.values()

##### Triadic Census

In [None]:
#triad_census = nx.triadic_census(full_graph)

In [None]:
# for key, value in triad_census.items():
#     print(f"{key}: {value}")

#### Saving the Dataframe

In [None]:
measure_df.head()

In [None]:
#Adding number of tweets
merged_df = pd.merge(measure_df, community_df, on='UserID', how='inner')
merged_df.head()

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/label.csv")
labels.columns = ['UserID', 'label']
merged_df = pd.merge(merged_df, labels, on='UserID', how='inner')
merged_df.head()

In [None]:
os.mkdir(f"{community_name}")
merged_df = merged_df.rename(columns={"Occurrences":"n_tweets"})
merged_df.to_csv(f'{community_name}/{community_name}_measures.csv', index=False)

In [None]:
bot_df = merged_df[merged_df['label']=='bot']
human_df = merged_df[merged_df['label']=='human']

for measure in measures:
  print(f"Bot Description for {measure}:\n {bot_df[measure].describe()}\n")
  print(f"Human Description for {measure}:\n {human_df[measure].describe()}'")
  print('\n\n')

## Plotting Distributions

In [None]:
n_measures = len(measures)
for n in range(n_measures):
    measure = measures[n]
    log=False
    if measure not in ['reputation', 'reciprocity', 'average_neighbour_degree', 'core']:
        log=True

    sns.histplot(data=merged_df, x=measure, hue='label', element="step",
    stat="density", common_norm=False, log_scale=log)
    plt.savefig(f"{community_name}/{community_name}_{measure}.png")
    plt.show()

## Graph Visualization

In [None]:
for k in measures:
  nx.set_node_attributes(full_graph, list_to_dict_user(merged_df[k].round(2).astype('float')), k)

nx.set_node_attributes(full_graph, list_to_dict_user(merged_df['n_tweets'].astype('float')), 'size')
fig = gv.d3(full_graph)
fig.export_html(f'{community_name}/{community_name}_graph.html')