# Complete Network Analysis

## Path

In [None]:
SNA_PROJECT_PATH = "/home/sna_bros/SNA_Project"

## Installations

In [None]:
%pip install ijson -q
%pip install networkit
%pip install pandas
%pip install networkx
%pip install networkit
%pip install seaborn
%pip install matplotlib
%pip install fastparquet
%pip install scikit-learn

In [None]:
%pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com

In [None]:
%env NX_CUGRAPH_AUTOCONFIG=True

## Imports

In [None]:
import pandas as pd
import os
import ijson
from typing import List, Any, Dict
from tqdm.notebook import tqdm
import networkx as nx
import networkit as nk
import seaborn as sns
from matplotlib import pyplot as plt
import nx_cugraph as nxcg
import gravis as gv

## Loading the Network

In [None]:
df_list = []
for k in range(8):
  chunk_edge_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunks/edge_chunk_{k}.parquet")
  df = chunk_edge_df
  df_list.append(df)

final_df = pd.concat(df_list, ignore_index=True)
print(final_df.shape)
final_df.head(100)

In [None]:
following_df = final_df[final_df['relation']=='following']
following_graph = nx.from_pandas_edgelist(following_df, 'source_id', 'target_id', create_using=nx.DiGraph())
followers_df = final_df[final_df['relation']=='followers']
followers_graph = nx.from_pandas_edgelist(followers_df, 'target_id', 'source_id', create_using=nx.DiGraph())
full_graph = nx.compose(following_graph, followers_graph)

In [None]:
print(full_graph)

In [None]:
full_graph_gpu = nxcg.from_networkx(full_graph)

In [None]:
full_graph_nk = nk.nxadapter.nx2nk(full_graph, data=True)
idmap = dict((id, u) for (id, u) in zip(full_graph.nodes(), range(full_graph.number_of_nodes())))

In [None]:
def list_to_dict_user(list_measure):
  dict_measure = {}
  for u in idmap.keys():
    dict_measure[u] = list_measure[idmap[u]]
  return dict_measure

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/label.csv")
label_dict = labels.set_index('id')['label'].to_dict()
nx.set_node_attributes(full_graph, label_dict, 'label')

In [None]:
nodes = full_graph.nodes()
edge_dict = {}
for e in full_graph.edges():
  start_label = nodes[e[0]]['label']
  end_label = nodes[e[1]]['label']
  edge_dict[e] = start_label + '_' + end_label

nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')

## Adding Number of Tweets

In [None]:
hu = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users_non_unique.parquet")

In [None]:
hu.head()

In [None]:
hu_grouped = pd.DataFrame(hu['UserID'].value_counts())

In [None]:
hu_grouped.head()

## Centrality Measures

In [None]:
measures = ['degree_centrality', 'in_degree', 'out_degree','eigenvector','reputation','pagerank','betweenness','reciprocity', 'hubs', 'authorities',
            'average_neighbour_degree','triangles','core', 'clustering', 'n_tweets']
#measure_df = pd.DataFrame(full_graph.nodes(),columns=['UserID'])
measure_df = pd.read_csv('measures.csv').drop(['label', 'n_tweets'], axis=1, inplace=False)

In [None]:
measure_df=measure_df.drop(['Unnamed: 0'], axis=1)
measure_df.head()

### Node Measures

##### Degree Centrality

In [None]:
deg_centr_nk = nk.centrality.DegreeCentrality(full_graph_nk).run().scores()
measure_df['degree_centrality']=deg_centr_nk

##### In/Out Degree

In [None]:
in_degree = full_graph.in_degree
out_degree = full_graph.out_degree
measure_df['in_degree']=[t[1] for t in in_degree]
measure_df['out_degree']=[t[1] for t in out_degree]

##### Eigenvector Centrality

In [None]:
eigenvector = nxcg.eigenvector_centrality(full_graph_gpu)
measure_df['eigenvector'] = eigenvector.values()

##### Reputation

In [None]:
reputation = {}
for node in full_graph.nodes().keys():
  reputation[node]=in_degree[node]/(in_degree[node]+out_degree[node]+1)
measure_df['reputation'] = reputation.values()

##### Pagerank

In [None]:
pagerank = nk.centrality.PageRank(full_graph_nk).run().scores()
measure_df['pagerank'] = pagerank

##### Betweenness

In [None]:
betweenness = nxcg.betweenness_centrality(full_graph_gpu)
measure_df['betweenness'] = betweenness.values()

##### Reciprocity

In [None]:
reciprocity= nxcg.reciprocity(full_graph_gpu, full_graph.nodes)
measure_df['reciprocity'] = reciprocity.values()

##### Hubs and Autorities

In [None]:
hits = nxcg.hits(full_graph_gpu)
measure_df['hubs'] = hits[0].values()
measure_df['authorities'] = hits[1].values()

### Network Measures

##### Average Neighbour Degree

In [None]:
avg_n_degree = nx.average_neighbor_degree(full_graph, source='in+out')
measure_df['average_neighbour_degree'] = avg_n_degree.values()

##### Triangle Count

In [None]:
triangles = nxcg.triangles(full_graph_gpu.to_undirected())
measure_df['triangles'] = triangles.values()

##### Core Number

In [None]:
core = nxcg.core_number(full_graph_gpu.to_undirected())
measure_df['core'] = core.values()

##### Clustering Coefficient

In [None]:
clustering = nxcg.clustering(full_graph_gpu.to_undirected())
measure_df['clustering']=clustering.values()

##### Triadic Census

In [None]:
triad_census = nx.triadic_census(full_graph)

In [None]:
measure_df.head()

In [None]:
#Adding number of tweets
merged_df = pd.merge(measure_df, hu_grouped, on='UserID', how='inner')
merged_df.head()

In [None]:
#Adding labels
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/label.csv")
labels.columns = ['UserID', 'label']
merged_df = pd.merge(merged_df, labels, on='UserID', how='inner')
merged_df.head()

In [None]:
os.mkdir("complete")
#merged_df = merged_df.drop('label_x', axis=1).rename(columns={"label_y": "label"})
merged_df.to_csv('complete/complete_measures.csv', index=False)

In [None]:
bot_df = merged_df[merged_df['label']=='bot']
human_df = merged_df[merged_df['label']=='human']

for measure in measures:
  print(f"Bot Description for {measure}:\n {bot_df[measure].describe()}\n")
  print(f"Human Description for {measure}:\n {human_df[measure].describe()}'")
  print('\n\n')

## Plotting Distributions

In [None]:
n_measures = len(measures)
for n in range(n_measures):
    measure = measures[n]
    log=False
    if measure not in ['reputation', 'reciprocity', 'average_neighbour_degree', 'core']:
        log=True

    sns.histplot(data=merged_df, x=measure, hue='label', element="step",
    stat="density", common_norm=False, log_scale=log)
    plt.savefig(f"complete/complete_{measure}.png")
    plt.show()

## Graph Visualization

In [None]:
for k in measures:
  nx.set_node_attributes(full_graph, merged_df[k].round(2), k)


#nx.set_node_attributes(full_graph, merged_df['n_tweets'], 'size')
fig = gv.d3(full_graph)
fig.export_html(f'complete/complete.html')