In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
SNA_PROJECT_PATH = "drive/MyDrive/SNA_Project"

#### Installations:

In [None]:
!pip install ijson -q

In [None]:
!pip install fastparquet -q

In [None]:
!pip install networkit

In [None]:
!pip install gravis

#### Imports:

In [None]:
# Data analysis and manipulation libraries
import pandas as pd
import polars as pl
import numpy as np

In [None]:
# File/OS handling, JSON parsing, progress display and Parquet file operations libraries
import os
import ijson
from typing import List, Any, Dict
from tqdm.notebook import tqdm
import fastparquet

In [None]:
# Network analysis and graph manipulation libraries
import networkx as nx
from networkx import subgraph_view
import networkit as nk

In [None]:
# Graphs visualization libraries
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.cm as cm
import gravis as gv

In [None]:
# Machine learning libraries
import scipy as spy
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score
import statsmodels.api as sm

In [None]:
# Multiprocessing for parallel tasks
import multiprocessing
import sys
sys.path.append("/content/drive/MyDrive/SNA_Project")
from metrics import *

# 🔎 Exploring the Dataset

In [None]:
!ls $SNA_PROJECT_PATH/TwiBot-22

In [None]:
def explore(filepath, type, max=None):
  if type == 'csv':
    dataset = pd.read_csv(filepath)
  elif type == 'json':
    dataset = pd.read_json(filepath, nrows=max)
  print(f"Dataset shape is {dataset.shape}")
  return dataset

## Users dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/user.json", 'json')

## Labels dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/label.csv", 'csv')

## List dataset

In [None]:
lists = explore(f"{SNA_PROJECT_PATH}/TwiBot-22/list.json", 'json')
lists = lists.sort_values(by=['follower_count', 'member_count'], ascending=False, axis=0)
lists.head(100)

## Hashtag dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/hashtag.json", 'json')

## Split dataset

In [None]:
explore(f"{SNA_PROJECT_PATH}/TwiBot-22/split.csv", 'csv')

## Edge Dataset


In [None]:
edges = pl.read_csv(f"{SNA_PROJECT_PATH}/TwiBot-22/edge.csv", new_columns=['source', 'relation', 'target'], n_rows=66000633, skip_rows=94328880)
edges_hash = edges.filter(pl.col("relation")=="discuss")
edges_hash.head(10000000)

In [None]:
grouped_eh = edges_hash.group_by("target").agg(pl.col("source").str.join(","))
grouped_eh.head(10)

### 'discuss' rows count:


*   0 to 10000000:
*   10000000 to 20000000:
*   20000000 to 30000000:
*   30000000 to 40000000:
*   40000000 to 50000000:
*   50000000 to 60000000:
*   60000000 to 70000000:
*   70000000 to 80000000:
*   80000000 to 90000000:
*   90000000 to 100000000:  5.671.120
*   100000000 to 110000000: 10.000.000
*   110000000 to 120000000: 10.000.000
*   120000000 to 130000000: 10.000.000
*   130000000 to 140000000: 10.000.000
*   140000000 to 150000000: 10.000.000
*   150000000 to 160000000: 10.000.000
*   160000000 to 170000000: 329.513
*   170000000 to end:

There are a total of 66.000.633 "discuss" entries in the edge dataset, comprised between indeces 90.000.000 and 170.000.000.

## Twitter dataset (split 0)

The Tweet_i datasets seem to be constitued by one giant line, without '\n' characters. The structure is the following: <br>
[{json_Object_1} , ..., {json_Object_n}]

In [None]:
def read_n_instances(filename, n):
  i=0
  file = open(filename, "r")
  square = file.read(1)
  instances = []
  instance = ''
  start = '{"attachments":'
  while(i<n):
    while(not instance.endswith(', {"attachments":', 18)):
      char = file.read(1)
      instance += char
    instances.append(instance[0:-17])
    instance=start
    i+=1
  file.close()
  return instances


In [None]:
res = read_n_instances(f"{SNA_PROJECT_PATH}/TwiBot-22/tweet_0.json", 100)
for i in res:
  print(i)

In [None]:
df_inter = pd.DataFrame(res)
#df_inter.columns = ['attachments', 'author_id', 'context_annotations', 'conversation_id', 'created_at', 'entities', 'geo', 'id', 'in_reply_to_user_id', 'lang', 'possibly_sensitive', 'public_metrics', 'referenced_tweets', 'reply_sttings', 'source', 'text', 'withheld']
df_inter.columns = ['json_element']

import json
df_inter['json_element'].apply(json.loads)

df_final = pd.json_normalize(df_inter['json_element'].apply(json.loads))
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
df_final=df_final.sort_values(by='entities.hashtags', ascending=False , axis=0, key=lambda col: [len(i) for i in col])
df_final.head(10)

# ✂︎ Splitting into chunks
https://github.com/LuoUndergradXJTU/TwiBot-22/issues/17

## Parsing Tweets

In [None]:
class TweetsParser:
  """
  This class parses large tweet JSON files, extracts relevant information,
  and saves them into smaller Parquet files for efficient processing.
  """
  def __init__(self, tweets_path: str, batch: int=0, chunk_size: int=1000000) -> None:
    # tweets_path: path of the large tweet chunk
    # chunk_size: size of the mini chunks
    # batch: offset of the chunk indices
    self.tweets_path = tweets_path
    self.chunk_size = chunk_size
    self.batch = batch

  def change_tweets_path(self, new_tweets_path):
    self.tweets_path = new_tweets_path

  def _extract_hashtags(self, entity: Dict) -> List:
    if not entity or 'hashtags' not in entity:
        return []
    return entity.get('hashtags', [])

  def _save_mini_chunk(self, records: List[Any], chunk_number: int, output_dir: str=f"{SNA_PROJECT_PATH}/tweet_chunks"):
      os.makedirs(output_dir, exist_ok=True)

      df = pd.DataFrame(records)

      df['hashtags'] = df['entities'].apply(self._extract_hashtags)
      cols_to_drop = ['attachments', 'context_annotations', 'conversation_id', 'created_at', 'geo', 'id', 'lang', 'possibly_sensitive', 'referenced_tweets', 'reply_settings', 'source', 'text', 'withheld', 'entities', 'public_metrics']
      df = df.drop(columns=[col for col in cols_to_drop if col in df.columns])

      output_path = os.path.join(output_dir, f"tweet_chunk_{chunk_number}.parquet")
      df.to_parquet(output_path, compression='snappy', index=False)

      print(f"Saved chunk {chunk_number} with {len(records)} records to {output_path}")

  def parse(self):
      with open(self.tweets_path, 'r') as f:
          data = ijson.items(f, 'item')

          records = []
          chunk_count = 10*self.batch
          for item in tqdm(data, desc="Parsing tweets", unit=" tweets"):
              records.append(item)
              if len(records) >= self.chunk_size:
                  self._save_mini_chunk(records, chunk_count)
                  chunk_count += 1
                  records = []
          # check for remaining tweets
          if records:
            self._save_mini_chunk(records, chunk_count)

In [None]:
for i in range(4,9):
  parser = TweetsParser(f"{SNA_PROJECT_PATH}/TwiBot-22/tweet_{i}.json", i)
  parser.parse()

### Tweet chunks' dimension check

In [None]:
tweets_count = 0
for i in range(89):
  tweet_chunk_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/tweet_chunks/tweet_chunk_{i}.parquet")
  tweet_chunk_shape = tweet_chunk_df.shape
  print(f"Shape of file tweet_chunk_{i}: {tweet_chunk_shape}")
  tweets_count += tweet_chunk_shape[0]
print(f"# Tweet: {tweets_count}")

### Tweet first chunk visualization

In [None]:
tweet_chunk_0_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/tweet_chunks/tweet_chunk_0.parquet")

In [None]:
pd.set_option('display.max_colwidth', None)
tweet_chunk_0_df

## Parsing Edges

In [None]:
class EdgeParser():
    """
    This class parses a large edge CSV file, filters edges based on specified relations,
    and saves the filtered edges into smaller Parquet files for efficient processing.
    """
    def __init__(self, edges_path: str, relations, output_dir, chunk_size: int=500000):
      self.edges_path = edges_path
      self.relations = relations
      self.output_dir = output_dir
      self.chunk_size = chunk_size

    def __save_edges__(self, edges, chunk_count):
      os.makedirs(self.output_dir, exist_ok=True)
      df = pd.concat(edges, ignore_index=True)
      output_path = os.path.join(self.output_dir, f"edge_chunk_{chunk_count}.parquet")
      df.to_parquet(output_path, compression='snappy', index=False)
      print(f"Saved chunk {chunk_count} with {sum(len(df) for df in edges)} records to {output_path}")

    def parse(self):
      filtered_edges = []
      chunk_count = 0
      for chunk in pd.read_csv(self.edges_path, usecols=['source_id', 'relation', 'target_id'], chunksize=self.chunk_size):
        filtered_chunk = chunk[chunk["relation"].isin(self.relations)]
        filtered_edges.append(filtered_chunk)
        if sum(len(df) for df in filtered_edges) >= self.chunk_size:
          self.__save_edges__(filtered_edges, chunk_count)
          chunk_count += 1
          filtered_edges = []
      if len(filtered_edges) > 0:
          self.__save_edges__(filtered_edges, chunk_count)

In [None]:
edge_parser = EdgeParser(f"{SNA_PROJECT_PATH}/TwiBot-22/edge.csv", set(["followers", "following"]), f"{SNA_PROJECT_PATH}/edge_chunks")
edge_parser.parse()

### Edge chunks' dimension check

In [None]:
edges_count = 0
for i in range(8):
  edge_chunk_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunk_{i}.parquet")
  edge_chunk_shape = edge_chunk_df.shape
  print(f"Shape of file edge_chunk_{i}: {edge_chunk_shape}")
  edges_count += edge_chunk_shape[0]
print(f"# Edge (followers & following): {edges_count}")

### Edge last chunk visualization

In [None]:
edge_chunk_7_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunk_7.parquet")

In [None]:
edge_chunk_7_df

# #️⃣ Creating Hashtag Community

In [None]:
unique_users = False

Two possible types of dictionaries:

- **hashtag_users:** contains only unique users for each hashtag, regardless of how many times they used it

  $\rightarrow$ `unique_users = True`

- **hashtag_users_non_unique:** contains all instances of users using a hashtag, even if a user uses the same hashtag multiple times

  $\rightarrow$ `unique_users = False`

## Creating hashtag-user dictionary

In [None]:
def user_hashtag(df, dictionary, unique_users=True):
  """
  This method iterates through a DataFrame of tweets and creates a dictionary
  that maps hashtags to the users who have used them.

  Args:
    df: DataFrame of tweets.
    dictionary: Dictionary to populate.

  Returns:
    None. The dictionary is updated in-place.
  """
  for tweet in tqdm(df.iterrows(), desc="Parsing tweets", unit="tweets"):
    for hash_dict in tweet[1]['hashtags']:
        try:
          tag = hash_dict['tag']
        except KeyError:
          text = hash_dict['text']
          tag = None

        try:
          text = hash_dict['text']
        except KeyError:
          tag = hash_dict['tag']
          text = None

        hashtag = tag if tag is not None else text

        user_container = dictionary.get(hashtag, set() if unique_users else list())
        user_action = user_container.add if unique_users else user_container.append
        user_action(f"u{tweet[1]['author_id']}")

        dictionary.update({hashtag: user_container})

In [None]:
discussions = dict()
for i in range(89):
    print(f"Parsing chunk {i+1}/89:")
    tweet_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/tweet_chunks/tweet_chunk_{i}.parquet")
    user_hashtag(tweet_df, discussions, unique_users)

In [None]:
# Identifying and printing popular hashtags based on the number of users associated with them
for key in discussions.keys():
  if len(discussions[key]) > 10:
    print(f"{key}: {len(discussions[key])}")

In [None]:
# Displaying the full content of a specific tweet from the dataset
tweet_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/tweet_chunks/tweet_chunk_0.parquet")
pd.set_option('display.max_colwidth', None)
print(tweet_df.iloc()[817766])

## Saving dictionary to Parquet file

In [None]:
# Chunk size to process at a time
chunk_size = 1_000_000

# Convert to Lists if unique_users is False, otherwise leave as sets
if not unique_users:
    dict_as_lists = {k: list(v) for k, v in discussions.items()}
else:
    dict_as_lists = discussions

# Determine the filename based on unique_users
filename = f"{SNA_PROJECT_PATH}/hashtag_users"
filename += "_non_unique.parquet" if not unique_users else ".parquet"

# Write in append mode
for i in range(0, len(dict_as_lists.keys()), chunk_size):
    # Handle chunking differently based on unique_users
    if not unique_users:
        chunk = {k: v[i:i + chunk_size] for k, v in dict_as_lists.items()}
        df = pd.DataFrame([(k, v) for k, values in chunk.items() for v in values], columns=["Hashtag", "UserID"])  # Convert only a small part to DataFrame
    else:
        chunk_keys = list(dict_as_lists.keys())[i:i + chunk_size]
        chunk = {k: list(v) for k, v in dict_as_lists.items() if k in chunk_keys}  # Convert to list for DataFrame
        df = pd.DataFrame([(k, v) for k, values in chunk.items() for v in values], columns=["Hashtag", "UserID"])

    # Write to Parquet file
    if i == 0:
        fastparquet.write(filename, df)
    else:
        fastparquet.write(filename, df, append=True)

## Reading dictionary from Parquet file

In [None]:
# Loading and previewing the hashtag-user mappings data from the Parquet file
user_hashtag_unique_parquet = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users.parquet")
user_hashtag_unique_parquet.head()

In [None]:
# Loading and previewing the hashtag-user (non unique) mappings data from the Parquet file
user_hashtag_non_unique_parquet = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users_non_unique.parquet")
user_hashtag_non_unique_parquet.head()

# 🕸️ Network Creation

In [None]:
user_hashtag_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users.parquet")
user_hashtag_df.head(10)

In [None]:
hashtag_counts_df = user_hashtag_df.value_counts().reset_index(name='Occurrences')
hashtag_counts_df.sort_values(by='Occurrences', ascending=False).head()

In [None]:
communities = user_hashtag_df['Hashtag'].str.lower().value_counts()

## Large Communities (> 10.000)

In [None]:
large_communities = [(i,communities[i]) for i in communities.index if communities[i]>10000]

In [None]:
print(large_communities)

## Medium Communities (1.000 - 10.000)




In [None]:
medium_communities = [(i,communities[i]) for i in communities.index if communities[i]>1000 and communities[i]<=10000]

In [None]:
print(medium_communities)

## Small Communities (< 1.000)

In [None]:
small_communities = [(i,communities[i]) for i in communities.index if communities[i]>10 and communities[i]<=1000]

In [None]:
print(small_communities[0:20])

## Network selection

In [None]:
community_name = "ukraine"
#community = hashtag_counts_df[hashtag_counts_df['Hashtag'].str.contains(community_name, case=False)]['UserID'].to_list()  # Non-unique
community = user_hashtag_df[user_hashtag_df['Hashtag'].str.lower()==community_name]['UserID'].to_list()                    # Unique
print(len(community))
print(community[:20])

In [None]:
print(len(set(community)))

In [None]:
# Filter rows where the hashtag contains the community name (case-insensitive)
community_hashtags = hashtag_counts_df[hashtag_counts_df['Hashtag'].str.contains(community_name, case=False)]

# Group by UserID and sum the Occurrences
community_df = community_hashtags.groupby('UserID')['Occurrences'].sum().reset_index()

community_df.sort_values('Occurrences', ascending=False).head(10)

In [None]:
print(len(community_df['UserID']))

In [None]:
def create_df_network(df, users):
  return df[(df["source_id"].isin(users)) & (df["target_id"].isin(users))]

In [None]:
df_list = []
for k in range(8):
  chunk_edge_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunk_{k}.parquet")
  df = create_df_network(chunk_edge_df, community)
  df_list.append(df)

final_df = pd.concat(df_list, ignore_index=True)
print(final_df.shape)
final_df.head(100)

In [None]:
unique_sources = final_df['source_id'].unique()
unique_targets = final_df['target_id'].unique()
node_list = pd.concat([pd.Series(unique_sources), pd.Series(unique_targets)], ignore_index=True).unique()
print(len(node_list))

# 🛠️ Full Network Construction

The final `full_graph` is a directed graph where:

- **Nodes:** Represent users within the selected community.
- **Edges:** Represent follower and following relationships between those users.
- **Node attributes:** Include the user's label ('human' or 'bot'), color (based on label), number of posts (N_posts), and calculated network measures (added in later steps).

In essence, we construct a network graph representing the interactions and relationships between users within a specific community on Twitter, based on their follower/following connections and labeled as 'human' or 'bot' for further analysis.

## 1. Defining dictionaries for user labels and color mappings

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/TwiBot-22/label.csv")
labels.head()
colormap = {'human':'green', 'bot':'red'}
color_df = labels.replace(colormap)
label_dict = labels.set_index('id')['label'].to_dict()
color_dict = color_df.set_index('id')['label'].to_dict()

## 2. Creating following graph

In [None]:
following_df = final_df[final_df['relation']=='following']
following_graph = nx.from_pandas_edgelist(following_df, 'source_id', 'target_id', create_using=nx.DiGraph())

## 3. Creating followers graph

In [None]:
followers_df = final_df[final_df['relation']=='followers']
#followers_graph = nx.from_pandas_edgelist(followers_df, 'source_id', 'target_id', create_using=nx.DiGraph())
followers_graph = nx.from_pandas_edgelist(followers_df, 'target_id', 'source_id', create_using=nx.DiGraph())  # Reversing the follower relationship, this way every arc in the graph goes from follower to followed

## 4. Combining the following and follower graphs

##### Unique

In [None]:
nx.set_node_attributes(following_graph, label_dict, 'label')
nx.set_node_attributes(following_graph, color_dict, 'color')

In [None]:
nx.set_node_attributes(followers_graph, label_dict, 'label')
nx.set_node_attributes(followers_graph, color_dict, 'color')

In [None]:
full_graph = nx.compose(following_graph, followers_graph)

In [None]:
nodes = full_graph.nodes()
edge_dict = {}
for e in full_graph.edges():
  start_label = nodes[e[0]]['label']
  end_label = nodes[e[1]]['label']
  edge_dict[e] = start_label + '_' + end_label

nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')

In [None]:
print(f"Nodes: {len(full_graph.nodes())}")
print(f"Edges: {len(full_graph.edges())}")

##### Non-unique

In [None]:
community_df.set_index('UserID')
community_dict = {}
for u in range(len(community_df)):
  community_dict[community_df.loc[u]['UserID']] = community_df.loc[u]['Occurrences']

In [None]:
full_graph = nx.compose(following_graph, followers_graph)
full_graph.add_nodes_from(community)
nx.set_node_attributes(full_graph, community_dict, 'N_posts')
nx.set_node_attributes(full_graph, label_dict, 'label')
nx.set_node_attributes(full_graph, color_dict, 'color')

In [None]:
nodes = full_graph.nodes()
edge_dict = {}
for e in full_graph.edges():
  start_label = nodes[e[0]]['label']
  end_label = nodes[e[1]]['label']
  edge_dict[e] = start_label + '_' + end_label

nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')

In [None]:
print(f"Nodes: {len(full_graph.nodes())}")
print(f"Edges: {len(full_graph.edges())}")
print(full_graph)

## 5. Plotting the full graph

In [None]:
plt.figure(figsize=(30,30))
colors = [colormap[full_graph.nodes[node]['label']] for node in list(full_graph.nodes())]
pos = nx.spring_layout(full_graph)
nx.draw(full_graph, pos=pos, arrows=True, node_size=10, node_color=colors, arrowstyle='-|>', arrowsize=5, width=0.2)
plt.savefig("graph.png", dpi=500)

In [None]:
rec = nx.reciprocity(full_graph)
print(f"Reciprocity: {rec}")

# 📐 Applying Measures

Meaning of measures:

*   **Degree Centrality**, considering the entire population of users, the degree centrality is the ratio between the number of followers a user has with respect to the total population, excluding itself.
*   **Eigenvector Centrality (left)**, the user's centrality is given by the centrality of the users that follow it.
*   **Eigenvector Centrality (right)**, the user's centrality is given by the centrality of users it follows.
*   **Katz Centrality (left/rigth)**, more effective and solid formulation of Eigenvector Centrality as it reduces the impact a single high centrality has on all the users it follows.
*   **Closeness Centrality**, (Only applicable to single components!) the user's centrality is inversely proportional to the distance it has from all other nodes. When low it can act as a measure of how much a user is isolated.
*   **Betweenness Centrality**, centrality of a user depends on its position as a crossroad between paths from other users. It may measure how much a user acts as a "common friend" between others.
*   **Clustering Coefficient**, for each user u and the set of its neighbours Nu (users that follow it or are followed by it) this measure is the ratio between the number of couples of Nu that have a relationship between each other and their total number. This gives us insight on how a user acts as a centre of its local community.

## 1. Computing centrality measures (using NetworkIT and Networkx)

In [None]:
def list_to_dict_user(list_measure):
  dict_measure = {}
  for u in idmap.keys():
    dict_measure[u] = list_measure[idmap[u]]
  return dict_measure

In [None]:
full_graph_nk = nk.nxadapter.nx2nk(full_graph, data=True)
idmap = dict((id, u) for (id, u) in zip(full_graph.nodes(), range(full_graph.number_of_nodes())))

#### Degree Centrality

In [None]:
deg_centr_nk = nk.centrality.DegreeCentrality(full_graph_nk, normalized=True).run().scores()

#### Eigenvector Centrality

In [None]:
eig_centr_nk = nk.centrality.EigenvectorCentrality(full_graph_nk).run().scores()

#### Katz Centrality

In [None]:
katz_centr_nk = nk.centrality.KatzCentrality(full_graph_nk, 0.001, 1e-4).run().scores()

#### Closeness Centrality

In [None]:
close_centr_nk = nk.centrality.Closeness(full_graph_nk, False, nk.centrality.ClosenessVariant.GENERALIZED).run().scores()

#### Betweennes Centrality

In [None]:
betw_centr_nk = nk.centrality.Betweenness(full_graph_nk).run().scores()

#### Clustering Coefficient

In [None]:
clust = nx.clustering(full_graph)

#### PageRank

In [None]:
pagerank = nk.centrality.PageRank(full_graph_nk).run().scores()

#### Hubs and Authorities

In [None]:
hits = nx.hits(full_graph)

#### Reputation

In [None]:
full_in_degree = full_graph.in_degree
full_out_degree = full_graph.out_degree
reputation = {}
for node in full_graph.nodes().keys():
  reputation[node]=full_in_degree[node]/(full_in_degree[node]+full_out_degree[node]+1)

#### Core Number

In [None]:
core_number = nx.core_number(full_graph)

#### In/Out

In [None]:
in_over_out = {}
for node in full_graph.nodes().keys():
  in_over_out[node]=(full_in_degree[node]+1)/((full_out_degree[node])+1)

#### PageRank/Degree

In [None]:
pagerank_over_degree = {}
pagerank = list_to_dict_user(pagerank)
for node in full_graph.nodes().keys():
  pagerank_over_degree[node]=pagerank[node]/(full_in_degree[node]+full_out_degree[node]+1)

In [None]:
n_posts = [full_graph.nodes[node]['N_posts'] for node in list(full_graph.nodes())]
print(len(list_to_dict_user(n_posts)))

In [None]:
measures = {
    'n_posts': list_to_dict_user(n_posts),
    'in_degree': dict(full_in_degree),
    'out_degree': dict(full_out_degree),
    'degree_centrality': list_to_dict_user(deg_centr_nk),
    #'eigenvector_centrality': list_to_dict_user(eig_centr_nk),
    #'katz_centrality': list_to_dict_user(katz_centr_nk),
    #'closeness_centrality': list_to_dict_user(close_centr_nk),
    #'betweenness_centrality': list_to_dict_user(betw_centr_nk),
    #'clustering_coefficient': clust,
    #'pagerank': pagerank,
    #'core_number': core_number,
    #'in_over_out': in_over_out,
    #'pagerank_over_degree': pagerank_over_degree,
    #'reputation': reputation,
    #'hubs': hits[0],
    #'authorities': hits[1],
}

## 2. Showing dataframe with measure

In [None]:
measure_df = []
for u in measure.keys():
    d = {
        'user_id' : u,
        'label' : label_dict[u],
        'measure' : measure[u]
    }
    measure_df.append(d)

measure_df = pd.DataFrame(measure_df)
measure_df = measure_df.sort_values(by='measure', ascending=False)
display(measure_df.head(100))
top100_user_counts = measure_df[0:100]['label'].value_counts()
humans = top100_user_counts['human']
bots = top100_user_counts['bot']
tot = humans + bots
print(f"Humans: {humans}")
print(f"Bots: {bots}")
print(f"Bot Percentage: {bots/tot}")

## 3. Plotting the graph with measures

In [None]:
plt.figure(figsize=(30,30))
cent = np.fromiter(measure.values(), float)
sizes = cent / np.max(cent) * 200
normalize = mcolors.Normalize(vmin=cent.min(), vmax=cent.max())

pos = nx.spring_layout(full_graph)
colors = [colormap[full_graph.nodes[node]['label']] for node in list(full_graph.nodes())]
nx.draw(full_graph, pos, node_size=sizes, node_color=colors ,arrowstyle='-|>', arrows=True ,arrowsize=5, width=0.2)#node_color=sizes, cmap=colormap)
plt.show()

## 4. Visualizing the graph using Gravis

### Rounding method

In [None]:
def round_dict(dict):
  for key in dict.keys():
    dict[key] = round(dict[key], 4)
  return dict

### Setting nodes attributes and exporting the graph

In [None]:
sizes = betw_centr.copy()
for k in betw_centr.keys():
  sizes[k] = 1 + betw_centr[k] * 1000

for k in measures.keys():
  nx.set_node_attributes(full_graph, round_dict(measures[k]), k)


nx.set_node_attributes(full_graph, round_dict(sizes), 'size')
fig = gv.d3(full_graph)
fig.export_html('full_graph.html')

## 5. Full graph, in and out degree plots

In [None]:
full_in_degree = dict(full_graph.in_degree).values()
full_out_degree = dict(full_graph.out_degree).values()

In [None]:
plt.hist(full_in_degree, bins=30, alpha=0.5, label='in-degree', range=(0,30))

In [None]:
plt.hist(full_in_degree, bins=30, alpha=0.5, label='in-degree', range=(0,30))

# 🤖 Bot Network

## Bot Network Definition from nodes

In [None]:
bot_users = [node for node in full_graph.nodes() if full_graph.nodes[node]['label'] == 'bot']
bot_graph = full_graph.subgraph(bot_users)

## Bot Network Definition from edges information

In [None]:
bot_bot_edges = [edge for edge in full_graph.edges() if full_graph.edges[edge]['label'] == 'bot_bot']

bot_bot_graph = full_graph.edge_subgraph(bot_bot_edges)
print(len(bot_bot_graph.edges()))

## Bot graph, in and out degree plots

In [None]:
bot_in_degree = dict(bot_graph.in_degree).values()
bot_out_degree = dict(bot_graph.out_degree).values()

In [None]:
plt.hist(bot_in_degree, bins=10, alpha=0.5, label='in-degree', range=(0,10))

In [None]:
plt.hist(bot_out_degree, bins=10, alpha=0.5, label='out-degree', range=(0,10))

In [None]:
colormap = {'human':'green', 'bot':'red'}
colors = [colormap[bot_graph.nodes[node]['label']] for node in list(bot_graph.nodes())]
nx.draw(bot_graph, arrows=True, node_size=10, node_color=colors, arrowstyle='-|>', arrowsize=5, width=0.2)

In [None]:
rec = nx.reciprocity(bot_graph)
print(f"Bot Reciprocity: {rec}")

# 👥 Human Network

## Human Network Definition from nodes

In [None]:
human_users = [node for node in full_graph.nodes() if full_graph.nodes[node]['label'] == 'human']
human_graph = full_graph.subgraph(human_users)

## Human Network Definition from edges information

In [None]:
human_human_edges = [edge for edge in full_graph.edges() if full_graph.edges[edge]['label'] == 'human_human']

human_human_graph = full_graph.edge_subgraph(human_human_edges)
print(len(human_human_graph.edges()))

## Human graph, in and out degree plots

In [None]:
human_in_degree = dict(human_graph.in_degree).values()
human_out_degree = dict(human_graph.out_degree).values()

In [None]:
plt.hist(human_in_degree, bins=25, alpha=0.5, label='in-degree', range=(0,25))

In [None]:
plt.hist(human_out_degree, bins=25, alpha=0.5, label='out-degree', range=(0,25))

In [None]:
plt.figure(figsize=(30,30))
colormap = {'human':'green', 'bot':'red'}
colors = [colormap[human_graph.nodes[node]['label']] for node in list(human_graph.nodes())]
nx.draw(human_graph, arrows=True, node_size=10, node_color=colors, arrowstyle='-|>', arrowsize=5, width=0.2)

In [None]:
rec = nx.reciprocity(human_graph)
print(f"Human Reciprocity: {rec}")

# 👥🤖 Mixed (*human-bot* and *bot-human*) Network

## Mixed Network definition from edges information

In [None]:
mixed_edges = [edge for edge in full_graph.edges() if full_graph.edges[edge]['label'] == 'human_bot' or full_graph.edges[edge]['label'] == 'bot_human']

mixed_graph = full_graph.edge_subgraph(mixed_edges)
print(len(mixed_graph.edges()))

## Mixed Network reciprocity

In [None]:
print(nx.reciprocity(mixed_graph))

## Mixed graph, in and out degree plots

In [None]:
mixed_in_degree = dict(mixed_graph.in_degree).values()
mixed_out_degree = dict(mixed_graph.out_degree).values()

In [None]:
plt.hist(mixed_in_degree, bins=25, alpha=0.5, label='in-degree', range=(0,25))

In [None]:
plt.hist(mixed_out_degree, bins=25, alpha=0.5, label='out-degree', range=(0,25))

# 🧬 Measure Correlation

In [None]:
for m in measures.keys():
  corr = spy.stats.pointbiserialr(one_hot_label, list(measures[m].values()))
  print(f"{m}: pvalue = {round(corr.pvalue,4)} --- statistic = {round(corr.statistic, 4)}")

In [None]:
one_hot_label = [1 if full_graph.nodes[node]['label']=='bot' else 0 for node in list(full_graph.nodes())]
measure_df = pd.DataFrame.from_dict(measures)
measure_df['label'] = one_hot_label

In [None]:
pair = sns.pairplot(measure_df, hue='label')

In [None]:
corr = measure_df.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr, cmap="YlGnBu", annot=True);

# ⚙️ Computing measures with multiprocessing

Usage of the multiprocessing module and the functions to compute the metrics from metrics.py (each functions is basically a wrapper of a networkit or networkx function to compute the specified metric, so that each function has the same args that we can use in the following methods). To add a new metric, add the wrap function into metrics.py and add the function name in the dictionary metric_functions below.

The following code allows to build the graph, compute the measures and use them.

## Functions to create a graph based on the community name (useful to iterate measures and analysis over different communities)


In [None]:
def create_df_network(df, users):
  return df[(df["source_id"].isin(users)) & (df["target_id"].isin(users))]

In [None]:
def build_community_df(community_name, user_hashtag_df):
  community = user_hashtag_df[user_hashtag_df['Hashtag'].str.contains(community_name, case=False)]['UserID'].to_list()
  #community = user_hashtag_df[user_hashtag_df['Hashtag'].str.lower()==community_name]['UserID'].to_list()
  df_list = []
  for k in range(8):
    chunk_edge_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/edge_chunks/edge_chunk_{k}.parquet")
    df = create_df_network(chunk_edge_df, community)
    df_list.append(df)

  final_df = pd.concat(df_list, ignore_index=True)
  return final_df

In [None]:
def build_following_network(final_df):
  following_df = final_df[final_df['relation']=='following']
  following_graph = nx.from_pandas_edgelist(following_df, 'source_id', 'target_id', create_using=nx.DiGraph())
  nx.set_node_attributes(following_graph, label_dict, 'label')
  nx.set_node_attributes(following_graph, color_dict, 'color')
  return following_graph

In [None]:
def build_follower_network(final_df):
  followers_df = final_df[final_df['relation']=='followers']
  #followers_graph = nx.from_pandas_edgelist(followers_df, 'source_id', 'target_id', create_using=nx.DiGraph())
  followers_graph = nx.from_pandas_edgelist(followers_df, 'target_id', 'source_id', create_using=nx.DiGraph()) # Reversing the follower relationship, this way every arc in the graph goes from follower to followed.
  nx.set_node_attributes(followers_graph, label_dict, 'label')
  nx.set_node_attributes(followers_graph, color_dict, 'color')
  return followers_graph

In [None]:
def build_full_graph(community_name, user_hashtag_df):
  final_df = build_community_df(community_name, user_hashtag_df)
  full_graph = nx.compose(build_following_network(final_df), build_follower_network(final_df))
  nodes = full_graph.nodes()
  edge_dict = {}
  for e in full_graph.edges():
    start_label = nodes[e[0]]['label']
    end_label = nodes[e[1]]['label']
    edge_dict[e] = start_label + '_' + end_label

  nx.set_edge_attributes(full_graph, edge_dict, 'edge_label')
  return full_graph

## Functions to wrap the computations of measures with multiprocess to kill them after reaching a time limit

In [None]:
def compute_reputation(full_graph):
  full_in_degree = full_graph.in_degree
  full_out_degree = full_graph.out_degree
  reputation = {}
  for node in full_graph.nodes().keys():
    reputation[node]=full_in_degree[node]/(full_in_degree[node]+full_out_degree[node])
  return reputation

In [None]:
def list_to_dict_user(list_measure, idmap):
  dict_measure = {}
  for u in idmap.keys():
    dict_measure[u] = list_measure[idmap[u]]
  return dict_measure

The values of the following dict are the functions from metrics.py to compute the metrics.

In [None]:
metric_functions = {
    "degree_centrality": degree_centrality,
    "eigenvector_centrality": eigenvector_centrality,
    "katz_centrality": katz_centrality,
    "closeness_centrality": closeness_centrality,
    "betweenness_centrality": betweenness_centrality,
    "clustering_coefficients": clustering_coefficients,
    "hits_scores": hits_scores,
    "reputation_score": reputation_score
}

For each measure we use an async computation and then we save the result (if there is one) in the measure dict that we return in the end (special case for hubs and autorithies that come both from the same function).

In [None]:
def compute_metrics(full_graph, time_limit=30):
    full_graph_nk = nk.nxadapter.nx2nk(full_graph, data=True)
    idmap = {id: u for id, u in zip(full_graph.nodes(), range(full_graph.number_of_nodes()))}
    measures = {}
    for name, func in metric_functions.items():
        print(f"▶️ Computing {name}...")
        with multiprocessing.Pool(processes=1) as pool:
          async_result = pool.apply_async(func, args=(full_graph, full_graph_nk, idmap))
          try:
            result = async_result.get(time_limit)
            if name == "hits_scores":
              measures["hubs"] = result[0]
              measures["authorities"] = result[1]
            else:
              measures[name] = result
            print(f"✅ Done: {name}")
          except multiprocessing.TimeoutError:
            print(f"⏱️ Timeout after {time_limit}s")
          finally:
            pool.terminate()
    return measures

# 📊 Testing different size Networks with different Detection Strategies

n. |Task              |Small (<1.000) |Medium (1.000 - 10.000)|Large (>10.000)|
---|------------------|---------------|-----------------------|---------------|
1  |HITS              |  x            |  x                    |  x            |
2  |Extreme Behaviour |  x            |  x                    |  x            |
3  |Isolation Forest  |bot do not emerge as anomalies|bot do not emerge as anomalies|bot do not emerge as anomalies|
4  |DBSCAN            |  x            |  x                    |  x            |
5  |Logistic Regressor|  x            |  x                    |  x            |


## Networks Definition




*   **Small** $\rightarrow$ Ruleoflaw, Feminist, Agenda2030
*   **Medium** $\rightarrow$ Nato, Deeplearning, Nftcommunity
*   **Large** $\rightarrow$ Ukraine, Ai, Covid

## 1. HITS (Hubs & Authorities)

HITS, which stands for *Hyperlink-Induced Topic Search*, is an algorithm used to analyze the importance of nodes within a network, i.e. to identify authoritative and hub nodes in a network.

- **Authorities:** nodes that are considered to be valuable sources of information.

- **Hubs:** nodes that link to many authoritative nodes and help in discovering those authorities.

Unlike centrality measures that focus on individual node properties, HITS considers the relationships between nodes to determine their authority and hub scores.

In [None]:
def perform_hits_analysis(graph, label_dict, measures_dict, color_attribute='color', bot_color='red'):
    """
    Performs HITS analysis on a network graph with weighted edges, iterating through a
    dictionary of measures and analyzing bots/humans based on a composite score for each measure.
    """

    for measure_name, measure_values in measures_dict.items():
        print(f"\n--- Analyzing with measure: {measure_name} ---")

        # Create a weighted graph using the current measure as weights
        weighted_graph = nx.DiGraph()
        for u, v, data in graph.edges(data=True):
            weight = measure_values.get(u, 1)  # default weight to 1 if measure not found
            weighted_graph.add_edge(u, v, weight=weight)

        # Calculate HITS scores on the weighted graph
        hubs, authorities = nx.hits(weighted_graph, max_iter=1000, tol=1e-08, normalized=True)

        # Create a composite score using the current measure
        composite_score = {}
        for node in graph.nodes():
            score = hubs[node] + authorities[node] + graph.nodes[node].get('betweenness_centrality', 0) + \
                    graph.nodes[node].get('closeness_centrality', 0) + graph.nodes[node].get('clustering_coefficient', 0) + \
                    graph.nodes[node].get('pagerank', 0)
            composite_score[node] = score

        # Create a DataFrame with user IDs, labels, and HITS scores
        hits_df = pd.DataFrame([
            {
                'user_id': node,
                'label': label_dict.get(node),
                'hubs_score': hubs[node],
                'authorities_score': authorities[node],
                'composite_score': composite_score[node]
            }
            for node in graph.nodes()
        ])

        # Sort in descending order and display top 100 users by composite score
        sorted_df = hits_df.sort_values(by='composite_score', ascending=False)
        print("Top 100 Users by Composite Score:")
        print(sorted_df.head(100))

        # Calculate percentage of bots in top users
        bot_percentage = (sorted_df.head(100)['label'] == 'bot').mean() * 100
        print(f"\nPercentage of bots in top 100 users: {bot_percentage:.2f}%")

        # Calculate percentage of humans in top users
        human_percentage = (sorted_df.head(100)['label'] == 'human').mean() * 100
        print(f"\nPercentage of humans in top 100 users: {human_percentage:.2f}%")

        # Plot distributions of HITS scores for bots and humans
        hits_scores = {'hubs': hubs, 'authorities': authorities}
        bot_nodes = [n for n, d in graph.nodes(data=True) if d.get(color_attribute) == bot_color]
        human_nodes = [n for n, d in graph.nodes(data=True) if d.get(color_attribute) != bot_color]

        for score_type in hits_scores.keys():
            bot_scores = [hits_scores[score_type][n] for n in bot_nodes]
            human_scores = [hits_scores[score_type][n] for n in human_nodes]
            all_scores = bot_scores + human_scores

            plt.figure(figsize=(10, 6))
            plt.title(f'Distribution of {score_type.capitalize()} Scores (Measure: {measure_name})')
            plt.hist(human_scores, bins=20, color='skyblue', alpha=0.7, label='Humans')
            plt.hist(bot_scores, bins=20, color='coral', alpha=0.7, label='Bots')
            plt.xlim(0, max(all_scores))
            plt.xlabel(score_type.capitalize() + ' Score')
            plt.ylabel('Frequency')
            plt.legend()
            plt.grid(True, linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.show()

In [None]:
user_hashtag_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users.parquet")

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/TwiBot-22/label.csv")
labels.head()
colormap = {'human':'green', 'bot':'red'}
color_df = labels.replace(colormap)
label_dict = labels.set_index('id')['label'].to_dict()
color_dict = color_df.set_index('id')['label'].to_dict()

In [None]:
community = "feminist"
full_graph = build_full_graph(community, user_hashtag_df)

In [None]:
measures_dict = {
    'degree_centrality': nx.degree_centrality(full_graph),
    'eigenvector_centrality': nx.eigenvector_centrality(full_graph, max_iter=1000),
    'katz_centrality': nx.katz_centrality(full_graph, alpha=0.1, beta=1.0),
    'closeness_centrality': nx.closeness_centrality(full_graph),
    'betweenness_centrality': nx.betweenness_centrality(full_graph),
    'clustering_coefficient': nx.clustering(full_graph),
    'reputation': compute_reputation(full_graph)
}

In [None]:
perform_hits_analysis(full_graph, label_dict, measures_dict)

## 2. Looking for Extreme Behaviour in bot measures

In [None]:
# Creating a dictionary with all measures
measures_for_plot = {
    'degree_centrality': dict(),
    'eigenvector_centrality': dict(),
    'katz_centrality': dict(),
    'closeness_centrality': dict(),
    'betweenness_centrality': dict(),
    'clustering_coefficient': dict(),
    'reputation': dict(),
    'n_posts': dict()
}
botNodes = [x for x,y in full_graph.nodes(data=True) if y['color']=='red']

In [None]:
for measure in measures_for_plot.keys():
  bot_dict = {}
  for bot in botNodes:
    bot_dict.update({bot : measures[measure][bot]})

  human_dict = measures[measure].copy()
  for el in bot_dict:
    human_dict.pop(el)

  bot_list = bot_dict.values()
  human_list = human_dict.values()

  measures_for_plot[measure].update({'bots':bot_list})
  measures_for_plot[measure].update({'humans':human_list})

In [None]:
for measure in measures_for_plot.keys():
  values = []
  for el in measures_for_plot[measure]['humans']:
    values.append(el)
  for el in measures_for_plot[measure]['bots']:
    values.append(el)

  plt.figure(figsize=(10,10))
  plt.title(measure)
  plt.gca().get_yaxis().clear()
  plt.gca().get_xaxis().clear()
  plt.hist(measures_for_plot[measure]['humans'], alpha=0.8, label='humans')
  plt.xlim(0,max(values))
  plt.xlabel('humans')
  plt.show()
  plt.figure(figsize=(10,10))
  plt.title(measure)
  plt.gca().get_yaxis().clear()
  plt.gca().get_xaxis().clear()
  plt.hist(measures_for_plot[measure]['bots'], alpha=0.8, label='bots')
  plt.xlim(0,max(values))
  plt.xlabel('bots')
  plt.show()

## 3. 4. 5. Machine Learning approaches to detect bots

In [None]:
measure_df.head()

In [None]:
print(f"Average number of posts by bots: {measure_df[measure_df['label']==1]['n_posts'].mean()}")
print(f"Average number of posts by humans: {measure_df[measure_df['label']==0]['n_posts'].mean()}")

In [None]:
measure_df_nl = measure_df.drop('label', axis=1)

In [None]:
# Scaling measures in order to compare them fairly
scaler = StandardScaler()
measure_df_scaled = scaler.fit_transform(measure_df_nl)
measure_df_nl.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(measure_df_scaled, measure_df['label'], test_size=0.2, random_state=42)

### 3) Isolation Forest

In [None]:
# Training model on normal behaviour
model = IsolationForest(contamination=0.1, n_estimators=100, max_samples=256, random_state=42)
model.fit(x_train)

In [None]:
# Prediction on test set
y_pred = model.predict(x_test)
y_pred = np.array([1 if y==-1 else 0 for y in y_pred])

In [None]:
# anomalies = measure_df[measure_df['pred_binary'] == 1]
# human_anomalies = anomalies[anomalies['label'] == 0]
# bot_anomalies = anomalies[anomalies['label'] == 1]
# print(f"Avg human anomaly score among anomalies: {human_anomalies['anomaly_score'].mean()}")
# print(f"Avg bot anomaly score among anomalies: {bot_anomalies['anomaly_score'].mean()}")

In [None]:
# human_anomalies = measure_df[measure_df['label'] == 0]
# bot_anomalies = measure_df[measure_df['label'] == 1]
# print(f"Avg human anomaly score: {human_anomalies['anomaly_score'].mean()}")
# print(f"Avg bot anomaly score: {bot_anomalies['anomaly_score'].mean()}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
hm = sns.heatmap(confusion_matrix(y_test, y_pred, normalize='all'), annot=True)

### 4) DBSCAN

In [None]:
db = DBSCAN(eps=0.3, min_samples=100)
labels = db.fit_predict(measure_df_scaled)
measure_df['dbscan_label'] = labels

In [None]:
plt.figure(figsize=(15,10))
n = sns.countplot(x='dbscan_label', hue='label', data=measure_df)

### 5) Logistic Regressor

In [None]:
def prepare_data(full_graph, measures):
  x = []
  y = []
  for node in full_graph.nodes:
    label = 1 if full_graph.nodes[node]['label']=='bot' else 0
    features = []
    for m in measures:
      features.append(measures[m][node])
    x.append(features)
    y.append(label)
  return np.array(x), np.array(y)

In [None]:
def test_lr(full_graph, measures):
  x, y = prepare_data(full_graph, measures)

  scaler = StandardScaler()
  x = scaler.fit_transform(x)

  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  model = LogisticRegression(class_weight='balanced')
  model.fit(x, y)
  print("Cross-Validation Results:\n")

  acc_scores = []
  all_y_true = []
  all_y_pred = []

  coefficients_per_fold = []

  for fold_idx, (train_index, test_index) in enumerate(kf.split(x, y)):
      X_train, X_test = x[train_index], x[test_index]
      y_train, y_test = y[train_index], y[test_index]

      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)

      acc = accuracy_score(y_test, y_pred)
      acc_scores.append(acc)

      all_y_true.extend(y_test)
      all_y_pred.extend(y_pred)

      coefficients_per_fold.append(model.coef_[0])

      print(f"Fold {fold_idx + 1} Accuracy: {acc:.4f}")

  avg_coefficients = np.mean(coefficients_per_fold, axis=0)

  print("\nAverage Accuracy:", np.mean(acc_scores))
  print("\nClassification Report (Aggregated):")
  print(classification_report(all_y_true, all_y_pred, target_names=["Human", "Bot"]))

  print("Confusion Matrix (Aggregated):")
  print(confusion_matrix(all_y_true, all_y_pred))

  print("\nAverage Coefficients Across Folds:")
  results = {}
  for feature, coef in zip(list(measures.keys()), avg_coefficients):
      results[feature] = coef

  x_sm = sm.add_constant(x)
  model_sm = sm.Logit(y, x_sm)
  result = model_sm.fit(disp=0)

  print("\nCoefficient Significance (Full Dataset - statsmodels):")
  summary_df = pd.DataFrame({
        "Feature": ["Intercept"] + list(measures.keys()),
        "Coefficient": result.params,
        "P-value": result.pvalues,
  })
  print(summary_df)
  return results, summary_df

In [None]:
  coeff, df = test_lr(full_graph, measures)
  sorted_coeff = dict(sorted(coeff.items(), key=lambda item: item[1]))
  for c in sorted_coeff.keys():
    print(f"  {c}: {sorted_coeff[c]:.4f}")

### 5) Logistic Regressor w/ multiprocessing

In [None]:
user_hashtag_df = pd.read_parquet(f"{SNA_PROJECT_PATH}/hashtag_users.parquet")

In [None]:
labels = pd.read_csv(f"{SNA_PROJECT_PATH}/TwiBot-22/label.csv")
labels.head()
colormap = {'human':'green', 'bot':'red'}
color_df = labels.replace(colormap)
label_dict = labels.set_index('id')['label'].to_dict()
color_dict = color_df.set_index('id')['label'].to_dict()

In [None]:
communities = ["ukraine", "ai", "covid"]

In [None]:
time_limit = 10

In [None]:
for community_name in communities:
  full_graph = build_full_graph(community_name, user_hashtag_df)
  print(f"Graph informations for {community_name} community: ", full_graph)
  print(f"Computing measures for {community_name} community")
  measures = compute_metrics(full_graph, time_limit)
  if len(measures) > 0:
    print(f"LR results for {community_name}")
    coeff, df = test_lr(full_graph, measures)
    sorted_coeff = dict(sorted(coeff.items(), key=lambda item: item[1]))
    for c in sorted_coeff.keys():
      print(f"  {c}: {sorted_coeff[c]:.4f}")