# Get AUC of ensembled test

In [1]:
import polars as pl
import os
from sklearn.metrics import roc_auc_score

DATA_PATH = os.getenv("DATA_PATH")

### 1. Get train and test labels

In [2]:
# Full labels
labels = pl.read_parquet(f"{DATA_PATH}/interim/labels.parquet")

# Train data of sybilscar (uses 600 previously defined samples)
train_sybilscar = pl.read_parquet(f"{DATA_PATH}/interim/train_labels.parquet")

# Train data for ml model
test_sybilscar = labels.filter(~pl.col("fid").is_in(train_sybilscar["fid"]))
train_ml = test_sybilscar.filter(pl.col("bot")).sample(3000,seed=42)
train_ml = pl.concat([train_sybilscar, train_ml])

test_ml = labels.filter(~pl.col("fid").is_in(train_ml["fid"]))

In [3]:
print("train_ml\n",train_ml["bot"].to_pandas().value_counts(),"\n")
print("test_ml\n",test_ml["bot"].to_pandas().value_counts(),"\n")
print("train_sybilscar\n",train_sybilscar["bot"].to_pandas().value_counts(),"\n")
print("test_sybilscar\n",test_sybilscar["bot"].to_pandas().value_counts(),"\n")

train_ml
 bot
True     3300
False     300
Name: count, dtype: int64 

test_ml
 bot
True     1332
False     116
Name: count, dtype: int64 

train_sybilscar
 bot
False    300
True     300
Name: count, dtype: int64 

test_sybilscar
 bot
True     4332
False     116
Name: count, dtype: int64 



### 2. Train ml model

In [13]:
from pathlib import Path
import polars as pl
from farcaster_sybil_detection.config.defaults import Config
from farcaster_sybil_detection.services.detector import DetectorService
from farcaster_sybil_detection.features.registry import FeatureRegistry
from farcaster_sybil_detection.features.extractors.content_engagement_extractor import (
    ContentEngagementExtractor,
)
from farcaster_sybil_detection.features.extractors.network_analysis_extractor import (
    NetworkAnalysisExtractor,
)
from farcaster_sybil_detection.features.extractors.temporal_behavior_extractor import (
    TemporalBehaviorExtractor,
)
from farcaster_sybil_detection.features.extractors.user_identity_extractor import (
    UserIdentityExtractor,
)

pl.Config.set_streaming_chunk_size(1_000_000)
pl.Config.set_fmt_str_lengths(50)

config = Config(
    data_path=Path(f"{DATA_PATH}/raw"),
    checkpoint_dir=Path(f"{DATA_PATH}/checkpoints"),
    model_dir=Path(f"{DATA_PATH}/models"),
)

registry = FeatureRegistry()

# Register in any order - manager will figure out correct build order
registry.register("user_identity", UserIdentityExtractor)
registry.register("network_analysis", NetworkAnalysisExtractor)
registry.register("temporal_behavior", TemporalBehaviorExtractor)
# registry.register("content_engagement", ContentEngagementExtractor)
# registry.register("reputation_meta", ReputationMetaExtractor)

detector = DetectorService(config, registry)

metrics = detector.trainer.train(train_ml)
print("Training Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.3f}")

  from .autonotebook import tqdm as notebook_tqdm
2024-12-19 09:10:31,967 - DetectorService - DEBUG - Loading existing model from checkpoint.
DEBUG:DetectorService:Loading existing model from checkpoint.
2024-12-19 09:10:32,645 - Trainer - DEBUG - Building feature matrix...
DEBUG:Trainer:Building feature matrix...
2024-12-19 09:10:32,857 - Trainer - DEBUG - Preparing features for 3600 labeled fids
DEBUG:Trainer:Preparing features for 3600 labeled fids
2024-12-19 09:10:32,858 - FeatureManager - DEBUG - Starting feature matrix build - Memory usage: 2829.34 MB
DEBUG:FeatureManager:Starting feature matrix build - Memory usage: 2829.34 MB
2024-12-19 09:10:32,862 - FeatureManager - DEBUG - Base FIDs: 3600
DEBUG:FeatureManager:Base FIDs: 3600
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
  f"Feature matrix schema: {feature_matrix.schema} ({len(feature_matrix.columns)} columns)"
2024-12-19 09:10:32,864 - FeatureManager - DEBUG - Feature matrix sche

[LightGBM] [Info] Number of positive: 2640, number of negative: 240
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8042
[LightGBM] [Info] Number of data points in the train set: 2880, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.916667 -> initscore=2.397895
[LightGBM] [Info] Start training from score 2.397895
[LightGBM] [Info] Number of positive: 2112, number of negative: 192
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7973
[LightGBM] [Info] Number of data points in the train set: 2304, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.916667 -> initscore=2.397895
[LightGBM] [Info] Start training from score 2.397895
[LightGBM] [Info] Number

2024-12-19 09:14:19,206 - BaseModel - INFO - lgbm best score: 0.9990
INFO:BaseModel:lgbm best score: 0.9990


[LightGBM] [Info] Number of positive: 2112, number of negative: 192
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7973
[LightGBM] [Info] Number of data points in the train set: 2304, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.916667 -> initscore=2.397895
[LightGBM] [Info] Start training from score 2.397895
[LightGBM] [Info] Number of positive: 2112, number of negative: 192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7982
[LightGBM] [Info] Number of data points in the train set: 2304, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.916667 -> initscore=2.397895
[LightGBM] [I

2024-12-19 09:14:35,616 - Trainer - DEBUG - 
Evaluation metrics:
DEBUG:Trainer:
Evaluation metrics:
2024-12-19 09:14:35,617 - Trainer - DEBUG - roc_auc: 0.968
DEBUG:Trainer:roc_auc: 0.968
2024-12-19 09:14:35,617 - Trainer - DEBUG - precision: 0.982
DEBUG:Trainer:precision: 0.982
2024-12-19 09:14:35,617 - Trainer - DEBUG - recall: 0.995
DEBUG:Trainer:recall: 0.995
2024-12-19 09:14:35,617 - Trainer - DEBUG - f1: 0.989
DEBUG:Trainer:f1: 0.989
2024-12-19 09:14:35,618 - Trainer - DEBUG - mcc: 0.857
DEBUG:Trainer:mcc: 0.857
2024-12-19 09:14:35,618 - Trainer - DEBUG - kappa: 0.854
DEBUG:Trainer:kappa: 0.854
2024-12-19 09:14:35,618 - Trainer - DEBUG - tn: 48.000
DEBUG:Trainer:tn: 48.000
2024-12-19 09:14:35,619 - Trainer - DEBUG - fp: 12.000
DEBUG:Trainer:fp: 12.000
2024-12-19 09:14:35,619 - Trainer - DEBUG - fn: 3.000
DEBUG:Trainer:fn: 3.000
2024-12-19 09:14:35,619 - Trainer - DEBUG - tp: 657.000
DEBUG:Trainer:tp: 657.000


Training Metrics:
roc_auc: 0.968
precision: 0.982
recall: 0.995
f1: 0.989
mcc: 0.857
kappa: 0.854
tn: 48.000
fp: 12.000
fn: 3.000
tp: 657.000


In [None]:
ml_output = []

for fid in test_ml["fid"].to_list():
  try:
    ml_output.append(detector.predict(fid)["probability"])
  except:
    ml_output.append(None)

In [16]:
ml_predictions = test_ml.with_columns(pl.Series("ml_proba",ml_output).alias("ml_proba"))
ml_predictions = ml_predictions.filter(pl.col("ml_proba").is_not_null())
display(ml_predictions)

ml_auc = roc_auc_score(
  ml_predictions["bot"].cast(pl.Int8).to_numpy(),
  ml_predictions[f"ml_proba"]
)

print("ml auc:",ml_auc)

fid,bot,ml_proba
i64,bool,f64
11,false,0.05556
52,false,0.067287
55,false,0.057609
63,false,0.091361
64,false,0.053966
…,…,…
351522,true,0.985655
421493,true,0.949381
287794,true,0.985682
423036,true,0.98561


ml auc: 0.9899793732117905


### 3. Run SybilSCAR

In [4]:
import os
import glob
import asyncio
import aiofiles
import polars as pl
import time
import logging

class FarcasterBaseProcessor:
    def __init__(self):
        self.data_path = f"{DATA_PATH}/raw"
        self.persisted_data_path = f"{DATA_PATH}/interim"

    async def get_latest_parquet_file(self, file_pattern):
        """Gets the latest parquet file matching a pattern."""
        parquet_files = await asyncio.to_thread(
            glob.glob, os.path.join(self.data_path, file_pattern)
        )
        if not parquet_files:
            raise FileNotFoundError(f"No files found matching pattern: {file_pattern}")
        parquet_files.sort()
        return parquet_files[-1]

    def get_links_lazy_df(self, file_path):
        """Returns a lazy DataFrame for the given parquet file."""
        return pl.scan_parquet(file_path)

    def write_links_to_parquet(self, df, filename_suffix):
        """Writes the DataFrame to a parquet file with a unique timestamp."""
        filename = f"/{filename_suffix}-{int(time.time())}.parquet"
        df.sink_parquet(self.data_path + filename)

    def execute(self):
        """Template method to be overridden by subclasses."""
        raise NotImplementedError("Subclasses should implement the `execute` method.")


class FarcasterLinksAggregator(FarcasterBaseProcessor):
    async def execute(self):
        logging.info("Aggregating links...")
        start = time.time()
        latest_file = await self.get_latest_parquet_file("farcaster-links-0-*.parquet")
        links_lazy_df = self.get_links_lazy_df(latest_file)
        mutual_links = self.get_mutual_links(links_lazy_df)
        self.write_links_to_parquet(mutual_links, "processed-farcaster-mutual-links")
        logging.info(f"Execution time: {time.time() - start} seconds")
        return mutual_links

    def get_mutual_links(self, links_df):
        df_filtered = links_df.filter(
            (pl.col("deleted_at").is_null())
            & (pl.col("fid") != pl.col("target_fid"))
            & (pl.col("type") == "follow")
        ).select(["fid", "target_fid"])

        df_sorted = df_filtered.with_columns(
            [
                pl.min_horizontal(["fid", "target_fid"]).alias("sorted_fid"),
                pl.max_horizontal(["fid", "target_fid"]).alias("sorted_target_fid"),
            ]
        )

        df_grouped = df_sorted.group_by(["sorted_fid", "sorted_target_fid"]).agg(
            pl.count().alias("count")
        )

        return df_grouped.filter(pl.col("count") == 2).select(
            [
                pl.col("sorted_fid").alias("fid"),
                pl.col("sorted_target_fid").alias("target_fid"),
            ]
        )


class FarcasterUndirectedLinksBuilder(FarcasterBaseProcessor):
    async def execute(self):
        logging.info("Building undirected links...")
        start = time.time()
        latest_file = await self.get_latest_parquet_file(
            "processed-farcaster-mutual-links-*.parquet"
        )
        links_lazy_df = self.get_links_lazy_df(latest_file)
        undirected_links = self.get_undirected_links(links_lazy_df)
        self.write_links_to_parquet(
            undirected_links, "processed-farcaster-undirected-connections"
        )
        logging.info(f"Execution time: {time.time() - start} seconds")
        return undirected_links

    def get_undirected_links(self, links_df):
        fids = links_df.select("fid").unique()
        target_fids = (
            links_df.select("target_fid").unique().rename({"target_fid": "fid"})
        )
        all_fids = (
            pl.concat([fids, target_fids]).unique().collect()
        )  # test streaming mode

        # Use the collected DataFrame's shape to get the height
        mutual_reindex = all_fids.with_columns(
            pl.arange(0, all_fids.shape[0]).alias("index")
        )

        mutual_links_with_index = links_df.join(
            mutual_reindex.select(
                [pl.col("fid"), pl.col("index").alias("fid_index")]
            ).lazy(),
            on="fid",
            how="left",
        ).join(
            mutual_reindex.select(
                [pl.col("fid"), pl.col("index").alias("target_fid_index")]
            ).lazy(),
            left_on="target_fid",
            right_on="fid",
            how="left",
        )

        df_reversed = mutual_links_with_index.select(
            [
                pl.col("target_fid").alias("fid"),
                pl.col("fid").alias("target_fid"),
                pl.col("target_fid_index").alias("fid_index"),
                pl.col("fid_index").alias("target_fid_index"),
            ]
        )

        order = ["fid", "target_fid", "fid_index", "target_fid_index"]
        mutual_links_with_index_concatenated = pl.concat(
            [mutual_links_with_index.select(order), df_reversed.select(order)]
        )

        # mutual_links_with_index_concatenated = mutual_links_with_index_concatenated.with_columns(
        #     (pl.col("fid_index").cast(pl.Utf8) + " " + pl.col("target_fid_index").cast(pl.Utf8)).alias("connection")
        # )

        labels_df = pl.scan_parquet(
            f"/{self.persisted_data_path}/labels.parquet"
        )

        return mutual_links_with_index_concatenated.join(
            labels_df, how="left", on="fid"
        ).select("fid", "fid_index", "target_fid_index", "bot")

In [None]:
links_aggregator = FarcasterLinksAggregator()
await links_aggregator.execute()

  pl.count().alias("count")


In [None]:
undirected_links_builder = FarcasterUndirectedLinksBuilder()
await undirected_links_builder.execute()

In [4]:
processed_farcaster_undirected_connections = pl.read_parquet(f"{DATA_PATH}/raw/processed-farcaster-undirected-connections-1734540373.parquet")
processed_farcaster_undirected_connections

fid,fid_index,target_fid_index,bot
i64,i64,i64,bool
305835,250996,295895,
19339,193983,165313,
409969,343448,133977,
343904,66298,15087,
303580,111181,342016,
…,…,…,…
414665,58445,190169,
500400,206948,10571,
412579,187304,274188,
509675,282444,6027,


In [5]:
import random
import math
import os
import glob
from collections import defaultdict
from typing import Set, Dict, Tuple, List
import polars as pl
import time
import asyncio
import numpy as np
import logging


class SybilScar:
    def __init__(self):
        self.network_map = defaultdict(list)
        self.weighted_graph = 0
        self.prior = None
        self.post = None
        self.post_pre = None
        self.theta_pos = 0.6
        self.theta_neg = 0.4
        self.theta_unl = 0.5
        self.weight = 0.6
        self.max_iter = 10
        self.N = 0
        self.ordering_array = []
        self.semaphore = asyncio.Semaphore(4)

    def add_edge(self, node1, node2, w):
        if node1 != node2:  # Avoid self-loops
            self.network_map[node1].append((node2, w))

    # Refactored to read from in-memory connections data
    def read_network(self, connections):
        for node1, node2 in connections:
            self.add_edge(node1, node2, self.weight - 0.5)

        self.N = len(self.network_map)
        self.post = np.zeros(self.N)
        self.post_pre = np.zeros(self.N)
        self.prior = np.zeros(self.N)

    # Refactored to read from in-memory sybil and benigns sets
    def read_prior(self, train_sybils, train_benigns):
        self.prior.fill(self.theta_unl - 0.5)

        for benign in train_benigns:
            self.prior[benign] = self.theta_pos - 0.5

        for sybil in train_sybils:
            self.prior[sybil] = self.theta_neg - 0.5

    ## Write final posterior probabilities of nodes to the output file
    ## The final posterior probability is changed from p (in the residual form) to p + 0.5.
    def get_posterior(self, post_file):
        # with open(post_file, 'w') as f:
        #     for i in range(self.N):
        #         f.write(f"{i} {self.post[i] + 0.5:.10f}\n")

        data = [
            {"fid_index": i, "posterior": self.post[i] + 0.5} for i in range(self.N)
        ]
        df_lazy = pl.LazyFrame(data)
        return df_lazy

    async def lbp_thread(self, start, end):
        async with self.semaphore:
            for index in range(start, end):
                node = self.ordering_array[index]
                # update the the post for node
                for neighbor, weight in self.network_map[node]:
                    self.post[node] += 2 * self.post_pre[neighbor] * weight
                self.post[node] += self.prior[node]
                self.post[node] = max(min(self.post[node], 0.5), -0.5)

    # Async version of the LBP algorithm
    async def lbp_async(self):
        self.ordering_array = list(range(self.N))

        # initialize posts
        np.copyto(self.post, self.prior)
        iter_count = 1

        while iter_count <= self.max_iter:
            random.shuffle(self.ordering_array)
            np.copyto(self.post_pre, self.post)

            tasks = []
            num_nodes = int(
                np.ceil(self.N / self.semaphore._value)
            )  # Divide tasks by semaphore limit
            for current_thread in range(self.semaphore._value):
                start = current_thread * num_nodes
                end = min(start + num_nodes, self.N)
                task = asyncio.create_task(self.lbp_thread(start, end))
                tasks.append(task)

            await asyncio.gather(*tasks)
            iter_count += 1


class SybilScarExecutor:
    def __init__(self):
        self.data_path = DATA_PATH
        self.sybil_scar = SybilScar()
        

    def load_data(self):
        """Load data from the Parquet file and process connections, sybils, and benigns."""

        connections_df = processed_farcaster_undirected_connections
        self.connections = ((row[0], row[1]) for row in connections_df[["fid_index", "target_fid_index"]].iter_rows())

        sybils_df = (
            connections_df.filter(
                pl.col("fid").is_in(train_sybilscar.filter(pl.col("bot"))["fid"])
            )["fid_index"]
            .unique()
        )
        self.sybils = (s for s in sybils_df.to_list())

        benigns_df = (
            connections_df.filter(
                pl.col("fid").is_in(train_sybilscar.filter(~pl.col("bot"))["fid"])
            )["fid_index"]
            .unique()
        )
        self.benigns = (b for b in benigns_df.to_list())

    async def arun_sybil_scar(self):
        """Execute the SybilScar algorithm asynchronously on the loaded data."""
        self.sybil_scar.read_network(self.connections)
        self.sybil_scar.read_prior(self.sybils, self.benigns)
        await self.sybil_scar.lbp_async()

    def save_results(self, output_file: str):
        """Write the SybilScar post results to a file."""
        posterior_df = self.sybil_scar.get_posterior(output_file)
        posterior_df.sink_parquet(output_file)

    async def execute(self):
        """Load data, run the algorithm, and save the results."""
        logging.info("Running SybilScar...")
        start = time.time()

        self.load_data()

        logging.info("Data loaded. Running SybilScar algorithm...")
        await self.arun_sybil_scar()

        logging.info("SybilScar algorithm executed. Saving results...")
        self.save_results(
            self.data_path + "/sybil_scar_results.parquet"
        )
        end = time.time()

        logging.info(f"SybilScar execution time: {end - start:.2f} seconds")

In [6]:
sybilscar_executor = SybilScarExecutor()
await sybilscar_executor.execute()

In [7]:
sybilscar_results = pl.read_parquet(f"{DATA_PATH}/sybil_scar_results.parquet")
processed_farcaster_undirected_connections = processed_farcaster_undirected_connections.drop("bot").join(
    labels, how="left", on="fid", coalesce=True
)[["fid", "fid_index", "target_fid_index", "bot"]]
processed_farcaster_undirected_connections
index_to_fid = processed_farcaster_undirected_connections.group_by("fid_index").agg(pl.col("fid").last())
index_to_fid

sybilscar_results = sybilscar_results.join(index_to_fid,on="fid_index",coalesce=True)
sybilscar_results

fid_index,posterior,fid
i64,f64,i64
288133,0.0,828590
277001,0.0,689601
315676,0.0,277661
105801,0.5,784876
277135,0.0,374229
…,…,…
291872,0.0,813266
144154,0.0,716396
321231,0.0,861510
288740,0.0,373640


In [9]:
sybilscar_predictions = test_sybilscar.join(sybilscar_results[["fid","posterior"]],how="left",on="fid",coalesce=True)
sybilscar_predictions = sybilscar_predictions.with_columns((1-pl.col("posterior")).alias("sybilscar_proba")).drop("posterior")
# predictions = predictions.with_columns(pl.Series("ml_proba",ml_output).alias("ml_proba"))
# predictions = predictions.with_columns(pl.Series("ml_proba",ml_output).alias("ensemble_proba"))
sybilscar_predictions = sybilscar_predictions.filter(pl.col("sybilscar_proba").is_not_null())
display(sybilscar_predictions)

sybilscar_auc = roc_auc_score(
  sybilscar_predictions["bot"].cast(pl.Int8).to_numpy(),
  sybilscar_predictions[f"sybilscar_proba"]
)

print("sybilscar auc:",sybilscar_auc)

fid,bot,sybilscar_proba
i64,bool,f64
11,false,0.0
52,false,0.0
63,false,0.0
64,false,0.0
81,false,0.0
…,…,…
280179,true,1.0
423036,true,1.0
327500,true,1.0
428200,true,1.0


sybilscar auc: 0.9545484093947662


In [12]:
sybilscar_results["posterior"].sum() / len(sybilscar_results)

0.1432148184970624

### 4. Ensemble

In [18]:
# Ensemble of sybilscar and ml predictions

ensemble_predictions = ml_predictions.join(sybilscar_predictions[["fid","sybilscar_proba"]],how="left",on="fid",coalesce=True)
ensemble_predictions = ensemble_predictions.with_columns(
  pl.when(pl.col('sybilscar_proba').is_null() & pl.col('ml_proba').is_null())
    .then(None)
    .when(pl.col('ml_proba').is_null())
    .then(pl.col('sybilscar_proba'))
    .when(pl.col('sybilscar_proba').is_null())
    .then(pl.col('ml_proba'))
    .otherwise((pl.col('sybilscar_proba') + pl.col('ml_proba')) / 2)
    .alias('ensemble_proba')
)
ensemble_predictions

fid,bot,ml_proba,sybilscar_proba,ensemble_proba
i64,bool,f64,f64,f64
11,false,0.05556,0.0,0.02778
52,false,0.067287,0.0,0.033643
55,false,0.057609,,0.057609
63,false,0.091361,0.0,0.04568
64,false,0.053966,0.0,0.026983
…,…,…,…,…
351522,true,0.985655,1.0,0.992827
421493,true,0.949381,0.0,0.474691
287794,true,0.985682,1.0,0.992841
423036,true,0.98561,1.0,0.992805


In [20]:
ensemble_auc = roc_auc_score(
  ensemble_predictions["bot"].cast(pl.Int8).to_numpy(),
  ensemble_predictions[f"ensemble_proba"]
)

print("ensemble auc:",ensemble_auc)

ensemble auc: 0.9922682813227759
