In [2]:
import polars as pl
from hashlib import sha256
import os
pl.Config.set_fmt_str_lengths(400)
DATA_PATH = os.getenv("DATA_PATH", "")

In [3]:
processed_farcaster_mutual_links = pl.read_parquet(f"{DATA_PATH}/../farcaster-social-graph-api/farcaster_social_graph_api/data/processed-farcaster-mutual-links-1730375194.parquet")
processed_farcaster_mutual_links

fid,target_fid
i64,i64
577003,592136
500766,501667
468460,632061
330634,565884
839830,843804
…,…
513555,843703
337656,375068
711130,752043
604,20329


In [4]:
processed_farcaster_undirected_connections = pl.read_parquet(f"{DATA_PATH}/../farcaster-social-graph-api/farcaster_social_graph_api/data/processed-farcaster-undirected-connections-1730375251.parquet")
processed_farcaster_undirected_connections

fid,fid_index,target_fid_index,bot
i64,i64,i64,bool
577003,253932,178703,
500766,155835,12025,false
468460,192681,101650,true
330634,71182,19986,
839830,30016,319360,
…,…,…,…
843703,295562,353460,
375068,149909,271760,
752043,169032,265647,
20329,36392,315081,


In [8]:
labels = pl.read_parquet(f"{DATA_PATH}/interim/labels.parquet")

processed_farcaster_undirected_connections = processed_farcaster_undirected_connections.drop("bot").join(
    labels, how="left", on="fid", coalesce=True
)[["fid", "fid_index", "target_fid_index", "bot"]]
processed_farcaster_undirected_connections

fid,fid_index,target_fid_index,bot
i64,i64,i64,bool
577003,253932,178703,
500766,155835,12025,
468460,192681,101650,true
330634,71182,19986,
839830,30016,319360,
…,…,…,…
843703,295562,353460,
375068,149909,271760,
752043,169032,265647,
20329,36392,315081,


In [27]:
import random
import math
import os
import glob
from collections import defaultdict
from typing import Set, Dict, Tuple, List
import polars as pl
import time
import asyncio
import numpy as np
import logging


class SybilScar:
    def __init__(self):
        self.network_map = defaultdict(list)
        self.weighted_graph = 0
        self.prior = None
        self.post = None
        self.post_pre = None
        self.theta_pos = 0.6
        self.theta_neg = 0.4
        self.theta_unl = 0.5
        self.weight = 0.6
        self.max_iter = 10
        self.N = 0
        self.ordering_array = []
        self.semaphore = asyncio.Semaphore(4)

    def add_edge(self, node1, node2, w):
        if node1 != node2:  # Avoid self-loops
            self.network_map[node1].append((node2, w))

    # Refactored to read from in-memory connections data
    def read_network(self, connections):
        for node1, node2 in connections:
            self.add_edge(node1, node2, self.weight - 0.5)

        self.N = len(self.network_map)
        self.post = np.zeros(self.N)
        self.post_pre = np.zeros(self.N)
        self.prior = np.zeros(self.N)

    # Refactored to read from in-memory sybil and benigns sets
    def read_prior(self, train_sybils, train_benigns):
        self.prior.fill(self.theta_unl - 0.5)

        for benign in train_benigns:
            self.prior[benign] = self.theta_pos - 0.5

        for sybil in train_sybils:
            self.prior[sybil] = self.theta_neg - 0.5

    ## Write final posterior probabilities of nodes to the output file
    ## The final posterior probability is changed from p (in the residual form) to p + 0.5.
    def get_posterior(self, post_file):
        # with open(post_file, 'w') as f:
        #     for i in range(self.N):
        #         f.write(f"{i} {self.post[i] + 0.5:.10f}\n")

        data = [
            {"fid_index": i, "posterior": self.post[i] + 0.5} for i in range(self.N)
        ]
        df_lazy = pl.LazyFrame(data)
        return df_lazy

    async def lbp_thread(self, start, end):
        async with self.semaphore:
            for index in range(start, end):
                node = self.ordering_array[index]
                # update the the post for node
                for neighbor, weight in self.network_map[node]:
                    self.post[node] += 2 * self.post_pre[neighbor] * weight
                self.post[node] += self.prior[node]
                self.post[node] = max(min(self.post[node], 0.5), -0.5)

    # Async version of the LBP algorithm
    async def lbp_async(self):
        self.ordering_array = list(range(self.N))

        # initialize posts
        np.copyto(self.post, self.prior)
        iter_count = 1

        while iter_count <= self.max_iter:
            random.shuffle(self.ordering_array)
            np.copyto(self.post_pre, self.post)

            tasks = []
            num_nodes = int(
                np.ceil(self.N / self.semaphore._value)
            )  # Divide tasks by semaphore limit
            for current_thread in range(self.semaphore._value):
                start = current_thread * num_nodes
                end = min(start + num_nodes, self.N)
                task = asyncio.create_task(self.lbp_thread(start, end))
                tasks.append(task)

            await asyncio.gather(*tasks)
            iter_count += 1


class SybilScarExecutor:
    def __init__(self):
        self.data_path = DATA_PATH
        self.sybil_scar = SybilScar()
        

    def load_data(self):
        """Load data from the Parquet file and process connections, sybils, and benigns."""

        connections_df = processed_farcaster_undirected_connections
        self.connections = ((row[0], row[1]) for row in connections_df[["fid_index", "target_fid_index"]].iter_rows())

        sybils_df = (
            connections_df.filter(pl.col("bot") == True)["fid_index"]
            .unique()
            .sample(300, seed=42)
        )
        self.sybils = (s for s in sybils_df.to_list())

        benigns_df = (
            connections_df.filter(pl.col("bot") == False)["fid_index"]
            .unique()
            .sample(300, seed=42)
        )
        self.benigns = (b for b in benigns_df.to_list())

    async def arun_sybil_scar(self):
        """Execute the SybilScar algorithm asynchronously on the loaded data."""
        self.sybil_scar.read_network(self.connections)
        self.sybil_scar.read_prior(self.sybils, self.benigns)
        await self.sybil_scar.lbp_async()

    def save_results(self, output_file: str):
        """Write the SybilScar post results to a file."""
        posterior_df = self.sybil_scar.get_posterior(output_file)
        posterior_df.sink_parquet(output_file)

    async def execute(self):
        """Load data, run the algorithm, and save the results."""
        print("here")
        logging.info("Running SybilScar...")
        start = time.time()

        self.load_data()

        logging.info("Data loaded. Running SybilScar algorithm...")
        await self.arun_sybil_scar()

        logging.info("SybilScar algorithm executed. Saving results...")
        self.save_results(
            self.data_path + "/sybil_scar_results.parquet"
        )
        end = time.time()

        logging.info(f"SybilScar execution time: {end - start:.2f} seconds")
    


In [None]:
executor = SybilScarExecutor()
await executor.execute()

here


In [36]:
list(executor.benigns)

[]

In [120]:
connections_df = processed_farcaster_undirected_connections
sconnections = ((row[0], row[1]) for row in connections_df[["fid_index", "target_fid_index"]].iter_rows())

sybils_df = (
    connections_df.filter(pl.col("bot") == True)["fid_index"]
    .unique()
    .sample(300, seed=42)
)
train_sybils = (s for s in sybils_df.to_list())

benigns_df = (
    connections_df.filter(pl.col("bot") == False)["fid_index"]
    .unique()
    .sample(300, seed=42)
)
train_humans = (b for b in benigns_df.to_list())

In [121]:
train_sybils = list(train_sybils)
print(train_sybils)

[58095, 114833, 271889, 202145, 55784, 118947, 87102, 90710, 327191, 288061, 295896, 62093, 339528, 221160, 110560, 134327, 128514, 228725, 353650, 350399, 13790, 176865, 277330, 335023, 350240, 202127, 254167, 84673, 216450, 290430, 268926, 231988, 48342, 141656, 139440, 161167, 311075, 199395, 33300, 76963, 358815, 240233, 302495, 149417, 67087, 233431, 350492, 252329, 254299, 159912, 362358, 315148, 106861, 224162, 108472, 256679, 23032, 270882, 371983, 74630, 301861, 105502, 361642, 146527, 220218, 154088, 232365, 67129, 62178, 265702, 227770, 132727, 149916, 275893, 33289, 294386, 120031, 363487, 367128, 62266, 231024, 46460, 133003, 84715, 307549, 139617, 366352, 56353, 268161, 156328, 230746, 188497, 364569, 340034, 9888, 5841, 181114, 213322, 111157, 254706, 112914, 169453, 288430, 104530, 43265, 32330, 230875, 272230, 145704, 205957, 339048, 178470, 277309, 175551, 226180, 337295, 192108, 336676, 144871, 14442, 3258, 26753, 307134, 218335, 106042, 294080, 5768, 24042, 285706, 

In [122]:
train_humans = list(train_humans)
print(train_humans)

[98567, 59743, 157151, 80293, 84862, 48961, 136465, 310401, 185117, 266615, 142594, 68185, 93144, 212218, 329261, 148015, 323671, 196455, 351948, 111199, 132649, 126612, 205798, 170909, 183605, 349168, 344301, 53845, 41864, 169270, 252107, 24345, 186545, 100232, 229320, 95935, 161310, 199793, 216274, 71829, 191509, 150110, 165941, 165524, 45736, 154755, 161986, 230832, 274565, 232771, 155571, 70159, 100173, 220517, 185659, 164679, 256243, 326610, 290091, 220198, 1016, 221086, 355541, 348879, 237184, 360063, 161909, 309536, 217694, 242251, 56782, 113159, 104977, 140588, 55476, 78330, 20697, 225821, 110976, 29293, 105521, 316970, 244622, 259040, 171083, 6945, 161860, 265032, 286175, 4845, 66052, 361521, 246277, 179907, 228680, 165044, 264514, 307589, 170305, 180622, 363758, 71489, 197665, 339265, 86651, 360324, 194301, 59108, 159488, 118645, 188440, 155666, 325795, 116036, 66070, 160787, 78363, 51875, 196276, 85909, 11430, 212136, 6932, 83232, 311367, 113792, 50590, 289565, 157779, 12600

In [None]:
results = pl.read_parquet(f"{DATA_PATH}/sybil_scar_results.parquet")
results.join()

fid_index,posterior
i64,f64
0,0.0
1,0.0
2,0.0
3,0.5
4,0.0
…,…
375361,0.0
375362,0.0
375363,0.0
375364,0.0


In [50]:
index_to_fid = processed_farcaster_undirected_connections.group_by("fid_index").agg(pl.col("fid").last())
index_to_fid

fid_index,fid
i64,i64
13642,561442
268730,825401
241092,515305
272940,330999
273970,459430
…,…
160937,610330
67462,249128
125004,452718
207320,514295


In [51]:
results = results.join(index_to_fid,on="fid_index",coalesce=True)
results

fid_index,posterior,fid
i64,f64,i64
13642,0.0,561442
268730,0.0,825401
241092,0.0,515305
272940,1.0,330999
273970,0.0,459430
…,…,…
160937,0.0,610330
67462,0.0,249128
125004,0.0,452718
207320,0.0,514295


In [124]:
labels = pl.read_parquet(f"{DATA_PATH}/interim/labels.parquet")
train_labels = index_to_fid.filter(pl.col("fid_index").is_in(train_sybils + train_humans))
train_labels
test_labels = labels.filter(~pl.col("fid").is_in(train_labels["fid"]))
test_labels

fid,bot
i64,bool
11,false
52,false
55,false
63,false
64,false
81,false
165,false
176,false
183,false
189,false


In [125]:
train_labels.write_parquet(f"{DATA_PATH}/interim/train_labels.parquet")

In [None]:
df = test_labels.join(results,on="fid",how="left",coalesce=True)
df = df.with_columns(pl.col("bot").cast(pl.Int8).alias("bot"))
# Invert the posterior
df = df.with_columns((1-pl.col("posterior")).alias("posterior"))
df

fid,bot,fid_index,posterior
i64,i8,i64,f64
11,0,250220,0.0
52,0,152581,0.0
63,0,49744,0.0
64,0,146572,0.0
81,0,76784,0.0
165,0,186665,0.0
176,0,15493,0.0
183,0,178992,0.0
189,0,150404,0.0
190,0,327885,0.0


In [102]:
from sklearn.metrics import roc_auc_score, confusion_matrix

y_true = df['bot'].to_numpy()
y_pred = df['posterior'].to_numpy()
print("ROC:",roc_auc_score(y_true, y_pred))
print("Confusion matrix:\n",confusion_matrix(y_true,y_pred.round()))

ROC: 0.9545484093947662
Confusion matrix:
 [[ 102    5]
 [ 197 4116]]


### Inspect the results

In [104]:
# Load fnames for later manual inspection
fnames = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-fnames-0-1730134800.parquet")
last_fnames = fnames[["fid","updated_at"]].group_by("fid").max()
last_fnames = last_fnames.join(fnames,on=["fid","updated_at"],how="left",coalesce=True)[["fid","fname"]]
last_fnames

df = df.join(last_fnames,on="fid",how="left",coalesce=True)
df

fid,bot,fid_index,posterior,fname,fname_right
i64,i8,i64,f64,str,str
11,0,250220,0.0,"""graham""","""graham"""
52,0,152581,0.0,"""sinahab""","""sinahab"""
63,0,49744,0.0,"""lauren""","""lauren"""
64,0,146572,0.0,"""maksim""","""maksim"""
81,0,76784,0.0,"""blau.eth""","""blau.eth"""
165,0,186665,0.0,"""ludo""","""ludo"""
176,0,15493,0.0,"""ajwaxman""","""ajwaxman"""
183,0,178992,0.0,"""koloz""","""koloz"""
189,0,150404,0.0,"""abram""","""abram"""
190,0,327885,0.0,"""bohm""","""bohm"""


In [114]:
pl.Config.set_tbl_rows(20)
df.filter((pl.col("posterior")==0) & (pl.col("bot") == 1)).sample(20,seed=42)

fid,bot,fid_index,posterior,fname,fname_right
i64,i8,i64,f64,str,str
248564,1,307092,0.0,"""disqtible""","""disqtible"""
248150,1,270759,0.0,"""manueljose19""","""manueljose19"""
403248,1,177901,0.0,"""ogsalesbot""","""ogsalesbot"""
2864,1,351649,0.0,"""launch""","""launch"""
240339,1,50483,0.0,"""beeee""","""beeee"""
242025,1,140897,0.0,"""0xenigma""","""0xenigma"""
12090,1,237131,0.0,"""eyzed""","""eyzed"""
289613,1,93553,0.0,"""gratefulape""","""gratefulape"""
459385,1,226022,0.0,"""sari1996""","""sari1996"""
428441,1,69619,0.0,"""purpledrank""","""purpledrank"""


In [113]:
df.filter((pl.col("bot") == 0) & (pl.col("posterior") == 1))

fid,bot,fid_index,posterior,fname,fname_right
i64,i8,i64,f64,str,str
12912,0,296989,1.0,"""ir""","""ir"""
13505,0,5061,1.0,"""jvaleska.eth""","""jvaleska.eth"""
308771,0,292789,1.0,"""murena.eth""","""murena.eth"""
386800,0,336383,1.0,"""d3v0nt3""","""d3v0nt3"""
450155,0,142086,1.0,"""manggo""","""manggo"""


### Conclusion

SybilSCAR seems to indeed perform an AUC of 0.95

It cannot detect approximately 5% of sybils and marked approximately 5% of the humans as bots.