In [1]:
import polars as pl
from hashlib import sha256
import os
import json
from urllib.parse import urlparse, parse_qs
import networkx as nx
import matplotlib.pyplot as plt
pl.Config.set_fmt_str_lengths(400)
# !pip install pyarrow

polars.config.Config

In [2]:
DATA_PATH = os.getenv("DATA_PATH", "")

In [3]:
bot_or_not_df = pl.scan_parquet(f"{DATA_PATH}/interim/bot_or_not_without_noises.parquet")

In [4]:
links_lazy_df = pl.scan_parquet(f"{DATA_PATH}/raw/farcaster-links-0-1730134800.parquet")

In [5]:
bot_or_not_df

In [6]:
# links_lazy_df.filter(pl.col('deleted_at').is_null()).head(5).collect()

In [7]:
bot_or_not_fids = bot_or_not_df.select("fid")

In [8]:
links_filtered_by_fid = links_lazy_df.join(bot_or_not_fids, on="fid", how="inner")
links_between_bots = links_filtered_by_fid.join(bot_or_not_fids, left_on="target_fid", right_on="fid", how="inner")

In [9]:
links_between_bots_df = links_between_bots.collect()

In [10]:
links_between_bots_df

fid,target_fid,hash,timestamp,created_at,updated_at,deleted_at,type,display_timestamp,id
i64,i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64
2,23,"b""\xf6\xb3y\x03f\x0b\x0f\x15""R\xf32\xf1orQ\x16\xc1[\xcb""",2021-07-22 21:38:55,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,22
2,18,"b""\xb9\x18\xc8\xf8\xb1_\xf4SU\x9b\x0f\x18\xce\x99D\xc5T\x12\x8a\x7f""",2021-07-22 21:38:59,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,23
2,20,"b""v\xb8\xaa\x96\x1c\x94\x8d\xaf\xc0c\xbf\xb8N\x88\x82\x87\xa6Q^q""",2021-07-22 21:39:03,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,25
2,8,"b""`e\xed\xd6\xb7\x9b\xf4H\x9d\x8fU\xcb\x98\x07\xac\x15kD\x95|""",2021-07-22 21:39:04,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,26
2,12,"b""4{*K\x02l\x05\x08\xe3\x88\x84\xc5\xc2\xfb\xb6m\x9c=i\xdd""",2021-07-22 21:39:06,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,28
…,…,…,…,…,…,…,…,…,…
7960,243771,"b""d\xf5\x84\xdah\x95B0\xe4\xf2\xfdH\xa8\x88z\xd7\xb8SD\xaa""",2024-10-28 16:56:20,2024-10-28 16:56:20.685148,2024-10-28 16:56:20.685148,,"""follow""",,11884876419
231371,288578,"b""\x0d\xc8\xe4C/\x82\xe5\xd0\xc8F\xe1~+\x10\xe3\x0a\xcd\xf1^\x9f""",2024-09-18 13:31:15,2024-09-18 13:31:15.565869,2024-10-28 16:58:18.899827,2024-10-28 16:58:19,"""follow""",,11874995633
738574,349675,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673
7960,14767,"b""\xd7\xaa\xd4}\xb7\xd5s+>\xc7/\xcc\xf0\xfct\xb6\xeb\xdc\xe7\x99""",2024-10-28 16:58:57,2024-10-28 16:58:59.604411,2024-10-28 16:58:59.604411,,"""follow""",,11884876739


In [11]:
links_df = links_between_bots_df.filter(pl.col('fid') != pl.col('target_fid'))

In [12]:
# bots_fids.collect()

In [13]:
following_count_df = (
    links_df
    .filter((pl.col("type") == "follow") & (pl.col("deleted_at").is_null()))
    .group_by("fid")
    .agg(pl.count("id").alias("following_count"))
)

In [14]:
followers_count_df = (
    links_df
    .filter((pl.col("type") == "follow") & (pl.col("deleted_at").is_null()))
    .group_by("target_fid")
    .agg(pl.count("id").alias("followers_count"))
    .rename({"target_fid": "fid"})
)

In [15]:
bots_df = bot_or_not_df.collect()
bots_df = bots_df.drop(["followers_count", "following_count"])
result_df = bots_df.join(following_count_df, on="fid", how="left")
result_df = result_df.join(followers_count_df, on="fid", how="left")
result_df = result_df.fill_null(0)

  result_df = bots_df.join(following_count_df, on="fid", how="left")
  result_df = result_df.join(followers_count_df, on="fid", how="left")


In [16]:
result_df

fid,bot,following_count,followers_count
i64,bool,i64,i64
446097,false,291,212
3,false,885,7126
8,false,191,6122
12,false,339,5802
2,false,470,6128
…,…,…,…
280179,true,301,430
327500,true,202,143
428200,true,75,31
278549,true,153,146


In [17]:
# bot_or_not_df.collect().filter(pl.col('fid')==3)

In [18]:
# result_df.unique(subset=['fid'])

In [19]:
links_df #.unique(subset=['id'])

fid,target_fid,hash,timestamp,created_at,updated_at,deleted_at,type,display_timestamp,id
i64,i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64
2,23,"b""\xf6\xb3y\x03f\x0b\x0f\x15""R\xf32\xf1orQ\x16\xc1[\xcb""",2021-07-22 21:38:55,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,22
2,18,"b""\xb9\x18\xc8\xf8\xb1_\xf4SU\x9b\x0f\x18\xce\x99D\xc5T\x12\x8a\x7f""",2021-07-22 21:38:59,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,23
2,20,"b""v\xb8\xaa\x96\x1c\x94\x8d\xaf\xc0c\xbf\xb8N\x88\x82\x87\xa6Q^q""",2021-07-22 21:39:03,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,25
2,8,"b""`e\xed\xd6\xb7\x9b\xf4H\x9d\x8fU\xcb\x98\x07\xac\x15kD\x95|""",2021-07-22 21:39:04,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,26
2,12,"b""4{*K\x02l\x05\x08\xe3\x88\x84\xc5\xc2\xfb\xb6m\x9c=i\xdd""",2021-07-22 21:39:06,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,28
…,…,…,…,…,…,…,…,…,…
7960,243771,"b""d\xf5\x84\xdah\x95B0\xe4\xf2\xfdH\xa8\x88z\xd7\xb8SD\xaa""",2024-10-28 16:56:20,2024-10-28 16:56:20.685148,2024-10-28 16:56:20.685148,,"""follow""",,11884876419
231371,288578,"b""\x0d\xc8\xe4C/\x82\xe5\xd0\xc8F\xe1~+\x10\xe3\x0a\xcd\xf1^\x9f""",2024-09-18 13:31:15,2024-09-18 13:31:15.565869,2024-10-28 16:58:18.899827,2024-10-28 16:58:19,"""follow""",,11874995633
738574,349675,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673
7960,14767,"b""\xd7\xaa\xd4}\xb7\xd5s+>\xc7/\xcc\xf0\xfct\xb6\xeb\xdc\xe7\x99""",2024-10-28 16:58:57,2024-10-28 16:58:59.604411,2024-10-28 16:58:59.604411,,"""follow""",,11884876739


In [20]:
result_df

fid,bot,following_count,followers_count
i64,bool,i64,i64
446097,false,291,212
3,false,885,7126
8,false,191,6122
12,false,339,5802
2,false,470,6128
…,…,…,…
280179,true,301,430
327500,true,202,143
428200,true,75,31
278549,true,153,146


In [21]:
fnames_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-fnames-0-1730134800.parquet")
# 320189

In [22]:
# links_df = links_between_bots_df

# # G = nx.DiGraph()

# nodes = []
# for row in links_df.iter_rows():
#     # nodes.append([row[0], row[1]])

# # plt.figure(figsize=(10, 10))
# # pos = nx.spring_layout(G, seed=42)  # Positioning of the nodes
# # nx.draw(G, pos, with_labels=True, node_size=500, node_color="lightblue", font_size=10, font_weight="bold", edge_color="gray")

# # # Optional: Save the plot as an image
# # # plt.savefig("bot_graph_connections.png", format="PNG")

# # # Show the plot
# # plt.show()

In [23]:
# links_between_bots_df

In [24]:
# for row in links_between_bots_df.iter_rows():
#     print(row)
#     break

In [25]:
# len(nodes)

In [26]:
counts = ['fid','verifications_count', 'recasts_count', 'likes_count', 'total_casts_count','reply_casts_count', 'casts_count']

In [27]:
# result_df.filter(pl.col('bot')==True).sort('likes_count').tail().select(counts) #.describe()
result_df.filter(pl.col('bot')==True).sample(seed=42)

fid,bot,following_count,followers_count
i64,bool,i64,i64
286753,True,153,137


In [28]:
fnames_df.filter(pl.col('fid')==1)

created_at,updated_at,custody_address,expires_at,fid,deleted_at,fname
datetime[ns],datetime[ns],binary,datetime[ns],i64,datetime[ns],str
2023-09-05 23:07:28.042699,2024-06-07 16:44:02.341,"b""\x87sD'@\xc1|\x9d\x0f\x0b\x87\x02,r/\x9a\x13b\x06\xed""",,1,,"""farcaster"""
2023-09-05 23:07:28.045141,2024-06-07 16:44:02.372,"b""\x86\x92L7\xa974\xe8a\x1e\xb0\x81#\x89(\xa9\xd1\x8ac\xc0""",,1,,"""warpcast.eth"""
2023-09-05 23:07:28.048640,2024-06-07 16:44:02.386,"b""\x86\x92L7\xa974\xe8a\x1e\xb0\x81#\x89(\xa9\xd1\x8ac\xc0""",,1,,"""farcaster.eth"""


In [29]:
profile_with_addresses_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-profile_with_addresses-0-1730134800.parquet")

In [30]:
profile_with_addresses_df.filter(pl.col('fid')==1)

fname,display_name,avatar_url,bio,verified_addresses,updated_at,fid
str,str,str,str,str,datetime[ns],i64
"""farcaster""","""Farcaster""","""https://i.imgur.com/I2rEbPF.png""","""A sufficiently decentralized social network. farcaster.xyz""","""[""0x86924c37a93734e8611eb081238928a9d18a63c0""]""",2024-06-07 16:44:02.646,1


In [31]:
links_df

fid,target_fid,hash,timestamp,created_at,updated_at,deleted_at,type,display_timestamp,id
i64,i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64
2,23,"b""\xf6\xb3y\x03f\x0b\x0f\x15""R\xf32\xf1orQ\x16\xc1[\xcb""",2021-07-22 21:38:55,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,22
2,18,"b""\xb9\x18\xc8\xf8\xb1_\xf4SU\x9b\x0f\x18\xce\x99D\xc5T\x12\x8a\x7f""",2021-07-22 21:38:59,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,23
2,20,"b""v\xb8\xaa\x96\x1c\x94\x8d\xaf\xc0c\xbf\xb8N\x88\x82\x87\xa6Q^q""",2021-07-22 21:39:03,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,25
2,8,"b""`e\xed\xd6\xb7\x9b\xf4H\x9d\x8fU\xcb\x98\x07\xac\x15kD\x95|""",2021-07-22 21:39:04,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,26
2,12,"b""4{*K\x02l\x05\x08\xe3\x88\x84\xc5\xc2\xfb\xb6m\x9c=i\xdd""",2021-07-22 21:39:06,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,28
…,…,…,…,…,…,…,…,…,…
7960,243771,"b""d\xf5\x84\xdah\x95B0\xe4\xf2\xfdH\xa8\x88z\xd7\xb8SD\xaa""",2024-10-28 16:56:20,2024-10-28 16:56:20.685148,2024-10-28 16:56:20.685148,,"""follow""",,11884876419
231371,288578,"b""\x0d\xc8\xe4C/\x82\xe5\xd0\xc8F\xe1~+\x10\xe3\x0a\xcd\xf1^\x9f""",2024-09-18 13:31:15,2024-09-18 13:31:15.565869,2024-10-28 16:58:18.899827,2024-10-28 16:58:19,"""follow""",,11874995633
738574,349675,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673
7960,14767,"b""\xd7\xaa\xd4}\xb7\xd5s+>\xc7/\xcc\xf0\xfct\xb6\xeb\xdc\xe7\x99""",2024-10-28 16:58:57,2024-10-28 16:58:59.604411,2024-10-28 16:58:59.604411,,"""follow""",,11884876739


In [32]:
links_df.filter(pl.col('deleted_at').is_null()).select("type").count()

type
u32
2674265


In [33]:
links_df.filter(pl.col('deleted_at').is_null() & (pl.col('type') == "follow")).select('type').count()

type
u32
2674263


In [34]:
df_filtered = links_df.filter(pl.col("deleted_at").is_null())
df_filtered = df_filtered.filter(pl.col('fid') != pl.col('target_fid'))
df_filtered = df_filtered.filter(pl.col("type") == "follow")

mutual_links = df_filtered.join(
    df_filtered,
    left_on=["fid", "target_fid"],
    right_on=["target_fid", "fid"],
    suffix="_reverse"
)


bot_or_not_filtered = bot_or_not_df.collect().filter(
    (pl.col("fid").is_in(mutual_links.select('fid').to_series())) |
    (pl.col("fid").is_in(mutual_links.select('target_fid').to_series()))
)
# bot_or_not_filtered
bot_or_not_with_index = bot_or_not_filtered.with_columns(
    pl.arange(0, bot_or_not_filtered.height).alias("index")
)

mutual_links_with_fid_index = mutual_links.join(
    bot_or_not_with_index.select([pl.col('fid'), pl.col('index').alias('fid_index')]),
    on='fid',
    how='left'
)

mutual_links_with_index = mutual_links_with_fid_index.join(
    bot_or_not_with_index.select([pl.col('fid'), pl.col('index').alias('target_fid_index')]),
    left_on='target_fid',
    right_on='fid',
    how='left'
)


mutual_links_with_index = mutual_links_with_index.with_columns(
    (pl.col("fid_index").cast(pl.Utf8) + " " + pl.col("target_fid_index").cast(pl.Utf8)).alias("connection")
)

mutual_links_with_index

  mutual_links_with_fid_index = mutual_links.join(
  mutual_links_with_index = mutual_links_with_fid_index.join(


fid,target_fid,hash,timestamp,created_at,updated_at,deleted_at,type,display_timestamp,id,hash_reverse,timestamp_reverse,created_at_reverse,updated_at_reverse,deleted_at_reverse,type_reverse,display_timestamp_reverse,id_reverse,fid_index,target_fid_index,connection
i64,i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64,i64,i64,str
23,2,"b""V\xaf\x0c\xcb\xef\xea&\xd6S\x1c\xc1\x82\x87\x11\xf6""\x1fLS\x96""",2021-07-22 22:19:06,2023-09-05 23:07:34.989427,2023-09-05 23:07:34.989427,,"""follow""",,6348,"b""\xf6\xb3y\x03f\x0b\x0f\x15""R\xf32\xf1orQ\x16\xc1[\xcb""",2021-07-22 21:38:55,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,22,8,4,"""8 4"""
18,2,"b""\xa6\xd3\x08q\xdc\x09\x8b`\xef\xdb\x03\x98Z\xc8\xb3\x8c\x9bO$\xab""",2021-07-25 15:39:43,2023-09-05 23:07:34.741851,2023-09-05 23:07:34.741851,,"""follow""",,5689,"b""\xb9\x18\xc8\xf8\xb1_\xf4SU\x9b\x0f\x18\xce\x99D\xc5T\x12\x8a\x7f""",2021-07-22 21:38:59,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,23,6,4,"""6 4"""
20,2,"b""\x07\xf3\x06@\xcb\x19u\xe2\xb1\x11\xea\xce\x7f\xab\x04\x93\xce\x86\xa0\x11""",2022-04-27 02:00:49,2023-09-05 23:07:34.847303,2023-09-05 23:07:34.847303,,"""follow""",,6104,"b""v\xb8\xaa\x96\x1c\x94\x8d\xaf\xc0c\xbf\xb8N\x88\x82\x87\xa6Q^q""",2021-07-22 21:39:03,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,25,7,4,"""7 4"""
8,2,"b""\x87\xb4\xb0\xac7\xed`\xf8\xe5\x8f0\xdc\xda\xfe\xf3\x98\xcd\x9d\xce2""",2021-09-19 19:50:34,2023-09-05 23:07:33.799636,2023-09-05 23:07:33.799636,,"""follow""",,4093,"b""`e\xed\xd6\xb7\x9b\xf4H\x9d\x8fU\xcb\x98\x07\xac\x15kD\x95|""",2021-07-22 21:39:04,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,26,2,4,"""2 4"""
12,2,"b""\xe5-*)\xd3j\xaf[/\xcf\x96\x93\xf9\xca&\xcc\x8cW\x06\xac""",2021-08-18 05:15:08,2023-09-05 23:07:34.198928,2023-09-05 23:07:34.198928,,"""follow""",,4796,"b""4{*K\x02l\x05\x08\xe3\x88\x84\xc5\xc2\xfb\xb6m\x9c=i\xdd""",2021-07-22 21:39:06,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,28,3,4,"""3 4"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
379750,338997,"b""\xb6\xfb\x8a\x8c\xa3q&#9z\xf3'v\xe8T\xddJ\xc0\x20v""",2024-07-31 19:56:55,2024-07-31 19:56:55.928704,2024-07-31 19:56:55.928704,,"""follow""",,11862253678,"b""\xdd\xb3\x86I[\x97\xa6\x9b\xd9\xc2\xb2\xd8\x06\xe4[De\xeb\xf2J""",2024-10-28 16:51:18,2024-10-28 16:51:20.456982,2024-10-28 16:51:20.456982,,"""follow""",,11884875611,3865,3228,"""3865 3228"""
738574,349675,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673,"b""\xa3\x9f\xc3)\xda\x8a\xaf\xee\x0e\x89\xac\xe5A\xbe\xed\x05h\xd4Q8""",2024-10-28 16:54:19,2024-10-28 16:54:19.610602,2024-10-28 16:54:19.610602,,"""follow""",,11884876139,7698,3405,"""7698 3405"""
243771,7960,"b""\xee\x8d\xade\x1f\C\xd9\xf3k\x1f\xf1\x18m,(\xeaBJ\xe2""",2024-10-17 18:00:32,2024-10-17 18:11:06.505470,2024-10-17 18:11:06.505470,,"""follow""",,11882831684,"b""d\xf5\x84\xdah\x95B0\xe4\xf2\xfdH\xa8\x88z\xd7\xb8SD\xaa""",2024-10-28 16:56:20,2024-10-28 16:56:20.685148,2024-10-28 16:56:20.685148,,"""follow""",,11884876419,1837,679,"""1837 679"""
349675,738574,"b""\xa3\x9f\xc3)\xda\x8a\xaf\xee\x0e\x89\xac\xe5A\xbe\xed\x05h\xd4Q8""",2024-10-28 16:54:19,2024-10-28 16:54:19.610602,2024-10-28 16:54:19.610602,,"""follow""",,11884876139,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673,3405,7698,"""3405 7698"""


In [35]:
# bot_or_not_with_index.filter(pl.col("fid")==2)

In [36]:
connections_list = mutual_links_with_index.select("connection").to_series().to_list()
connections_list

['8 4',
 '6 4',
 '7 4',
 '2 4',
 '3 4',
 '10 4',
 '5 4',
 '12 4',
 '11 4',
 '13 4',
 '15 4',
 '16 4',
 '17 4',
 '9 4',
 '21 4',
 '22 4',
 '23 4',
 '20 4',
 '19 4',
 '18 4',
 '24 4',
 '25 4',
 '27 4',
 '28 4',
 '30 4',
 '29 4',
 '31 4',
 '33 4',
 '34 4',
 '35 4',
 '36 4',
 '38 4',
 '39 4',
 '40 4',
 '41 4',
 '42 4',
 '44 4',
 '45 4',
 '46 4',
 '48 4',
 '50 4',
 '51 4',
 '54 4',
 '55 4',
 '57 4',
 '58 4',
 '59 4',
 '60 4',
 '61 4',
 '62 4',
 '63 4',
 '65 4',
 '67 4',
 '69 4',
 '70 4',
 '72 4',
 '73 4',
 '74 4',
 '75 4',
 '77 4',
 '78 4',
 '79 4',
 '81 4',
 '82 4',
 '84 4',
 '85 4',
 '89 4',
 '91 4',
 '92 4',
 '93 4',
 '94 4',
 '95 4',
 '97 4',
 '96 4',
 '99 4',
 '101 4',
 '102 4',
 '103 4',
 '106 4',
 '107 4',
 '108 4',
 '110 4',
 '111 4',
 '112 4',
 '115 4',
 '117 4',
 '118 4',
 '7723 4',
 '121 4',
 '122 4',
 '123 4',
 '124 4',
 '125 4',
 '126 4',
 '127 4',
 '130 4',
 '129 4',
 '131 4',
 '132 4',
 '134 4',
 '135 4',
 '137 4',
 '138 4',
 '140 4',
 '141 4',
 '143 4',
 '145 4',
 '147 4',
 

In [37]:
with open('graph.txt', 'w') as f:
    f.write("\n".join(connections_list))

In [38]:
# bot_or_not = bot_or_not_df.collect()

In [39]:
print(bot_or_not_with_index.filter(pl.col("bot")==True).shape)

(3368, 3)


In [40]:
print(bot_or_not_with_index.filter(pl.col("bot")==False).shape)

(4546, 3)


In [41]:
train_sybils = bot_or_not_with_index.filter(pl.col("bot")==True).sample(300, seed=40).select("index").to_series().to_list()
train_benigns = bot_or_not_with_index.filter(pl.col("bot")==False).sample(300, seed=40).select("index").to_series().to_list()

In [42]:
test_sybils = bot_or_not_with_index.filter((pl.col("bot")==True) & (~pl.col("index").is_in(train_sybils))).select("index").to_series().to_list()
test_benigns = bot_or_not_with_index.filter((pl.col("bot")==False) & (~pl.col("index").is_in(train_benigns))).select("index").to_series().to_list()

In [43]:
benign_ids_str = " ".join(map(str, test_benigns))
sybil_ids_str = " ".join(map(str, test_sybils))

with open('test.txt', 'w') as f:
    f.write(benign_ids_str + "\n")
    f.write(sybil_ids_str + "\n")

In [44]:
benign_ids_str = " ".join(map(str, train_benigns))
sybil_ids_str = " ".join(map(str, train_sybils))

with open('train.txt', 'w') as f:
    f.write(benign_ids_str + "\n")
    f.write(sybil_ids_str + "\n")

In [45]:
bot_or_not_with_index

fid,bot,index
i64,bool,i64
446097,false,0
3,false,1
8,false,2
12,false,3
2,false,4
…,…,…
280179,true,7909
327500,true,7910
428200,true,7911
278549,true,7912


In [46]:
bot_or_not_with_index.filter(pl.col("bot")==False)
4_655 + 7_192

11847

In [47]:
bot_or_not_with_index

fid,bot,index
i64,bool,i64
446097,false,0
3,false,1
8,false,2
12,false,3
2,false,4
…,…,…
280179,true,7909
327500,true,7910
428200,true,7911
278549,true,7912


In [48]:
print(bot_or_not_with_index.filter((pl.col("bot")==True) & (~pl.col("index").is_in(train_sybils))))
print(bot_or_not_with_index.filter((pl.col("bot")==False) & (~pl.col("index").is_in(train_benigns))))

4_155 + 6_692 #+ 1000

shape: (3_068, 3)
┌────────┬──────┬───────┐
│ fid    ┆ bot  ┆ index │
│ ---    ┆ ---  ┆ ---   │
│ i64    ┆ bool ┆ i64   │
╞════════╪══════╪═══════╡
│ 8156   ┆ true ┆ 695   │
│ 8276   ┆ true ┆ 701   │
│ 8505   ┆ true ┆ 712   │
│ 10258  ┆ true ┆ 803   │
│ 10604  ┆ true ┆ 813   │
│ …      ┆ …    ┆ …     │
│ 14897  ┆ true ┆ 7905  │
│ 280179 ┆ true ┆ 7909  │
│ 327500 ┆ true ┆ 7910  │
│ 428200 ┆ true ┆ 7911  │
│ 278549 ┆ true ┆ 7912  │
└────────┴──────┴───────┘
shape: (4_246, 3)
┌────────┬───────┬───────┐
│ fid    ┆ bot   ┆ index │
│ ---    ┆ ---   ┆ ---   │
│ i64    ┆ bool  ┆ i64   │
╞════════╪═══════╪═══════╡
│ 446097 ┆ false ┆ 0     │
│ 3      ┆ false ┆ 1     │
│ 8      ┆ false ┆ 2     │
│ 12     ┆ false ┆ 3     │
│ 2      ┆ false ┆ 4     │
│ …      ┆ …     ┆ …     │
│ 20701  ┆ false ┆ 7903  │
│ 418674 ┆ false ┆ 7904  │
│ 404156 ┆ false ┆ 7906  │
│ 320189 ┆ false ┆ 7908  │
│ 446821 ┆ false ┆ 7913  │
└────────┴───────┴───────┘


10847

In [49]:
mutual_links_with_index #.filter(pl.col('fid')==pl.col('target_fid'))

fid,target_fid,hash,timestamp,created_at,updated_at,deleted_at,type,display_timestamp,id,hash_reverse,timestamp_reverse,created_at_reverse,updated_at_reverse,deleted_at_reverse,type_reverse,display_timestamp_reverse,id_reverse,fid_index,target_fid_index,connection
i64,i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64,binary,datetime[ns],datetime[ns],datetime[ns],datetime[ns],str,datetime[ns],i64,i64,i64,str
23,2,"b""V\xaf\x0c\xcb\xef\xea&\xd6S\x1c\xc1\x82\x87\x11\xf6""\x1fLS\x96""",2021-07-22 22:19:06,2023-09-05 23:07:34.989427,2023-09-05 23:07:34.989427,,"""follow""",,6348,"b""\xf6\xb3y\x03f\x0b\x0f\x15""R\xf32\xf1orQ\x16\xc1[\xcb""",2021-07-22 21:38:55,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,22,8,4,"""8 4"""
18,2,"b""\xa6\xd3\x08q\xdc\x09\x8b`\xef\xdb\x03\x98Z\xc8\xb3\x8c\x9bO$\xab""",2021-07-25 15:39:43,2023-09-05 23:07:34.741851,2023-09-05 23:07:34.741851,,"""follow""",,5689,"b""\xb9\x18\xc8\xf8\xb1_\xf4SU\x9b\x0f\x18\xce\x99D\xc5T\x12\x8a\x7f""",2021-07-22 21:38:59,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,23,6,4,"""6 4"""
20,2,"b""\x07\xf3\x06@\xcb\x19u\xe2\xb1\x11\xea\xce\x7f\xab\x04\x93\xce\x86\xa0\x11""",2022-04-27 02:00:49,2023-09-05 23:07:34.847303,2023-09-05 23:07:34.847303,,"""follow""",,6104,"b""v\xb8\xaa\x96\x1c\x94\x8d\xaf\xc0c\xbf\xb8N\x88\x82\x87\xa6Q^q""",2021-07-22 21:39:03,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,25,7,4,"""7 4"""
8,2,"b""\x87\xb4\xb0\xac7\xed`\xf8\xe5\x8f0\xdc\xda\xfe\xf3\x98\xcd\x9d\xce2""",2021-09-19 19:50:34,2023-09-05 23:07:33.799636,2023-09-05 23:07:33.799636,,"""follow""",,4093,"b""`e\xed\xd6\xb7\x9b\xf4H\x9d\x8fU\xcb\x98\x07\xac\x15kD\x95|""",2021-07-22 21:39:04,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,26,2,4,"""2 4"""
12,2,"b""\xe5-*)\xd3j\xaf[/\xcf\x96\x93\xf9\xca&\xcc\x8cW\x06\xac""",2021-08-18 05:15:08,2023-09-05 23:07:34.198928,2023-09-05 23:07:34.198928,,"""follow""",,4796,"b""4{*K\x02l\x05\x08\xe3\x88\x84\xc5\xc2\xfb\xb6m\x9c=i\xdd""",2021-07-22 21:39:06,2023-09-05 23:07:30.968478,2023-09-05 23:07:30.968478,,"""follow""",,28,3,4,"""3 4"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
379750,338997,"b""\xb6\xfb\x8a\x8c\xa3q&#9z\xf3'v\xe8T\xddJ\xc0\x20v""",2024-07-31 19:56:55,2024-07-31 19:56:55.928704,2024-07-31 19:56:55.928704,,"""follow""",,11862253678,"b""\xdd\xb3\x86I[\x97\xa6\x9b\xd9\xc2\xb2\xd8\x06\xe4[De\xeb\xf2J""",2024-10-28 16:51:18,2024-10-28 16:51:20.456982,2024-10-28 16:51:20.456982,,"""follow""",,11884875611,3865,3228,"""3865 3228"""
738574,349675,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673,"b""\xa3\x9f\xc3)\xda\x8a\xaf\xee\x0e\x89\xac\xe5A\xbe\xed\x05h\xd4Q8""",2024-10-28 16:54:19,2024-10-28 16:54:19.610602,2024-10-28 16:54:19.610602,,"""follow""",,11884876139,7698,3405,"""7698 3405"""
243771,7960,"b""\xee\x8d\xade\x1f\C\xd9\xf3k\x1f\xf1\x18m,(\xeaBJ\xe2""",2024-10-17 18:00:32,2024-10-17 18:11:06.505470,2024-10-17 18:11:06.505470,,"""follow""",,11882831684,"b""d\xf5\x84\xdah\x95B0\xe4\xf2\xfdH\xa8\x88z\xd7\xb8SD\xaa""",2024-10-28 16:56:20,2024-10-28 16:56:20.685148,2024-10-28 16:56:20.685148,,"""follow""",,11884876419,1837,679,"""1837 679"""
349675,738574,"b""\xa3\x9f\xc3)\xda\x8a\xaf\xee\x0e\x89\xac\xe5A\xbe\xed\x05h\xd4Q8""",2024-10-28 16:54:19,2024-10-28 16:54:19.610602,2024-10-28 16:54:19.610602,,"""follow""",,11884876139,"b""\xd1\xf8\x8f\x96\xeb\xb2""enN\xa7\x1e\x8at48\xd3\xcb""\xd8""",2024-10-28 16:58:21,2024-10-28 16:58:21.852045,2024-10-28 16:58:21.852045,,"""follow""",,11884876673,3405,7698,"""3405 7698"""


In [50]:
#df1 = mutual_links_with_index.join(bot_or_not_filtered.select(["fid", "bot"]), how="left", on="fid")
#df1.select(["fid_index","target_fid_index","bot"]).write_parquet(f"{DATA_PATH}/interim/test-bots.parquet")