In [None]:
import polars as pl
from hashlib import sha256
import os
import json
from urllib.parse import urlparse, parse_qs
import networkx as nx
import matplotlib.pyplot as plt
pl.Config.set_fmt_str_lengths(400)
# !pip install pyarrow

In [None]:
DATA_PATH = os.getenv("DATA_PATH", "")

In [None]:
bot_or_not_df = pl.scan_parquet(f"{DATA_PATH}/interim/bot_or_not_without_info.parquet")

In [None]:
links_lazy_df = pl.scan_parquet(f"{DATA_PATH}/raw/farcaster-links-0-1730134800.parquet")

In [None]:
bot_or_not_df

In [None]:
# links_lazy_df.filter(pl.col('deleted_at').is_null()).head(5).collect()

In [None]:
bot_or_not_fids = bot_or_not_df.select("fid")

In [None]:
links_filtered_by_fid = links_lazy_df.join(bot_or_not_fids, on="fid", how="inner")
links_between_bots = links_filtered_by_fid.join(bot_or_not_fids, left_on="target_fid", right_on="fid", how="inner")

In [None]:
links_between_bots_df = links_between_bots.collect()

In [None]:
links_between_bots_df

In [None]:
links_df = links_between_bots_df.filter(pl.col('fid') != pl.col('target_fid'))

In [None]:
# bots_fids.collect()

In [None]:
following_count_df = (
    links_df
    .filter((pl.col("type") == "follow") & (pl.col("deleted_at").is_null()))
    .group_by("fid")
    .agg(pl.count("id").alias("following_count"))
)

In [None]:
followers_count_df = (
    links_df
    .filter((pl.col("type") == "follow") & (pl.col("deleted_at").is_null()))
    .group_by("target_fid")
    .agg(pl.count("id").alias("followers_count"))
    .rename({"target_fid": "fid"})
)

In [None]:
bots_df = bot_or_not_df.collect()
bots_df = bots_df.drop(["followers_count", "following_count"])
result_df = bots_df.join(following_count_df, on="fid", how="left")
result_df = result_df.join(followers_count_df, on="fid", how="left")
result_df = result_df.fill_null(0)

In [None]:
result_df

In [None]:
# bot_or_not_df.collect().filter(pl.col('fid')==3)

In [None]:
# result_df.unique(subset=['fid'])

In [None]:
links_df #.unique(subset=['id'])

In [None]:
result_df

In [None]:
fnames_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-fnames-0-1730134800.parquet")
# 320189

In [None]:
# links_df = links_between_bots_df

# # G = nx.DiGraph()

# nodes = []
# for row in links_df.iter_rows():
#     # nodes.append([row[0], row[1]])

# # plt.figure(figsize=(10, 10))
# # pos = nx.spring_layout(G, seed=42)  # Positioning of the nodes
# # nx.draw(G, pos, with_labels=True, node_size=500, node_color="lightblue", font_size=10, font_weight="bold", edge_color="gray")

# # # Optional: Save the plot as an image
# # # plt.savefig("bot_graph_connections.png", format="PNG")

# # # Show the plot
# # plt.show()

In [None]:
# links_between_bots_df

In [None]:
# for row in links_between_bots_df.iter_rows():
#     print(row)
#     break

In [None]:
# len(nodes)

In [None]:
counts = ['fid','verifications_count', 'recasts_count', 'likes_count', 'total_casts_count','reply_casts_count', 'casts_count']

In [None]:
# result_df.filter(pl.col('bot')==True).sort('likes_count').tail().select(counts) #.describe()
result_df.filter(pl.col('bot')==True).sample(seed=42)

In [None]:
fnames_df.filter(pl.col('fid')==1)

In [None]:
profile_with_addresses_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-profile_with_addresses-0-1730134800.parquet")

In [None]:
profile_with_addresses_df.filter(pl.col('fid')==1)

In [None]:
links_df

In [None]:
links_df.filter(pl.col('deleted_at').is_null()).select("type").count()

In [None]:
links_df.filter(pl.col('deleted_at').is_null() & (pl.col('type') == "follow")).select('type').count()

In [None]:
df_filtered = links_df.filter(pl.col("deleted_at").is_null())
df_filtered = df_filtered.filter(pl.col('fid') != pl.col('target_fid'))
df_filtered = df_filtered.filter(pl.col("type") == "follow")

mutual_links = df_filtered.join(
    df_filtered,
    left_on=["fid", "target_fid"],
    right_on=["target_fid", "fid"],
    suffix="_reverse"
)


bot_or_not_filtered = bot_or_not_df.collect().filter(
    (pl.col("fid").is_in(mutual_links.select('fid').to_series())) |
    (pl.col("fid").is_in(mutual_links.select('target_fid').to_series()))
)
# bot_or_not_filtered
bot_or_not_with_index = bot_or_not_filtered.with_columns(
    pl.arange(0, bot_or_not_filtered.height).alias("index")
)

mutual_links_with_fid_index = mutual_links.join(
    bot_or_not_with_index.select([pl.col('fid'), pl.col('index').alias('fid_index')]),
    on='fid',
    how='left'
)

mutual_links_with_index = mutual_links_with_fid_index.join(
    bot_or_not_with_index.select([pl.col('fid'), pl.col('index').alias('target_fid_index')]),
    left_on='target_fid',
    right_on='fid',
    how='left'
)


mutual_links_with_index = mutual_links_with_index.with_columns(
    (pl.col("fid_index").cast(pl.Utf8) + " " + pl.col("target_fid_index").cast(pl.Utf8)).alias("connection")
)

mutual_links_with_index

In [None]:
# bot_or_not_with_index.filter(pl.col("fid")==2)

In [None]:
connections_list = mutual_links_with_index.select("connection").to_series().to_list()
connections_list

In [None]:
with open('graph.txt', 'w') as f:
    f.write("\n".join(connections_list))

In [None]:
# bot_or_not = bot_or_not_df.collect()

In [None]:
print(bot_or_not_with_index.filter(pl.col("bot")==True).shape)

In [None]:
print(bot_or_not_with_index.filter(pl.col("bot")==False).shape)

In [None]:
train_sybils = bot_or_not_with_index.filter(pl.col("bot")==True).sample(300, seed=40).select("index").to_series().to_list()
train_benigns = bot_or_not_with_index.filter(pl.col("bot")==False).sample(300, seed=40).select("index").to_series().to_list()

In [None]:
test_sybils = bot_or_not_with_index.filter((pl.col("bot")==True) & (~pl.col("index").is_in(train_sybils))).select("index").to_series().to_list()
test_benigns = bot_or_not_with_index.filter((pl.col("bot")==False) & (~pl.col("index").is_in(train_benigns))).select("index").to_series().to_list()

In [None]:
benign_ids_str = " ".join(map(str, test_benigns))
sybil_ids_str = " ".join(map(str, test_sybils))

with open('test.txt', 'w') as f:
    f.write(benign_ids_str + "\n")
    f.write(sybil_ids_str + "\n")

In [None]:
benign_ids_str = " ".join(map(str, train_benigns))
sybil_ids_str = " ".join(map(str, train_sybils))

with open('train.txt', 'w') as f:
    f.write(benign_ids_str + "\n")
    f.write(sybil_ids_str + "\n")

In [None]:
bot_or_not_with_index

In [None]:
bot_or_not_with_index.filter(pl.col("bot")==False)
4_655 + 7_192

In [None]:
bot_or_not_with_index

In [None]:
print(bot_or_not_with_index.filter((pl.col("bot")==True) & (~pl.col("index").is_in(train_sybils))))
print(bot_or_not_with_index.filter((pl.col("bot")==False) & (~pl.col("index").is_in(train_benigns))))

4_155 + 6_692 #+ 1000

In [None]:
mutual_links_with_index #.filter(pl.col('fid')==pl.col('target_fid'))

In [None]:
#df1 = mutual_links_with_index.join(bot_or_not_filtered.select(["fid", "bot"]), how="left", on="fid")
#df1.select(["fid_index","target_fid_index","bot"]).write_parquet(f"{DATA_PATH}/interim/test-bots.parquet")