In [None]:
import polars as pl
import os
import json
from urllib.parse import urlparse, parse_qs
pl.Config.set_fmt_str_lengths(400)

In [None]:
DATA_PATH = os.getenv("DATA_PATH", "")

In [None]:
df = pl.read_parquet(f"{DATA_PATH}/interim/aggregated_links_statistics.parquet")
casts_lazy_df = pl.scan_parquet(f"{DATA_PATH}/raw/farcaster-casts-0-1730134800.parquet")

In [None]:
df

In [None]:
casts_count_lazy = (
    casts_lazy_df
    .filter(pl.col("deleted_at").is_null())  # Filter out deleted casts
    .group_by("fid")
    .agg(pl.count("id").alias("casts_count"))  # Aggregate to count casts per fid
)

In [None]:
# https://warpcast.com/eljuniordiaz.eth/0xf612041a
# https://warpcast.com/falleccypollne/0x04253bc9

# https://warpcast.com/sayangel/0x8cfc0da0

In [None]:
# https://warpcast.com/botornot
BOT_OR_NOT_FID = 480626
bot_or_not_casts_lazy = casts_lazy_df.filter(pl.col("fid") == BOT_OR_NOT_FID)
bot_or_not_casts_df = bot_or_not_casts_lazy.collect()
bot_or_not_casts_df.write_parquet(f"{DATA_PATH}/interim/bot_or_not_casts.parquet")
# bot_or_not_casts_df = pl.read_parquet(f"{DATA_PATH}/interim/bot_or_not_casts.parquet")

In [None]:
bot_or_not_casts_df

In [None]:
bots_casts_df = bot_or_not_casts_df.filter(~pl.col("text").str.contains(r"(?i)not a bot\."))
# bots_casts_df = bots_casts_df.filter(~pl.col("text").str.contains("Bot or Not? Someone checked : Casts: "))
# bots_casts_df = bots_casts_df.filter(~pl.col("text").str.contains("Bot or Not? Someone checked: Casts: "))
bots_casts_df = bots_casts_df.filter(~pl.col("text").str.contains("Hi ! You have reached your Bot or Not free limit. "))
bots_casts_df = bots_casts_df.filter((pl.col("text").str.contains(r"(?i)like a bot") | pl.col("text").str.contains(r"(?i)bot or low")))

bots_casts_df = bots_casts_df.with_columns(
    pl.col("hash").map_elements(lambda x: "0x" + x.hex(), return_dtype=pl.Utf8).alias("cast_hash")
)
# bots_casts_df
# unique_casts_df = bots_casts_df.unique(subset=["text"])
# sampled_casts_df = unique_casts_df.select(["text","cast_hash" ]).sample(10)
# print(bots_casts_df.shape)
# bots_casts_df = bots_casts_df.unique
# print(bots_casts_df.unique(subset=["mentions"]).shape)

bots_casts_df = bots_casts_df.unique(subset=["mentions"])

In [None]:
not_bots_df = bot_or_not_casts_df.filter(pl.col("text").str.contains(r"(?i)not a bot\."))

In [None]:
# build fids
def transform_mentions(row):
    mentions = json.loads(row['mentions'])
    embeds = json.loads(row['embeds'])

    if mentions:
        return int(mentions[0])

    if embeds:
        url = embeds[0]["url"]
        parsed_url = urlparse(url)
        query_params = parse_qs(parsed_url.query)
        if 'fid' in query_params:
            return int(query_params['fid'][0])

    return None


In [None]:
# testing = not_bots_df.filter(pl.col("mentions") == "[]")
# testing = testing.with_columns(
#     pl.struct(["mentions", "embeds"]).map_elements(transform_mentions, return_dtype=pl.Int64).alias("fid_new")
# )

In [None]:
bots_casts_df = bots_casts_df.with_columns(
    pl.struct(["mentions", "embeds"]).map_elements(transform_mentions, return_dtype=pl.Int64).alias("fid")
)
not_bots_df = not_bots_df.with_columns(
    pl.struct(["mentions", "embeds"]).map_elements(transform_mentions, return_dtype=pl.Int64).alias("fid")
)

In [None]:
bots_casts_df = bots_casts_df.unique(subset=['fid'])

In [None]:
not_bots_df = not_bots_df.unique(subset=['fid'])

In [None]:
not_bots_df = not_bots_df.with_columns(
    pl.lit(False).alias('bot')
)

bots_casts_df = bots_casts_df.with_columns(
    pl.lit(True).alias('bot')
)

In [None]:
not_bots_df = not_bots_df.select(['fid', 'bot', 'text'])
bots_casts_df = bots_casts_df.select(['fid', 'bot', 'text'])

In [None]:
bots_casts_df #.unique(subset=['fid'])

In [None]:
combined_df = pl.concat([not_bots_df, bots_casts_df])

In [None]:
combined_df

In [None]:
# combined_df.write_parquet(f"{DATA_PATH}/bot_or_not_checks.parquet")

In [None]:
combined_df

In [None]:
combined_df.unique(subset=['fid'])

In [None]:
duplicateds = (
    combined_df.group_by(['fid'])
    .agg(pl.count())
    .filter(pl.col("count") > 1)  # Filter groups where count > 1 (duplicates)
)

duplicateds
# Step 2: Join the duplicate groups back to the original DataFrame
# This retrieves all versions (original + duplicates)
# duplicates_df = df.join(duplicate_groups.drop("count"), on=subset_cols, how="inner")

In [None]:
duplicated_fids = duplicateds.select(['fid'])

In [None]:
fnames_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-fnames-0-1730134800.parquet")

In [None]:
# combined_df.filter(pl.col('fid')==291366)

In [None]:
# fnames_df.filter(pl.col('fid')==291366)

In [None]:
without_duplicates = combined_df.filter(~pl.col('fid').is_in(duplicated_fids.to_series()))

In [None]:
without_duplicates.write_parquet(f"{DATA_PATH}/interim/bot_or_not_checks.parquet")