In [None]:
# !pip install polars
import polars as pl
import os
pl.Config.set_fmt_str_lengths(400)

In [None]:
DATA_PATH = os.getenv("DATA_PATH", "")

In [None]:
fids_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-fids-0-1730134800.parquet")

In [None]:
# profiles_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-fids-0-1730134800.parquet")farcaster-fids-0-1727974800

In [None]:
links_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-links-0-1730134800.parquet")

In [None]:
reactions_lazy_df = pl.scan_parquet(f"{DATA_PATH}/raw/farcaster-reactions-0-1730134800.parquet")

In [None]:
verifications_df = pl.read_parquet(f"{DATA_PATH}/raw/farcaster-verifications-0-1730134800.parquet")

In [None]:
# Verifications count per fid
verifications_count_df = (
    verifications_df
    .group_by("fid")
    .agg(pl.count("id").alias("verifications_count"))
)

In [None]:
# Likes count and recasts count per fid
reactions_count_lazy = (
    reactions_lazy_df
    .filter(pl.col("deleted_at").is_null())  # Filter out deleted reactions
    .group_by(["fid", "reaction_type"])
    .agg(pl.count("id").alias("count"))
)

In [None]:
reactions_count_df = reactions_count_lazy.collect()

In [None]:
reactions_pivot_df = (
    reactions_count_df
    .pivot(
        values="count",
        index="fid",
        columns="reaction_type",
        aggregate_function="first"
    )
    .rename({"1": "likes_count", "2": "recasts_count"})
    .fill_null(0)
)

In [None]:
following_count_df = (
    links_df
    .filter((pl.col("type") == "follow") & (pl.col("deleted_at").is_null()))
    .group_by("fid")
    .agg(pl.count("id").alias("following_count"))
)

In [None]:
# Followers count (number of users following the fid)
followers_count_df = (
    links_df
    .filter((pl.col("type") == "follow") & (pl.col("deleted_at").is_null()))
    .group_by("target_fid")
    .agg(pl.count("id").alias("followers_count"))
    .rename({"target_fid": "fid"})
)

In [None]:
result_df = fids_df.join(verifications_count_df, on="fid", how="left")
result_df = result_df.join(following_count_df, on="fid", how="left")
result_df = result_df.join(followers_count_df, on="fid", how="left")
result_df = result_df.join(reactions_pivot_df, on="fid", how="left")
result_df = result_df.fill_null(0)

In [None]:
result_df.write_parquet(f"{DATA_PATH}/interim/aggregated_links_statistics.parquet")

In [None]:
import psutil
import os

# Function to print the current memory usage
def print_memory_usage():
    process = psutil.Process(os.getpid())
    memory = process.memory_info().rss / (1024 * 1024)  # Convert to MB
    print(f"Memory Usage: {memory:.2f} MB")

In [None]:
print_memory_usage()

In [None]:
result_df.sample(10)

In [None]:
joao_fid = 12031

In [None]:
user_statistics = result_df.filter(pl.col("fid") == joao_fid)


In [None]:
user_statistics

In [None]:
links_df.filter(pl.col("fid") == joao_fid)

In [None]:
links_df.filter(pl.col("target_fid") == joao_fid)

In [None]:
# should filter by the links where the deleted at is null

In [None]:
# add casts 

In [None]:
casts_lazy_df = pl.scan_parquet(f"{DATA_PATH}/raw/farcaster-casts-0-1730134800.parquet")

In [None]:
casts_count_lazy = (
    casts_lazy_df
    .filter(pl.col("deleted_at").is_null())  # Filter out deleted casts
    .group_by("fid")
    .agg([
        pl.count("id").alias("total_casts_count"),
        (pl.col("parent_hash").is_not_null() & (pl.col("parent_hash").is_not_null())).sum().alias("reply_casts_count"),
        (pl.col("parent_hash").is_null()).sum().alias("casts_count"),
    ])
)

In [None]:
casts_count_df = casts_count_lazy.collect()

In [None]:
casts_count_df

In [None]:
df = pl.read_parquet(f"{DATA_PATH}/interim/aggregated_links_statistics.parquet")

In [None]:
# df = df.drop(["casts_count","casts_count_right"])
# df

In [None]:
casts_count_df.sample(10)

In [None]:
result_df = df.join(casts_count_df, on="fid", how="left")
result_df = result_df.fill_null(0)

In [None]:
result_df.write_parquet(f"{DATA_PATH}/interim/aggregated_links_statistics.parquet")

In [None]:
# casts_by_fid = casts_lazy_df.filter((pl.col('fid')==400242) & (pl.col('deleted_at').is_null()))
# result = casts_by_fid.collect()

In [None]:
# result.filter(~pl.col("root_parent_url").str.contains("https://warpcast.com/~/channel"))

In [None]:
# b"\xed=\x85\x89\xeb\x89\xe8*\x1d\xa6P\x83\xe6[\xc9\x1af\x87IA"	.hex()

In [None]:
df_filtered = links_df.filter(
            (pl.col("deleted_at").is_null()) & 
            (pl.col("fid") != pl.col("target_fid")) & 
            (pl.col("type") == "follow")
        ).select([
            "fid", "target_fid"
        ])

In [None]:
mutual_links = df_filtered.join(
    df_filtered,
    left_on=["fid", "target_fid"],
    right_on=["target_fid", "fid"],
    suffix="_reverse"
)

In [None]:
df_filtered = df_filtered.with_columns([
        pl.col("fid").cast(pl.Categorical),
        pl.col("target_fid").cast(pl.Categorical)
    ])

In [None]:
mutual_links = df_filtered.join(
        df_filtered,
        left_on=["fid", "target_fid"],
        right_on=["target_fid", "fid"],
        suffix="_reverse",
    )