In [None]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
print("Imported libs")

In [None]:
full_df = pl.read_csv("full_combined_fixed.csv")
full_df = full_df.with_columns(
    pl.from_epoch("timestamp", time_unit="s")
)
full_df

In [None]:
region_focused_df = pl.read_csv("test1_combined.csv")
region_focused_df = region_focused_df.with_columns(
    pl.from_epoch("timestamp", time_unit="s")
)
region_focused_df

### Some ideas
- Load balancer clusters/groups, identified by a common prefix.
  - How many clusters did we detect? How many per country/colcation center?
  - Same cluster is visible across multiple countries, but all in close proximity. Can approximate cluster range/size? Visualize clusters on a map?
  - For some clusters, we have most/all instances. Extract instances count, country/colocation distrubtion, size differences between larger cluster and smaller clusters.
- Unique balancer IDs in a single colocation center or multiple? Is balancer ID 1:1 to a colocation center?
- Do some requests access a load balancer outside the country? How often and for which countries?


In [None]:
grouped_by_prefix = (
    df
    .with_columns(
        prefix=pl.col("balancerId").str.slice(0, 3)
    )
    .group_by("prefix")
    .agg(
        pl.col("clientCountryAccodingCloudflarecodingCloudflarecodingCloudflare").unique().alias("countries"),
        pl.col("balancerColocationCenter").unique().alias("colocationCenters"),
        pl.col("balancerId").unique().alias("balancerIds"),
    )
)
grouped_by_prefix

Looks like the prefixes loosley represent a region, but are not limited to country or colocation centers. A prefix can maybe be reffered to as a 'load balancer group/cluster'.

In [None]:
count_ids_per_prefix = (
    region_focused_df
    .with_columns(
        prefix=pl.col("balancerId").str.slice(0, 3)
    )
    .group_by("prefix")
    .agg(
        pl.col("balancerId").n_unique().alias("unique_balancer_count")
    )
    .sort("unique_balancer_count", descending=True)
)
count_ids_per_prefix