In [1]:
import polars as pl
import math
import numpy as np
from itertools import combinations
from tqdm import tqdm
from typing import Optional

In [2]:
df = pl.read_json("data-medium.json")
print(df.shape)
df.head()

(300000, 3)


user,question,score
str,str,i64
"""32121608-5d1b-…","""80c01e0f-a88d-…",0
"""c6ef79b6-dff8-…","""80c01e0f-a88d-…",0
"""7a241e06-be79-…","""80c01e0f-a88d-…",1
"""098132c1-deb8-…","""80c01e0f-a88d-…",0
"""80a35735-43fd-…","""80c01e0f-a88d-…",0


In [3]:
def k_corrset(df:pl.DataFrame, k: int, u_col:str="user", q_col:str="question", score_col:str="score"):

    # Reduction: Only Keep Users who answer >= k questions
    good_pop = df.group_by(
        pl.col(u_col)
    ).count().filter(
        pl.col("count") >= k
    ).drop_in_place(u_col)

    df_local = df.filter(
        pl.col(u_col).is_in(good_pop)
    ).sort(pl.col(u_col))

    # Precompute grand totals
    grand_totals = df_local.lazy().group_by(
        pl.col(u_col)
    ).agg(
        pl.col(score_col).sum().alias("grand_totals")
    ).cache().collect().lazy()

    batch_size = 1024
    unique_questions = df_local.get_column(q_col).unique()
    all_combs = list(combinations(unique_questions, k))
    pbar = tqdm(total=len(all_combs))
    top_subset: Optional[tuple] = None
    top_corr: float = -1.0
    for i in range(len(all_combs)//batch_size + 1):
        combs = all_combs[i*batch_size: (i+1)*batch_size]
        frames = (
            df_local.lazy().filter(
                pl.col(q_col).is_in(subset)
            ).group_by(
                pl.col(u_col).set_sorted()
            ).agg(
                pl.count(),
                pl.col(score_col).sum().alias("qs_totals")
            ).filter(
                pl.col("count").eq(k)
            ).select(
                pl.col(u_col).set_sorted(),
                pl.col("qs_totals")
            ).join(
                grand_totals
                , on = u_col
            ).select(
                pl.corr(pl.col("qs_totals"), pl.col("grand_totals")).alias("corr")
            )
            for subset in combs
        )

        for i, frame in enumerate(pl.collect_all(frames)):
            if len(frame) >= 1:
                idx, corr = i, frame.item(0,0)
                if corr is not None:
                    if (not np.isnan(corr)) & (corr > top_corr):
                        top_subset = combs[idx]
                        top_corr = corr
        
        pbar.update(batch_size)

    pbar.close()
    return top_subset, top_corr

# Not working
# def k_corrset2(df:pl.DataFrame, k: int, u_col:str="user", q_col:str="question", score_col:str="score"):

#     # Reduction: Only Keep Users who answer >= k questions
#     good_pop = df.group_by(
#         pl.col(u_col)
#     ).count().filter(
#         pl.col("count") >= k
#     ).drop_in_place(u_col).sort().set_sorted()

#     df_local = df.filter(
#         pl.col(u_col).is_in(good_pop)
#     ).sort(pl.col(u_col))

#     # Precompute grand totals and reformat dataframe
#     reference = df_local.lazy().groupby(
#         pl.col(u_col)
#         , maintain_order=True
#     ).agg(
#         pl.col(score_col).alias("all_scores"), # This will be in a list
#         pl.col(q_col).alias("all_questions"), # This will be in a list
#         pl.col(score_col).sum().alias("grand_totals")
#     ).collect()

#     batch_size = 2048
#     unique_questions = df_local.get_column(q_col).unique()
#     all_combs = list(combinations(unique_questions, k))
#     pbar = tqdm(total=len(all_combs))
#     top_subset: Optional[tuple] = None
#     top_corr: float = -1.0
#     for i in range(len(all_combs)//batch_size + 1):
#         combs = all_combs[i*batch_size: (i+1)*batch_size]
        
#         frames = (
#             reference.lazy()
#             .with_columns(
#                 pl.col("all_questions").explode().is_in(subset).cast(pl.UInt8).implode().alias("q_idx")
#             ).filter(
#                 # Only users who answered all questions in subset are kept
#                 pl.col("q_idx").list.sum().first().eq(k)
#             ).with_columns(
#                 pl.col("all_scores").explode().dot(
#                     pl.col("q_idx").explode()
#                 ).alias("qs_totals")
#             ).select(
#                 pl.corr(pl.col("qs_totals"), pl.col("grand_totals")).alias("corr"),
#             )
#             for subset in combs
#         )

#         for i, frame in enumerate(pl.collect_all(frames)):
#             if len(frame) >= 1:
#                 idx, corr = i, frame.item(0,0)
#                 if corr is not None:
#                     if (not np.isnan(corr)) & (corr > top_corr):
#                         top_subset = combs[idx]
#                         top_corr = corr
        
#         pbar.update(batch_size)

#     pbar.close()
#     return top_subset, top_corr


In [4]:
k_corrset(df, 5)

  0%|          | 8192/75287520 [00:13<32:53:46, 635.67it/s]

KeyboardInterrupt: 