## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

## Load the Dataset

In [None]:
df = pd.read_parquet("res.pqt")
df = df[df['subject'] == 'professional_law']

## Topic Name

In [None]:
topic_name = "professional_law"  # or "professional_law", "high_school_macroeconomics", or "professional_psychology"

## Function to calculate the mean of the "corrects" for different sample sizes

In [None]:
def compute_avg_correct_vs_sample_size(df, max_k=500, n=1000, seed=42):
    np.random.seed(seed)
    means = []
    for k in tqdm(range(1, max_k + 1), desc="Sample size loop"):
        trials = []
        for _ in range(n):
            sample = df.sample(n=k, replace=False)
            trials.append(sample['correct'].mean())
        means.append(np.mean(trials))
    return means

## Execution

In [None]:
avg_means = compute_avg_correct_vs_sample_size(df, max_k=300, n=1000)
overall_mean = df['correct'].mean()
print("Overall mean of 'correct':", overall_mean)

Sample size loop: 100%|██████████| 300/300 [02:01<00:00,  2.47it/s]

Overall mean of 'correct': 0.6236138290932811





## Compute Sampling Baseline (Random)

We compute the average "correct" score over multiple random subsets of increasing size (k = 1 to N), repeated multiple times to simulate stability. This provides a reference curve against which smarter selection strategies will be compared.

In [None]:
from tqdm import tqdm

def compute_avg_correct_vs_sample_size(df, max_k=300, n=1000, seed=42):
    np.random.seed(seed)
    means = []
    for k in tqdm(range(1, max_k + 1)):
        trials = []
        for _ in range(n):
            sample = df.sample(n=k, replace=False)
            trials.append(sample['correct'].mean())
        means.append(np.mean(trials))
    return list(range(1, max_k + 1)), means

# Filter dataset based on the selected topic
df_topic = df[df["subject"] == topic_name]

# Run the sampling procedure
k_values, avg_means = compute_avg_correct_vs_sample_size(df_topic, max_k=300, n=1000)
overall_mean = df_topic["correct"].mean()

100%|██████████| 300/300 [02:01<00:00,  2.47it/s]


## Save Results

In [None]:
import json

results_df = pd.DataFrame({
    "k": k_values,
    "avg_sample_mean": avg_means
})

results_df.to_csv(f"sample_mean_results_{topic_name}.csv", index=False)
results_df.to_excel(f"sample_mean_results_{topic_name}.xlsx", index=False)

with open(f"sample_mean_results_{topic_name}.json", "w") as f:
    json.dump({
        "k_values": k_values,
        "avg_sample_mean": avg_means,
        "overall_mean": overall_mean,
        "topic": topic_name
    }, f)

## KMeans

In [None]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
from tqdm import tqdm
import json


def kmeans_subset(X, df, k, seed):
    kmeans = KMeans(n_clusters=k, random_state=seed, n_init='auto').fit(X)
    centroids = kmeans.cluster_centers_
    selected_idxs = []
    for c in centroids:
        distances = np.linalg.norm(X - c, axis=1)
        best_idx = np.argmin(distances)
        while best_idx in selected_idxs:
            distances[best_idx] = np.inf
            best_idx = np.argmin(distances)
        selected_idxs.append(best_idx)
    return df.iloc[selected_idxs]


# Parameters and data loading
# Automatically pick a topic with valid embeddings
valid_topics = df[df["embedding"].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x) > 0)]["subject"].unique()
topic_name = valid_topics[0] if len(valid_topics) > 0 else None
if topic_name is None:
    raise ValueError("No topics found with valid embeddings")
df_topic = df[df["subject"] == topic_name].copy()
df_topic = df_topic[df_topic["embedding"].apply(lambda x: isinstance(x, (list, np.ndarray)) and len(x) > 0)]
if len(df_topic) == 0:
    raise ValueError(f"No valid embeddings found for topic '{topic_name}'")
X = np.vstack(df_topic["embedding"].values)

k_values = list(range(1, 301))
n_repeats = 100
kmeans_means = []

# Main evaluation loop
for k in tqdm(k_values, desc=f"KMeans avg over {n_repeats} runs"):
    accuracies = []
    for i in range(n_repeats):
        selected_df = kmeans_subset(X, df_topic, k=k, seed=42 + i)
        accuracies.append(selected_df["correct"].mean())
    kmeans_means.append(np.mean(accuracies))

# Save results to CSV
results_df = pd.DataFrame({
    "k": k_values,
    "kmeans_avg_accuracy": kmeans_means
})
results_df.to_csv(f"kmeans_avg_results_{topic_name}.csv", index=False)


KMeans avg over 100 runs:  72%|███████▏  | 216/300 [3:45:30<2:42:39, 116.18s/it]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Set topic
topic_name = "professional_law"  # e.g., "professional_law", "high_school_macroeconomics", "professional_psychology"

# Load data
df_random = pd.read_csv(f"sample_mean_results_{topic_name}.csv")
df_kmeans = pd.read_csv(f"kmeans_avg_results_{topic_name}.csv")  # averaged KMeans results
overall_mean = df[df["subject"] == topic_name]["correct"].mean()

# Plot
plt.figure(figsize=(10, 6))
plt.plot(df_random["k"], df_random["avg_sample_mean"],
         label="Random Sampling (avg over 1000 runs)", color="blue")
plt.plot(df_kmeans["k"], df_kmeans["kmeans_avg_accuracy"],
         label="KMeans Clustering (avg over 10 runs)", color="orange")
plt.axhline(y=overall_mean, color="gray", linestyle="--",
            label=f"Overall mean = {overall_mean:.3f}")

plt.xlabel("Subset size (k)")
plt.ylabel("Accuracy")
plt.title(f"Accuracy vs Subset Size ({topic_name})\nRandom vs KMeans")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.ylim(0.3, 1.0)  # standard Y-axis range for fair comparison
plt.savefig(f"comparison_plot_{topic_name}.png", dpi=300)
plt.show()

# Note:
# - KMeans results are averaged over 100 repetitions with different seeds.
# - Random sampling is averaged over 1000 repetitions.