# Labelled Feedback Sampling
Taking a random or stratified sample of labelled feedback records to use as context for few-shot prompting

In [None]:
from google.cloud import bigquery
import pandas as pd
import os

PUBLISHING_PROJECT_ID = os.getenv("PUBLISHING_PROJECT_ID")
LABELLED_FEEDBACK_DATASET = os.getenv("LABELLED_FEEDBACK_DATASET")
LABELLED_FEEDBACK_TABLE = os.getenv("LABELLED_FEEDBACK_TABLE")

In [None]:
def query_bigquery(project_id: str, dataset_id: str, query: str):
    """Extracts feedback records from BigQuery

    Args:
        project_id (str): BigQuery project ID
        dataset_id (str): BigQuery dataset ID
        query (str): SQL query to get data from BigQuery

    Returns:
        pd.DataFrame: DataFrame containing feedback records
    """
    # Initialize a BigQuery client
    client = bigquery.Client(project=project_id)

    # Construct a reference to the dataset
    dataset_ref = client.dataset(dataset_id)

    # Make a BigQuery API request to run the query
    query_job = client.query(query)

    # Wait for the query to complete
    query_job.result()

    # Fetch the results of the query
    # results = query_job.result()

    # Write to a df
    results_df = query_job.to_dataframe()

    return results_df

In [None]:
query_read = """
SELECT * FROM  @feedback_sample_table
"""
query_read = query_read.replace("@feedback_sample_table", str(LABELLED_FEEDBACK_TABLE))
# Call the function to execute the query
sample_df = query_bigquery(PUBLISHING_PROJECT_ID, LABELLED_FEEDBACK_DATASET, query_read)
print(len(sample_df))

In [None]:
# Display count of labels by distinct feedback_record_id
def count_labels(sample_df: pd.DataFrame):
    """Count the proportion of records in the dataset versus the total

    Args:
        sample_df (pd.DataFrame): data

    Returns:
        _type_: _description_
    """
    sample_df_count = (
        sample_df.explode("labels")
        .groupby("labels")["feedback_record_id"]
        .nunique()
        .reset_index(name="count_unique_ids")
        .sort_values(by=["count_unique_ids"], ascending=False)
    )

    n_unique_classes = sample_df_count["labels"].nunique()
    sample_df_count["class_proportion"] = (
        sample_df_count["count_unique_ids"] / n_unique_classes
    )

    return sample_df_count

In [None]:
sample_df_count = count_labels(sample_df)

In [None]:
# Set sample size
n = 25

In [None]:
def stratified_sample_with_underrepresented_bias(
    sample_df: pd.DataFrame, n: int, underrepresented_bias_frac=0.2
):
    """Create a stratified sample with a bias towards underrepresented classes. Class variable should be called 'labels'.

    Args:
        sample_df (pd.DataFrame): data to sample from
        n (int): desired sample size
        underrepresented_bias_frac(float): proportion of sample that should be made up of underrepresented classes

    Returns:
        pd.DataFrame: stratified sample
    """
    # Normalize the DataFrame by exploding 'labels'
    df_normalized = sample_df.explode("labels")

    # Calculate the proportion of each class
    class_proportions = df_normalized["labels"].value_counts(normalize=True)

    # Determine the number of samples for underrepresented classes (20% of n)
    samples_for_underrepresented = max(1, int(n * underrepresented_bias_frac))

    # Calculate sample sizes for each class, considering the total desired size n and the additional allocation for diversity
    total_samples_needed = (
        n + samples_for_underrepresented
    )  # Adjust total samples to include diversity allocation

    # Calculate initial sample size per class before adding diversity, attempting to respect original proportions
    initial_samples_per_class = (
        (class_proportions * (total_samples_needed - samples_for_underrepresented))
        .round()
        .astype(int)
    )

    # Ensure the sum of initial samples does not exceed total_samples_needed due to rounding adjustments
    while initial_samples_per_class.sum() > (
        total_samples_needed - samples_for_underrepresented
    ):
        initial_samples_per_class[initial_samples_per_class.idxmax()] -= 1

    # Sample based on calculated sizes
    initial_samples_list = [
        df_normalized[df_normalized["labels"] == cls].sample(
            n=min(cnt, len(df_normalized[df_normalized["labels"] == cls])),
            random_state=42,
        )
        for cls, cnt in initial_samples_per_class.items()
        if cnt > 0
    ]
    initial_samples = pd.concat(initial_samples_list)

    # Now add diversity: sample from underrepresented classes not already covered in initial_samples
    covered_classes = initial_samples["labels"].unique()
    additional_classes = df_normalized[~df_normalized["labels"].isin(covered_classes)][
        "labels"
    ].unique()

    if additional_classes.size > 0:
        additional_samples_list = [
            df_normalized[df_normalized["labels"] == cls].sample(n=1, random_state=42)
            for cls in additional_classes
        ]
        additional_samples = pd.concat(additional_samples_list)
        # Combine initial and additional samples
        final_sample = (
            pd.concat([initial_samples, additional_samples]).drop_duplicates().head(n)
        )
    else:
        final_sample = initial_samples.head(n)

    # Join back on to df to get full set of labels per record
    final_sample = pd.merge(
        final_sample.drop(columns=["labels"]),
        sample_df[["feedback_record_id", "labels"]],
        on="feedback_record_id",
        how="left",
    )

    return final_sample

In [None]:
# Generate stratified sample with small bias towards underrepresented classes
stratified_sample_df = stratified_sample_with_underrepresented_bias(
    sample_df=sample_df, n=n
)

In [None]:
strat_2_df_count = count_labels(stratified_sample_df)
sample_df_count["strat_sample_class_proportion"] = strat_2_df_count["class_proportion"]

In [None]:
# Random sample
random_sample_df = sample_df.sample(n=n)

In [None]:
# Ratio of selected items by label
rand_df_count = count_labels(random_sample_df)
sample_df_count["random_sample_class_proportion"] = rand_df_count["class_proportion"]

In [None]:
# Create simple stratified sample, without greater representation of smaller categories
# stratified_sample = sample_df.explode("labels").groupby("labels", group_keys=False).apply(
#     lambda x: x.sample(frac=0.16)
# )

# strat_df_count = count_labels(stratified_sample)
# sample_df_count["strat_sample_class_proportion"] = strat_df_count["class_proportion"]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Reshape DataFrame
df_melted = sample_df_count.melt(
    id_vars=["labels"],
    value_vars=[
        "class_proportion",
        "random_sample_class_proportion",
        "strat_sample_class_proportion",
    ],
    var_name="Y_Variable",
    value_name="Value",
)

# Plot
plt.figure(figsize=(16, 12))
sns.barplot(x="labels", y="Value", hue="Y_Variable", data=df_melted)
plt.xticks(rotation=90)
plt.title(
    "Bar plot of label class distributions in original dataset, random sample and stratfied sample"
)
plt.xlabel("Label")
plt.ylabel("Class proportion")
plt.legend(title="Variables", loc="upper right")
plt.show()