In [172]:
# --- Setup ---
import pandas as pd
import random


In [173]:
df = pd.read_csv("sample_input2.csv")  # Replace with your file path

In [174]:
df.columns

Index(['part_no', 'part_description', 'new_commodity_description'], dtype='object')

In [175]:

def sample_unique_parts_per_group(df: pd.DataFrame, sample_dict: dict) -> pd.DataFrame:
    # Ensure clean column names and string formatting
    df.columns = df.columns.str.strip()
    df['part_no'] = df['part_no'].astype(str).str.strip()
    df['new_commodity_description'] = df['new_commodity_description'].astype(str).str.strip()

    sampled_rows = []

    for group, n in sample_dict.items():
        group_df = df[df['new_commodity_description'] == group]

        if group_df.empty:
            print(f"⚠️ No rows found for group '{group}'")
            continue

        unique_parts = group_df['part_no'].drop_duplicates()

        if len(unique_parts) < n:
            print(f"⚠️ Only {len(unique_parts)} unique parts found for group '{group}', but {n} requested.")
            n = len(unique_parts)

        sampled_parts = unique_parts.sample(n=n, random_state=42)
        sampled_group = group_df[group_df['part_no'].isin(sampled_parts)]
        sampled_rows.append(sampled_group)

    result_df = pd.concat(sampled_rows, ignore_index=True) if sampled_rows else pd.DataFrame()
    return result_df


In [176]:
sample_dict = {
    'LVT': 2,
    'LVP': 3,
    'VCT': 4
}

df_sampled = sample_unique_parts_per_group(df, sample_dict)
df_sampled.to_excel("sampled_output_output1.xlsx", index=False)
