In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Step 1: Load both datasets
vqa_df = pd.read_csv("vqa_dataset_groq_final.csv")
meta_df = pd.read_csv("sampled_metadata_stratified.csv")

# Step 2: Remove duplicates by 'path' in both files
vqa_df = vqa_df.drop_duplicates(subset='path')
meta_df = meta_df.drop_duplicates(subset='path')

# Step 3: Merge VQA with metadata on 'path'
merged_df = pd.merge(vqa_df, meta_df[['path', 'product_type']], on='path', how='inner')

# Step 4: Drop rare or missing product_type (required for stratified split)
counts = merged_df['product_type'].value_counts()
valid_classes = counts[counts >= 2].index
filtered_df = merged_df[merged_df['product_type'].isin(valid_classes)]

# Step 5: Stratified train-test split
train_df, test_df = train_test_split(
    filtered_df,
    test_size=0.2,
    stratify=filtered_df['product_type'],
    random_state=42
)

# Step 6: Drop 'product_type' column and save the splits
train_df.drop(columns=['product_type']).to_csv("vqa_train.csv", index=False)
test_df.drop(columns=['product_type']).to_csv("vqa_test.csv", index=False)

# Step 7: Sanity check printout
print("✅ Stratified split completed and saved:")
print(f"🔹 Total merged entries: {len(merged_df)}")
print(f"🟩 Train set: {len(train_df)} rows → 'vqa_train.csv'")
print(f"🟥 Test set:  {len(test_df)} rows → 'vqa_test.csv'")


✅ Stratified split completed and saved:
🔹 Total merged entries: 18203
🟩 Train set: 14464 rows → 'vqa_train.csv'
🟥 Test set:  3616 rows → 'vqa_test.csv'
