In [4]:
# Augment dataset using YOUR specific CCIF codes and NOVA assignments
import pandas as pd
import numpy as np

# Load datasets
df_train = pd.read_excel('beto_training_sample_5000_alfredo.xlsx')
df_full = pd.read_parquet('epf_nova_merged.parquet')

print("=== CHECKING FOR DUPLICATES IN ORIGINAL DATA ===")
print(f"Original training data: {len(df_train)} records")
print(f"Unique id_gasto in training: {df_train['id_gasto'].nunique()}")

# Check if training data already has duplicates
train_duplicates = df_train[df_train.duplicated(subset=['id_gasto'], keep=False)]
if len(train_duplicates) > 0:
    print(f"⚠️  WARNING: Training data has {len(train_duplicates)} duplicate id_gasto values!")
    print(train_duplicates[['id_gasto', 'descripcion', 'NOVA']].head())

print("\n=== EXTRACTING AUGMENTATION SAMPLES ===")

# YOUR SPECIFIC MAPPINGS
# NOVA 1 - Fresh produce, eggs
nova1_mapping = {
    '01.1.4.08.01': 'HUEVOS DE GALLINA, FRESCOS Y CON CÁSCARA',
    '01.1.6.02.02': 'NARANJAS FRESCAS',
    '01.1.7.02.02': 'ZAPALLOS FRESCOS O REFRIGERADOS',
    '01.1.7.02.07': 'TOMATES FRESCOS O REFRIGERADOS',
    '01.1.7.04.05': 'CEBOLLAS FRESCAS O REFRIGERADAS',
    '01.1.7.05.01': 'PAPAS Y OTROS TUBÉRCULOS'
}

# NOVA 2 - Basic ingredients
nova2_mapping = {
    '01.1.5.01.01': 'ACEITES VEGETALES, DE FRUTA O SEMILLAS',
    '01.1.9.03.07': 'VINAGRES'
}

# NOVA 3 - Processed foods
nova3_mapping = {
    '01.1.1.03.01': 'PAN CORRIENTE A GRANEL',
    '01.1.3.03.02': 'ATÚN EN CONSERVA',
    '01.1.3.03.03': 'JUREL EN CONSERVA'
}

# IMPORTANT: Get all id_gasto values from training data to avoid duplicates
existing_id_gastos = set(df_train['id_gasto'].values)
print(f"\nExisting id_gasto values in training data: {len(existing_id_gastos)}")

# Filter out any records from full dataset that already exist in training
df_full_filtered = df_full[~df_full['id_gasto'].isin(existing_id_gastos)].copy()
print(f"Full dataset after removing existing id_gastos: {len(df_full_filtered)} records")

# Extract samples for each NOVA category
augmentation_samples = []

# NOVA 1 - Extract 700 samples
print("\nNOVA 1 - Extracting 700 samples...")
nova1_data = df_full_filtered[df_full_filtered['ccif'].isin(nova1_mapping.keys())].copy()
print(f"Found {len(nova1_data)} total NOVA 1 candidates (excluding existing)")

# Show distribution by CCIF
for ccif, glosa in nova1_mapping.items():
    count = len(nova1_data[nova1_data['ccif'] == ccif])
    print(f"  {ccif} ({glosa}): {count} records")

# Sample 700
if len(nova1_data) >= 700:
    # Stratified sampling to get variety
    nova1_sample = nova1_data.groupby('ccif', group_keys=False).apply(
        lambda x: x.sample(min(len(x), 120), random_state=42)  # ~120 per category
    ).sample(700, random_state=42)
else:
    nova1_sample = nova1_data
    print(f"  ⚠️  Only {len(nova1_sample)} samples available for NOVA 1")

nova1_sample['NOVA'] = 1
augmentation_samples.append(nova1_sample)

# NOVA 2 - Extract 200 samples  
print("\nNOVA 2 - Extracting 200 samples...")
nova2_data = df_full_filtered[df_full_filtered['ccif'].isin(nova2_mapping.keys())].copy()
print(f"Found {len(nova2_data)} total NOVA 2 candidates (excluding existing)")

for ccif, glosa in nova2_mapping.items():
    count = len(nova2_data[nova2_data['ccif'] == ccif])
    print(f"  {ccif} ({glosa}): {count} records")

# Sample 200
if len(nova2_data) >= 200:
    nova2_sample = nova2_data.sample(200, random_state=42)
else:
    nova2_sample = nova2_data
    print(f"  ⚠️  Only {len(nova2_sample)} samples available for NOVA 2")
    
nova2_sample['NOVA'] = 2
augmentation_samples.append(nova2_sample)

# NOVA 3 - Extract 100 samples
print("\nNOVA 3 - Extracting 100 samples...")
nova3_data = df_full_filtered[df_full_filtered['ccif'].isin(nova3_mapping.keys())].copy()
print(f"Found {len(nova3_data)} total NOVA 3 candidates (excluding existing)")

for ccif, glosa in nova3_mapping.items():
    count = len(nova3_data[nova3_data['ccif'] == ccif])
    print(f"  {ccif} ({glosa}): {count} records")

# Sample 100
if len(nova3_data) >= 100:
    # Get variety across the 3 types
    nova3_sample = nova3_data.groupby('ccif', group_keys=False).apply(
        lambda x: x.sample(min(len(x), 34), random_state=42)  # ~33 per category
    ).sample(100, random_state=42)
else:
    nova3_sample = nova3_data
    print(f"  ⚠️  Only {len(nova3_sample)} samples available for NOVA 3")

nova3_sample['NOVA'] = 3
augmentation_samples.append(nova3_sample)

# Combine all augmentation
df_augment = pd.concat(augmentation_samples, ignore_index=True)

# VERIFY NO DUPLICATES IN AUGMENTATION
augment_duplicates = df_augment[df_augment.duplicated(subset=['id_gasto'], keep=False)]
if len(augment_duplicates) > 0:
    print(f"\n⚠️  WARNING: Augmentation has {len(augment_duplicates)} duplicate id_gasto values!")
    # Remove duplicates, keeping first occurrence
    df_augment = df_augment.drop_duplicates(subset=['id_gasto'], keep='first')
    print(f"Removed duplicates. New augmentation size: {len(df_augment)}")

print(f"\n=== AUGMENTATION SUMMARY ===")
print(f"Total new samples: {len(df_augment)}")
print("\nDistribution by NOVA:")
print(df_augment['NOVA'].value_counts().sort_index())

# Show some examples
print("\n=== SAMPLE OF AUGMENTATION DATA ===")
for nova in [1, 2, 3]:
    nova_samples = df_augment[df_augment['NOVA'] == nova]
    if len(nova_samples) > 0:
        print(f"\nNOVA {nova} examples:")
        examples = nova_samples.sample(min(3, len(nova_samples)))
        for _, row in examples.iterrows():
            print(f"  {row['ccif']}: {row['descripcion_gasto']} | {row['establecimiento']} (id: {row['id_gasto']})")

# Prepare for merging - match column names with training data
df_augment_final = pd.DataFrame({
    'id_gasto': df_augment['id_gasto'],
    'folio': df_augment['folio'],
    'descripcion': df_augment['descripcion_gasto'],
    'establecimiento': df_augment['establecimiento'],
    'ccif': df_augment['ccif'],
    'NOVA': df_augment['NOVA'].astype(int),
    'glosa': df_augment['glosa_ccif']
})

# Save augmentation samples
df_augment_final.to_csv('augmentation_1000_samples.csv', index=False)
print(f"\nSaved augmentation to 'augmentation_1000_samples.csv'")

# Combine with original training data
print("\n=== COMBINING WITH TRAINING DATA ===")

# Select relevant columns from training data
df_train_clean = df_train[['id_gasto','folio','descripcion', 'establecimiento', 'ccif', 'NOVA', 'glosa']].copy()

# Combine
df_combined = pd.concat([df_train_clean, df_augment_final], ignore_index=True)

# FINAL VERIFICATION - Check for duplicates in combined dataset
print("\n=== FINAL DUPLICATE CHECK ===")
final_duplicates = df_combined[df_combined.duplicated(subset=['id_gasto'], keep=False)]
if len(final_duplicates) > 0:
    print(f"⚠️  ERROR: Combined dataset has {len(final_duplicates)} duplicate id_gasto values!")
    print("Sample duplicates:")
    print(final_duplicates[['id_gasto', 'descripcion', 'NOVA']].head(10))
    
    # Remove duplicates, keeping the one from training data (first occurrence)
    df_combined = df_combined.drop_duplicates(subset=['id_gasto'], keep='first')
    print(f"\nRemoved duplicates. Final size: {len(df_combined)}")
else:
    print("✓ No duplicate id_gasto values found!")

print(f"\nOriginal training: {len(df_train_clean)} samples")
print(f"Augmentation: {len(df_augment_final)} samples")
print(f"Combined total: {len(df_combined)} samples")
print(f"Unique id_gasto values: {df_combined['id_gasto'].nunique()}")

# Final distribution
print("\n=== FINAL DISTRIBUTION ===")
final_dist = df_combined['NOVA'].value_counts().sort_index()
for nova, count in final_dist.items():
    print(f"NOVA {nova}: {count} samples ({count/len(df_combined)*100:.1f}%)")

# Save final training dataset
df_combined.to_csv('training_dataset_6000.csv', index=False)
print(f"\nSaved final training dataset to 'training_dataset_6000.csv'")

# Save a summary of what we did
with open('augmentation_summary.txt', 'w') as f:
    f.write("AUGMENTATION SUMMARY\n")
    f.write("="*50 + "\n")
    f.write(f"Original training samples: {len(df_train_clean)}\n")
    f.write(f"Augmentation samples added: {len(df_augment_final)}\n")
    f.write(f"Final total samples: {len(df_combined)}\n")
    f.write(f"Unique id_gasto values: {df_combined['id_gasto'].nunique()}\n")
    f.write("\nFinal NOVA distribution:\n")
    for nova, count in final_dist.items():
        f.write(f"  NOVA {nova}: {count} ({count/len(df_combined)*100:.1f}%)\n")

print("\n✓ Summary saved to 'augmentation_summary.txt'")

=== CHECKING FOR DUPLICATES IN ORIGINAL DATA ===
Original training data: 5000 records
Unique id_gasto in training: 5000

=== EXTRACTING AUGMENTATION SAMPLES ===

Existing id_gasto values in training data: 5000
Full dataset after removing existing id_gastos: 954626 records

NOVA 1 - Extracting 700 samples...
Found 76783 total NOVA 1 candidates (excluding existing)
  01.1.4.08.01 (HUEVOS DE GALLINA, FRESCOS Y CON CÁSCARA): 16559 records
  01.1.6.02.02 (NARANJAS FRESCAS): 4340 records
  01.1.7.02.02 (ZAPALLOS FRESCOS O REFRIGERADOS): 7921 records
  01.1.7.02.07 (TOMATES FRESCOS O REFRIGERADOS): 20054 records
  01.1.7.04.05 (CEBOLLAS FRESCAS O REFRIGERADAS): 12402 records
  01.1.7.05.01 (PAPAS Y OTROS TUBÉRCULOS): 15507 records


  nova1_sample = nova1_data.groupby('ccif', group_keys=False).apply(



NOVA 2 - Extracting 200 samples...
Found 8203 total NOVA 2 candidates (excluding existing)
  01.1.5.01.01 (ACEITES VEGETALES, DE FRUTA O SEMILLAS): 7601 records
  01.1.9.03.07 (VINAGRES): 602 records

NOVA 3 - Extracting 100 samples...
Found 119770 total NOVA 3 candidates (excluding existing)
  01.1.1.03.01 (PAN CORRIENTE A GRANEL): 113433 records
  01.1.3.03.02 (ATÚN EN CONSERVA): 4792 records
  01.1.3.03.03 (JUREL EN CONSERVA): 1545 records

=== AUGMENTATION SUMMARY ===
Total new samples: 1000

Distribution by NOVA:
NOVA
1    700
2    200
3    100
Name: count, dtype: int64

=== SAMPLE OF AUGMENTATION DATA ===

NOVA 1 examples:
  01.1.7.05.01: PAPAS FRESCAS EN MALLA | LIDER (id: 12242-1-65)
  01.1.6.02.02: NARANJAS FRESCAS | FERIA (id: 10229-1-54)
  01.1.7.02.07: TOMATES FRESCOS | FERIA (id: 9035-1-97)

NOVA 2 examples:
  01.1.5.01.01: ACEITE VEG. CAMPO | SUPERMERCADO AHORRA MAS (id: 13694-1-23)
  01.1.5.01.01: ACEITE VEGETAL | FERIA (id: 13844-1-16)
  01.1.5.01.01: ACEITE 04 MARAVIL

  nova3_sample = nova3_data.groupby('ccif', group_keys=False).apply(
