In [1]:
# EPF-NOVA Database Merger - LSE Master's Capstone Project
# Simple 3-step process as requested

# ============================================================================
# 0. LOAD THE LIBRARIES REQUIRED
# ============================================================================
import pandas as pd
import numpy as np
import os

# ============================================================================
# 1. OPEN THE DATABASE OF THE IX HOUSEHOLD BUDGET SURVEY
# ============================================================================
print("Step 1: Loading EPF database...")
df_epf = pd.read_stata("./data/base-cantidades-quintilizada-ix-epf-(stata).dta", 
                      convert_categoricals=False)
print(f"✓ EPF loaded: {df_epf.shape[0]:,} rows, {df_epf.shape[1]} columns")

# ============================================================================
# 2. MERGE/JOIN THE NOVA AND COICOP INFORMATION - JOIN WITH CCIF CODE
# ============================================================================
print("Step 2: Loading NOVA mapping...")
df_nova = pd.read_csv("./data/ccif_nova.txt")
print(f"✓ NOVA mapping loaded: {df_nova.shape[0]} rows")

print("Step 3: Merging EPF with NOVA on CCIF codes...")
# Clean CCIF codes for consistent matching
df_epf['ccif'] = df_epf['ccif'].astype(str).str.strip()
df_nova['CCIF'] = df_nova['CCIF'].astype(str).str.strip()

# Perform the merge
df_merged = df_epf.merge(df_nova, left_on='ccif', right_on='CCIF', how='left', indicator=True)

# Show results
merge_stats = df_merged['_merge'].value_counts()
matched = merge_stats.get('both', 0)
total = len(df_merged)
match_rate = (matched / total) * 100

print(f"✓ Merge completed:")
print(f"  - Total records: {total:,}")
print(f"  - With NOVA classification: {matched:,}")
print(f"  - Match rate: {match_rate:.1f}%")

# Show NOVA distribution
if matched > 0:
    nova_dist = df_merged[df_merged['_merge'] == 'both']['NOVA'].value_counts().sort_index()
    print(f"  - NOVA distribution:")
    for nova_level, count in nova_dist.items():
        print(f"    NOVA {nova_level}: {count:,} records")

print("✓ Merged dataset ready in 'df_merged' variable")

Step 1: Loading EPF database...
✓ EPF loaded: 958,410 rows, 26 columns
Step 2: Loading NOVA mapping...
✓ NOVA mapping loaded: 342 rows
Step 3: Merging EPF with NOVA on CCIF codes...
✓ Merge completed:
  - Total records: 958,410
  - With NOVA classification: 900,301
  - Match rate: 93.9%
  - NOVA distribution:
    NOVA 1.0: 325,663 records
    NOVA 2.0: 32,523 records
    NOVA 3.0: 232,346 records
    NOVA 4.0: 309,769 records
✓ Merged dataset ready in 'df_merged' variable


In [2]:
# ============================================================================
# CHECK CCIF CODES THAT DID NOT MERGE
# ============================================================================
print("\nChecking unmatched CCIF codes...")

# Get unmatched records
unmatched = df_merged[df_merged['_merge'] == 'left_only']
unmatched_codes = unmatched[['ccif', 'glosa_ccif']].drop_duplicates().sort_values('ccif')

print(f"Unmatched CCIF codes: {len(unmatched_codes)}")

if len(unmatched_codes) > 0:
    # Count frequency of each unmatched code
    unmatched_counts = unmatched['ccif'].value_counts()
    unmatched_codes['record_count'] = unmatched_codes['ccif'].map(unmatched_counts)
    unmatched_codes = unmatched_codes.sort_values('record_count', ascending=False)
    
    print(f"\nTop unmatched codes by frequency:")
    print("-" * 80)
    print(f"{'CCIF Code':<15} | {'Description':<40} | {'Records'}")
    print("-" * 80)
    for _, row in unmatched_codes.head(15).iterrows():
        print(f"{row['ccif']:<15} | {row['glosa_ccif']:<40} | {row['record_count']:>7}")
    
    # Save full list
    unmatched_codes.to_csv("unmatched_ccif_codes.csv", index=False)
    print(f"\n✓ Full list saved to: unmatched_ccif_codes.csv")
else:
    print("✓ All CCIF codes matched successfully!")


Checking unmatched CCIF codes...
Unmatched CCIF codes: 30

Top unmatched codes by frequency:
--------------------------------------------------------------------------------
CCIF Code       | Description                              | Records
--------------------------------------------------------------------------------
01.4.1.01.01    | GASTOS NO DESGLOSADOS EN ALIMENTOS Y BEBIDAS NO ALCOHÓLICAS |   22072
02.3.1.01.01    | CIGARRILLOS Y CIGARROS                   |   11043
02.5.1.01.01    | GASTOS NO DESGLOSADOS EN BEBIDAS ALCOHÓLICAS, TABACO Y ESTUPEFACIENTES |    9626
02.1.3.01.01    | CERVEZAS CON ALCOHOL                     |    6039
02.1.2.01.01    | VINO DE UVAS                             |    4335
11.1.1.01.09    | OTRAS BEBIDAS ALCOHÓLICAS FERMENTADAS O DESTILADAS (PURAS O COMBINADAS), ADQUIRIDAS EN RESTAURANTES, CAFÉS Y SIMILARES, CON SERVICIO COMPLETO |     767
11.1.1.01.08    | CERVEZAS ADQUIRIDAS EN RESTAURANTES, CAFÉS Y SIMILARES, CON SERVICIO COMPLETO |     611
02.1.

In [17]:
# ============================================================================
# SAVE THE MERGED DATASET
# ============================================================================
print("\nSaving merged dataset...")
output_filename = "epf_nova_merged_dataset.csv"
df_merged.to_csv(output_filename, index=False, encoding='utf-8')
print(f"✓ Merged dataset saved as: {output_filename}")
print(f"  - File contains {len(df_merged):,} records")
print(f"  - Ready for stratified sampling and BETO training preparation")


Saving merged dataset...
✓ Merged dataset saved as: epf_nova_merged_dataset.csv
  - File contains 958,410 records
  - Ready for stratified sampling and BETO training preparation


In [18]:
# 5,000 STRATIFIED SAMPLE FOR BETO TRAINING
# Following your agreed strategy: quintile balance + confidence distribution

import pandas as pd
import numpy as np

# Load the merged dataset
print("Loading merged dataset...")
df_merged = pd.read_csv("epf_nova_merged_dataset.csv")
print(f"✓ Dataset loaded: {len(df_merged):,} records")

# Filter to only records with NOVA classification
df_nova = df_merged[df_merged['_merge'] == 'both'].copy()
print(f"✓ Records with NOVA: {len(df_nova):,}")

# Check quintile column and its values
print(f"\nQuintile column analysis:")
if 'quintil' in df_nova.columns:
    quintile_col = 'quintil'
elif 'quintile' in df_nova.columns:
    quintile_col = 'quintile'
else:
    print("Available columns:")
    print(df_nova.columns.tolist())
    raise ValueError("No quintile column found!")

print(f"Using column: {quintile_col}")
quintile_values = df_nova[quintile_col].value_counts().sort_index()
print(f"Quintile distribution:")
for q, count in quintile_values.items():
    print(f"  {q}: {count:,}")

# Create confidence categories (HIGH/MEDIUM/LOW based on your mapping)
df_nova['confidence_category'] = df_nova['CONFIDENCE'].fillna('UNKNOWN')

print(f"\nConfidence distribution:")
conf_dist = df_nova['confidence_category'].value_counts()
for conf, count in conf_dist.items():
    print(f"  {conf}: {count:,}")

# ============================================================================
# STRATIFIED SAMPLING: 1,000 PER QUINTILE
# ============================================================================
print(f"\nStarting stratified sampling...")

sample_rows = []

# Use actual quintile values from the data
for quintil in df_nova[quintile_col].unique():
    if pd.isna(quintil):
        continue
        
    print(f"\nProcessing Quintil {quintil}:")
    
    quintil_data = df_nova[df_nova[quintile_col] == quintil].copy()
    print(f"  Available records: {len(quintil_data):,}")
    
    if len(quintil_data) == 0:
        print(f"  ⚠️ No data for quintil {quintil}")
        continue
    
    # CONFIDENCE DISTRIBUTION (85% Medium/Low, 15% High)
    medium_low = quintil_data[quintil_data['confidence_category'].isin(['MEDIUM', 'LOW'])]
    high_conf = quintil_data[quintil_data['confidence_category'] == 'HIGH']
    
    print(f"  Medium/Low confidence: {len(medium_low):,}")
    print(f"  High confidence: {len(high_conf):,}")
    
    # Sample 850 from Medium/Low confidence (where manual labeling adds most value)
    if len(medium_low) >= 850:
        sample_medium_low = medium_low.sample(n=850, random_state=42)
    else:
        sample_medium_low = medium_low.copy()
        print(f"  ⚠️ Only {len(medium_low)} Medium/Low available, taking all")
    
    # Sample 150 from High confidence (reliable training foundation)
    if len(high_conf) >= 150:
        # Apply selection heuristics for High confidence diversity
        high_conf['token_count'] = high_conf['glosa_ccif'].str.split().str.len()
        
        # Prefer descriptions ≥3 tokens
        high_long = high_conf[high_conf['token_count'] >= 3]
        high_short = high_conf[high_conf['token_count'] < 3]
        
        # Try to get diverse high confidence samples
        if len(high_long) >= 100:
            sample_high_long = high_long.sample(n=100, random_state=42)
            remaining_high = 50
        else:
            sample_high_long = high_long.copy()
            remaining_high = 150 - len(high_long)
        
        if remaining_high > 0 and len(high_short) > 0:
            sample_high_short = high_short.sample(n=min(remaining_high, len(high_short)), random_state=42)
            sample_high = pd.concat([sample_high_long, sample_high_short])
        else:
            sample_high = sample_high_long.copy()
            
    else:
        sample_high = high_conf.copy()
        print(f"  ⚠️ Only {len(high_conf)} High confidence available, taking all")
    
    # Combine samples for this quintil
    quintil_sample = pd.concat([sample_medium_low, sample_high])
    
    # If we don't have enough, fill from remaining data
    target_size = 1000
    if len(quintil_sample) < target_size:
        remaining_needed = target_size - len(quintil_sample)
        already_sampled = set(quintil_sample.index)
        remaining_data = quintil_data[~quintil_data.index.isin(already_sampled)]
        
        if len(remaining_data) >= remaining_needed:
            additional_sample = remaining_data.sample(n=remaining_needed, random_state=42)
            quintil_sample = pd.concat([quintil_sample, additional_sample])
        else:
            print(f"  ⚠️ Only {len(quintil_data)} total records available for quintil {quintil}")
            quintil_sample = quintil_data.sample(n=min(target_size, len(quintil_data)), random_state=42)
    
    print(f"  Final sample size: {len(quintil_sample)}")
    
    # Add to overall sample
    sample_rows.append(quintil_sample)

# Combine all quintil samples
print(f"\nCombining samples...")

if len(sample_rows) == 0:
    print("❌ No samples collected! Check quintile values and data availability.")
    print("Falling back to simple random sampling...")
    
    # Simple fallback: random sample of 5000 from available NOVA-classified records
    if len(df_nova) >= 5000:
        final_sample = df_nova.sample(n=5000, random_state=42)
        print(f"✓ Random sample of 5,000 records selected")
    else:
        final_sample = df_nova.copy()
        print(f"✓ Using all {len(final_sample):,} available records")
else:
    final_sample = pd.concat(sample_rows, ignore_index=True)
    print(f"✓ Stratified sample size: {len(final_sample):,}")

# ============================================================================
# SELECT AND RENAME REQUIRED VARIABLES
# ============================================================================
print(f"\nSelecting required variables...")

# Select the required variables (including the new ones)
required_vars = ['id_gasto', 'folio', 'ccif', 'glosa_ccif', 'establecimiento', 'descripcion_gasto', 'NOVA', 'CONFIDENCE', quintile_col]
sample_final = final_sample[required_vars].copy()

# Rename variables as requested
sample_final = sample_final.rename(columns={
    'glosa_ccif': 'glosa',
    'descripcion_gasto': 'descripcion',  # SUPER IMPORTANT for relabeling
    'NOVA': 'NOVA_preliminar',
    'CONFIDENCE': 'confidence',
    quintile_col: 'quintil'  # Ensure consistent naming
})

print(f"✓ Variables selected and renamed:")
print(f"  1. id_gasto")
print(f"  2. folio") 
print(f"  3. ccif")
print(f"  4. glosa")
print(f"  5. descripcion (SUPER IMPORTANT - original descriptions for relabeling)")
print(f"  6. NOVA_preliminar")
print(f"  7. confidence")
print(f"  8. quintil")

# ============================================================================
# FINAL SAMPLE STATISTICS
# ============================================================================
print(f"\nFinal Sample Statistics:")
print(f"  Total records: {len(sample_final):,}")

# NOVA distribution
nova_dist = sample_final['NOVA_preliminar'].value_counts().sort_index()
print(f"\n  NOVA Distribution:")
for nova, count in nova_dist.items():
    pct = (count/len(sample_final)*100)
    print(f"    NOVA {nova}: {count:,} ({pct:.1f}%)")

# Confidence distribution  
conf_dist = sample_final['confidence'].value_counts()
print(f"\n  Confidence Distribution:")
for conf, count in conf_dist.items():
    pct = (count/len(sample_final)*100)
    print(f"    {conf}: {count:,} ({pct:.1f}%)")

# ============================================================================
# SAVE SAMPLE FOR BETO TRAINING (WITH PROPER ENCODING)
# ============================================================================
output_filename = "beto_training_sample_5000.csv"
sample_final.to_csv(output_filename, index=False, encoding='utf-8-sig')  # UTF-8 with BOM for Excel compatibility

print(f"\n✓ 5,000 sample saved as: {output_filename}")
print(f"✓ Encoding: UTF-8 with BOM (fixes √â/√ç character issues)")
print(f"✓ Ready for manual NOVA labeling and BETO fine-tuning!")

# Show sample of descriptions to verify encoding
print(f"\nSample descriptions (to verify encoding):")
for i, desc in enumerate(sample_final['descripcion'].dropna().head(3)):
    print(f"  {i+1}. {desc}")

print(f"\nNext steps:")
print(f"  1. Manual review/correction using 'descripcion' (original descriptions)")
print(f"  2. Use 'glosa' for BETO training (standardized descriptions)")
print(f"  3. Target: Medium/Low confidence items for maximum impact")

Loading merged dataset...
✓ Dataset loaded: 958,410 records
✓ Records with NOVA: 900,301

Quintile column analysis:
Using column: quintil
Quintile distribution:
  I: 206,153
  II: 197,591
  III: 183,490
  IV: 169,107
  V: 143,960

Confidence distribution:
  HIGH: 782,564
  LOW: 60,446
  MEDIUM: 57,291

Starting stratified sampling...

Processing Quintil I:
  Available records: 206,153
  Medium/Low confidence: 19,805
  High confidence: 186,348


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_conf['token_count'] = high_conf['glosa_ccif'].str.split().str.len()


  Final sample size: 1000

Processing Quintil IV:
  Available records: 169,107
  Medium/Low confidence: 25,196
  High confidence: 143,911


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_conf['token_count'] = high_conf['glosa_ccif'].str.split().str.len()


  Final sample size: 1000

Processing Quintil II:
  Available records: 197,591
  Medium/Low confidence: 20,835
  High confidence: 176,756


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_conf['token_count'] = high_conf['glosa_ccif'].str.split().str.len()


  Final sample size: 1000

Processing Quintil III:
  Available records: 183,490
  Medium/Low confidence: 22,055
  High confidence: 161,435


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_conf['token_count'] = high_conf['glosa_ccif'].str.split().str.len()


  Final sample size: 1000

Processing Quintil V:
  Available records: 143,960
  Medium/Low confidence: 29,846
  High confidence: 114,114
  Final sample size: 1000

Combining samples...
✓ Stratified sample size: 5,000

Selecting required variables...
✓ Variables selected and renamed:
  1. id_gasto
  2. folio
  3. ccif
  4. glosa
  5. descripcion (SUPER IMPORTANT - original descriptions for relabeling)
  6. NOVA_preliminar
  7. confidence
  8. quintil

Final Sample Statistics:
  Total records: 5,000

  NOVA Distribution:
    NOVA 1.0: 462 (9.2%)
    NOVA 2.0: 35 (0.7%)
    NOVA 3.0: 1,274 (25.5%)
    NOVA 4.0: 3,229 (64.6%)

  Confidence Distribution:
    MEDIUM: 2,138 (42.8%)
    LOW: 2,112 (42.2%)
    HIGH: 750 (15.0%)

✓ 5,000 sample saved as: beto_training_sample_5000.csv
✓ Encoding: UTF-8 with BOM (fixes √â/√ç character issues)
✓ Ready for manual NOVA labeling and BETO fine-tuning!

Sample descriptions (to verify encoding):
  1. HELADO
  2. DESAYUNO
  3. ALMUERZO, PLATO DE FONDO, LI

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high_conf['token_count'] = high_conf['glosa_ccif'].str.split().str.len()


In [19]:
# ENHANCED SAMPLE EXPLORATION WITH MORE STATISTICS
import pandas as pd

df = pd.read_csv("beto_training_sample_5000.csv", encoding='utf-8-sig')

print(f"Sample: {df.shape}")

# Basic distributions
print(f"\nQuintiles: {df['quintil'].value_counts().sort_index().to_dict()}")
print(f"NOVA: {df['NOVA_preliminar'].value_counts().sort_index().to_dict()}")
print(f"Confidence: {df['confidence'].value_counts().to_dict()}")

# NOVA × Confidence by Quintile
print(f"\nNOVA × Confidence by Quintile (%):")
for q in ['I', 'II', 'III', 'IV', 'V']:
    qdata = df[df['quintil'] == q]
    cross = pd.crosstab(qdata['NOVA_preliminar'], qdata['confidence'], normalize='index') * 100
    print(f"\nQuintil {q}:")
    print(cross.round(1))

# Food categories (first 2 digits of CCIF)
df['food_category'] = df['ccif'].str[:2]
print(f"\nFood Categories (CCIF first 2 digits):")
cat_dist = df['food_category'].value_counts()
for cat, count in cat_dist.items():
    pct = count/len(df)*100
    print(f"  {cat}: {count} ({pct:.1f}%)")

# NOVA by Food Category
print(f"\nNOVA Distribution by Food Category (%):")
nova_by_cat = pd.crosstab(df['food_category'], df['NOVA_preliminar'], normalize='index') * 100
print(nova_by_cat.round(1))

# Confidence by Food Category
print(f"\nConfidence Distribution by Food Category (%):")
conf_by_cat = pd.crosstab(df['food_category'], df['confidence'], normalize='index') * 100
print(conf_by_cat.round(1))

Sample: (5000, 9)

Quintiles: {'I': 1000, 'II': 1000, 'III': 1000, 'IV': 1000, 'V': 1000}
NOVA: {1.0: 462, 2.0: 35, 3.0: 1274, 4.0: 3229}
Confidence: {'MEDIUM': 2138, 'LOW': 2112, 'HIGH': 750}

NOVA × Confidence by Quintile (%):

Quintil I:
confidence        HIGH   LOW  MEDIUM
NOVA_preliminar                     
1.0               75.3  24.7     0.0
2.0              100.0   0.0     0.0
3.0               12.5  83.1     4.4
4.0                6.9  26.4    66.7

Quintil II:
confidence        HIGH   LOW  MEDIUM
NOVA_preliminar                     
1.0               83.3  16.7     0.0
2.0              100.0   0.0     0.0
3.0               13.1  81.5     5.4
4.0                6.3  25.2    68.5

Quintil III:
confidence        HIGH   LOW  MEDIUM
NOVA_preliminar                     
1.0               72.4  27.6     0.0
2.0              100.0   0.0     0.0
3.0               18.5  77.2     4.3
4.0                5.9  25.1    69.0

Quintil IV:
confidence        HIGH   LOW  MEDIUM
NOVA_preliminar 

In [20]:
# Check NOVA 2 confidence distribution in original merged data
import pandas as pd

df_merged = pd.read_csv("epf_nova_merged_dataset.csv")
df_nova = df_merged[df_merged['_merge'] == 'both']

print("NOVA 2 Confidence in Original Data:")
nova2_data = df_nova[df_nova['NOVA'] == 2.0]
print(f"Total NOVA 2 records: {len(nova2_data):,}")

if len(nova2_data) > 0:
    conf_dist = nova2_data['CONFIDENCE'].value_counts()
    print("Confidence distribution:")
    for conf, count in conf_dist.items():
        pct = count/len(nova2_data)*100
        print(f"  {conf}: {count:,} ({pct:.1f}%)")
else:
    print("No NOVA 2 records found!")

NOVA 2 Confidence in Original Data:
Total NOVA 2 records: 32,523
Confidence distribution:
  HIGH: 32,523 (100.0%)


In [15]:
# MERGED DATABASE DESCRIPTIVES
import pandas as pd

df_merged = pd.read_csv("epf_nova_merged_dataset.csv")
df_nova = df_merged[df_merged['_merge'] == 'both']

print("MERGED DATABASE DESCRIPTIVES")
print("=" * 40)

print(f"Total EPF records: {len(df_merged):,}")
print(f"Records with NOVA: {len(df_nova):,}")
print(f"Match rate: {len(df_nova)/len(df_merged)*100:.1f}%")

# NOVA Distribution in Population
print(f"\nNOVA Distribution in Chilean Households:")
nova_dist = df_nova['NOVA'].value_counts().sort_index()
for nova, count in nova_dist.items():
    pct = count/len(df_nova)*100
    print(f"  NOVA {nova}: {count:,} ({pct:.1f}%)")

# Confidence Distribution in Population  
print(f"\nConfidence Distribution:")
conf_dist = df_nova['CONFIDENCE'].value_counts()
for conf, count in conf_dist.items():
    pct = count/len(df_nova)*100
    print(f"  {conf}: {count:,} ({pct:.1f}%)")

# Quintile Distribution
print(f"\nQuintile Distribution:")
quintil_dist = df_nova['quintil'].value_counts().sort_index()
for q, count in quintil_dist.items():
    pct = count/len(df_nova)*100
    print(f"  Quintil {q}: {count:,} ({pct:.1f}%)")

# Food Categories
df_nova['food_category'] = df_nova['ccif'].str[:2]
print(f"\nFood Categories:")
cat_dist = df_nova['food_category'].value_counts()
for cat, count in cat_dist.items():
    pct = count/len(df_nova)*100
    print(f"  Category {cat}: {count:,} ({pct:.1f}%)")

MERGED DATABASE DESCRIPTIVES
Total EPF records: 958,410
Records with NOVA: 900,301
Match rate: 93.9%

NOVA Distribution in Chilean Households:
  NOVA 1.0: 325,663 (36.2%)
  NOVA 2.0: 32,523 (3.6%)
  NOVA 3.0: 232,346 (25.8%)
  NOVA 4.0: 309,769 (34.4%)

Confidence Distribution:
  HIGH: 782,564 (86.9%)
  LOW: 60,446 (6.7%)
  MEDIUM: 57,291 (6.4%)

Quintile Distribution:
  Quintil I: 206,153 (22.9%)
  Quintil II: 197,591 (21.9%)
  Quintil III: 183,490 (20.4%)
  Quintil IV: 169,107 (18.8%)
  Quintil V: 143,960 (16.0%)

Food Categories:
  Category 01: 839,855 (93.3%)
  Category 11: 60,446 (6.7%)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nova['food_category'] = df_nova['ccif'].str[:2]
