In [3]:
# ==========================================
# UIDAI HACKATHON - DATA LOADING
# Working with YOUR exact folder structure
# ==========================================

import pandas as pd
import glob
import os
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print(" "*20 + "UIDAI HACKATHON")
print(" "*15 + "DATA LOADING PIPELINE")
print("="*70)

# ==========================================
# SET BASE PATH (FROM NOTEBOOKS FOLDER)
# ==========================================

# Since we're running from notebooks/ folder
# We need to go up one level to reach data/
BASE_PATH = os.path.abspath('..')  # Go up to HACKATHON folder
print(f"\n Base folder: {BASE_PATH}")

# ==========================================
# CREATE OUTPUT FOLDERS IF NOT EXIST
# ==========================================

os.makedirs(os.path.join(BASE_PATH, 'data', 'processed'), exist_ok=True)
os.makedirs(os.path.join(BASE_PATH, 'outputs', 'figures'), exist_ok=True)

print(f" Folders ready:")
print(f"   - data/processed/")
print(f"   - outputs/figures/")

# ==========================================
# FUNCTION: COMBINE CSV FILES
# ==========================================

def combine_csv_files(folder_name, dataset_name):
    """
    Combines all CSV files in a folder
    folder_name: 'enrollment', 'demographic', or 'biometric'
    """
    print(f"\n{'='*70}")
    print(f" LOADING: {dataset_name}")
    print(f"{'='*70}")
    
    # Build path: HACKATHON/data/raw/folder_name/*.csv
    folder_path = os.path.join(BASE_PATH, 'data', 'raw', folder_name)
    print(f"Folder: {folder_path}")
    
    # Check if folder exists
    if not os.path.exists(folder_path):
        print(f" ERROR: Folder not found!")
        return None
    
    # Get all CSV files
    csv_pattern = os.path.join(folder_path, '*.csv')
    csv_files = glob.glob(csv_pattern)
    
    if len(csv_files) == 0:
        print(f" ERROR: No CSV files found!")
        print(f"   Looking for: {csv_pattern}")
        return None
    
    print(f" Found {len(csv_files)} CSV files\n")
    
    # Load each file
    dfs = []
    total_rows = 0
    
    for i, file in enumerate(sorted(csv_files), 1):
        filename = os.path.basename(file)
        print(f"  [{i}/{len(csv_files)}] {filename}...", end=' ')
        
        try:
            # Read CSV
            df = pd.read_csv(file, low_memory=False)
            rows = len(df)
            total_rows += rows
            print(f" {rows:,} rows")
            dfs.append(df)
        except Exception as e:
            print(f" ERROR: {str(e)}")
    
    # Combine all dataframes
    if len(dfs) > 0:
        print(f"\n Combining {len(dfs)} files...")
        combined_df = pd.concat(dfs, ignore_index=True)
        print(f" SUCCESS!")
        print(f"   Total rows: {len(combined_df):,}")
        print(f"   Total columns: {len(combined_df.columns)}")
        
        # Memory info
        memory_mb = combined_df.memory_usage(deep=True).sum() / 1024**2
        print(f"   Memory: {memory_mb:.1f} MB")
        
        return combined_df
    else:
        print(" No data loaded")
        return None

# ==========================================
# LOAD ENROLLMENT DATA (3 FILES)
# ==========================================

df_enrollment = combine_csv_files('enrollment', 'ENROLLMENT DATA')

if df_enrollment is not None:
    # Save combined file
    output_path = os.path.join(BASE_PATH, 'data', 'processed', 'enrollment_combined.csv')
    print(f"\n Saving to: {output_path}")
    df_enrollment.to_csv(output_path, index=False)
    print(f" Saved!")
    
    # Show structure
    print(f"\n ENROLLMENT COLUMNS ({len(df_enrollment.columns)} total):")
    for i, col in enumerate(df_enrollment.columns, 1):
        dtype = df_enrollment[col].dtype
        non_null = df_enrollment[col].notna().sum()
        print(f"   {i:2d}. {col:30s} | {str(dtype):10s} | {non_null:,} non-null")
    
    print(f"\n SAMPLE DATA (first 3 rows):")
    print(df_enrollment.head(3).to_string())
    
    print(f"\n BASIC STATS:")
    print(f"   Date range: {df_enrollment.iloc[:, 0].min()} to {df_enrollment.iloc[:, 0].max()}")
    print(f"   Missing values: {df_enrollment.isnull().sum().sum():,}")
    print(f"   Duplicate rows: {df_enrollment.duplicated().sum():,}")

# ==========================================
# LOAD DEMOGRAPHIC DATA (5 FILES)
# ==========================================

df_demographic = combine_csv_files('demographic', 'DEMOGRAPHIC UPDATE DATA')

if df_demographic is not None:
    output_path = os.path.join(BASE_PATH, 'data', 'processed', 'demographic_combined.csv')
    print(f"\n Saving to: {output_path}")
    df_demographic.to_csv(output_path, index=False)
    print(f" Saved!")
    
    print(f"\n DEMOGRAPHIC COLUMNS ({len(df_demographic.columns)} total):")
    for i, col in enumerate(df_demographic.columns, 1):
        print(f"   {i:2d}. {col}")

# ==========================================
# LOAD BIOMETRIC DATA (4 FILES)
# ==========================================

df_biometric = combine_csv_files('biometric', 'BIOMETRIC UPDATE DATA')

if df_biometric is not None:
    output_path = os.path.join(BASE_PATH, 'data', 'processed', 'biometric_combined.csv')
    print(f"\n Saving to: {output_path}")
    df_biometric.to_csv(output_path, index=False)
    print(f" Saved!")
    
    print(f"\n BIOMETRIC COLUMNS ({len(df_biometric.columns)} total):")
    for i, col in enumerate(df_biometric.columns, 1):
        print(f"   {i:2d}. {col}")

# ==========================================
# FINAL SUMMARY
# ==========================================

print(f"\n{'='*70}")
print(f" "*25 + "SUMMARY")
print(f"{'='*70}")

if df_enrollment is not None:
    print(f"\n ENROLLMENT DATA:")
    print(f"   Files combined: 3")
    print(f"   Total records: {len(df_enrollment):,}")
    print(f"   Columns: {len(df_enrollment.columns)}")

if df_demographic is not None:
    print(f"\n DEMOGRAPHIC DATA:")
    print(f"   Files combined: 5")
    print(f"   Total records: {len(df_demographic):,}")
    print(f"   Columns: {len(df_demographic.columns)}")

if df_biometric is not None:
    print(f"\n BIOMETRIC DATA:")
    print(f"   Files combined: 4")
    print(f"   Total records: {len(df_biometric):,}")
    print(f"   Columns: {len(df_biometric.columns)}")

print(f"\n{'='*70}")
print(f" DATA LOADING COMPLETE!")
print(f"{'='*70}")

# ==========================================
# SAVE COLUMN REFERENCE FILE
# ==========================================

ref_path = os.path.join(BASE_PATH, 'data', 'processed', 'COLUMN_REFERENCE.txt')
with open(ref_path, 'w', encoding='utf-8') as f:
    f.write("UIDAI HACKATHON - COLUMN REFERENCE\n")
    f.write("="*70 + "\n\n")
    
    if df_enrollment is not None:
        f.write("ENROLLMENT COLUMNS:\n")
        f.write("-"*70 + "\n")
        for i, col in enumerate(df_enrollment.columns, 1):
            f.write(f"{i:2d}. {col}\n")
        f.write("\n")
    
    if df_demographic is not None:
        f.write("DEMOGRAPHIC COLUMNS:\n")
        f.write("-"*70 + "\n")
        for i, col in enumerate(df_demographic.columns, 1):
            f.write(f"{i:2d}. {col}\n")
        f.write("\n")
    
    if df_biometric is not None:
        f.write("BIOMETRIC COLUMNS:\n")
        f.write("-"*70 + "\n")
        for i, col in enumerate(df_biometric.columns, 1):
            f.write(f"{i:2d}. {col}\n")

print(f"\n Column reference saved: {ref_path}")
print(f"\n NEXT STEP: Check the column names above and tell me!")



                    UIDAI HACKATHON
               DATA LOADING PIPELINE

 Base folder: c:\Users\ahmad\Desktop\hackathon
 Folders ready:
   - data/processed/
   - outputs/figures/

 LOADING: ENROLLMENT DATA
Folder: c:\Users\ahmad\Desktop\hackathon\data\raw\enrollment
 Found 3 CSV files

  [1/3] file1.csv...  500,000 rows
  [2/3] file2.csv...  500,000 rows
  [3/3] file3.csv...  6,029 rows

 Combining 3 files...
 SUCCESS!
   Total rows: 1,006,029
   Total columns: 7
   Memory: 199.1 MB

 Saving to: c:\Users\ahmad\Desktop\hackathon\data\processed\enrollment_combined.csv
 Saved!

 ENROLLMENT COLUMNS (7 total):
    1. date                           | object     | 1,006,029 non-null
    2. state                          | object     | 1,006,029 non-null
    3. district                       | object     | 1,006,029 non-null
    4. pincode                        | int64      | 1,006,029 non-null
    5. age_0_5                        | int64      | 1,006,029 non-null
    6. age_5_17           