# Biometric EDA (Universal 14-Step Framework)
Run top-to-bottom. This notebook mirrors the same framework used for enrollment and demographic EDA.


## STEP 1 - Business Understanding


In [1]:
# Aim: Define business context for biometric table.
# Expected Output: Business context table.
# What You Get: Clear semantic definition of row meaning and usage.
# Data Engineer Learning: Business meaning must be explicit before technical profiling.

import pandas as pd
import numpy as np
from pathlib import Path

business_context = pd.DataFrame([
    {'field': 'row_definition', 'value': 'One row represents biometric counts for one date-state-district-pincode record.'},
    {'field': 'table_type', 'value': 'Aggregated event summary table.'},
    {'field': 'measured_values', 'value': 'Biometric count columns (numeric measures).'},
    {'field': 'decision_support', 'value': 'Biometric trend tracking, quality controls, geographic comparisons.'},
])
display(business_context)


Unnamed: 0,field,value
0,row_definition,One row represents biometric counts for one da...
1,table_type,Aggregated event summary table.
2,measured_values,Biometric count columns (numeric measures).
3,decision_support,"Biometric trend tracking, quality controls, ge..."


## STEP 2 - Structural Profiling


In [2]:
# Aim: Load biometric table with robust path handling.
# Expected Output: Base dataframe loaded and row/column count printed.
# What You Get: Stable data source initialization.
# Data Engineer Learning: Portable ingestion logic reduces environment-specific failures.

candidate_files = [
    Path('scripts/EDA/panda_eda/data/data_aadhar_biometric_full.csv'),
    Path('scripts/EDA/panda_eda/eda_enrollment/data/data_aadhar_biometric_full.csv'),
    Path('data/data_aadhar_biometric_full.csv'),
]

biometric_path = next((f for f in candidate_files if f.exists()), None)
if biometric_path is None:
    raise FileNotFoundError(f'Could not find biometric file in: {candidate_files}')

data_aadhar_biometric_full = pd.read_csv(biometric_path)
print('Biometric path:', biometric_path)
print('Rows, Cols:', data_aadhar_biometric_full.shape)


Biometric path: data\data_aadhar_biometric_full.csv
Rows, Cols: (1861108, 6)


In [3]:
# Aim: Run structural profiling checks.
# Expected Output: shape, columns, info, head, and sample rows.
# What You Get: Schema overview and first quality signals.
# Data Engineer Learning: Early schema visibility prevents downstream assumptions.

df = data_aadhar_biometric_full
print('Shape:', df.shape)
print('\nColumns:')
print(list(df.columns))
print('\nInfo:')
print(df.info())
print('\nHead:')
display(df.head())
print('\nSample:')
display(df.sample(min(5, len(df)), random_state=42))

Shape: (1861108, 6)

Columns:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1861108 entries, 0 to 1861107
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   date          object
 1   state         object
 2   district      object
 3   pincode       int64 
 4   bio_age_5_17  int64 
 5   bio_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 85.2+ MB
None

Head:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815



Sample:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
690756,24-10-2025,Kerala,Malappuram,676304,6,16
1047602,09-11-2025,Andhra Pradesh,Krishna,521001,4,3
511058,20-09-2025,Gujarat,Anand,388130,1,3
920820,03-11-2025,Tamil Nadu,Tirunelveli,627803,8,5
231064,06-09-2025,West Bengal,Bankura,722205,9,13


## STEP 3 - Grain Identification


In [4]:
# Aim: Identify natural key and test uniqueness.
# Expected Output: full duplicate count and key duplicate count.
# What You Get: Evidence for row grain.
# Data Engineer Learning: Grain clarity is mandatory for trustworthy modeling.

df = data_aadhar_biometric_full.copy()
natural_key = ['date', 'state', 'district', 'pincode']
full_dups = int(df.duplicated().sum())
key_dups = int(df.duplicated(subset=natural_key).sum())
print('Natural Key:', natural_key)
print('Full-row duplicates:', full_dups)
print('Key-level duplicates:', key_dups)


Natural Key: ['date', 'state', 'district', 'pincode']
Full-row duplicates: 94896
Key-level duplicates: 94896


## STEP 4 - Duplicate Analysis


In [5]:
# Aim: Inspect duplicate groups and build deduplicated working table.
# Expected Output: duplicate sample + before/after key duplicate counts.
# What You Get: Clean working table for downstream steps.
# Data Engineer Learning: Separate raw and deduplicated tables for lineage traceability.

df = data_aadhar_biometric_full.copy()
natural_key = ['date', 'state', 'district', 'pincode']

dup_view = df[df.duplicated(subset=natural_key, keep=False)].sort_values(by=['date', 'state'])
print('Duplicate group rows:', len(dup_view))
display(dup_view.head(20))

df_dedup = df.drop_duplicates(subset=natural_key, keep='first').reset_index(drop=True)
print('Key duplicates before:', int(df.duplicated(subset=natural_key).sum()))
print('Key duplicates after :', int(df_dedup.duplicated(subset=natural_key).sum()))


Duplicate group rows: 189792


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
118922,01-09-2025,Assam,Sonitpur,784174,0,1
118923,01-09-2025,Assam,Sonitpur,784182,4,4
118924,01-09-2025,Assam,South Salmara Mankachar,783127,3,3
118925,01-09-2025,Assam,South Salmara Mankachar,783128,0,1
118926,01-09-2025,Assam,Tinsukia,786150,2,0
118927,01-09-2025,Assam,Tinsukia,786152,8,2
118928,01-09-2025,Assam,Tinsukia,786155,3,0
118929,01-09-2025,Assam,Tinsukia,786158,2,2
118930,01-09-2025,Assam,Tinsukia,786170,3,2
118931,01-09-2025,Assam,Tinsukia,786182,3,1


Key duplicates before: 94896
Key duplicates after : 0


## STEP 5 - Missing Value Analysis


In [6]:
# Aim: Analyze nulls, null%, and empty-string fields.
# Expected Output: missingness tables and empty-state count.
# What You Get: Completeness risk baseline.
# Data Engineer Learning: Null and empty-string defects should be measured separately.

df = df_dedup.copy()
null_count = df.isnull().sum().sort_values(ascending=False)
null_pct = (df.isnull().sum() / max(len(df),1) * 100).sort_values(ascending=False)
missing_tbl = pd.DataFrame({'null_count': null_count, 'null_pct': null_pct})
display(missing_tbl.head(20))

if 'state' in df.columns:
    empty_state = int(df['state'].astype(str).str.strip().eq('').sum())
    print('Empty-string state rows:', empty_state)


Unnamed: 0,null_count,null_pct
date,0,0.0
state,0,0.0
district,0,0.0
pincode,0,0.0
bio_age_5_17,0,0.0
bio_age_17_,0,0.0


Empty-string state rows: 0


## STEP 6 - Data Type and Format Validation


In [7]:
# Aim: Validate date type, numeric columns, and pincode format.
# Expected Output: datetime parsing status, pincode length distribution, object columns.
# What You Get: Data contract validation evidence.
# Data Engineer Learning: Type/format contracts prevent silent downstream failures.

df = df_dedup.copy()
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
print('Null dates after parse:', int(df['date'].isna().sum()))

pin_len = df['pincode'].astype(str).str.replace(r'\.0$','', regex=True).str.strip().str.len().value_counts(dropna=False).sort_index()
print('Pincode length distribution:')
display(pin_len.to_frame('count'))

obj_cols = df.select_dtypes(include=['object']).columns.tolist()
print('Object columns:', obj_cols)


Null dates after parse: 0
Pincode length distribution:


Unnamed: 0_level_0,count
pincode,Unnamed: 1_level_1
6,1766212


Object columns: ['state', 'district']


## STEP 7 - Domain Validation


In [8]:
# Aim: Validate value ranges and domain sanity.
# Expected Output: negative-measure diagnostics, date range, district sample.
# What You Get: Domain correctness signal.
# Data Engineer Learning: Domain checks separate true behavior from bad data.

df = df_dedup.copy()
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)

measure_cols = [c for c in df.columns if c not in ['date','state','district','pincode'] and pd.api.types.is_numeric_dtype(df[c])]
neg_rows = int((df[measure_cols] < 0).any(axis=1).sum()) if measure_cols else 0

print('Numeric measure columns:', measure_cols)
print('Rows with negative values:', neg_rows)
print('Date range:', df['date'].min(), 'to', df['date'].max())
print('District sample:')
display(pd.Series(df['district'].dropna().astype(str).unique()).head(20).to_frame('district'))


Numeric measure columns: ['bio_age_5_17', 'bio_age_17_']
Rows with negative values: 0
Date range: 2025-03-01 00:00:00 to 2025-12-29 00:00:00
District sample:


Unnamed: 0,district
0,Mahendragarh
1,Madhepura
2,Punch
3,Bhojpur
4,Madurai
5,Ratnagiri
6,Anand
7,Gandhinagar
8,Dhenkanal
9,Valsad


## STEP 8 - Cardinality and Distribution


In [9]:
# Aim: Compute key cardinality and distribution metrics.
# Expected Output: unique states, districts-per-state, unique pincodes.
# What You Get: Dimensional scaling profile.
# Data Engineer Learning: Cardinality informs partitioning and data model sizing.

df = df_dedup.copy()
print('Unique states:', df['state'].nunique())
print('Unique pincodes:', df['pincode'].nunique())
state_district = df.groupby('state')['district'].nunique().sort_values(ascending=False)
display(state_district.head(20).to_frame('district_nunique'))


Unique states: 57
Unique pincodes: 19707


Unnamed: 0_level_0,district_nunique
state,Unnamed: 1_level_1
Uttar Pradesh,90
Madhya Pradesh,60
West Bengal,58
Karnataka,54
Maharashtra,52
Bihar,47
Rajasthan,46
Odisha,46
Andhra Pradesh,45
Tamil Nadu,45


## STEP 9 - Cross-Column Consistency


In [10]:
# Aim: Test geo relationship stability.
# Expected Output: pincode->district and district->state uniqueness checks.
# What You Get: Referential consistency profile.
# Data Engineer Learning: Cross-column stability is key for reliable joins.

df = df_dedup.copy()
pin_to_district = df.groupby('pincode')['district'].nunique().sort_values(ascending=False)
dist_to_state = df.groupby('district')['state'].nunique().sort_values(ascending=False)

print('Pincodes with >1 district:', int((pin_to_district > 1).sum()))
print('Districts with >1 state  :', int((dist_to_state > 1).sum()))

display(pin_to_district.head(20).to_frame('district_nunique'))
display(dist_to_state.head(20).to_frame('state_nunique'))


Pincodes with >1 district: 7562
Districts with >1 state  : 86


Unnamed: 0_level_0,district_nunique
pincode,Unnamed: 1_level_1
509340,8
721144,8
509339,8
509371,7
500087,7
500090,7
500043,7
500055,7
500014,7
571313,6


Unnamed: 0_level_0,state_nunique
district,Unnamed: 1_level_1
Hooghly,5
South 24 Parganas,4
HOOGHLY,3
Howrah,3
hooghly,3
Gajapati,3
Diu,3
Kargil,3
Daman,3
Ganjam,2


## STEP 10 - Measures vs Dimensions Classification


In [11]:
# Aim: Classify columns into dimensions and measures.
# Expected Output: data dictionary table.
# What You Get: Fact/dimension readiness map.
# Data Engineer Learning: Explicit column roles reduce modeling ambiguity.

df = df_dedup.copy()
measure_cols = [c for c in df.columns if c not in ['date','state','district','pincode'] and pd.api.types.is_numeric_dtype(df[c])]
classification = []
for c in df.columns:
    if c == 'date':
        role = 'Time Dimension'
    elif c in ['state','district','pincode']:
        role = 'Location Dimension'
    elif c in measure_cols:
        role = 'Measure'
    else:
        role = 'Attribute'
    classification.append({'column': c, 'role': role, 'dtype': str(df[c].dtype)})

display(pd.DataFrame(classification))
print('dtypes:')
print(df.dtypes)
print('\nDescribe:')
display(df.describe(include='all'))


Unnamed: 0,column,role,dtype
0,date,Time Dimension,object
1,state,Location Dimension,object
2,district,Location Dimension,object
3,pincode,Location Dimension,int64
4,bio_age_5_17,Measure,int64
5,bio_age_17_,Measure,int64


dtypes:
date            object
state           object
district        object
pincode          int64
bio_age_5_17     int64
bio_age_17_      int64
dtype: object

Describe:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
count,1766212,1766212,1766212,1766212.0,1766212.0,1766212.0
unique,89,57,974,,,
top,12-11-2025,Tamil Nadu,Pune,,,
freq,23994,174934,11068,,,
mean,,,,521731.4,18.9426,19.70568
std,,,,198338.2,85.81746,90.30576
min,,,,110001.0,0.0,0.0
25%,,,,391150.0,1.0,1.0
50%,,,,522439.0,4.0,4.0
75%,,,,686681.0,11.0,10.0


## STEP 11 - Outlier Detection (Optional Advanced)


In [12]:
# Aim: Detect outliers using total_biometric and z-score.
# Expected Output: outlier count and top outlier rows.
# What You Get: anomaly candidates for investigation.
# Data Engineer Learning: Outlier monitoring supports proactive data quality control.

df = df_dedup.copy()
measure_cols = [c for c in df.columns if c not in ['date','state','district','pincode'] and pd.api.types.is_numeric_dtype(df[c])]
if measure_cols:
    df['total_biometric'] = df[measure_cols].fillna(0).sum(axis=1)
    mu = df['total_biometric'].mean()
    sd = df['total_biometric'].std()
    df['z'] = (df['total_biometric'] - mu) / (sd if pd.notna(sd) and sd != 0 else np.nan)
    outliers = df[df['z'].abs() > 3]
    print('Outlier rows (|z|>3):', len(outliers))
    display(outliers[['date','state','district','pincode','total_biometric','z']].head(20))
else:
    print('No numeric measure columns found for outlier detection.')


Outlier rows (|z|>3): 22756


Unnamed: 0,date,state,district,pincode,total_biometric,z
0,01-03-2025,Haryana,Mahendragarh,123029,857,4.91659
2,01-03-2025,Jammu and Kashmir,Punch,185101,1734,10.185533
3,01-03-2025,Bihar,Bhojpur,802158,1236,7.19359
4,01-03-2025,Tamil Nadu,Madurai,625514,1086,6.292403
5,01-03-2025,Maharashtra,Ratnagiri,416702,684,3.87722
15,01-03-2025,Bihar,Vaishali,844504,1384,8.082762
27,01-03-2025,Maharashtra,Wardha,442101,807,4.616194
28,01-03-2025,Odisha,Nabarangapur,764075,1043,6.034062
30,01-03-2025,Karnataka,Davangere,577002,605,3.402594
31,01-03-2025,Gujarat,Dahod,389382,910,5.235009


## STEP 12 - Trend and Time Analysis (Optional)


In [13]:
# Aim: Build monthly trend for total biometric counts.
# Expected Output: month-wise total trend table.
# What You Get: time-series movement view.
# Data Engineer Learning: Time trends are required for operational monitoring.

df = df_dedup.copy()
df['date'] = pd.to_datetime(df['date'], errors='coerce', dayfirst=True)
measure_cols = [c for c in df.columns if c not in ['date','state','district','pincode'] and pd.api.types.is_numeric_dtype(df[c])]
if measure_cols:
    df['total_biometric'] = df[measure_cols].fillna(0).sum(axis=1)
    df['month'] = df['date'].dt.to_period('M').astype(str)
    monthly = df.groupby('month', as_index=False)['total_biometric'].sum().sort_values('month')
    display(monthly.head(24))
    print('Total months:', monthly['month'].nunique())
else:
    print('No numeric measure columns found for monthly trend.')


Unnamed: 0,month,total_biometric
0,2025-03,8322222
1,2025-04,8641679
2,2025-05,7879956
3,2025-06,7899289
4,2025-07,9792552
5,2025-09,6513507
6,2025-10,4233854
7,2025-11,6770804
8,2025-12,8207196


Total months: 9


## STEP 13 - Data Quality Risk Summary


In [14]:
# Aim: Build quality risk snapshot dictionary/table.
# Expected Output: compact summary object and table.
# What You Get: one-glance decision summary.
# Data Engineer Learning: End EDA with explicit risk metrics.

df = df_dedup.copy()
summary = {
    'rows': len(df),
    'duplicates_key': int(df.duplicated(subset=['date','state','district','pincode']).sum()),
    'null_cells': int(df.isnull().sum().sum()),
    'unique_states': int(df['state'].nunique())
}
print(summary)

summary_tbl = pd.DataFrame([{'metric': k, 'value': v} for k, v in summary.items()])
display(summary_tbl)


{'rows': 1766212, 'duplicates_key': 0, 'null_cells': 0, 'unique_states': 57}


Unnamed: 0,metric,value
0,rows,1766212
1,duplicates_key,0
2,null_cells,0
3,unique_states,57


## STEP 14 - Documentation


In [15]:
# Aim: Create final EDA documentation artifact.
# Expected Output: findings table + markdown export.
# What You Get: durable documentation for handoff/interviews.
# Data Engineer Learning: Documentation makes EDA reusable and auditable.

df = df_dedup.copy()

gr_text = 'One row = biometric counts for one date-state-district-pincode combination.'
nk_text = '(date, state, district, pincode)'
issues = [
    f"Full-row duplicates: {int(df.duplicated().sum())}",
    f"Key duplicates: {int(df.duplicated(subset=['date','state','district','pincode']).sum())}",
    f"Total null cells: {int(df.isnull().sum().sum())}",
]
fixes = [
    'Enforce natural-key uniqueness in ingestion layer.',
    'Enforce date and pincode contracts.',
    'Track pincode referential conflicts as quality KPI.'
]
model_dir = [
    'Use date+location grain for conformed joins.',
    'Publish only quality-checked records to downstream models.'
]

eda_findings_doc = pd.DataFrame([
    {'section': 'Grain', 'details': gr_text},
    {'section': 'Natural Key', 'details': nk_text},
    {'section': 'Identified Issues', 'details': ' | '.join(issues)},
    {'section': 'Expected Fixes', 'details': ' | '.join(fixes)},
    {'section': 'Modeling Direction', 'details': ' | '.join(model_dir)},
])
display(eda_findings_doc)

report_dir = Path('scripts/EDA/panda_eda/consistency_reports')
report_dir.mkdir(parents=True, exist_ok=True)

csv_path = report_dir / 'final_biometric_eda_findings_table.csv'
md_path = report_dir / 'final_biometric_eda_findings.md'
eda_findings_doc.to_csv(csv_path, index=False)

md_lines = ['# Final Biometric EDA Findings', '']
for _, r in eda_findings_doc.iterrows():
    md_lines.append(f"## {r['section']}")
    md_lines.append(str(r['details']))
    md_lines.append('')
md_path.write_text('\n'.join(md_lines), encoding='utf-8')

print('Saved:')
print('-', csv_path)
print('-', md_path)


Unnamed: 0,section,details
0,Grain,One row = biometric counts for one date-state-...
1,Natural Key,"(date, state, district, pincode)"
2,Identified Issues,Full-row duplicates: 0 | Key duplicates: 0 | T...
3,Expected Fixes,Enforce natural-key uniqueness in ingestion la...
4,Modeling Direction,Use date+location grain for conformed joins. |...


Saved:
- scripts\EDA\panda_eda\consistency_reports\final_biometric_eda_findings_table.csv
- scripts\EDA\panda_eda\consistency_reports\final_biometric_eda_findings.md
