In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display


In [2]:
BASE_PATH = "/Users/ayush/Library/Mobile Documents/com~apple~CloudDocs/Developer/UIDAI/02_Cleaned_Data"

merged = pd.read_csv(f"{BASE_PATH}/merged.csv")


In [3]:
merged['dfi'] = merged['digital_friction_index']


In [4]:
# identify outliers using multiple statistical methods

def identify_outliers(data, method='iqr', threshold=1.5):
    
    outliers = {}

    if method == 'iqr':
        Q1 = data['dfi'].quantile(0.25)
        Q3 = data['dfi'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers['high_friction'] = data[data['dfi'] > upper_bound]
        outliers['low_friction'] = data[data['dfi'] < lower_bound]

    elif method == 'zscore':
        z_scores = np.abs(stats.zscore(data['dfi'], nan_policy='omit'))
        outliers['high_friction'] = data[z_scores > threshold]

    elif method == 'percentile':
        p95 = data['dfi'].quantile(0.95)
        outliers['high_friction'] = data[data['dfi'] > p95]

    else:
        raise ValueError("method must be one of: 'iqr', 'zscore', 'percentile'")

    return outliers


In [5]:
outlier_results = identify_outliers(merged, method='percentile')

high_friction_outliers = outlier_results['high_friction']

print("Total rows in system:", len(merged))
print("High-friction outliers (95th percentile):", len(high_friction_outliers))
print("Percentage affected:", round(len(high_friction_outliers)/len(merged)*100, 2), "%")

display(high_friction_outliers[['state','district','month','dfi']].head())


Total rows in system: 4398
High-friction outliers (95th percentile): 220
Percentage affected: 5.0 %


Unnamed: 0,state,district,month,dfi
12,Andhra Pradesh,adilabad,9,10334.0
13,Andhra Pradesh,adilabad,10,8297.0
14,Andhra Pradesh,adilabad,11,8811.0
15,Andhra Pradesh,adilabad,12,8884.0
66,Andhra Pradesh,hyderabad,10,9631.0


In [6]:
# Save high-friction outliers for ML pipeline
high_friction_outliers.to_csv(
    "../outputs/high_friction_outliers.csv",
    index=False
)

print("Saved: outputs/high_friction_outliers.csv")


Saved: outputs/high_friction_outliers.csv


In [7]:
#Identify outliers using a composite multi-dimensional stress score
    
def multi_dimensional_outliers(data):
    
    df = data.copy()

    # Safety against division by zero
    df['adult_stress_ratio'] = df['adult_system_stress'] / (df['age_18_greater'] + 1)
    df['update_enrol_ratio'] = df['total_updates'] / (df['total_enrolments'] + 1)

    # Composite score (weighted)
    df['composite_score'] = (
        0.4 * df['dfi'] +
        0.3 * df['adult_stress_ratio'] +
        0.3 * df['update_enrol_ratio']
    )

    # Percentile-based outliers on composite score
    threshold = df['composite_score'].quantile(0.95)

    outliers = {
        'high_friction': df[df['composite_score'] > threshold]
    }

    return outliers


In [8]:
multi_outliers = multi_dimensional_outliers(merged)

multi_high = multi_outliers['high_friction']

print("Multi-dimensional high-risk locations:", len(multi_high))

display(
    multi_high[['state','district','dfi','composite_score']]
    .sort_values('composite_score', ascending=False)
    .head(10)
)


Multi-dimensional high-risk locations: 220


Unnamed: 0,state,district,dfi,composite_score
3573,Telangana,nalgonda,35067.0,24551.275479
4300,West Bengal,howrah,32253.0,22580.319819
903,Chhattisgarh,janjgir champa,30972.0,21696.078261
956,Chhattisgarh,raigarh,28335.0,19856.008333
4255,West Bengal,birbhum,28357.0,19852.586063
849,Chhattisgarh,baloda bazar,25513.0,17875.212038
3478,Telangana,adilabad,25467.0,17832.632193
902,Chhattisgarh,janjgir champa,25185.0,17637.233028
3446,Tamil Nadu,tirunelveli,25149.0,17612.014757
3574,Telangana,nalgonda,24234.0,16969.688408


In [9]:
# Analyze geographic patterns of high-friction locations

def analyze_geographic_patterns(outliers):
    
    # State-level concentration 
    state_concentration = (
        outliers.groupby('state')
        .agg(
            outlier_count=('dfi', 'count'),
            avg_dfi=('dfi', 'mean'),
            max_dfi=('dfi', 'max'),
            districts_affected=('district', 'nunique')
        )
        .sort_values('outlier_count', ascending=False)
    )

    # Region mapping 
    region_map = {
        'North': [
            'Jammu And Kashmir','Himachal Pradesh','Punjab',
            'Uttarakhand','Haryana','Delhi','Rajasthan'
        ],
        'South': [
            'Tamil Nadu','Kerala','Karnataka',
            'Andhra Pradesh','Telangana'
        ],
        'East': [
            'West Bengal','Odisha','Jharkhand',
            'Bihar','Assam'
        ],
        'West': [
            'Maharashtra','Gujarat','Goa',
            'Madhya Pradesh','Chhattisgarh'
        ],
        'North-East': [
            'Arunachal Pradesh','Manipur','Meghalaya',
            'Mizoram','Nagaland','Tripura','Sikkim'
        ]
    }

    regional_summary = {}

    for region, states in region_map.items():
        region_df = outliers[outliers['state'].isin(states)]

        regional_summary[region] = {
            'outlier_count': len(region_df),
            'avg_dfi': region_df['dfi'].mean(),
            'states_affected': region_df['state'].nunique(),
            'districts_affected': region_df['district'].nunique()
        }

    return {
        'state_concentration': state_concentration,
        'regional_analysis': regional_summary
    }


In [10]:
geo_results = analyze_geographic_patterns(high_friction_outliers)

print("State-level concentration of high-friction locations")
display(geo_results['state_concentration'].head(10))

print(" Regional summary")
pd.DataFrame(geo_results['regional_analysis']).T


State-level concentration of high-friction locations


Unnamed: 0_level_0,outlier_count,avg_dfi,max_dfi,districts_affected
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tamil Nadu,38,11237.333333,25149.0,23
Chhattisgarh,37,13179.31982,30972.0,21
Andhra Pradesh,22,9379.272727,17635.0,8
Maharashtra,22,12884.040909,23623.0,15
Telangana,17,14308.009804,35067.0,7
Odisha,15,9076.422222,15066.0,10
West Bengal,15,14291.776667,32253.0,9
Haryana,13,9560.307692,14207.0,10
Uttar Pradesh,9,9068.018519,14470.0,7
Karnataka,8,8749.895833,13422.5,7


 Regional summary


Unnamed: 0,outlier_count,avg_dfi,states_affected,districts_affected
North,22.0,9163.704545,4.0,18.0
South,85.0,11136.447059,4.0,41.0
East,38.0,11276.8,4.0,25.0
West,63.0,12778.059259,3.0,39.0
North-East,3.0,7847.111111,1.0,2.0


In [11]:
# Analyze month-wise patterns in high-friction locations

def analyze_temporal_patterns(outliers):
    
    monthly_patterns = (
        outliers
        .groupby('month')
        .agg(
            outlier_count=('dfi', 'count'),
            avg_dfi=('dfi', 'mean'),
            dfi_volatility=('dfi', 'std'),
            states_affected=('state', 'nunique'),
            districts_affected=('district', 'nunique')
        )
        .sort_index()
    )

    return {
        'monthly_patterns': monthly_patterns
    }



In [12]:
temporal_results = analyze_temporal_patterns(high_friction_outliers)

print("=== Monthly outlier distribution ===")
display(temporal_results['monthly_patterns'])




=== Monthly outlier distribution ===


Unnamed: 0_level_0,outlier_count,avg_dfi,dfi_volatility,states_affected,districts_affected
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,52,11065.325321,5201.042538,14,49
10,42,11075.369048,4629.347362,8,39
11,37,11434.697748,6017.649036,12,36
12,89,11497.111236,4942.579795,15,87


In [13]:
#Compare high-friction outliers vs normal locations 

def analyze_stress_factors(outliers, full_data):
    
    # Separate normal locations
    normal = full_data.drop(outliers.index)

    metrics = [
        'total_enrolments',
        'total_updates',
        'adult_system_stress',
        'child_system_stress'
    ]

    factor_comparison = {}

    for m in metrics:
        outlier_avg = outliers[m].mean()
        normal_avg = normal[m].mean()

        factor_comparison[m] = {
            'outlier_avg': outlier_avg,
            'normal_avg': normal_avg,
            'ratio': outlier_avg / (normal_avg + 1)
        }

    # Correlation within outliers
    correlation = (
        outliers[metrics + ['dfi']]
        .corr()['dfi']
        .sort_values(ascending=False)
    )

    return {
        'factor_comparison': factor_comparison,
        'correlation': correlation
    }



In [14]:
stress_results = analyze_stress_factors(
    high_friction_outliers,
    merged
)

print(" Factor comparison (Outliers vs Normal) ")

factor_df = pd.DataFrame(stress_results['factor_comparison']).T
display(factor_df.sort_values('ratio', ascending=False))


 Factor comparison (Outliers vs Normal) 


Unnamed: 0,outlier_avg,normal_avg,ratio
adult_system_stress,16164.840909,10007.688368,1.615081
total_updates,23704.077273,15298.245333,1.549363
child_system_stress,7539.236364,5290.556965,1.424767
total_enrolments,1285.618182,1208.403542,1.063018


In [15]:
# Identify infrastructure and process bottlenecks in high-friction locations

def infrastructure_analysis(outliers):
    
    df = outliers.copy()

    df['enrol_update_ratio'] = df['total_enrolments'] / (df['total_updates'] + 1)
    df['adult_stress_ratio'] = df['adult_system_stress'] / (df['age_18_greater'] + 1)

    processing_bottlenecks = df[df['enrol_update_ratio'] > 5]
    adult_processing_issues = df[df['adult_stress_ratio'] > 0.5]

    return {
        'processing_bottlenecks': {
            'count': len(processing_bottlenecks),
            'locations': processing_bottlenecks[
                ['state','district','month','enrol_update_ratio']
            ]
        },
        'adult_processing_issues': {
            'count': len(adult_processing_issues),
            'locations': adult_processing_issues[
                ['state','district','month','adult_stress_ratio']
            ]
        }
    }


In [16]:
# Generate actionable recommendations for a high-friction location

def generate_location_recommendations(location):
    
    recommendations = []

    # Capacity issue
    if location['total_enrolments'] > 100000:
        recommendations.append("Increase processing capacity (servers / staff)")

    # Update inefficiency
    if location['total_updates'] / (location['total_enrolments'] + 1) < 0.1:
        recommendations.append("Optimize update processing workflows")

    # Adult stress dominance
    if location['adult_system_stress'] / (location['age_18_greater'] + 1) > 0.4:
        recommendations.append("Deploy adult-focused biometric capture support")

    # Extreme friction
    if location['dfi'] > location['dfi'].quantile(0.99):
        recommendations.append("Immediate infrastructure audit and intervention")

    if not recommendations:
        recommendations.append("Routine monitoring")

    return recommendations


In [17]:
infra_results = infrastructure_analysis(high_friction_outliers)

print("Processing bottlenecks detected:",
      infra_results['processing_bottlenecks']['count'])

print("Adult stress issues detected:",
      infra_results['adult_processing_issues']['count'])

display(infra_results['processing_bottlenecks']['locations'].head())
display(infra_results['adult_processing_issues']['locations'].head())


Processing bottlenecks detected: 0
Adult stress issues detected: 220


Unnamed: 0,state,district,month,enrol_update_ratio


Unnamed: 0,state,district,month,adult_stress_ratio
12,Andhra Pradesh,adilabad,9,10334.0
13,Andhra Pradesh,adilabad,10,8297.0
14,Andhra Pradesh,adilabad,11,8811.0
15,Andhra Pradesh,adilabad,12,8884.0
66,Andhra Pradesh,hyderabad,10,9631.0


In [18]:
# Generate case studies for top high-friction locations

def generate_case_studies(outliers, top_n=10):
    top = outliers.nlargest(top_n, 'dfi')

    studies = []
    for i, row in top.iterrows():
        studies.append({
            'state': row['state'],
            'district': row['district'],
            'dfi': row['dfi'],
            'total_enrolments': row['total_enrolments'],
            'total_updates': row['total_updates']
        })

    return studies


In [19]:
case_studies = generate_case_studies(high_friction_outliers, top_n=10)

print("TOP 3 CRITICAL OUTLIER CASE STUDIES ")

for cs in case_studies[:3]:
    display(pd.DataFrame.from_dict(cs, orient='index'))


TOP 3 CRITICAL OUTLIER CASE STUDIES 


Unnamed: 0,0
state,Telangana
district,nalgonda
dfi,35067.0
total_enrolments,3025
total_updates,44134.0


Unnamed: 0,0
state,West Bengal
district,howrah
dfi,32253.0
total_enrolments,3415
total_updates,36663.0


Unnamed: 0,0
state,Chhattisgarh
district,janjgir champa
dfi,30972.0
total_enrolments,919
total_updates,48080.0


In [20]:
#Generate system-wide recommendations based on outlier patterns

def generate_system_recommendations(outlier_analysis):
    
    system_recommendations = []
    
    # Regional patterns
    high_risk_regions = [region for region, data in outlier_analysis['regional_analysis'].items() 
                        if data['outlier_count'] > 10]
    
    if high_risk_regions:
        system_recommendations.append({
            'category': 'Regional Infrastructure',
            'recommendation': f'Targeted infrastructure upgrades for {", ".join(high_risk_regions)}',
            'priority': 'high',
            'expected_impact': '25-30% reduction in regional friction'
        })
    
    # Process improvements
    system_recommendations.append({
        'category': 'Process Optimization',
        'recommendation': 'Implement automated load balancing for high-stress locations',
        'priority': 'medium',
        'expected_impact': '15-20% improvement in processing efficiency'
    })
    
    # Monitoring enhancements
    system_recommendations.append({
        'category': 'Monitoring Enhancement',
        'recommendation': 'Real-time alert system for friction threshold breaches',
        'priority': 'high',
        'expected_impact': 'Early detection and prevention of system stress'
    })
    
    return system_recommendations

In [21]:
system_recs = generate_system_recommendations(geo_results)

print(" SYSTEM-WIDE RECOMMENDATIONS ")
display(pd.DataFrame(system_recs))


 SYSTEM-WIDE RECOMMENDATIONS 


Unnamed: 0,category,recommendation,priority,expected_impact
0,Regional Infrastructure,"Targeted infrastructure upgrades for North, So...",high,25-30% reduction in regional friction
1,Process Optimization,Implement automated load balancing for high-st...,medium,15-20% improvement in processing efficiency
2,Monitoring Enhancement,Real-time alert system for friction threshold ...,high,Early detection and prevention of system stress


In [22]:
merged_labeled = merged.copy()
merged_labeled['high_friction_label'] = 0
merged_labeled.loc[
    merged_labeled.index.isin(high_friction_outliers.index),
    'high_friction_label'
] = 1

merged_labeled.to_csv(
    "../outputs/merged_with_labels.csv",
    index=False
)
