In [2]:
!pip install pandas numpy scikit-learn



In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler

In [4]:
df_part1 = pd.read_csv("api_data_aadhar_enrolment_0_500000.csv")
df_part2 = pd.read_csv("api_data_aadhar_enrolment_500000_1000000.csv")
df_part3 = pd.read_csv("api_data_aadhar_enrolment_1000000_1006029.csv")

df = pd.concat([df_part1, df_part2, df_part3], ignore_index=True)

df.head()

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,02-03-2025,Meghalaya,East Khasi Hills,793121,11,61,37.0
1,09-03-2025,Karnataka,Bengaluru Urban,560043,14,33,39.0
2,09-03-2025,Uttar Pradesh,Kanpur Nagar,208001,29,82,12.0
3,09-03-2025,Uttar Pradesh,Aligarh,202133,62,29,15.0
4,09-03-2025,Karnataka,Bengaluru Urban,560016,14,16,21.0


In [5]:
df = df.rename(columns={
    'Date' : 'date',
    'State Name': 'state',
    'District Name': 'district',
    'Pincode': 'pincode',
    'Age_0_5': 'age_0_5',
    'Age_5_17': 'age_5_17',
    'Age_18_': 'age_18_greater'
})

In [7]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y', errors='coerce')
df['month'] = df['date'].dt.month

df['total_enrolments'] = df['age_0_5'] + df['age_5_17'] + df['age_18_greater']
df['child_enrolments'] = df['age_0_5'] + df['age_5_17']
df['adult_enrolments'] = df['age_18_greater']

df = df.sort_values(['district', 'month'])

In [13]:
df['rolling_mean_3'] = df.groupby('district')['total_enrolments'] \
    .transform(lambda x: x.rolling(3, min_periods=1).mean())

df['rolling_std_3'] = df.groupby('district')['total_enrolments'] \
    .transform(lambda x: x.rolling(3, min_periods=1).std())

df['spike_score'] = (df['total_enrolments'] - df['rolling_mean_3']) / \
    (df['rolling_std_3'] + 1)

df['child_adult_ratio'] = df['child_enrolments'] / \
    (df['adult_enrolments'] + 1)

df['mom_growth'] = df.groupby('district')['total_enrolments'] \
    .pct_change().fillna(0)

  .pct_change().fillna(0)


In [9]:
features = df[['total_enrolments',
               'spike_score',
               'child_adult_ratio',
               'mom_growth']].fillna(0)

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(features)

In [10]:
model = IsolationForest(
    n_estimators=200,
    contamination=0.08,
    random_state=42
)

df['anomaly_flag'] = model.fit_predict(X_scaled)
df['anomaly_score'] = model.decision_function(X_scaled)

In [11]:
df['norm_anomaly'] = MinMaxScaler().fit_transform(
    df[['anomaly_score']]
)

df['norm_spike'] = MinMaxScaler().fit_transform(
    df[['spike_score']]
)

df['norm_child_skew'] = MinMaxScaler().fit_transform(
    df[['child_adult_ratio']]
)

df['risk_score'] = (
    0.4 * (1 - df['norm_anomaly']) +
    0.3 * df['norm_spike'] +
    0.3 * df['norm_child_skew']
) * 100

In [12]:
def risk_label(score):
    if score >= 70:
        return "HIGH (Audit Recommended)"
    elif score >= 40:
        return "MEDIUM (Monitor)"
    else:
        return "LOW (Normal)"

df['risk_level'] = df['risk_score'].apply(risk_label)

df['explanation'] = np.where(
    df['risk_score'] >= 70,
    "Abnormal enrolment spike & demographic skew detected",
    "Normal enrolment behavior"
)

In [14]:
df[['state', 'district', 'month',
    'risk_score', 'risk_level', 'explanation']].head(300)

Unnamed: 0,state,district,month,risk_score,risk_level,explanation
23108,100000,100000,9.0,,LOW (Normal),Normal enrolment behavior
46946,100000,100000,9.0,16.870839,LOW (Normal),Normal enrolment behavior
97816,100000,100000,9.0,17.561149,LOW (Normal),Normal enrolment behavior
115798,100000,100000,9.0,20.225133,LOW (Normal),Normal enrolment behavior
153156,100000,100000,9.0,29.634954,LOW (Normal),Normal enrolment behavior
...,...,...,...,...,...,...
88809,Andhra Pradesh,Adilabad,9.0,38.818976,LOW (Normal),Normal enrolment behavior
89409,Telangana,Adilabad,9.0,12.108417,LOW (Normal),Normal enrolment behavior
90866,Andhra Pradesh,Adilabad,9.0,11.469916,LOW (Normal),Normal enrolment behavior
90934,Andhra Pradesh,Adilabad,9.0,15.102608,LOW (Normal),Normal enrolment behavior


In [15]:
df.to_csv("aadhaar_risk_output.csv", index=False)