# Friction Signal Engineering
This notebook derives core friction signals (UIS, RIS, BSS, TSD) from cleaned, aggregated UIDAI datasets.

### Imports & Setup

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
pd.set_option('display.max_columns', None)

Load Cleaned Datasets

In [2]:
PROCESSED_PATH = Path("../datasets/processed")

In [3]:
enrolment_df = pd.read_csv(PROCESSED_PATH / "enrolment_cleaned.csv")
demo_df = pd.read_csv(PROCESSED_PATH / "demographic_updates_cleaned.csv")
bio_df = pd.read_csv(PROCESSED_PATH / "biometric_updates_cleaned.csv")


print(enrolment_df.shape, demo_df.shape, bio_df.shape)

(11394, 4) (12039, 4) (11979, 4)


### Merge Datasets (State–District–Period)

In [4]:
base_df = enrolment_df.merge(
demo_df,
on=["state", "district", "period"],
how="left"
).merge(
bio_df,
on=["state", "district", "period"],
how="left"
)


# Fill missing update counts with zero
base_df[["demographic_update_count", "biometric_update_count"]] = (
base_df[["demographic_update_count", "biometric_update_count"]]
.fillna(0)
)


base_df

Unnamed: 0,state,district,period,enrolment_count,demographic_update_count,biometric_update_count
0,Andaman & Nicobar Islands,Andamans,2025-01,2,13.0,993.0
1,Andaman & Nicobar Islands,Andamans,2025-02,1,8.0,24.0
2,Andaman & Nicobar Islands,Andamans,2025-04,3,19.0,12.0
3,Andaman & Nicobar Islands,Andamans,2025-09,2,9.0,31.0
4,Andaman & Nicobar Islands,Andamans,2025-11,1,1.0,17.0
...,...,...,...,...,...,...
11389,West Bengal,West Midnapore,2025-12,45,154.0,160.0
11390,West Bengal,West Midnapore,NaT,703,4764.0,827.0
11391,Westbengal,Hooghly,2025-05,1,1.0,0.0
11392,Westbengal,Hooghly,2025-12,1,3.0,2.0


### Update Intensity Signal (UIS)

UIS = (Demographic + Biometric Updates) / Enrolments


In [5]:
base_df["UIS"] = (
(base_df["demographic_update_count"] + base_df["biometric_update_count"]) /
base_df["enrolment_count"].replace(0, np.nan)
)


base_df["UIS"] = base_df["UIS"].fillna(0)
base_df

Unnamed: 0,state,district,period,enrolment_count,demographic_update_count,biometric_update_count,UIS
0,Andaman & Nicobar Islands,Andamans,2025-01,2,13.0,993.0,503.000000
1,Andaman & Nicobar Islands,Andamans,2025-02,1,8.0,24.0,32.000000
2,Andaman & Nicobar Islands,Andamans,2025-04,3,19.0,12.0,10.333333
3,Andaman & Nicobar Islands,Andamans,2025-09,2,9.0,31.0,20.000000
4,Andaman & Nicobar Islands,Andamans,2025-11,1,1.0,17.0,18.000000
...,...,...,...,...,...,...,...
11389,West Bengal,West Midnapore,2025-12,45,154.0,160.0,6.977778
11390,West Bengal,West Midnapore,NaT,703,4764.0,827.0,7.953058
11391,Westbengal,Hooghly,2025-05,1,1.0,0.0,1.000000
11392,Westbengal,Hooghly,2025-12,1,3.0,2.0,5.000000


### Repeat Interaction Signal (RIS)
Rolling 3-month update pressure vs long-term average

In [6]:
base_df = base_df.sort_values(["state", "district", "period"])


base_df["total_updates"] = (
base_df["demographic_update_count"] + base_df["biometric_update_count"]
)


base_df["rolling_updates"] = (
base_df.groupby(["state", "district"])["total_updates"]
.rolling(window=3, min_periods=1)
.mean()
.reset_index(level=[0,1], drop=True)
)


base_df["historical_avg"] = (
base_df.groupby(["state", "district"])["total_updates"]
.transform("mean")
)


base_df["RIS"] = (
base_df["rolling_updates"] /
base_df["historical_avg"].replace(0, np.nan)
)


base_df["RIS"] = base_df["RIS"].fillna(0)
base_df

Unnamed: 0,state,district,period,enrolment_count,demographic_update_count,biometric_update_count,UIS,total_updates,rolling_updates,historical_avg,RIS
0,Andaman & Nicobar Islands,Andamans,2025-01,2,13.0,993.0,503.000000,1006.0,1006.000000,190.285714,5.286787
1,Andaman & Nicobar Islands,Andamans,2025-02,1,8.0,24.0,32.000000,32.0,519.000000,190.285714,2.727477
2,Andaman & Nicobar Islands,Andamans,2025-04,3,19.0,12.0,10.333333,31.0,356.333333,190.285714,1.872623
3,Andaman & Nicobar Islands,Andamans,2025-09,2,9.0,31.0,20.000000,40.0,34.333333,190.285714,0.180430
4,Andaman & Nicobar Islands,Andamans,2025-11,1,1.0,17.0,18.000000,18.0,29.666667,190.285714,0.155906
...,...,...,...,...,...,...,...,...,...,...,...
11389,West Bengal,West Midnapore,2025-12,45,154.0,160.0,6.977778,314.0,408.333333,1656.076923,0.246567
11390,West Bengal,West Midnapore,NaT,703,4764.0,827.0,7.953058,5591.0,2091.666667,1656.076923,1.263025
11391,Westbengal,Hooghly,2025-05,1,1.0,0.0,1.000000,1.0,1.000000,8.000000,0.125000
11392,Westbengal,Hooghly,2025-12,1,3.0,2.0,5.000000,5.0,3.000000,8.000000,0.375000


### Biometric Stress Signal (BSS)

BSS = Biometric updates proportion

In [7]:
base_df["BSS"] = (
base_df["biometric_update_count"] /
base_df["total_updates"].replace(0, np.nan)
)


base_df["BSS"] = base_df["BSS"].fillna(0)
base_df

Unnamed: 0,state,district,period,enrolment_count,demographic_update_count,biometric_update_count,UIS,total_updates,rolling_updates,historical_avg,RIS,BSS
0,Andaman & Nicobar Islands,Andamans,2025-01,2,13.0,993.0,503.000000,1006.0,1006.000000,190.285714,5.286787,0.987078
1,Andaman & Nicobar Islands,Andamans,2025-02,1,8.0,24.0,32.000000,32.0,519.000000,190.285714,2.727477,0.750000
2,Andaman & Nicobar Islands,Andamans,2025-04,3,19.0,12.0,10.333333,31.0,356.333333,190.285714,1.872623,0.387097
3,Andaman & Nicobar Islands,Andamans,2025-09,2,9.0,31.0,20.000000,40.0,34.333333,190.285714,0.180430,0.775000
4,Andaman & Nicobar Islands,Andamans,2025-11,1,1.0,17.0,18.000000,18.0,29.666667,190.285714,0.155906,0.944444
...,...,...,...,...,...,...,...,...,...,...,...,...
11389,West Bengal,West Midnapore,2025-12,45,154.0,160.0,6.977778,314.0,408.333333,1656.076923,0.246567,0.509554
11390,West Bengal,West Midnapore,NaT,703,4764.0,827.0,7.953058,5591.0,2091.666667,1656.076923,1.263025,0.147916
11391,Westbengal,Hooghly,2025-05,1,1.0,0.0,1.000000,1.0,1.000000,8.000000,0.125000,0.000000
11392,Westbengal,Hooghly,2025-12,1,3.0,2.0,5.000000,5.0,3.000000,8.000000,0.375000,0.400000


### Temporal Spike Deviation (TSD)

In [8]:
base_df.groupby("state")["period"].nunique().describe()


count    48.000000
mean     11.562500
std       2.967241
min       1.000000
25%      12.000000
50%      13.000000
75%      13.000000
max      13.000000
Name: period, dtype: float64

In [9]:
base_df.groupby("state")["total_updates"].std().describe()


count       47.000000
mean     11096.375062
std      11624.226086
min          3.109126
25%        825.130816
50%       6043.197786
75%      19138.748107
max      42629.172284
Name: total_updates, dtype: float64

In [10]:
base_df["period_dt"] = pd.to_datetime(base_df["period"])


In [11]:
base_df = base_df.sort_values(
    ["state", "district", "period_dt"]
)


In [12]:
# Temporal Spike Deviation (state-level baseline)
base_df["TSD"] = (
    base_df.groupby(["state", "district"])["total_updates"]
    .transform(
        lambda x: (x - x.mean()) / (x.std() if x.std() != 0 else 1)
    )
)
base_df["TSD"] = base_df["TSD"].fillna(0)
base_df["TSD"].describe()

count    1.139400e+04
mean     9.743927e-19
std      9.534040e-01
min     -1.212062e+00
25%     -3.334802e-01
50%     -3.074994e-01
75%     -2.861346e-01
max      3.326111e+00
Name: TSD, dtype: float64

### Min-Max Normalization of Signals

In [13]:
for col in ["UIS", "RIS", "BSS"]:
    min_val = base_df[col].min()
    max_val = base_df[col].max()
    if max_val > min_val:
        base_df[col] = (base_df[col] - min_val) / (max_val - min_val)
    else:
        base_df[col] = 0


In [14]:
# Freeze raw TSD
base_df["TSD_raw"] = base_df["TSD"]

tsd_min = base_df["TSD_raw"].min()
tsd_max = base_df["TSD_raw"].max()

base_df["TSD_norm"] = (
    base_df["TSD_raw"] - tsd_min
) / (tsd_max - tsd_min)

base_df["TSD_norm"].describe()


count    11394.000000
mean         0.267082
std          0.210085
min          0.000000
25%          0.193598
50%          0.199323
75%          0.204031
max          1.000000
Name: TSD_norm, dtype: float64

### Save Signal Dataset

In [15]:
SIGNAL_PATH = Path("..//datasets/processed/signals")
SIGNAL_PATH.mkdir(parents=True, exist_ok=True)


base_df.to_csv(SIGNAL_PATH / "friction_signals.csv", index=False)


print("Friction signals generated and saved.")

Friction signals generated and saved.


### Quick Sanity Checks

In [16]:
base_df[["UIS", "RIS", "BSS", "TSD"]].describe()

Unnamed: 0,UIS,RIS,BSS,TSD
count,11394.0,11394.0,11394.0,11394.0
mean,0.002101,0.126689,0.515981,9.743926999999999e-19
std,0.01421,0.231641,0.246605,0.953404
min,0.0,0.0,0.0,-1.212062
25%,0.000225,0.008986,0.338456,-0.3334802
50%,0.000354,0.014762,0.522398,-0.3074994
75%,0.000583,0.07155,0.69695,-0.2861346
max,1.0,1.0,1.0,3.326111
