In [1]:
import numpy as np 
import pandas as pd
from pathlib import Path



In [2]:
PROJECT_ROOT = Path.cwd().parent
DATA_PATH = (
    PROJECT_ROOT
    
    / "external"
    / "kaggle_benchmarks"
    / "keystroke_dynamics"
    / "raw"
    / "DSL-StrongPasswordData.csv"
)

assert DATA_PATH.exists(), f"File not found: {DATA_PATH}"



In [3]:
df = pd.read_csv(DATA_PATH)
df.head()


Unnamed: 0,subject,sessionIndex,rep,H.period,DD.period.t,UD.period.t,H.t,DD.t.i,UD.t.i,H.i,...,H.a,DD.a.n,UD.a.n,H.n,DD.n.l,UD.n.l,H.l,DD.l.Return,UD.l.Return,H.Return
0,s002,1,1,0.1491,0.3979,0.2488,0.1069,0.1674,0.0605,0.1169,...,0.1349,0.1484,0.0135,0.0932,0.3515,0.2583,0.1338,0.3509,0.2171,0.0742
1,s002,1,2,0.1111,0.3451,0.234,0.0694,0.1283,0.0589,0.0908,...,0.1412,0.2558,0.1146,0.1146,0.2642,0.1496,0.0839,0.2756,0.1917,0.0747
2,s002,1,3,0.1328,0.2072,0.0744,0.0731,0.1291,0.056,0.0821,...,0.1621,0.2332,0.0711,0.1172,0.2705,0.1533,0.1085,0.2847,0.1762,0.0945
3,s002,1,4,0.1291,0.2515,0.1224,0.1059,0.2495,0.1436,0.104,...,0.1457,0.1629,0.0172,0.0866,0.2341,0.1475,0.0845,0.3232,0.2387,0.0813
4,s002,1,5,0.1249,0.2317,0.1068,0.0895,0.1676,0.0781,0.0903,...,0.1312,0.1582,0.027,0.0884,0.2517,0.1633,0.0903,0.2517,0.1614,0.0818


In [4]:
df.columns

Index(['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t',
       'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e',
       'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r',
       'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o', 'UD.Shift.r.o', 'H.o',
       'DD.o.a', 'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n', 'H.n', 'DD.n.l',
       'UD.n.l', 'H.l', 'DD.l.Return', 'UD.l.Return', 'H.Return'],
      dtype='object')

# Meta columns

In [5]:
meta_cols = ["subject", "sessionIndex", "rep"]


In [6]:
hold_cols = [c for c in df.columns if c.startswith("H.")]
dd_cols   = [c for c in df.columns if c.startswith("DD.")]
ud_cols   = [c for c in df.columns if c.startswith("UD.")]

In [7]:
print("Hold columns:", hold_cols)
print("DD columns:", dd_cols)
print("UD columns:", ud_cols)


Hold columns: ['H.period', 'H.t', 'H.i', 'H.e', 'H.five', 'H.Shift.r', 'H.o', 'H.a', 'H.n', 'H.l', 'H.Return']
DD columns: ['DD.period.t', 'DD.t.i', 'DD.i.e', 'DD.e.five', 'DD.five.Shift.r', 'DD.Shift.r.o', 'DD.o.a', 'DD.a.n', 'DD.n.l', 'DD.l.Return']
UD columns: ['UD.period.t', 'UD.t.i', 'UD.i.e', 'UD.e.five', 'UD.five.Shift.r', 'UD.Shift.r.o', 'UD.o.a', 'UD.a.n', 'UD.n.l', 'UD.l.Return']


# Summary statistics

In [8]:
df[hold_cols + dd_cols + ud_cols].describe()

Unnamed: 0,H.period,H.t,H.i,H.e,H.five,H.Shift.r,H.o,H.a,H.n,H.l,...,UD.period.t,UD.t.i,UD.i.e,UD.e.five,UD.five.Shift.r,UD.Shift.r.o,UD.o.a,UD.a.n,UD.n.l,UD.l.Return
count,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,...,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0,20400.0
mean,0.093379,0.085727,0.081565,0.089138,0.076904,0.095937,0.088354,0.106259,0.089899,0.095589,...,0.170769,0.083358,0.077806,0.288295,0.361983,0.154984,0.068577,0.044411,0.112731,0.226259
std,0.029626,0.027424,0.026887,0.030635,0.021746,0.0339,0.026427,0.038828,0.030738,0.028348,...,0.226836,0.125755,0.228512,0.266695,0.260886,0.181619,0.108509,0.105197,0.159571,0.230759
min,0.0014,0.0093,0.0032,0.0021,0.0014,0.0014,0.0069,0.004,0.0037,0.0037,...,-0.2358,-0.1621,-0.16,-0.1505,0.0856,-0.0865,-0.2287,-0.2355,-0.1758,-0.1245
25%,0.0744,0.066,0.062,0.0686,0.061,0.0702,0.0715,0.0821,0.0673,0.0774,...,0.0498,0.0272,0.0074,0.1332,0.229675,0.0547,0.017,-0.009,0.0235,0.1141
50%,0.0895,0.081,0.0771,0.0834,0.0742,0.0935,0.0863,0.1019,0.0853,0.0937,...,0.1087,0.0578,0.0412,0.2004,0.302,0.1022,0.0444,0.0227,0.0955,0.1603
75%,0.1079,0.0998,0.0969,0.1027,0.0906,0.1167,0.1019,0.1223,0.1079,0.1111,...,0.2124,0.0964,0.0934,0.3694,0.4089,0.191,0.0803,0.0689,0.1457,0.2551
max,0.3761,0.2411,0.3312,0.3254,0.1989,0.2817,0.6872,2.0353,0.3577,0.3407,...,12.4517,4.7999,25.9158,4.8827,8.2908,4.012,2.8152,2.5242,3.9782,5.8364


In [9]:
print("Hold min/max:",
      df[hold_cols].min().min(),
      df[hold_cols].max().max())

print("DD min/max:",
      df[dd_cols].min().min(),
      df[dd_cols].max().max())

print("UD min/max:",
      df[ud_cols].min().min(),
      df[ud_cols].max().max())


Hold min/max: 0.0014 2.0353
DD min/max: 0.0011 25.9873
UD min/max: -0.2358 25.9158


# safe timing bounds

In [10]:
df[hold_cols] = df[hold_cols].where(
    (df[hold_cols] >= 0.02) & (df[hold_cols] <= 1.5)
)


In [11]:
df[dd_cols] = df[dd_cols].where(
    (df[dd_cols] >= 0.01) & (df[dd_cols] <= 2.0)
)


In [12]:
df[ud_cols] = df[ud_cols].where(
    (df[ud_cols] >= 0.01) & (df[ud_cols] <= 2.0)
)


In [13]:
print("Hold min/max:",
      df[hold_cols].min().min(),
      df[hold_cols].max().max())

print("DD min/max:",
      df[dd_cols].min().min(),
      df[dd_cols].max().max())

print("UD min/max:",
      df[ud_cols].min().min(),
      df[ud_cols].max().max())


Hold min/max: 0.0201 0.9211
DD min/max: 0.01 1.9954
UD min/max: 0.01 1.9998


In [14]:
df["hold_mean"] = df[hold_cols].mean(axis=1)
df["hold_std"]  = df[hold_cols].std(axis=1)
df["hold_min"]  = df[hold_cols].min(axis=1)
df["hold_max"]  = df[hold_cols].max(axis=1)
df["hold_cv"]   = df["hold_std"] / df["hold_mean"]
df["hold_skew"] = df[hold_cols].skew(axis=1)


In [15]:
df[[
    "hold_mean",
    "hold_std",
    "hold_cv",
    "hold_min",
    "hold_max",
    "hold_skew"
]].head()


Unnamed: 0,hold_mean,hold_std,hold_cv,hold_min,hold_max,hold_skew
0,0.115782,0.02253,0.194593,0.0742,0.1491,-0.218866
1,0.1001,0.029165,0.291362,0.0689,0.157,0.849661
2,0.111109,0.029826,0.268437,0.0731,0.1621,0.356638
3,0.1054,0.023856,0.226337,0.0813,0.1457,0.950587
4,0.092582,0.025764,0.278281,0.043,0.1312,-0.057069


# DD (DOWN → DOWN) FLIGHT FEATURES

In [16]:
df["dd_mean"]     = df[dd_cols].mean(axis=1)
df["dd_std"]      = df[dd_cols].std(axis=1)
df["dd_median"]   = df[dd_cols].median(axis=1)
df["dd_iqr"]      = (
    df[dd_cols].quantile(0.75, axis=1) -
    df[dd_cols].quantile(0.25, axis=1)
)
df["dd_variance"] = df[dd_cols].var(axis=1)


In [17]:
df[[
    "dd_mean",
    "dd_std",
    "dd_median",
    "dd_iqr",
    "dd_variance"
]].head()


Unnamed: 0,dd_mean,dd_std,dd_median,dd_iqr,dd_variance
0,0.54039,0.493923,0.3512,0.453225,0.24396
1,0.434,0.361336,0.2699,0.482675,0.130564
2,0.39526,0.298385,0.2776,0.3248,0.089034
3,0.4645,0.405723,0.2505,0.435675,0.164611
4,0.39389,0.311851,0.2417,0.46025,0.097251


# UD (UP → DOWN) FLIGHT FEATURES

In [18]:
df["ud_mean"]     = df[ud_cols].mean(axis=1)
df["ud_std"]      = df[ud_cols].std(axis=1)
df["ud_median"]   = df[ud_cols].median(axis=1)
df["ud_iqr"]      = (
    df[ud_cols].quantile(0.75, axis=1) -
    df[ud_cols].quantile(0.25, axis=1)
)
df["ud_variance"] = df[ud_cols].var(axis=1)


In [19]:
df[[
    "ud_mean",
    "ud_std",
    "ud_median",
    "ud_iqr",
    "ud_variance"
]].head()


Unnamed: 0,ud_mean,ud_std,ud_median,ud_iqr,ud_variance
0,0.42045,0.492082,0.23295,0.447575,0.242145
1,0.33136,0.363031,0.17065,0.456525,0.131792
2,0.28249,0.304275,0.15495,0.3697,0.092583
3,0.35669,0.410564,0.14555,0.411425,0.168563
4,0.30023,0.315,0.15795,0.434725,0.099225


In [20]:
session_group = df.groupby(["subject", "sessionIndex"])


In [21]:
df["intra_session_variance"] = session_group["hold_mean"].transform("var")

df["rhythm_stability"] = 1 / (
    session_group["hold_mean"].transform("std") + 1e-6
)

df["tempo_consistency"] = session_group["dd_mean"].transform("std")


In [22]:
df[[
    "intra_session_variance",
    "rhythm_stability",
    "tempo_consistency"
]].head()


Unnamed: 0,intra_session_variance,rhythm_stability,tempo_consistency
0,3.5e-05,169.368217,0.059142
1,3.5e-05,169.368217,0.059142
2,3.5e-05,169.368217,0.059142
3,3.5e-05,169.368217,0.059142
4,3.5e-05,169.368217,0.059142


In [23]:
def early_late_delta(series):
    if series.notna().sum() < 2:
        return 0.0
    return series.iloc[-1] - series.iloc[0]


In [24]:
df["early_late_hold_delta"] = session_group["hold_mean"].transform(early_late_delta)
df["early_late_dd_delta"]   = session_group["dd_mean"].transform(early_late_delta)
df["early_late_ud_delta"]   = session_group["ud_mean"].transform(early_late_delta)


In [25]:
df[[
    "early_late_hold_delta",
    "early_late_dd_delta",
    "early_late_ud_delta"
]].head()


Unnamed: 0,early_late_hold_delta,early_late_dd_delta,early_late_ud_delta
0,-0.015682,-0.16184,-0.14256
1,-0.015682,-0.16184,-0.14256
2,-0.015682,-0.16184,-0.14256
3,-0.015682,-0.16184,-0.14256
4,-0.015682,-0.16184,-0.14256


In [26]:
def outlier_rate(row, cols):
    values = row[cols].dropna()
    if len(values) < 3:
        return 0.0

    q1 = values.quantile(0.25)
    q3 = values.quantile(0.75)
    iqr = q3 - q1

    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    return ((values < lower) | (values > upper)).mean()


In [27]:
df["hold_outlier_rate"] = df.apply(lambda r: outlier_rate(r, hold_cols), axis=1)
df["dd_outlier_rate"]   = df.apply(lambda r: outlier_rate(r, dd_cols), axis=1)
df["ud_outlier_rate"]   = df.apply(lambda r: outlier_rate(r, ud_cols), axis=1)


In [28]:
df[[
    "hold_outlier_rate",
    "dd_outlier_rate",
    "ud_outlier_rate"
]].head()


Unnamed: 0,hold_outlier_rate,dd_outlier_rate,ud_outlier_rate
0,0.0,0.1,0.1
1,0.0,0.0,0.0
2,0.0,0.1,0.0
3,0.0,0.0,0.1
4,0.0,0.0,0.0


# final column list

In [29]:
final_cols = [
    "subject", "sessionIndex", "rep",

    "hold_mean", "hold_std", "hold_cv", "hold_min", "hold_max", "hold_skew",

    "dd_mean", "dd_std", "dd_median", "dd_iqr", "dd_variance",

    "ud_mean", "ud_std", "ud_median", "ud_iqr", "ud_variance",

    "intra_session_variance", "rhythm_stability", "tempo_consistency",

    "early_late_hold_delta", "early_late_dd_delta", "early_late_ud_delta",

    "hold_outlier_rate", "dd_outlier_rate", "ud_outlier_rate"
]


In [30]:
final_df = df[final_cols].copy()


In [31]:
final_df.head()


Unnamed: 0,subject,sessionIndex,rep,hold_mean,hold_std,hold_cv,hold_min,hold_max,hold_skew,dd_mean,...,ud_variance,intra_session_variance,rhythm_stability,tempo_consistency,early_late_hold_delta,early_late_dd_delta,early_late_ud_delta,hold_outlier_rate,dd_outlier_rate,ud_outlier_rate
0,s002,1,1,0.115782,0.02253,0.194593,0.0742,0.1491,-0.218866,0.54039,...,0.242145,3.5e-05,169.368217,0.059142,-0.015682,-0.16184,-0.14256,0.0,0.1,0.1
1,s002,1,2,0.1001,0.029165,0.291362,0.0689,0.157,0.849661,0.434,...,0.131792,3.5e-05,169.368217,0.059142,-0.015682,-0.16184,-0.14256,0.0,0.0,0.0
2,s002,1,3,0.111109,0.029826,0.268437,0.0731,0.1621,0.356638,0.39526,...,0.092583,3.5e-05,169.368217,0.059142,-0.015682,-0.16184,-0.14256,0.0,0.1,0.0
3,s002,1,4,0.1054,0.023856,0.226337,0.0813,0.1457,0.950587,0.4645,...,0.168563,3.5e-05,169.368217,0.059142,-0.015682,-0.16184,-0.14256,0.0,0.0,0.1
4,s002,1,5,0.092582,0.025764,0.278281,0.043,0.1312,-0.057069,0.39389,...,0.099225,3.5e-05,169.368217,0.059142,-0.015682,-0.16184,-0.14256,0.0,0.0,0.0


In [35]:
final_df.shape
df.columns


Index(['subject', 'sessionIndex', 'rep', 'H.period', 'DD.period.t',
       'UD.period.t', 'H.t', 'DD.t.i', 'UD.t.i', 'H.i', 'DD.i.e', 'UD.i.e',
       'H.e', 'DD.e.five', 'UD.e.five', 'H.five', 'DD.five.Shift.r',
       'UD.five.Shift.r', 'H.Shift.r', 'DD.Shift.r.o', 'UD.Shift.r.o', 'H.o',
       'DD.o.a', 'UD.o.a', 'H.a', 'DD.a.n', 'UD.a.n', 'H.n', 'DD.n.l',
       'UD.n.l', 'H.l', 'DD.l.Return', 'UD.l.Return', 'H.Return', 'hold_mean',
       'hold_std', 'hold_min', 'hold_max', 'hold_cv', 'hold_skew', 'dd_mean',
       'dd_std', 'dd_median', 'dd_iqr', 'dd_variance', 'ud_mean', 'ud_std',
       'ud_median', 'ud_iqr', 'ud_variance', 'intra_session_variance',
       'rhythm_stability', 'tempo_consistency', 'early_late_hold_delta',
       'early_late_dd_delta', 'early_late_ud_delta', 'hold_outlier_rate',
       'dd_outlier_rate', 'ud_outlier_rate'],
      dtype='object')

In [33]:
PROJECT_ROOT = Path.cwd().parents[2]
print(PROJECT_ROOT)


/Users/bipinpaudel/Downloads/My projects/projects/humanSign


In [34]:
FINAL_DATASET_PATH = (
    PROJECT_ROOT
    / "ml"
    / "models"
    / "humansign_phase1_final_dataset.csv"
)

final_df.to_csv(
    FINAL_DATASET_PATH,
    index=False
)

print(f"Final dataset exported to: {export_path}")


NameError: name 'export_path' is not defined

In [None]:
final_df