In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense
import joblib

# Load dataset
df = pd.read_csv("ci_main_cleaned.csv")

# Required columns
required_cols = ['time_taken', 'key_presses', 'mouse_distance', 'mouse_points', 'attempts', 'mouse_movements']
for col in required_cols:
    if col not in df.columns:
        print(f"[!] Column '{col}' missing — filling with 0s.")
        df[col] = 0

# Feature Engineering
df['avg_key_interval'] = df['time_taken'] / df['key_presses'].replace(0, 1)
df['mouse_density'] = df['mouse_distance'] / df['mouse_points'].replace(0, 1)
df['time_per_attempt'] = df['time_taken'] / df['attempts'].replace(0, 1)
df['time_per_key'] = df['time_taken'] / df['key_presses'].replace(0, 1)

def mouse_jitter(movement_str):
    try:
        points = ast.literal_eval(movement_str)
        distances = [euclidean(p1.values(), p2.values()) for p1, p2 in zip(points[:-1], points[1:])]
        return np.std(distances) if distances else 0
    except Exception:
        return 0

df['mouse_jitter'] = df['mouse_movements'].apply(mouse_jitter)

# Features and scaling
features = ['time_taken', 'key_presses', 'mouse_distance', 'mouse_points', 'attempts',
            'avg_key_interval', 'mouse_density', 'time_per_attempt', 'time_per_key', 'mouse_jitter']
X = df[features]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

joblib.dump(scaler, "scaler.save")

# Tensor reshape for LSTM: (samples, timesteps, features)
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Define LSTM Autoencoder in TensorFlow
input_dim = X_reshaped.shape[2]
timesteps = X_reshaped.shape[1]
latent_dim = 32

inputs = Input(shape=(timesteps, input_dim))
encoded = LSTM(latent_dim)(inputs)
decoded = RepeatVector(timesteps)(encoded)
decoded = LSTM(input_dim, return_sequences=True)(decoded)
decoded = TimeDistributed(Dense(input_dim))(decoded)

autoencoder = Model(inputs, decoded)
autoencoder.compile(optimizer='adam', loss='mse')

# Train
autoencoder.fit(X_reshaped, X_reshaped, epochs=50, batch_size=32, verbose=1)

# Save the model
autoencoder.save("lstm_autoencoder_model.h5")

[!] Column 'mouse_distance' missing — filling with 0s.
[!] Column 'mouse_points' missing — filling with 0s.
Epoch 1/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.5538  
Epoch 2/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 970us/step - loss: 0.4972
Epoch 3/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 886us/step - loss: 0.6564
Epoch 4/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 898us/step - loss: 0.4960
Epoch 5/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 869us/step - loss: 0.7497
Epoch 6/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 876us/step - loss: 0.3781
Epoch 7/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 874us/step - loss: 0.2262
Epoch 8/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 851us/step - loss: 0.5952
Epoch 9/50
[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 872us/ste



In [2]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import euclidean
from tensorflow.keras.models import load_model
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.optimizers import Adam


# ========== Load the trained TensorFlow LSTM Autoencoder ==========
model = load_model("lstm_autoencoder_model.h5", compile=False)
model.compile(optimizer=Adam(), loss=MeanSquaredError())


# ========== Load and Preprocess Inference Dataset ==========
df = pd.read_csv("human_captcha_solve_data.csv")

# Ensure all required columns are present
required_cols = ['time_taken', 'key_presses', 'mouse_distance', 'mouse_points', 'attempts', 'mouse_movements']
for col in required_cols:
    if col not in df.columns:
        print(f"[!] Column '{col}' missing — filling with 0s.")
        df[col] = 0

# Feature Engineering
df['avg_key_interval'] = df['time_taken'] / df['key_presses'].replace(0, 1)
df['mouse_density'] = df['mouse_distance'] / df['mouse_points'].replace(0, 1)
df['time_per_attempt'] = df['time_taken'] / df['attempts'].replace(0, 1)
df['time_per_key'] = df['time_taken'] / df['key_presses'].replace(0, 1)

def mouse_jitter(movement_str):
    try:
        points = ast.literal_eval(movement_str)
        distances = [euclidean(p1.values(), p2.values()) for p1, p2 in zip(points[:-1], points[1:])]
        return np.std(distances) if distances else 0
    except Exception:
        return 0

df['mouse_jitter'] = df['mouse_movements'].apply(mouse_jitter)

# Selected features
features = ['time_taken', 'key_presses', 'mouse_distance', 'mouse_points', 'attempts',
            'avg_key_interval', 'mouse_density', 'time_per_attempt', 'time_per_key', 'mouse_jitter']
X = df[features]

# Scale and reshape input for LSTM
X_scaled = scaler.transform(X)
X_reshaped = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# ========== Perform Inference ==========
reconstructions = model.predict(X_reshaped)
mse = np.mean(np.mean(np.square(X_reshaped - reconstructions), axis=1), axis=1)

# Use the 95th percentile as threshold
threshold = np.percentile(mse, 95)
df['reconstruction_error'] = mse
df['anomaly'] = (mse > threshold).astype(int)

# ========== Save Results ==========
df.to_csv("tf_anomaly_inference_results.csv", index=False)
print("✅ Inference complete. Results saved to 'tf_anomaly_inference_results.csv'")


[!] Column 'mouse_distance' missing — filling with 0s.
[!] Column 'mouse_points' missing — filling with 0s.
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
✅ Inference complete. Results saved to 'tf_anomaly_inference_results.csv'


In [3]:
df["anomaly"].value_counts()

anomaly
0    1726
1      91
Name: count, dtype: int64