## Read in DataFrame

In [1]:
import pandas as pd
import ast

# Load the labelled behavioral biometric dataset into a DataFrame
df = pd.read_csv('/home/a8tariq/safeVM/labelled_108.csv')
df.head()

Unnamed: 0,script_id,script_url,code,max_api_aggregation_score,behavioral_api_agg_count,fp_api_agg_count,max_aggregated_apis,max_behavioral_api_aggregation_score,aggregated_behavioral_apis,max_fingerprinting_api_aggregation_score,...,behavioral_source_apis,behavioral_source_api_count,fingerprinting_source_api_count,behavioral_apis_access_count,fingerprinting_api_access_count,graph_construction_failure,dataflow_to_sink,apis_going_to_sink,is_behavioral_biometric,id
0,2901,https://js.datadome.co/tags.js,/** DataDome is a cybersecurity solution to de...,4,0.0,4.0,"[""Navigator.hardwareConcurrency"", ""Navigator.u...",1,"[""MouseEvent.timeStamp""]",4,...,"[""MouseEvent.{1031362"", ""MouseEvent.{43378"", ""...",203,25,"{""MouseEvent.{2174"": 1, ""MouseEvent.{20608"": 1...","{""Screen.width"": 1, ""Screen.height"": 1, ""Navig...",f,f,{},f,1
1,2914,https://banking.bendigobank.com.au/static/asse...,"function startsWith(c,d){return(c.indexOf(d)==...",4,0.0,4.0,"[""Screen.width"", ""Screen.availHeight"", ""Screen...",2,"[""KeyboardEvent.target"", ""FocusEvent.target""]",4,...,"[""KeyboardEvent.ctrlKey"", ""KeyboardEvent.type""...",7,15,"{""FocusEvent.type"": 6, ""FocusEvent.target"": 2,...","{""Screen.width"": 1, ""Screen.height"": 1, ""Navig...",f,f,{},f,2
2,2915,https://banking.bendigobank.com.au/static/asse...,;window.NREUM||(NREUM={});NREUM.init={distribu...,2,2.0,0.0,"[""PointerEvent.type"", ""PointerEvent.timeStamp""]",2,"[""PointerEvent.type"", ""PointerEvent.timeStamp""]",1,...,"[""PointerEvent.type"", ""PointerEvent.timeStamp""]",2,1,"{""PointerEvent.type"": 1, ""PointerEvent.timeSta...","{""Navigator.userAgent"": 4}",f,f,{},f,3
3,2917,https://js-agent.newrelic.com/nr-spa-1211.min.js,"!function(t,n,e){function r(e,o){if(!n[e]){if(...",15,15.0,0.0,"[""MouseEvent.type"", ""PointerEvent.__nrNode"", ""...",15,"[""MouseEvent.type"", ""PointerEvent.__nrNode"", ""...",1,...,"[""MouseEvent.__nrNode"", ""TouchEvent.type"", ""De...",15,1,"{""FocusEvent.type"": 1, ""MouseEvent.type"": 1, ""...","{""Navigator.userAgent"": 1}",f,f,{},f,4
4,2904,https://app.laybuy.com/assets/index-8ea4bd62.js,"var uM=Object.defineProperty;var dM=(e,t,n)=>t...",-1,-1.0,-1.0,,-1,,-1,...,"[""MouseEvent.screenX"", ""MouseEvent.shiftKey"", ...",79,10,"{""FocusEvent.view"": 1, ""MouseEvent.view"": 1, ""...","{""Window.innerWidth"": 2, ""Window.innerHeight"":...",t,f,{},f,5


## Remove Duplicates and Preprocess

In [2]:
# Remove rows with duplicate 'script_url'
df = df.drop_duplicates(subset=['script_url'], keep='first')


# Filter out rows where 'script_url' contains 'datadome'
df = df[~df['script_url'].str.contains('datadome', case=False, na=False)]

# Remove unnecessary features
df.drop(columns=['script_url', 'script_id', 'code', 'attached_listeners'], axis=1, inplace=True)
df.head()

print(f"Dimensions of the dataset: {df.shape}")


Dimensions of the dataset: (602, 19)


## Create Simple DataFrame with Behavioral APIs

In [3]:
# Create a new dataframe with simple features 
simple_df = df[['behavioral_source_apis', 'behavioral_source_api_count', 'is_behavioral_biometric']].copy()

# Parse the string array into a Python list
simple_df['behavioral_source_apis'] = simple_df['behavioral_source_apis'].apply(ast.literal_eval)

# 1. Extract the string before the period (e.g., 'MouseEvent', 'KeyboardEvent', etc.)
simple_df['behavioral_source_apis'] = simple_df['behavioral_source_apis'].apply(lambda x: [event.split('.')[0] for event in x])

# 2. Flatten the list into individual event types and count the occurrences
# We will first explode the list of event types and then use pivot_table to count occurrences
df_exploded = simple_df.explode('behavioral_source_apis')

# 3. Create new columns for each event type and count how many times they appear per row
df_count = df_exploded.pivot_table(index=df_exploded.index, columns='behavioral_source_apis', aggfunc='size', fill_value=0)

# 4. Join the new columns with the original DataFrame
df_final = simple_df.join(df_count)

# 5. Optional: Drop the original 'events' column if not needed
df_final.drop(columns='behavioral_source_apis', inplace=True)
df_final.drop(columns='behavioral_source_api_count', inplace=True)
df_final


Unnamed: 0,is_behavioral_biometric,DeviceMotionEvent,DeviceMotionEventAcceleration,DeviceMotionEventRotationRate,DeviceOrientationEvent,FocusEvent,InputEvent,KeyboardEvent,MouseEvent,PointerEvent,TextEvent,Touch,TouchEvent,WheelEvent
1,f,0,0,0,0,3,0,4,0,0,0,0,0,0
2,f,0,0,0,0,0,0,0,0,2,0,0,0,0
3,f,2,0,0,2,2,0,2,2,3,0,0,2,0
4,f,0,0,0,0,10,0,3,26,38,0,0,1,1
5,f,0,0,0,0,0,0,0,0,5,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,t,1,1,0,4,3,0,13,27,28,0,0,0,17
785,t,2,3,3,4,4,0,13,17,18,0,10,6,14
787,t,2,3,0,7,0,0,8,9,8,0,1,8,0
788,t,2,3,0,7,0,0,8,9,8,0,1,8,7


## Unbalanced Prediction

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

X = df_final.drop('is_behavioral_biometric', axis=1)
Y = df_final['is_behavioral_biometric']
# Split the data into training and testing sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=45)

# Initialize and train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=45)
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 91.80%
Classification Report:
              precision    recall  f1-score   support

           f       0.95      0.96      0.95        54
           t       0.67      0.57      0.62         7

    accuracy                           0.92        61
   macro avg       0.81      0.77      0.78        61
weighted avg       0.91      0.92      0.92        61



## Cross Validation Results (Unbalanced)

In [5]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")

Cross-validation scores: [0.93577982 0.97222222 0.96296296 0.94444444 0.97222222]
Average cross-validation score: 0.9575263336731228


## Class Weighting Prediction

In [6]:
# Class balance property should handle under-represented part of dataset
clf = DecisionTreeClassifier(class_weight='balanced', random_state=45)

# Step 3: Train the classifier on the training data
clf.fit(X_train, y_train)

# Step 4: Make predictions on the test data
y_pred = clf.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))



Model Accuracy: 95.08%
Classification Report:
              precision    recall  f1-score   support

           f       0.96      0.98      0.97        54
           t       0.83      0.71      0.77         7

    accuracy                           0.95        61
   macro avg       0.90      0.85      0.87        61
weighted avg       0.95      0.95      0.95        61



## Cross Validation Results (Class Weighting)

In [7]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")


Cross-validation scores: [0.9266055  0.96296296 0.97222222 0.93518519 0.9537037 ]
Average cross-validation score: 0.950135915732246


## Oversampled Prediction

In [8]:
from imblearn.over_sampling import RandomOverSampler

# Step 2: Balance the training set using oversampling (or undersampling, SMOTE, etc.)
ros = RandomOverSampler(random_state=45)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


# Train a Decision Tree classifier on the balanced dataset
clf = DecisionTreeClassifier(random_state=45)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 95.08%
Classification Report:
              precision    recall  f1-score   support

           f       0.96      0.98      0.97        54
           t       0.83      0.71      0.77         7

    accuracy                           0.95        61
   macro avg       0.90      0.85      0.87        61
weighted avg       0.95      0.95      0.95        61



## Cross Validation Results (Oversampled)

In [9]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")


Cross-validation scores: [0.97395833 0.97905759 0.97905759 0.98429319 0.96858639]
Average cross-validation score: 0.9769906195462479


## Undersampled Prediction

In [10]:
from imblearn.under_sampling import RandomUnderSampler

# Step 2: Balance the training set using oversampling (or undersampling, SMOTE, etc.)
ros = RandomUnderSampler(random_state=45)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


# Train a Decision Tree classifier on the balanced dataset
clf = DecisionTreeClassifier(random_state=45)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 81.97%
Classification Report:
              precision    recall  f1-score   support

           f       0.96      0.83      0.89        54
           t       0.36      0.71      0.48         7

    accuracy                           0.82        61
   macro avg       0.66      0.77      0.68        61
weighted avg       0.89      0.82      0.84        61



## Cross Validation Results (Undersampled)

In [11]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")


Cross-validation scores: [0.88461538 0.92       0.88       0.8        0.88      ]
Average cross-validation score: 0.8729230769230769


## ROC-AUC Results

In [12]:
from sklearn.metrics import roc_auc_score

# Get probability predictions for ROC-AUC score
y_pred_proba = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")


ROC-AUC Score: 0.77
