# Classification using More Features

## Read in DataFrame

In [77]:
import pandas as pd
import ast

# Load the labelled behavioral biometric dataset into a DataFrame
df = pd.read_csv('/home/a8tariq/safeVM/labelled_108.csv')
df[df['is_behavioral_biometric']=='t'].head()

Unnamed: 0,script_id,script_url,code,max_api_aggregation_score,behavioral_api_agg_count,fp_api_agg_count,max_aggregated_apis,max_behavioral_api_aggregation_score,aggregated_behavioral_apis,max_fingerprinting_api_aggregation_score,...,behavioral_source_apis,behavioral_source_api_count,fingerprinting_source_api_count,behavioral_apis_access_count,fingerprinting_api_access_count,graph_construction_failure,dataflow_to_sink,apis_going_to_sink,is_behavioral_biometric,id
34,3184,https://mpsnare.iesnare.com/general5/wdp.js?lo...,/*\n Copyright(c) 2023 TransUnion LLC. All Rig...,17,17.0,0.0,"[""MouseEvent.type"", ""PointerEvent.pointerType""...",17,"[""MouseEvent.type"", ""PointerEvent.pointerType""...",6,...,"[""DeviceOrientationEvent.beta"", ""MouseEvent.sc...",29,12,"{""MouseEvent.code"": 1, ""MouseEvent.type"": 14, ...","{""Screen.width"": 1, ""Screen.height"": 1, ""Navig...",f,f,{},t,45
57,3402,https://api-mastercard-eu.nd.nudatasecurity.co...,"var ndjsStaticVersion=""sync-1"",nslyyidtyi={};f...",6,0.0,6.0,"[""Screen.colorDepth"", ""Screen.pixelDepth"", ""Sc...",4,"[""DeviceMotionEvent.rotationRate"", ""DeviceMoti...",6,...,"[""DeviceMotionEventAcceleration.z"", ""DeviceOri...",12,19,"{""DeviceOrientationEvent.beta"": 1, ""DeviceOrie...","{""Screen.width"": 5, ""Screen.height"": 5, ""Navig...",f,f,{},t,72
100,3852,https://fingerprint.app.bureau.id/index.js,"parcelRequire=function(t,e,n,r){var o,i=""funct...",-1,-1.0,-1.0,,-1,,-1,...,"[""PointerEvent._sentryCaptured"", ""PointerEvent...",5,31,"{""MouseEvent.x"": 1, ""MouseEvent.y"": 1, ""Pointe...","{""Screen.width"": 2, ""Screen.height"": 2, ""Navig...",t,f,{},t,134
104,3907,https://static.digitaltrust.feedzai.cloud/js/d...,"/*6181*/(function () {function _b(a,b){var c=_...",14,0.0,14.0,"[""Window.innerWidth"", ""Navigator.oscpu"", ""Wind...",4,"[""KeyboardEvent.location"", ""KeyboardEvent.targ...",14,...,"[""MouseEvent.getModifierState"", ""MouseEvent.sc...",20,26,"{""MouseEvent.type"": 1, ""KeyboardEvent.key"": 4,...","{""Screen.width"": 2, ""Screen.height"": 2, ""Windo...",f,f,{},t,141
106,3883,https://login.sparkasse.at/sts/7hSTR7CfYN/gWqv...,"try{(function(){var pb=['pKDZ','B E5',' Cor','...",3,3.0,0.0,"[""MouseEvent.type"", ""PointerEvent.type"", ""Whee...",3,"[""MouseEvent.type"", ""PointerEvent.type"", ""Whee...",0,...,"[""PointerEvent.width"", ""MouseEvent.screenX"", ""...",67,37,"{""MouseEvent.type"": 5, ""WheelEvent.type"": 5, ""...","{""Screen.width"": 2, ""Screen.height"": 2, ""Navig...",f,f,{},t,143


## Remove Duplicates and Preprocess

In [78]:
# Remove rows with duplicate 'script_url'
df = df.drop_duplicates(subset=['script_url'], keep='first')


# Filter out rows where 'script_url' contains 'datadome'
df = df[~df['script_url'].str.contains('datadome', case=False, na=False)]

# Remove unnecessary features
df.drop(columns=['script_url', 'script_id', 'code', 'attached_listeners'], axis=1, inplace=True)
df.head()

print(f"Dimensions of the dataset: {df.shape}")

Dimensions of the dataset: (602, 19)


In [79]:
list(df.columns)

['max_api_aggregation_score',
 'behavioral_api_agg_count',
 'fp_api_agg_count',
 'max_aggregated_apis',
 'max_behavioral_api_aggregation_score',
 'aggregated_behavioral_apis',
 'max_fingerprinting_api_aggregation_score',
 'aggregated_fingerprinting_apis',
 'fingerprinting_source_apis',
 'behavioral_source_apis',
 'behavioral_source_api_count',
 'fingerprinting_source_api_count',
 'behavioral_apis_access_count',
 'fingerprinting_api_access_count',
 'graph_construction_failure',
 'dataflow_to_sink',
 'apis_going_to_sink',
 'is_behavioral_biometric',
 'id']

## Create DataFrame with more features

In [80]:
# Create a new dataframe with simple features 
complicated_df = df[['aggregated_behavioral_apis', 'behavioral_source_apis', 'is_behavioral_biometric']].copy()

# Parse the string array into a Python list
complicated_df['behavioral_source_apis'] = complicated_df['behavioral_source_apis'].apply(ast.literal_eval)
# Apply ast.literal_eval only if the value is not NaN
complicated_df['aggregated_behavioral_apis'] = complicated_df['aggregated_behavioral_apis'].apply(
    lambda x: ast.literal_eval(x) if pd.notnull(x) else x
)


# 1. Extract the string before the period (e.g., 'MouseEvent', 'KeyboardEvent', etc.)
complicated_df['behavioral_source_apis'] = complicated_df['behavioral_source_apis'].apply(lambda x: [event.split('.')[0] for event in x])
complicated_df['aggregated_behavioral_apis'] = complicated_df['aggregated_behavioral_apis'].apply(lambda x: [event.split('.')[0] for event in x] if isinstance(x, list) else x)


# 2. Explode both columns
df_exploded_source = complicated_df.explode('behavioral_source_apis')
df_exploded_aggregated = complicated_df.explode('aggregated_behavioral_apis')

# 3. Pivot for both columns to count occurrences per row
df_source_count = df_exploded_source.pivot_table(
    index=df_exploded_source.index,
    columns='behavioral_source_apis',
    aggfunc='size',
    fill_value=0
)

df_aggregated_count = df_exploded_aggregated.pivot_table(
    index=df_exploded_aggregated.index,
    columns='aggregated_behavioral_apis',
    aggfunc='size',
    fill_value=0
)

# 4. Rename columns in df_aggregated_count to add '_agg' suffix
df_aggregated_count = df_aggregated_count.add_suffix('_agg')

# 5. Join the new columns with the original DataFrame
df_final = complicated_df.join([df_source_count, df_aggregated_count])

# 6. Optional: Drop the original columns if no longer needed
df_final.drop(columns=['behavioral_source_apis', 'aggregated_behavioral_apis'], inplace=True)
df_final.fillna(-1, inplace=True)

df_final

# Display the final DataFrame

Unnamed: 0,is_behavioral_biometric,DeviceMotionEvent,DeviceMotionEventAcceleration,DeviceMotionEventRotationRate,DeviceOrientationEvent,FocusEvent,InputEvent,KeyboardEvent,MouseEvent,PointerEvent,...,DeviceOrientationEvent_agg,FocusEvent_agg,InputEvent_agg,KeyboardEvent_agg,MouseEvent_agg,PointerEvent_agg,TextEvent_agg,Touch_agg,TouchEvent_agg,WheelEvent_agg
1,f,0,0,0,0,3,0,4,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,f,0,0,0,0,0,0,0,0,2,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
3,f,2,0,0,2,2,0,2,2,3,...,2.0,2.0,0.0,2.0,2.0,3.0,0.0,0.0,2.0,0.0
4,f,0,0,0,0,10,0,3,26,38,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5,f,0,0,0,0,0,0,0,0,5,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,t,1,1,0,4,3,0,13,27,28,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
785,t,2,3,3,4,4,0,13,17,18,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
787,t,2,3,0,7,0,0,8,9,8,...,0.0,0.0,0.0,8.0,9.0,8.0,0.0,1.0,8.0,0.0
788,t,2,3,0,7,0,0,8,9,8,...,0.0,0.0,0.0,8.0,9.0,8.0,0.0,1.0,8.0,7.0


## Unbalanced Prediction

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report

# Features and target variable
X = df_final.drop('is_behavioral_biometric', axis=1)
Y = df_final['is_behavioral_biometric']

# Split the data into training and testing sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=45)

# Initialize and train a Decision Tree Classifier
clf = RandomForestClassifier(max_depth=5, random_state=45)
clf.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = clf.predict(X_train)

# Make predictions on the test data
y_test_pred = clf.predict(X_test)

# Evaluate the model on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))


Training Accuracy: 96.67%
Training Classification Report:
              precision    recall  f1-score   support

           f       0.97      1.00      0.98       478
           t       0.96      0.75      0.84        63

    accuracy                           0.97       541
   macro avg       0.96      0.87      0.91       541
weighted avg       0.97      0.97      0.96       541

Test Accuracy: 95.08%
Test Classification Report:
              precision    recall  f1-score   support

           f       0.95      1.00      0.97        54
           t       1.00      0.57      0.73         7

    accuracy                           0.95        61
   macro avg       0.97      0.79      0.85        61
weighted avg       0.95      0.95      0.94        61



### Cross Validation Results (Unbalance)

In [82]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")

Cross-validation scores: [0.93577982 0.96296296 0.96296296 0.90740741 0.96296296]
Average cross-validation score: 0.9464152225620115


## Class Weighting Prediction

In [83]:
# Class balance property should handle under-represented part of dataset
clf = RandomForestClassifier(class_weight='balanced', max_depth=5, random_state=45)

# Step 3: Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = clf.predict(X_train)

# Make predictions on the test data
y_test_pred = clf.predict(X_test)

# Evaluate the model on the training set
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print("Training Classification Report:")
print(classification_report(y_train, y_train_pred))

# Evaluate the model on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print("Test Classification Report:")
print(classification_report(y_test, y_test_pred))




Training Accuracy: 98.34%
Training Classification Report:
              precision    recall  f1-score   support

           f       0.99      0.99      0.99       478
           t       0.94      0.92      0.93        63

    accuracy                           0.98       541
   macro avg       0.96      0.96      0.96       541
weighted avg       0.98      0.98      0.98       541

Test Accuracy: 98.36%
Test Classification Report:
              precision    recall  f1-score   support

           f       0.98      1.00      0.99        54
           t       1.00      0.86      0.92         7

    accuracy                           0.98        61
   macro avg       0.99      0.93      0.96        61
weighted avg       0.98      0.98      0.98        61



In [84]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")

Cross-validation scores: [0.96330275 0.96296296 0.96296296 0.96296296 0.96296296]
Average cross-validation score: 0.9630309208290859


## Oversampling Prediction

In [85]:
from imblearn.over_sampling import RandomOverSampler

# Step 2: Balance the training set using oversampling (or undersampling, SMOTE, etc.)
ros = RandomOverSampler(random_state=45)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


# Train a Decision Tree classifier on the balanced dataset
clf = DecisionTreeClassifier(max_depth=5, random_state=45)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 86.89%
Classification Report:
              precision    recall  f1-score   support

           f       0.96      0.89      0.92        54
           t       0.45      0.71      0.56         7

    accuracy                           0.87        61
   macro avg       0.71      0.80      0.74        61
weighted avg       0.90      0.87      0.88        61



## Undersampling Prediction

In [86]:
from imblearn.under_sampling import RandomUnderSampler

# Step 2: Balance the training set using oversampling (or undersampling, SMOTE, etc.)
ros = RandomUnderSampler(random_state=45)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


# Train a Decision Tree classifier on the balanced dataset
clf = DecisionTreeClassifier(max_depth=5, random_state=45)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 75.41%
Classification Report:
              precision    recall  f1-score   support

           f       0.95      0.76      0.85        54
           t       0.28      0.71      0.40         7

    accuracy                           0.75        61
   macro avg       0.62      0.74      0.62        61
weighted avg       0.88      0.75      0.79        61

