## Read in DataFrame

In [None]:
import pandas as pd
import ast

# Load the labelled behavioral biometric dataset into a DataFrame
df = pd.read_csv('/home/a8tariq/safeVM/labelled_108.csv')
df.head()

## Remove Duplicates and Preprocess

In [None]:
# Remove rows with duplicate 'script_url'
df = df.drop_duplicates(subset=['script_url'], keep='first')


# Filter out rows where 'script_url' contains 'datadome'
df = df[~df['script_url'].str.contains('datadome', case=False, na=False)]

# Remove unnecessary features
df.drop(columns=['script_url', 'script_id', 'code', 'attached_listeners'], axis=1, inplace=True)
df.head()

print(f"Dimensions of the dataset: {df.shape}")


## Create Simple DataFrame with Behavioral APIs

In [None]:
# Create a new dataframe with simple features 
simple_df = df[['behavioral_source_apis', 'behavioral_source_api_count', 'is_behavioral_biometric']].copy()

# Parse the string array into a Python list
simple_df['behavioral_source_apis'] = simple_df['behavioral_source_apis'].apply(ast.literal_eval)

# 1. Extract the string before the period (e.g., 'MouseEvent', 'KeyboardEvent', etc.)
simple_df['behavioral_source_apis'] = simple_df['behavioral_source_apis'].apply(lambda x: [event.split('.')[0] for event in x])

# 2. Flatten the list into individual event types and count the occurrences
# We will first explode the list of event types and then use pivot_table to count occurrences
df_exploded = simple_df.explode('behavioral_source_apis')

# 3. Create new columns for each event type and count how many times they appear per row
df_count = df_exploded.pivot_table(index=df_exploded.index, columns='behavioral_source_apis', aggfunc='size', fill_value=0)

# 4. Join the new columns with the original DataFrame
df_final = simple_df.join(df_count)

# 5. Optional: Drop the original 'events' column if not needed
df_final.drop(columns='behavioral_source_apis', inplace=True)
df_final.drop(columns='behavioral_source_api_count', inplace=True)
df_final


## Unbalanced Prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

X = df_final.drop('is_behavioral_biometric', axis=1)
Y = df_final['is_behavioral_biometric']
# Split the data into training and testing sets (90% train, 10% test)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=45)

# Initialize and train a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=45)
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Print a detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Cross Validation Results (Unbalanced)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")

## Class Weighting Prediction

In [None]:
# Class balance property should handle under-represented part of dataset
clf = DecisionTreeClassifier(class_weight='balanced', random_state=45)

# Step 3: Train the classifier on the training data
clf.fit(X_train, y_train)

# Step 4: Make predictions on the test data
y_pred = clf.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))



## Cross Validation Results (Class Weighting)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")


## Oversampled Prediction

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Step 2: Balance the training set using oversampling (or undersampling, SMOTE, etc.)
ros = RandomOverSampler(random_state=45)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


# Train a Decision Tree classifier on the balanced dataset
clf = DecisionTreeClassifier(random_state=45)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))


## Cross Validation Results (Oversampled)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")


## Undersampled Prediction

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Step 2: Balance the training set using oversampling (or undersampling, SMOTE, etc.)
ros = RandomUnderSampler(random_state=45)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


# Train a Decision Tree classifier on the balanced dataset
clf = DecisionTreeClassifier(random_state=45)
clf.fit(X_train_resampled, y_train_resampled)

# Make predictions and evaluate
y_pred = clf.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Cross Validation Results (Undersampled)

In [None]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the balanced training set
scores = cross_val_score(clf, X_train_resampled, y_train_resampled, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")


## ROC-AUC Results

In [None]:
from sklearn.metrics import roc_auc_score

# Get probability predictions for ROC-AUC score
y_pred_proba = clf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.2f}")
