In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import json
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

In [2]:
# Load the JSON data
with open('../data/extracted_and_reformatted_data.json', 'r') as file:
    json_data = json.load(file)

In [3]:
'''
Preprocess the data
''' 

# Convert the JSON data to a DataFrame
df = pd.DataFrame(json_data)

fixed_length = 50

def convert_and_pad_or_truncate(arr, length):
    arr = np.fromstring(arr.strip('[]'), sep=',').tolist()
    if len(arr) > length:
        return arr[:length]
    elif len(arr) < length:
        return np.pad(arr, (0, length - len(arr)), 'constant')
    return arr

# Apply padding/truncation to each operationOutput
df['operationOutput'] = df['operationOutput'].apply(lambda x: convert_and_pad_or_truncate(x, fixed_length))

# Determine the plaform of the data
def determine_platform(platform):
    if 'Linux' in platform:
        return 'Linux'
    elif 'Win' in platform:
        return 'Windows'
    elif 'iPad' in platform or 'iPhone' in platform:
        # return 'iOS'
        return 'macOS'
    elif 'Mac' in platform:
        return 'macOS'
    else:
        return 'Other'

# Get the platform from component
df['platform'] = df['components'].apply(lambda x: determine_platform(x['platform']['value']))

# Remove all linux data
df = df[df['platform'] != 'Linux']

# Compute summary statistics for each operationOutput
df['mean'] = df['operationOutput'].apply(np.mean)
df['std'] = df['operationOutput'].apply(np.std)
df['min'] = df['operationOutput'].apply(np.min)
df['max'] = df['operationOutput'].apply(np.max)

df

Unnamed: 0,entryId,browserData,operationDetails,operationOutput,components,platform,mean,std,min,max
1,2501,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_inverse_saw_tooth', '...","[62.0, 23.0, 121.0, 36.0, 20.0, 4975.0, 77.0, ...","{'fonts': {'value': ['Arabic Typesetting', 'Ba...",Windows,744.6000,1783.131650,0.000,9303.000000
3,31066,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[6.714999675750732, 19.454999826848507, 17.815...","{'fonts': {'value': [], 'duration': 102}, 'dom...",macOS,16.9993,17.302972,3.610,73.980000
6,32059,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[38.16500003449619, 11.799999978393316, 37.215...","{'fonts': {'value': [], 'duration': 46}, 'domB...",macOS,63.0564,36.484954,5.990,194.470000
7,26061,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[134.0100000379607, 321.53999991714954, 753.10...","{'fonts': {'value': [], 'duration': 2343}, 'do...",Windows,894.4812,516.625525,97.235,2533.650000
8,52602,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[31.884999945759773, 1512.275000102818, 2693.9...","{'fonts': {'value': [], 'duration': 2415}, 'do...",Windows,1345.6076,2604.141791,0.000,12178.610000
...,...,...,...,...,...,...,...,...,...,...
54022,23278,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[14.710000017657876, 88.61500001512468, 29.815...","{'fonts': {'value': [], 'duration': 196}, 'dom...",macOS,229.7320,230.288465,6.165,1123.255000
54023,29869,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[16.580000519752502, 8.229999803006649, 279.88...","{'fonts': {'value': [], 'duration': 2569}, 'do...",Windows,159.9590,124.756627,8.230,501.809999
54024,43047,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[1280.7550001889467, 1033.9449997991323, 632.3...","{'fonts': {'value': [], 'duration': 2273}, 'do...",Windows,1271.2530,1595.840569,0.000,5705.545000
54026,15611,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[32.44499955326319, 26.405000127851963, 49.999...","{'fonts': {'value': [], 'duration': 66}, 'domB...",macOS,34.4791,36.735076,4.100,222.475000


In [6]:
'''
Get number of classes and balance the data
'''

# Get the number of classes
num_classes = df['platform'].nunique()

# find min number of samples in the classes
min_samples = df['platform'].value_counts().min()
print("Min Samples:",min_samples)
# Balance the data
df = df.groupby('platform').apply(lambda x: x.sample(n=min_samples)).reset_index(drop=True)

Min Samples: 16818


In [7]:
'''
Feature Engineering
'''

X = np.vstack(df['operationOutput'].values)
X = np.hstack([X, df[['mean', 'std', 'min', 'max']].values])
y = df['platform']

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [8]:
'''
Encode labels
'''

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Classes: {label_encoder.classes_}")

Classes: ['Windows' 'macOS']


In [9]:
'''
Split the data
'''

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [10]:
'''
Build Model
'''

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
'''
Train the model
'''
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr], verbose=2)

Epoch 1/100
673/673 - 8s - 11ms/step - accuracy: 0.4990 - loss: 4.2678 - val_accuracy: 0.5098 - val_loss: 1.7454 - learning_rate: 0.0010
Epoch 2/100
673/673 - 3s - 5ms/step - accuracy: 0.4987 - loss: 1.1052 - val_accuracy: 0.5046 - val_loss: 0.7995 - learning_rate: 0.0010
Epoch 3/100
673/673 - 3s - 5ms/step - accuracy: 0.4968 - loss: 0.7471 - val_accuracy: 0.5015 - val_loss: 0.7237 - learning_rate: 0.0010
Epoch 4/100
673/673 - 3s - 5ms/step - accuracy: 0.5026 - loss: 0.7244 - val_accuracy: 0.5030 - val_loss: 0.7483 - learning_rate: 0.0010
Epoch 5/100
673/673 - 3s - 5ms/step - accuracy: 0.5087 - loss: 0.7220 - val_accuracy: 0.5091 - val_loss: 0.7217 - learning_rate: 0.0010
Epoch 6/100
673/673 - 3s - 5ms/step - accuracy: 0.4989 - loss: 0.7285 - val_accuracy: 0.4861 - val_loss: 0.7363 - learning_rate: 0.0010
Epoch 7/100
673/673 - 3s - 5ms/step - accuracy: 0.5033 - loss: 0.7301 - val_accuracy: 0.4905 - val_loss: 0.7250 - learning_rate: 0.0010
Epoch 8/100
673/673 - 3s - 5ms/step - accuracy:

In [12]:
'''
Test accuracy
'''
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy * 100:.2f}%")

[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Accuracy: 49.46%
