In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
import joblib
import json
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

In [2]:
# Load the JSON data
with open('../data/extracted_and_reformatted_data.json', 'r') as file:
    json_data = json.load(file)

In [3]:
'''
Preprocess the data
''' 

# Convert the JSON data to a DataFrame
df = pd.DataFrame(json_data)

fixed_length = 50

def convert_and_pad_or_truncate(arr, length):
    arr = np.fromstring(arr.strip('[]'), sep=',').tolist()
    if len(arr) > length:
        return arr[:length]
    elif len(arr) < length:
        return np.pad(arr, (0, length - len(arr)), 'constant')
    return arr

# Apply padding/truncation to each operationOutput
df['operationOutput'] = df['operationOutput'].apply(lambda x: convert_and_pad_or_truncate(x, fixed_length))

# Determine the plaform of the data
def determine_platform(platform):
    return platform
    if 'Linux' in platform:
        return 'Linux'
    elif 'Win' in platform:
        return 'Windows'
    elif 'iPad' in platform or 'iPhone' in platform:
        return 'iOS'
    elif 'Mac' in platform:
        return 'macOS'
    else:
        return 'Other'

# Get the platform from component
df['platform'] = df['components'].apply(lambda x: determine_platform(x['platform']['value']))

# Compute summary statistics for each operationOutput
df['mean'] = df['operationOutput'].apply(np.mean)
df['std'] = df['operationOutput'].apply(np.std)
df['min'] = df['operationOutput'].apply(np.min)
df['max'] = df['operationOutput'].apply(np.max)

df

Unnamed: 0,entryId,browserData,operationDetails,operationOutput,components,platform,mean,std,min,max
0,47810,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'D', 'executableDuration': ...","[400.94999969005585, 77.61000096797943, 317.58...","{'fonts': {'value': [], 'duration': 1009}, 'do...",Linux armv6l,706.4904,770.257254,77.610001,3362.630000
1,2501,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_inverse_saw_tooth', '...","[62.0, 23.0, 121.0, 36.0, 20.0, 4975.0, 77.0, ...","{'fonts': {'value': ['Arabic Typesetting', 'Ba...",Win32,744.6000,1783.131650,0.000000,9303.000000
2,36364,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[11.064999969676137, 4.780000075697899, 4.9649...","{'fonts': {'value': [], 'duration': 69}, 'domB...",Linux armv7l,20.0932,24.710226,3.035000,129.170000
3,31066,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[6.714999675750732, 19.454999826848507, 17.815...","{'fonts': {'value': [], 'duration': 102}, 'dom...",iPad,16.9993,17.302972,3.610000,73.980000
4,16921,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[37.899999995715916, 83.11499998671934, 55.470...","{'fonts': {'value': [], 'duration': 437}, 'dom...",Linux armv7l,510.8120,707.875512,22.840000,3146.630000
...,...,...,...,...,...,...,...,...,...,...
54023,29869,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[16.580000519752502, 8.229999803006649, 279.88...","{'fonts': {'value': [], 'duration': 2569}, 'do...",Win32,159.9590,124.756627,8.230000,501.809999
54024,43047,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[1280.7550001889467, 1033.9449997991323, 632.3...","{'fonts': {'value': [], 'duration': 2273}, 'do...",Win32,1271.2530,1595.840569,0.000000,5705.545000
54025,25375,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[145.22499998565763, 313.1049999501556, 1582.4...","{'fonts': {'value': [], 'duration': 721}, 'dom...",Linux x86_64,911.7509,643.372538,57.295000,3396.915000
54026,15611,Host: frieza.herokuapp.com\nConnection: close\...,"{'executableName': 'exec_step', 'executableDur...","[32.44499955326319, 26.405000127851963, 49.999...","{'fonts': {'value': [], 'duration': 66}, 'domB...",iPhone,34.4791,36.735076,4.100000,222.475000


In [4]:
'''
Get number of classes and balance the data
'''

# Get the number of classes
num_classes = df['platform'].nunique()

# find min number of samples in the classes
min_samples = df['platform'].value_counts().min()
print("Min Samples:",min_samples)
# Balance the data
df = df.groupby('platform').apply(lambda x: x.sample(n=min_samples)).reset_index(drop=True)

Min Samples: 63


In [5]:
'''
Feature Engineering
'''

X = np.vstack(df['operationOutput'].values)
X = np.hstack([X, df[['mean', 'std', 'min', 'max']].values])
y = df['platform']

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [6]:
'''
Encode labels
'''

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Classes: {label_encoder.classes_}")

Classes: ['Linux aarch64' 'Linux armv5tejl' 'Linux armv6l' 'Linux armv7l'
 'Linux armv8l' 'Linux i386' 'Linux i686' 'Linux x86_64' 'MacIntel'
 'Win32' 'iPad' 'iPhone']


In [7]:
'''
Split the data
'''

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [11]:
'''
Build Model
'''

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(256, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [12]:
'''
Train the model
'''
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping, reduce_lr], verbose=2)

Epoch 1/100
16/16 - 4s - 229ms/step - accuracy: 0.0911 - loss: 10.1806 - val_accuracy: 0.0909 - val_loss: 9.3214 - learning_rate: 0.0010
Epoch 2/100
16/16 - 0s - 9ms/step - accuracy: 0.0890 - loss: 10.0212 - val_accuracy: 0.0826 - val_loss: 9.2462 - learning_rate: 0.0010
Epoch 3/100
16/16 - 0s - 8ms/step - accuracy: 0.0911 - loss: 9.7696 - val_accuracy: 0.1074 - val_loss: 9.1627 - learning_rate: 0.0010
Epoch 4/100
16/16 - 0s - 8ms/step - accuracy: 0.1222 - loss: 9.5923 - val_accuracy: 0.0992 - val_loss: 9.0616 - learning_rate: 0.0010
Epoch 5/100
16/16 - 0s - 8ms/step - accuracy: 0.1035 - loss: 9.4103 - val_accuracy: 0.0992 - val_loss: 8.9639 - learning_rate: 0.0010
Epoch 6/100
16/16 - 0s - 8ms/step - accuracy: 0.1077 - loss: 9.2057 - val_accuracy: 0.1157 - val_loss: 8.8773 - learning_rate: 0.0010
Epoch 7/100
16/16 - 0s - 8ms/step - accuracy: 0.1159 - loss: 9.0333 - val_accuracy: 0.1405 - val_loss: 8.7883 - learning_rate: 0.0010
Epoch 8/100
16/16 - 0s - 7ms/step - accuracy: 0.1097 - los

In [13]:
'''
Test accuracy
'''
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy * 100:.2f}%")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Accuracy: 9.87%
