In [1]:
import data_sanitize
import importlib
importlib.reload(data_sanitize)
import pandas as pd

traces = data_sanitize.get_traces()
training_data = data_sanitize.get_training_data(traces)

Reading traces...: 100%|██████████| 5000/5000 [00:32<00:00, 153.37it/s]
Extracting features...: 100%|██████████| 100/100 [00:11<00:00,  8.81it/s]


In [2]:
# convert the training data which is a numpy array of shape(5000,11) to a pandas dataframe
df = pd.DataFrame(training_data, columns=[
    'grid',
    'num_outgoing',
    'num_incoming',
    'outgoing_ratio',
    'incoming_ratio',
    'outgoing_bytes',
    'incoming_bytes',
    'avg_outgoing_freq',
    'avg_incoming_freq',
    'std_outgoing_freq',
    'std_incoming_freq',
    'avg_outgoing_bytes',
    'avg_incoming_bytes',
    'std_outgoing_bytes',
    'std_incoming_bytes',
    'min_outgoing_bytes',
    'min_incoming_bytes',
    'max_outgoing_bytes',
    'max_incoming_bytes',
    'min_outgoing_freq',
    'min_incoming_freq',
    'max_outgoing_freq',
    'max_incoming_freq'
])
df.head()

Unnamed: 0,grid,num_outgoing,num_incoming,outgoing_ratio,incoming_ratio,outgoing_bytes,incoming_bytes,avg_outgoing_freq,avg_incoming_freq,std_outgoing_freq,...,std_outgoing_bytes,std_incoming_bytes,min_outgoing_bytes,min_incoming_bytes,max_outgoing_bytes,max_incoming_bytes,min_outgoing_freq,min_incoming_freq,max_outgoing_freq,max_incoming_freq
0,56.0,157.0,133.0,0.541379,0.458621,414308.0,40486.0,0.042547,0.050559,0.078255,...,2958.765773,376.784956,52.0,52.0,14532.0,2948.0,2.4e-05,2.3e-05,0.277571,0.320129
1,56.0,183.0,169.0,0.519886,0.480114,417216.0,43430.0,0.182481,0.197864,0.517925,...,2576.4991,351.063178,52.0,52.0,14532.0,2948.0,1.9e-05,3e-05,3.837083,3.837594
2,56.0,141.0,121.0,0.538168,0.461832,404004.0,37718.0,0.042269,0.048917,0.078905,...,3039.149268,386.163097,52.0,52.0,13084.0,2948.0,7e-06,4.1e-05,0.34957,0.349689
3,56.0,193.0,163.0,0.542135,0.457865,416870.0,39366.0,0.079154,0.093722,0.174337,...,2191.360052,350.848653,52.0,52.0,11362.0,2948.0,6e-06,2.5e-05,0.819853,0.863817
4,56.0,152.0,131.0,0.537102,0.462898,404774.0,38238.0,0.038678,0.044894,0.072406,...,2816.24043,377.484669,52.0,52.0,13084.0,2948.0,2.2e-05,1.9e-05,0.283542,0.283656


In [3]:
# what are the unique grids in the dataset?
label_count = len(df['grid'].unique())
label_count

100

In [9]:
X_train.shape

(4000, 22)

In [53]:
# Create a grid search for different tebnsorflow models with different number of layers and neurons
import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

X = df.drop('grid', axis=1)
y = df['grid']

# stratify the data to ensure that the training and testing sets have the same distribution of grids
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# validation data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print("Before feature selection: ", X_train.shape)
selector = SelectFromModel(RandomForestClassifier().fit(X_train, y_train), prefit=True)
X_train = selector.transform(X_train)
X_val = selector.transform(X_val)
X_test = selector.transform(X_test)
print("After feature selection: ", X_train.shape)


model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(label_count, activation='softmax'),
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])
model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val))

Before feature selection:  (3200, 22)
After feature selection:  (3200, 10)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/10

<keras.callbacks.History at 0x45fe4ea40>

In [54]:
model.evaluate(X_test, y_test)



[1.475314736366272, 0.6380000114440918]

In [50]:
# Create a grid search for different tebnsorflow models with different number of layers and neurons
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from tensorflow.keras.models import Sequential
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
# use multiple cores
from joblib import Parallel, delayed
import multiprocessing

X = df.drop('grid', axis=1)
y = df['grid']

# stratify the data to ensure that the training and testing sets have the same distribution of grids
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
# split the train into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

class SavedModels:
    accuracy = 0
    model = None
    layers = None

# number of cores
num_cores = multiprocessing.cpu_count()
best_models = []
for i in range(num_cores):
    best_models.append(SavedModels())


# training one layer
def train_model(layers):
    # create model
    model = Sequential()
    for layer in layers:
        type = layer[0]
        value = layer[1]
        if type == 'dense':
            model.add(Dense(value, activation='relu'))
        elif type == 'dropout':
            model.add(Dropout(value))
        elif type == 'batchnorm':
            model.add(BatchNormalization())
    model.add(Dense(label_count, activation='softmax'))

    # compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['sparse_categorical_accuracy'])

    # train model
    model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

    # evaluate the model
    _, accuracy = model.evaluate(X_val, y_val, verbose=0)
    return model, accuracy

def train_chunk(chunk):
    bestModel = SavedModels()
    for i, layers in enumerate(chunk):
        print(f"Training model {i} of {len(chunk)}")
        model, accuracy = train_model(layers)
        print(f"Layer: {layers}, Accuracy: {accuracy}")
        if accuracy > bestModel.accuracy:
            bestModel.accuracy = accuracy
            bestModel.model = model
            bestModel.layers = layers
    return bestModel.accuracy, bestModel.layers

In [51]:
import numpy as np
import itertools
# number of cores
num_cores = multiprocessing.cpu_count()

layer_options = ['dense', 'batchnorm', 'dropout']
layer_values = [16, 32, 64, 128]
layers_list = []
# create the list of all possible ways to create a model with 1-6 layers
# length 1: 4 possible models [(dense, 16)], [(dense, 32)], [(dense, 64)], [(dense, 128)]
# length 2: 16 possible models: [(dense 16), (dense 16)], [(dense 16), (dense 32)], [(dense 16), (dense 64)], [(dense 16), (dense 128)], [(dense 32), (dense 16)], [(dense 32), (dense 32)], [(dense 32), (dense 64)], [(dense 32), (dense 128)], [(dense 64), (dense 16)], [(dense 64), (dense 32)], [(dense 64), (dense 64)], [(dense 64), (dense 128)], [(dense 128), (dense 16)], [(dense 128), (dense 32)], [(dense 128), (dense 64)], [(dense 128), (dense 128)]
# and so on
for i in range(1, 5):
    temp = list(itertools.product(layer_options, layer_values, repeat=i))
    for t in temp:
        tuples = []
        skip = False
        for i in range(0, len(t), 2):
            if(t[i] == 'dropout'):
                if(t[i+1] == 16):
                    tuples.append((t[i], 0.1))
                elif(t[i+1] == 32):
                    tuples.append((t[i], 0.2))
                elif(t[i+1] == 64):
                    tuples.append((t[i], 0.3))
                else:
                    skip = True
                    break
            elif(t[i] == 'batchnorm'):
                if (t[i + 1] == 16):
                    tuples.append((t[i], 0))
                else:
                    skip = True
                    break
            else:
                tuples.append((t[i], t[i+1]))
        if skip:
            continue
        if len(tuples) == 1:
            if tuples[0][0] == 'dropout' or tuples[0][0] == 'batchnorm':
                continue
        elif len(tuples) == 0:
            continue
        layers_list.append(tuples)

In [52]:
# Divide the layers list into equal chunks
layers_chunks = np.array_split(layers_list, num_cores)

  result = getattr(asarray(obj), method)(*args, **kwds)


In [53]:

# Each core will train ONE chunk
best_models = Parallel(n_jobs=num_cores)(delayed(train_chunk)(chunk) for chunk in layers_chunks)

Training model 0 of 468
Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Training model 0 of 468
Training model 0 of 467
Training model 0 of 468
Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Training model 0 of 468
Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Training model 0 of 467
Training model 0 of 468
Training model 0 of 468
Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Training model 0 of 467
Training model 0 of 467
Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB

Metal device set to: A

2023-05-22 00:12:41.490181: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.554380: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.583308: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.589986: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.590110: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.590700: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.595953: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.600753: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-05-22 00:12:41.607374: W tensorflow

Layer: [('dense', 16)], Accuracy: 0.5412499904632568
Training model 1 of 468
Layer: [('dropout', 0.2), ('dense', 64), ('dense', 16)], Accuracy: 0.3675000071525574
Training model 1 of 468
Layer: [('dropout', 0.1), ('dense', 64), ('dense', 16), ('dropout', 0.3)], Accuracy: 0.5174999833106995
Training model 1 of 467
Layer: [('dropout', 0.3), ('dense', 16), ('dropout', 0.1), ('dropout', 0.1)], Accuracy: 0.41749998927116394
Training model 1 of 467
Layer: [('dropout', 0.2), ('dense', 32), ('dense', 128), ('dense', 64)], Accuracy: 0.18125000596046448
Training model 1 of 467
Layer: [('dense', 32), ('batchnorm', 0), ('dropout', 0.3), ('dense', 16)], Accuracy: 0.5062500238418579
Training model 1 of 468
Layer: [('dense', 128), ('dense', 128), ('batchnorm', 0), ('dense', 16)], Accuracy: 0.19875000417232513
Training model 1 of 468
Layer: [('batchnorm', 0), ('dense', 64), ('dropout', 0.2), ('batchnorm', 0)], Accuracy: 0.5887500047683716
Training model 1 of 467
Layer: [('dense', 64), ('batchnorm', 0)

In [None]:
for accuracy,layers in best_models:
    print(f"Accuracy: {accuracy}, Layers: {layers}")
    # print(model.model.evaluate(X_test, y_test, verbose=0))