<a href="https://colab.research.google.com/github/bhattacharjee/msc-ai-project/blob/main/DeepANN/deep_ann_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
USE_FLOAT32 = False
REDUCE_DF_SIZE = True
NUM_EPOCHS = 1
BATCH_SIZE = 32
INCLUDE_NAPIERONE = True

!pip install tensorflow-addons

import tensorflow as tf
from keras.utils import np_utils
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import logging, sys, random, glob
from google.colab import drive
from functools import lru_cache
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from tensorflow import keras as K
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model
import IPython

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

import gc
import tensorflow_addons as tfa

drive.mount("/content/drive")

def set_random_seed():
    np.random.seed(1)
    random.seed(1)
    tf.random.set_seed(1)

root = logging.getLogger()
root.setLevel(logging.INFO)
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)

!if [[ -d /content/drive/MyDrive/MSCPROJDATA && ! -d MSCPROJDATA ]]; then cp -r /content/drive/MyDrive/MSCPROJDATA .; fi

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
 def plot_history(history):
    import matplotlib.pyplot as plt
    keys = [str(k) for k in history.history.keys()]
    keys = keys[0:(len(keys)//2)]

    length = 10
    height = length * 0.5 * len(keys)
    plt.rcParams["figure.figsize"] = (length, height)
    plt.rcParams['font.size'] = 15
    
    fig, ax = plt.subplots(len(keys))


    for i in range(len(keys)):
        key = keys[i]
        val_key = f"val_{key}"
        ax[i].plot(history.history[key], label=key)
        ax[i].plot(history.history[val_key], label=val_key)
        ax[i].legend()

    
    plt.show()

In [56]:
dataset_names = ["baseline", "advanced-only", "fourier-only", "baseline-and-fourier", "advanced-and-fourier", "advanced", "fourier"]
def get_columns_from_df(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))

    all_columns = [c for c in thisdf.columns]
    
    return {\
        "baseline": baseline_columns,\
        "advanced-only": advanced_columns_only,\
        "fourier-only": fourier_columns_only,\
        "baseline-and-fourier": baseline_and_fourier,\
        "advanced-and-fourier": advanced_and_fourier,\
        "advanced": advanced_columns,\
        "fourier": fourier_columns,\
        "all_columns": all_columns,
    }


def reduce_df_size(df):
    original_usage = df.memory_usage().sum() / (1024**2)
    for col in df.columns:
        if "fourier.value" in col or \
            "tail" in col or \
            "head" in col or \
            "end" in col or \
            "begin" in col:
            df.drop([col], axis=1, inplace=True)
            continue
        if USE_FLOAT32:
            if "float64" in str(df[col].dtype):
                df[col] = df[col].astype('float32')
        elif "int64" in str(df[col].dtype):
            df[col] = df[col].astype("int8")
    new_usage = df.memory_usage().sum() / (1024**2)
    return df

def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

@lru_cache(maxsize=5)
def load_datasets():
    global INCLUDE_NAPIERONE
    datasets = dict()
    print("Reloading Datasets")
    for file in glob.glob("MSCPROJDATA/**.parquet.gz", recursive=True):
        if INCLUDE_NAPIERONE or not file.startswith("MSCPROJDATA/n1"):
            df = pd.read_parquet(file)
            datasets[file] = df
            df["is_encrypted"] = 1 if "encr" in file.lower() else 0
    
            if REDUCE_DF_SIZE:
                df = reduce_df_size(df)

            df = df.sample(frac=1).reset_index(drop=True)
            datasets[file] = df
            call_gc()

    return datasets

@lru_cache(maxsize=5)
def get_columns():
    datasets = load_datasets()
    df = list(datasets.values())[0]
    return get_columns_from_df(df)


In [57]:
@lru_cache(maxsize=5)
def get_train_test_df():
    datasets = load_datasets()
    df = pd.concat([v for v in datasets.values()])
    x_cols = [str(c) for c in df.columns if "is_encrypted" != str(c)]
    y_cols = "is_encrypted"

    df = df.sample(frac=1).reset_index(drop=True)
    df.fillna(0.0)

    X = df[x_cols]
    y = df[y_cols]

    min_max_scaler = MinMaxScaler()
    X = X - X.min()
    X = X / X.max()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

_ = get_train_test_df()

Reloading Datasets


KeyboardInterrupt: ignored

In [None]:
class Model:
    @staticmethod
    def print_and_plot(model):
        print(model.summary())
        plot = tf.keras.utils.plot_model(
            model,
            show_shapes=True,
            expand_nested=True)
        IPython.display.display(plot)

    @staticmethod
    def create(name:str, columns:list):
        X_train, X_test, y_train, y_test = get_train_test_df()
        X_train = X_train[columns].to_numpy()

        model = tf.keras.Sequential(
            [
                layers.Dense(8, input_dim=X_train.shape[1], activation='relu', name="Dense-8"),
                layers.Dropout(0.2),
                layers.Dense(4, activation = 'relu', name="Dense-4"),
                layers.Dropout(0.2),
                layers.Dense(2, activation = 'relu', name="Dense-2"),
                layers.Dense(1, activation = 'sigmoid')
            ]
        )

        model.compile(\
            loss = 'binary_crossentropy',
            optimizer = 'adam',\
            metrics = [
                tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                tf.keras.metrics.Precision(name='precision'),
                tf.keras.metrics.Recall(name='recall'),
                tfa.metrics.F1Score(num_classes=1, name='f1_score'),
            ])
            #callbacks = [tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")])
        
        return model

    @staticmethod
    def train_model(name):
        tf.random.set_seed(42)
        np.random.seed(42)
        X_train, X_test, y_train, y_test = get_train_test_df()

        columns = get_columns()
        columns = columns[name]

        X_train = X_train[columns]
        X_test = X_test[columns]

        n_cross_val = X_train.shape[0]
        n_train = n_cross_val - n_cross_val // 5
        n_cross_val = n_cross_val // 5

        X_cross_val = X_train.tail(n_cross_val)
        y_cross_val = y_train[-n_cross_val:]

        X_train = X_train.head(n_train)
        y_train = y_train[:n_train]

        es_clbk = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)

        model = Model.create(name, columns)
        history = model.fit(X_train, y_train,
                            epochs=NUM_EPOCHS,
                            validation_data=(X_cross_val, y_cross_val),
                            callbacks=[es_clbk],
                            batch_size=BATCH_SIZE)

        Model.print_and_plot(model)
        
        y_pred = model.predict(X_test)

        fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_true=y_test, y_score=y_pred)
        auc = sklearn.metrics.auc(fpr, tpr)
        y_pred = y_pred > 0.5

        acc = sklearn.metrics.accuracy_score(y_test, y_pred)
        prec = sklearn.metrics.precision_score(y_test, y_pred)
        rec = sklearn.metrics.recall_score(y_test, y_pred)
        f1 = sklearn.metrics.f1_score(y_test, y_pred)
        
        return name, model, history, acc, f1, prec, rec, auc

In [None]:
class Runner:
    def __init__(self):
        self.scores_dict = {
            "MeasureName": [],
            "Accuracy": [],
            "F1": [],
            "Precision": [],
            "Recall": [],
            "AUC": [],
        }
        self.training_histories = dict()
        self.scores_df = None

    def run(self, name:str):
        message = f"Running : {name}"
        print(message)
        print("-" * len(message))

        name, model, history, accuracy, f1, precision, recall, auc = Model.train_model(name)
        self.training_histories[name] = history
        self.scores_dict["MeasureName"].append(name)
        self.scores_dict["Accuracy"].append(accuracy)
        self.scores_dict["F1"].append(f1)
        self.scores_dict["Precision"].append(precision)
        self.scores_dict["Recall"].append(recall)
        self.scores_dict["AUC"].append(auc)

    def print(self):
        df = pd.DataFrame(self.scores_dict)
        print(df)
    
    def print_latex(self):
        df = pd.DataFrame(self.scores_dict)
        print(df.round(3).to_latex(index=False))

    def pickle_histories(self, pickle_filename):
        import pickle
        with open(pickle_filename, "wb") as f:
            pickle.dump(self.training_histories, f, pickle.HIGHEST_PROTOCOL)

    def save_result_df(self, csv_filename):
        df = pd.DataFrame(self.scores_dict)
        df.to_csv(csv_filename)




In [None]:
# Run this if using NapierOne
print("USING NAPIER ONE")
print("******************************************")
print()
load_datasets.cache_clear()
get_columns.cache_clear()
get_train_test_df.cache_clear()

INCLUDE_NAPIERONE = True
runner = Runner()
# Now run all the different combinations of features
_ = [runner.run(str(c)) for c in get_columns().keys() if "all" != str(c).lower()]
runner.pickle_histories("n1.history.pickle")
runner.save_result_df("n1.DenseAnnResult.csv")
!mkdir -p /content/drive/MyDrive/ProjSave
!rm -f /content/drive/MyDrive/ProjSave/n1.history.pickle
!rm -f /content/drive/MyDrive/ProjSave/n1.DenseAnnResult.csv
!cp n1.history.pickle /content/drive/MyDrive/ProjSave
!cp n1.DenseAnnResult.csv /content/drive/MyDrive/ProjSave
!ls -l /content/drive/MyDrive/ProjSave

using_napier_runner = runner

In [None]:
# Run this if not using Napier One
print("NOT USING NAPIER ONE")
print("******************************************")
print()
load_datasets.cache_clear()
get_columns.cache_clear()
get_train_test_df.cache_clear()

INCLUDE_NAPIERONE = False
runner = Runner()
# Now run all the different combinations of features
_ = [runner.run(str(c)) for c in get_columns().keys() if "all" != str(c).lower()

runner.pickle_histories("history.pickle")
runner.save_result_df("DenseAnnResult.csv")
!mkdir -p /content/drive/MyDrive/ProjSave
!rm -f /content/drive/MyDrive/ProjSave/history.pickle
!rm -f /content/drive/MyDrive/ProjSave/DenseAnnResult.csv
!cp history.pickle /content/drive/MyDrive/ProjSave
!cp DenseAnnResult.csv /content/drive/MyDrive/ProjSave
!ls -l /content/drive/MyDrive/ProjSave

not_using_napier_runner = runner

In [None]:
runner = using_napier_runner
print("USING NAPIER ONE")
print("******************************************")
runner.print()
print()
print("--------------------------------")
print()
runner.print_latex()
print()
print("--------------------------------")
print()
pd.DataFrame(runner.scores_dict)

In [None]:
runner = not_using_napier_runner
print("NOT USING NAPIER ONE")
print("******************************************")
runner.print()
print()
print("--------------------------------")
print()
runner.print_latex()
print()
print("--------------------------------")
print()
pd.DataFrame(runner.scores_dict)