<a href="https://colab.research.google.com/github/bhattacharjee/RansomFoRRT/blob/main/compare_attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import IPython

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model
from functools import lru_cache

from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/newexperiment/* .

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
DO_NOT_USE_BEGIN_END_FEATURES = True
NUM_EPOCHS = 3

In [3]:
@lru_cache(maxsize=2)
def get_datasets():
    global DO_NOT_USE_FOURIER_FEATURES
    global DO_NOT_USE_ADVANCED_FEATURES
    df_pt = pd.read_parquet("plaintext.expanded.parquet.gz").sample(frac=1).reset_index(drop=True)
    df_et = pd.read_parquet("expanded.pyencrypted_v1.parquet.gz").sample(frac=1).reset_index(drop=True)
    df_32 = pd.read_parquet("expanded.pyencrypted_v1.b32.parquet.gz").sample(frac=1).reset_index(drop=True)
    
    columns = [str(c) for c in df_pt.columns]
    columns = [c for c in columns if "fourier.value" not in c]
    columns = [c for c in columns if "extended.extension" != c and "Unnamed" not in c and "filename" not in c]
    columns = [c for c in columns if "filesize" not in c]

    if DO_NOT_USE_BEGIN_END_FEATURES:
        columns = [c for c in columns if "head" not in c and "tail" not in c]
        columns = [c for c in columns if "begin" not in c and "end" not in c]
    
    if DO_NOT_USE_FOURIER_FEATURES:
        columns = [c for c in columns if "fourier" not in c]
    if DO_NOT_USE_ADVANCED_FEATURES:
        columns = [c for c in columns if "advanced" not in c]
    
    df_pt = df_pt[columns]
    df_et = df_et[columns]
    df_32 = df_32[columns]
    
    combined = pd.concat([df_pt, df_et, df_32])
    themax = combined.max()
    themin = combined.min()
    del combined
    
    df_pt = (df_pt - themin) / (themax - themin)
    df_et = (df_et - themin) / (themax - themin)
    df_32 = (df_32 - themin) / (themax - themin)
    
    
    df_pt["is_encrypted"] = 0
    df_et["is_encrypted"] = 1
    df_32["is_encrypted"] = 1
    
    
    train_df = pd.concat([df_pt.head(40000), df_et.head(40000)]).sample(frac=1).reset_index(drop=True)
    test_df = pd.concat([df_pt.head(60000).tail(20000), df_et.head(60000).tail(20000)]).sample(frac=1).reset_index(drop=True)
    val_df = pd.concat([df_pt.tail(20000), df_et.tail(20000)]).sample(frac=1).reset_index(drop=True)
    
    test_df_32 = pd.concat([df_pt.head(60000).tail(20000), df_et.head(60000).tail(20000), df_32.head(60000).tail(20000)]).sample(frac=1).reset_index(drop=True)
    test_df_32 = test_df_32.dropna()

    train_df = train_df.dropna()
    test_df = test_df.dropna()
    val_df = val_df.dropna()

    return df_pt, df_et, df_32, train_df, val_df, test_df, test_df_32
    

In [4]:
class Model:
    def create(X):
        if X.shape[1] >= 160:
            model = tf.keras.Sequential(
                [
                    layers.Dense(169, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(80, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(40, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 80:
            model = tf.keras.Sequential(
                [
                    layers.Dense(80, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(40, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 40:
            model = tf.keras.Sequential(
                [
                    layers.Dense(40, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 4:
            model = tf.keras.Sequential(
                [
                    layers.Dense(5, input_dim=X.shape[1], activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        model.compile(\
                      loss='binary_crossentropy', \
                      optimizer='adam', \
                      metrics=[tf.keras.metrics.AUC()])
        #Model.print_and_plot(model)
        return model

In [5]:
def split_x_y(df):
    x_columns = [str(c) for c in df.columns if "is_encrypted" != c]
    X, y = df[x_columns].to_numpy(), df["is_encrypted"].to_numpy()
    return X, y



In [6]:
def run(description, num_epochs=10):
    df_pt, df_et, df_32, train_df, val_df, test_df, test32_df = get_datasets()

    trainX, trainY = split_x_y(train_df)
    valX, valY = split_x_y(val_df)
    testX, testY = split_x_y(test_df)
    model = Model.create(trainX)

    print(f"Shape = {trainX.shape}")

    es_callback = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=5,
        verbose=1,
        mode="auto",
        baseline=None,
        restore_best_weights=True,
    )
    hist = model.fit(trainX, trainY,
                    epochs=num_epochs,
                    validation_data=(valX, valY),
                    batch_size=16,
                    callbacks=[es_callback])
    
    test_predict_y = model.predict(testX)
    test_auc = roc_auc_score(testY, test_predict_y)
    test_f1 = f1_score(testY, test_predict_y > 0.5)
    print()
    print()
    print(description)
    print()
    print(f"With test files: AUC = {test_auc} , F1 score = {test_f1}")
    print()
    #test_df_32 = pd.concat([df_pt.head(60000).tail(20000), df_et.head(60000).tail(20000), df_32.head(60000).tail(20000)]).sample(frac=1).reset_index(drop=True)
    test_df_32 = pd.concat([df_pt.head(60000).tail(20000), df_32.head(60000).tail(20000)]).sample(frac=1).reset_index(drop=True)
    test_df_32 = test_df_32.dropna()
    test32X, test32Y = split_x_y(test_df_32)
    
    test32_predict_y = model.predict(test32X)
    test32_auc = roc_auc_score(test32Y, test32_predict_y)
    test32_f1 = f1_score(test32Y, test32_predict_y > 0.5)
    print(f"With base32 encoded encrypted files: AUC = {test32_auc} F1 = {test32_f1}")

    print('-' * 80)

In [7]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = False
run("using all features")


Shape = (80000, 81)
Epoch 1/10
Epoch 2/10
 340/5000 [=>............................] - ETA: 27s - loss: 0.4099 - auc: 0.8296

KeyboardInterrupt: ignored

In [None]:
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = False
run("Using baseline and advanced")

In [None]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline and Fourier")

In [None]:
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline only")

In [None]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = False
run("using all features (begin and end)")


In [None]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = False
run("Using baseline and advanced (begin and end)")

In [None]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline and Fourier (begin and end)")

In [None]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline only (begin and end)")