<a href="https://colab.research.google.com/github/bhattacharjee/RansomFoRRT/blob/main/compare_attack.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import IPython

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model
from functools import lru_cache

from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/newexperiment/* .

Mounted at /content/drive


In [2]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
DO_NOT_USE_BEGIN_END_FEATURES = True
USE_FOURIER_ONLY=False
NUM_EPOCHS = 3

In [3]:
@lru_cache(maxsize=2)
def read_from_parquet():
    df_pt = pd.read_parquet("plaintext.expanded.parquet.gz")
    df_et = pd.read_parquet("expanded.pyencrypted_v1.parquet.gz")
    df_32 = pd.read_parquet("expanded.pyencrypted_v1.b32.parquet.gz")
    df_pt = df_pt[df_pt["extended.extension"] != ".webp"].sample(frac=1).reset_index(drop=True)
    df_et = df_et[df_et["extended.extension"] != ".webp"].sample(frac=1).reset_index(drop=True)
    df_32 = df_32[df_32["extended.extension"] != ".webp"].sample(frac=1).reset_index(drop=True)
    print(df_32["extended.extension"].unique(), df_32.dropna().shape)
    return df_pt, df_et, df_32


def get_datasets():
    global DO_NOT_USE_FOURIER_FEATURES
    global DO_NOT_USE_ADVANCED_FEATURES
    
    df_pt, df_et, df_32 = read_from_parquet()

    columns = [str(c) for c in df_pt.columns]
    columns = [c for c in columns if "fourier.value" not in c]
    columns = [c for c in columns if "extended.extension" != c and "Unnamed" not in c and "filename" not in c]
    columns = [c for c in columns if "filesize" not in c]

    if DO_NOT_USE_BEGIN_END_FEATURES:
        columns = [c for c in columns if "head" not in c and "tail" not in c]
        columns = [c for c in columns if "begin" not in c and "end" not in c]
    
    if DO_NOT_USE_FOURIER_FEATURES:
        columns = [c for c in columns if "fourier" not in c]
    if DO_NOT_USE_ADVANCED_FEATURES:
        columns = [c for c in columns if "advanced" not in c]
    if USE_FOURIER_ONLY:
        columns = [c for c in columns if "fourier" in c]
    
    df_pt = df_pt[columns]
    df_et = df_et[columns]
    df_32 = df_32[columns]
    
    combined = pd.concat([df_pt, df_et, df_32])
    themax = combined.max()
    themin = combined.min()
    del combined
    
    df_pt = (df_pt - themin) / (themax - themin)
    df_et = (df_et - themin) / (themax - themin)
    df_32 = (df_32 - themin) / (themax - themin)
    
    
    df_pt["is_encrypted"] = 0
    df_et["is_encrypted"] = 1
    df_32["is_encrypted"] = 1
    
    nrows = df_pt.shape[0]
    val_samples = int(nrows * 0.1)
    test_samples = int(nrows * 0.1)
    train_samples = nrows - val_samples - test_samples

    train_df = pd.concat([df_pt.head(train_samples), df_et.head(train_samples)]).sample(frac=1).reset_index(drop=True)

    test_df = pd.concat([df_pt.head(train_samples + test_samples).tail(test_samples), df_et.head(train_samples + test_samples).tail(test_samples)]).sample(frac=1).reset_index(drop=True)
    val_df = pd.concat([df_pt.tail(val_samples), df_et.tail(val_samples)]).sample(frac=1).reset_index(drop=True)
    
    test_df_32 = pd.concat([df_pt.head(train_samples + test_samples).tail(test_samples), df_32.head(test_samples)]).sample(frac=1).reset_index(drop=True)
    test_df_32 = test_df_32.dropna()

    train_df = train_df.dropna()
    test_df = test_df.dropna()
    val_df = val_df.dropna()

    return df_pt, df_et, df_32, train_df, val_df, test_df, test_df_32
    

In [4]:
class Model:
    def create(X):
        if X.shape[1] >= 160:
            model = tf.keras.Sequential(
                [
                    layers.Dense(169, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(80, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(40, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 80:
            model = tf.keras.Sequential(
                [
                    layers.Dense(80, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(40, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 40:
            model = tf.keras.Sequential(
                [
                    layers.Dense(40, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 35:
            model = tf.keras.Sequential(
                [
                    layers.Dense(36, input_dim=X.shape[1], activation='relu'),
                    layers.Dropout(0.2),
                    layers.Dense(20, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(10, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(5, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        elif X.shape[1] >= 4:
            model = tf.keras.Sequential(
                [
                    layers.Dense(5, input_dim=X.shape[1], activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(2, activation='gelu'),
                    layers.Dropout(0.2),
                    layers.Dense(1, activation='sigmoid')
                ]
            )
        model.compile(\
                      loss='binary_crossentropy', \
                      optimizer='adam', \
                      metrics=[tf.keras.metrics.AUC()])
        #Model.print_and_plot(model)
        return model

In [5]:
def split_x_y(df):
    x_columns = [str(c) for c in df.columns if "is_encrypted" != c]
    X, y = df[x_columns].to_numpy(), df["is_encrypted"].to_numpy()
    return X, y



In [6]:
dictdf = {
    "Run": [],
    "auc": [],
    "base32_auc": [],
    "f1": [],
    "base32_f1": [],
    "begin_end_used": [],
}
def run(description, num_epochs=NUM_EPOCHS):
    global dictdf
    df_pt, df_et, df_32, train_df, val_df, test_df, test32_df = get_datasets()

    trainX, trainY = split_x_y(train_df)
    valX, valY = split_x_y(val_df)
    testX, testY = split_x_y(test_df)
    model = Model.create(trainX)

    print(f"Shape = {trainX.shape}")

    es_callback = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=5,
        verbose=1,
        mode="auto",
        baseline=None,
        restore_best_weights=True,
    )
    hist = model.fit(trainX, trainY,
                    epochs=num_epochs,
                    validation_data=(valX, valY),
                    batch_size=16,
                    callbacks=[es_callback])
    
    test_predict_y = model.predict(testX)
    test_auc = roc_auc_score(testY, test_predict_y)
    test_f1 = f1_score(testY, test_predict_y > 0.5)
    print()
    print()
    print(description)
    print()
    print(f"With test files: AUC = {test_auc}")
    print()


    nrows = df_pt.shape[0]
    val_samples = int(nrows * 0.1)
    test_samples = int(nrows * 0.1)
    train_samples = nrows - val_samples - test_samples

    #test_df_32 = pd.concat([df_pt.head(60000).tail(20000), df_et.head(60000).tail(20000), df_32.head(60000).tail(20000)]).sample(frac=1).reset_index(drop=True)
    test_df_32 = pd.concat([df_pt.head(train_samples).tail(test_samples), df_32.head(train_samples).tail(test_samples)]).sample(frac=1).reset_index(drop=True)
    test_df_32 = test_df_32.dropna()
    test32X, test32Y = split_x_y(test_df_32)
    
    test32_predict_y = model.predict(test32X)
    test32_auc = roc_auc_score(test32Y, test32_predict_y)
    test32_f1 = f1_score(test32Y, test32_predict_y > 0.5)
    print(f"With base32 encoded encrypted files: AUC = {test32_auc}")

    dictdf["Run"].append(description)
    dictdf["f1"].append(test_f1)
    dictdf["auc"].append(test_auc)
    dictdf["base32_f1"].append(test32_f1)
    dictdf["base32_auc"].append(test32_auc)
    if "begin and end" in description.lower():
        dictdf["begin_end_used"] = 1
    else:
        dictdf["begin_end_used"] = 0

    print('-' * 80)

In [7]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = False
run("using all features")


['.txt' '.xml' '.jpg' '.html' '.pdf' '.doc' '.xls' '.ps' '.ppt' '.gif'
 '.swf' '.csv' '.text' '.gz' '.f' '.wp' '.png' '.log' '.pps' '.dbase3'
 '.unk' '.java' '.rtf' '.eps' '.tmp' '.troff' '.kmz' '.hlp' '.sql' '.kml'
 '.fits' '.dwf' '.tex' '.sys' '.sgml' '.xbm' '.pptx' '.fm' '.docx' '.odp'
 '.zip' '.gls' '.bmp' '.xlsx' '.squeak' '.pub' '.exported'] (49228, 675)
Shape = (78768, 81)
Epoch 1/3
Epoch 2/3
Epoch 3/3


using all features

With test files: AUC = 0.9966031034978662

With base32 encoded encrypted files: AUC = 0.9247793824210302
--------------------------------------------------------------------------------


In [8]:
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = False
run("Using baseline and advanced")

Shape = (78768, 45)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using baseline and advanced

With test files: AUC = 0.9932523143236852

With base32 encoded encrypted files: AUC = 0.8034090032875308
--------------------------------------------------------------------------------


In [9]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
USE_FOURIER_ONLY=True
run("Using Fourier only")
USE_FOURIER_ONLY=False

Shape = (78768, 36)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using Fourier only

With test files: AUC = 0.9947868173824543

With base32 encoded encrypted files: AUC = 0.7893308716340619
--------------------------------------------------------------------------------


In [10]:
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline and Fourier")

Shape = (78768, 40)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using baseline and Fourier

With test files: AUC = 0.9984393268016408

With base32 encoded encrypted files: AUC = 0.1903753202539874
--------------------------------------------------------------------------------


In [11]:
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline only")

Shape = (78768, 4)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using baseline only

With test files: AUC = 0.970714148436041

With base32 encoded encrypted files: AUC = 0.6423038903027002
--------------------------------------------------------------------------------


In [12]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = False
run("using all features (begin and end)")




Shape = (74607, 169)
Epoch 1/3
Epoch 2/3
Epoch 3/3


using all features (begin and end)

With test files: AUC = 0.9990375090151243

With base32 encoded encrypted files: AUC = 0.9687605529087553
--------------------------------------------------------------------------------


In [13]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = False
run("Using baseline and advanced (begin and end)")



Shape = (74607, 133)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using baseline and advanced (begin and end)

With test files: AUC = 0.9990246665628183

With base32 encoded encrypted files: AUC = 0.9402381049919155
--------------------------------------------------------------------------------


In [14]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline and Fourier (begin and end)")

Shape = (74607, 46)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using baseline and Fourier (begin and end)

With test files: AUC = 0.9994101711113562

With base32 encoded encrypted files: AUC = 0.9959762162579106
--------------------------------------------------------------------------------


In [15]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = True
DO_NOT_USE_ADVANCED_FEATURES = True
run("Using baseline only (begin and end)")

Shape = (74607, 10)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using baseline only (begin and end)

With test files: AUC = 0.9985076654657589

With base32 encoded encrypted files: AUC = 0.9637419816346633
--------------------------------------------------------------------------------


In [16]:
DO_NOT_USE_BEGIN_END_FEATURES = False
DO_NOT_USE_FOURIER_FEATURES = False
DO_NOT_USE_ADVANCED_FEATURES = True
USE_FOURIER_ONLY=True
run("Using Fourier only (begin and end)")
USE_FOURIER_ONLY=False

Shape = (78768, 36)
Epoch 1/3
Epoch 2/3
Epoch 3/3


Using Fourier only (begin and end)

With test files: AUC = 0.9954007011616075

With base32 encoded encrypted files: AUC = 0.8668551838588523
--------------------------------------------------------------------------------


In [17]:
pd.DataFrame(dictdf)

Unnamed: 0,Run,auc,base32_auc,f1,base32_f1,begin_end_used
0,using all features,0.996603,0.924779,0.988254,0.0,1
1,Using baseline and advanced,0.993252,0.803409,0.95835,0.0,1
2,Using Fourier only,0.994787,0.789331,0.978302,0.0,1
3,Using baseline and Fourier,0.998439,0.190375,0.986257,0.0,1
4,Using baseline only,0.970714,0.642304,0.941448,0.0,1
5,using all features (begin and end),0.999038,0.968761,0.995953,0.0,1
6,Using baseline and advanced (begin and end),0.999025,0.940238,0.992339,0.0,1
7,Using baseline and Fourier (begin and end),0.99941,0.995976,0.994946,0.98792,1
8,Using baseline only (begin and end),0.998508,0.963742,0.993339,0.029621,1
9,Using Fourier only (begin and end),0.995401,0.866855,0.972182,0.0,1


In [18]:
print(pd.DataFrame(dictdf).to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
                                        Run &      auc &  base32\_auc &       f1 &  base32\_f1 &  begin\_end\_used \\
\midrule
                         using all features & 0.996603 &    0.924779 & 0.988254 &   0.000000 &               1 \\
                Using baseline and advanced & 0.993252 &    0.803409 & 0.958350 &   0.000000 &               1 \\
                         Using Fourier only & 0.994787 &    0.789331 & 0.978302 &   0.000000 &               1 \\
                 Using baseline and Fourier & 0.998439 &    0.190375 & 0.986257 &   0.000000 &               1 \\
                        Using baseline only & 0.970714 &    0.642304 & 0.941448 &   0.000000 &               1 \\
         using all features (begin and end) & 0.999038 &    0.968761 & 0.995953 &   0.000000 &               1 \\
Using baseline and advanced (begin and end) & 0.999025 &    0.940238 & 0.992339 &   0.000000 &               1 \\
 Using baseline and Fourier (begin and end

In [19]:
print(pd.DataFrame(dictdf).round(3).to_latex(index=False))

\begin{tabular}{lrrrrr}
\toprule
                                        Run &   auc &  base32\_auc &    f1 &  base32\_f1 &  begin\_end\_used \\
\midrule
                         using all features & 0.997 &       0.925 & 0.988 &      0.000 &               1 \\
                Using baseline and advanced & 0.993 &       0.803 & 0.958 &      0.000 &               1 \\
                         Using Fourier only & 0.995 &       0.789 & 0.978 &      0.000 &               1 \\
                 Using baseline and Fourier & 0.998 &       0.190 & 0.986 &      0.000 &               1 \\
                        Using baseline only & 0.971 &       0.642 & 0.941 &      0.000 &               1 \\
         using all features (begin and end) & 0.999 &       0.969 & 0.996 &      0.000 &               1 \\
Using baseline and advanced (begin and end) & 0.999 &       0.940 & 0.992 &      0.000 &               1 \\
 Using baseline and Fourier (begin and end) & 0.999 &       0.996 & 0.995 &      0.988 &  

In [20]:
print(pd.DataFrame(dictdf))

                                           Run       auc  base32_auc  \
0                           using all features  0.996603    0.924779   
1                  Using baseline and advanced  0.993252    0.803409   
2                           Using Fourier only  0.994787    0.789331   
3                   Using baseline and Fourier  0.998439    0.190375   
4                          Using baseline only  0.970714    0.642304   
5           using all features (begin and end)  0.999038    0.968761   
6  Using baseline and advanced (begin and end)  0.999025    0.940238   
7   Using baseline and Fourier (begin and end)  0.999410    0.995976   
8          Using baseline only (begin and end)  0.998508    0.963742   
9           Using Fourier only (begin and end)  0.995401    0.866855   

         f1  base32_f1  begin_end_used  
0  0.988254   0.000000               1  
1  0.958350   0.000000               1  
2  0.978302   0.000000               1  
3  0.986257   0.000000               1  
4 

In [21]:
pd.DataFrame(dictdf)

Unnamed: 0,Run,auc,base32_auc,f1,base32_f1,begin_end_used
0,using all features,0.996603,0.924779,0.988254,0.0,1
1,Using baseline and advanced,0.993252,0.803409,0.95835,0.0,1
2,Using Fourier only,0.994787,0.789331,0.978302,0.0,1
3,Using baseline and Fourier,0.998439,0.190375,0.986257,0.0,1
4,Using baseline only,0.970714,0.642304,0.941448,0.0,1
5,using all features (begin and end),0.999038,0.968761,0.995953,0.0,1
6,Using baseline and advanced (begin and end),0.999025,0.940238,0.992339,0.0,1
7,Using baseline and Fourier (begin and end),0.99941,0.995976,0.994946,0.98792,1
8,Using baseline only (begin and end),0.998508,0.963742,0.993339,0.029621,1
9,Using Fourier only (begin and end),0.995401,0.866855,0.972182,0.0,1


In [22]:
pd.DataFrame(dictdf).round(3)

Unnamed: 0,Run,auc,base32_auc,f1,base32_f1,begin_end_used
0,using all features,0.997,0.925,0.988,0.0,1
1,Using baseline and advanced,0.993,0.803,0.958,0.0,1
2,Using Fourier only,0.995,0.789,0.978,0.0,1
3,Using baseline and Fourier,0.998,0.19,0.986,0.0,1
4,Using baseline only,0.971,0.642,0.941,0.0,1
5,using all features (begin and end),0.999,0.969,0.996,0.0,1
6,Using baseline and advanced (begin and end),0.999,0.94,0.992,0.0,1
7,Using baseline and Fourier (begin and end),0.999,0.996,0.995,0.988,1
8,Using baseline only (begin and end),0.999,0.964,0.993,0.03,1
9,Using Fourier only (begin and end),0.995,0.867,0.972,0.0,1
