## Train with one dataset and test with another

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

In [2]:
!ls

expanded.base32.des3.csv.gz
expanded.des3.csv.gz
expanded.plaintext.base32.csv.gz
expanded.pyencrypted_v1.b32.csv.gz
expanded.pyencrypted_v1.csv.gz
expanded.pyencrypted_v2.base32.csv.gz
expanded.pyencrypted_v2.csv.gz
expanded_encrypted_v3.csv.gz
expanded_encrypted_v3_base32.csv.gz
n1.encrypted.expanded.ransomware.csv.gz
n1.expanded.plaintext.base32.csv.gz
n1.expanded.plaintext.csv.gz
n1.expanded.pyencrypted_v1.base32.csv.gz
n1.expanded.pyencrypted_v1.csv.gz
n1.expanded.pyencrypted_v2.base32.csv.gz
n1.expanded.pyencrypted_v2.csv.gz
n1.expanded.pyencrypted_v3.base32.csv.gz
n1.expanded.pyencrypted_v3.csv.gz
n1.plaintext.base32.csv.gz
n1.plaintext.csv.gz
n1.zip.expanded.encrypted.base32.csv.gz
n1.zip.expanded.encrypted.csv.gz
n1.zip.expanded.encrypted.v2.base32.csv.gz
n1.zip.expanded.encrypted.v2.csv.gz
n1.zip.expanded.plaintext.base32.csv.gz
n1.zip.expanded.plaintext.csv.gz
napierone_1.1.ipynb
plaintext.base32.combined.csv.gz
plaintext.combined.csv.gz
plaintex

In [3]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [4]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [5]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    train_datasets = {}
    test_datasets = {}
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df = df.sample(frac=1).reset_index(drop=True)
        df["csv_filename"] = file
        df["is_encrypted"] = 1 if "encr" in file.lower() or "ransomware" in file.lower() else 0
        df["is_ransomware"] = True if "ransomware" in file.lower() else False
        if "ransomware" in file.lower():
            df["is_encrypted"] = True
        if file.startswith("n1."):
            test_datasets[file] = df
        else:
            train_datasets[file] = df
        if DEBUG and len(test_datasets) > 2 and len(train_datasets) > 2:
            break
    return train_datasets, test_datasets

In [6]:
def run_model(traindf, testdf, columns, description, clf, clfname="clf"):
    call_gc()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
        
    call_gc()
    
    trainX = traindf[columns].to_numpy() 
    testX = testdf[columns].to_numpy()
    trainY = traindf["is_encrypted"].to_numpy()
    testY = testdf["is_encrypted"].to_numpy()
    
    estimators = [\
                  ('std,', MinMaxScaler()), \
                  (clfname, clf())]
    pipeline = Pipeline(estimators)
    
    print("Training started...")
    pipeline.fit(trainX, trainY)
    print("Done.")
    
    print("Prediction started...")
    y_pred = pipeline.predict(testX)
    print("Done.")
    
    return y_pred

In [7]:
def get_results_for_classifier(clf, clfname="clf"):
    print(f"Evaluating : {clf}")
    results = {
        "Feature Set": [],
        "Accuracy": [],
        "F1-Score": [],
        "Precision": [],
        "Recall": []
    }

    traindf, testdf = load_datasets_once()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
    
    if DEBUG:
        qraindf = traindf.head(5000)
        testdf = testdf.head(5000)
    
    testdf_copy = testdf.copy()

    columns = get_columns(traindf)

    for desc, cols in columns.items():
        y_pred = run_model(traindf, testdf, cols, desc, clf, clfname)
        testdf_copy[f"Prediction:{desc}"] = y_pred
    return testdf_copy
    

In [8]:
rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
lr_clf = lambda: LogisticRegression(\
            n_jobs=8, \
            solver='saga', \
            random_state=42, \
            max_iter=1000, \
            multi_class='ovr')
rf_results = get_results_for_classifier(rfc_clf, "Random Forest")
#lr_results = get_results_for_classifier(lr_clf, "Logistic Regression")

Evaluating : <function <lambda> at 0x7fb85254ca60>
Loading n1.zip.expanded.plaintext.csv.gz
Loading plaintext.base32.combined.csv.gz
Loading expanded.base32.des3.csv.gz
Loading n1.plaintext.base32.csv.gz
Loading expanded_encrypted_v3.csv.gz
Loading n1.expanded.plaintext.csv.gz
Loading n1.expanded.pyencrypted_v2.csv.gz
Loading n1.expanded.pyencrypted_v1.base32.csv.gz
Loading n1.encrypted.expanded.ransomware.csv.gz
Loading expanded.des3.csv.gz
Loading n1.expanded.pyencrypted_v2.base32.csv.gz
Loading expanded.pyencrypted_v1.csv.gz
Loading expanded.pyencrypted_v2.base32.csv.gz
Loading n1.zip.expanded.encrypted.v2.csv.gz
Loading n1.zip.expanded.encrypted.v2.base32.csv.gz
Loading expanded_encrypted_v3_base32.csv.gz
Loading n1.expanded.pyencrypted_v3.base32.csv.gz
Loading n1.zip.expanded.plaintext.base32.csv.gz
Loading plaintext.combined.csv.gz
Loading plaintext.expanded.csv.gz
Loading expanded.pyencrypted_v2.csv.gz
Loading expanded.plaintext.base32.csv.gz
Loading n1.expanded.pyencrypted_v3.c

In [9]:
def custom_result_print(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

In [36]:
print("Office Files: ")
print("------------------------")
rdf = custom_result_print(rf_results, [".xls", ".csv", ".ppt", ".doc", ".doc", "odf", "opf"]).sort_values(by="F1")
print()
print()
print(rdf.to_latex())
rdf

Office Files: 
------------------------
                                               Baseline only: 		 0.707 		 0.655 		19505
                                               Advanced only: 		 0.695 		 0.621 		19505
                                                Fourier only: 		 0.850 		 0.809 		19505
                                        Baseline and Fourier: 		 0.900 		 0.880 		19505
                                        Advanced and Fourier: 		 0.853 		 0.832 		19505
                                       Baseline and Advanced: 		 0.722 		 0.646 		19505
                             Baseline, Advanced, and Fourier: 		 0.865 		 0.845 		19505


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
1 &                    Advanced only &  0.620508 &  0.695040 &   0.682278 &  0.708288 \\
0 &                    Baseline only &  0.655012 &  0.707065 &   0.734135 &  0.681921 \\
5 &            Baseline and Adva

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.620508,0.69504,0.682278,0.708288
0,Baseline only,0.655012,0.707065,0.734135,0.681921
5,Baseline and Advanced,0.646398,0.721637,0.694747,0.750693
2,Fourier only,0.80928,0.849563,0.819409,0.882022
4,Advanced and Fourier,0.832197,0.853288,0.915192,0.799227
6,"Baseline, Advanced, and Fourier",0.845014,0.865327,0.921617,0.815518
3,Baseline and Fourier,0.880031,0.899502,0.920615,0.879335


In [37]:
print("Passsword protected files (excluding archives)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], [".7z", ".gz", ".zip"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (excluding archives)
------------------------------------------------
                                               Baseline only: 		 0.609 		 0.498 		7000
                                               Advanced only: 		 0.598 		 0.488 		7000
                                                Fourier only: 		 0.819 		 0.762 		7000
                                        Baseline and Fourier: 		 0.886 		 0.860 		7000
                                        Advanced and Fourier: 		 0.809 		 0.784 		7000
                                       Baseline and Advanced: 		 0.632 		 0.493 		7000
                             Baseline, Advanced, and Fourier: 		 0.840 		 0.815 		7000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.497857 &  0.609488 &   0.571339 &  0.653095 \\
1 &                    Advanced only &  0.487714 &  0.598252 &   0.56496

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.487714,0.598252,0.56496,0.635714
0,Baseline only,0.497857,0.609488,0.571339,0.653095
5,Baseline and Advanced,0.492714,0.63183,0.559596,0.725476
4,Advanced and Fourier,0.783571,0.808736,0.86079,0.762619
2,Fourier only,0.761571,0.819431,0.750942,0.901667
6,"Baseline, Advanced, and Fourier",0.814857,0.840158,0.871546,0.810952
3,Baseline and Fourier,0.859714,0.886316,0.862551,0.911429


In [38]:
print("Passsword protected files (excluding office files)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], [".xls", ".csv", ".ppt", ".doc", ".doc", "odf", "opf"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (excluding office files)
------------------------------------------------
                                               Baseline only: 		 0.600 		 0.480 		1000
                                               Advanced only: 		 0.658 		 0.520 		1000
                                                Fourier only: 		 0.814 		 0.740 		1000
                                        Baseline and Fourier: 		 0.833 		 0.780 		1000
                                        Advanced and Fourier: 		 0.781 		 0.716 		1000
                                       Baseline and Advanced: 		 0.673 		 0.540 		1000
                             Baseline, Advanced, and Fourier: 		 0.789 		 0.730 		1000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &     0.480 &  0.600000 &   0.557143 &  0.650000 \\
1 &                    Advanced only &     0.520 &  0.658120 &   0.5

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.48,0.6,0.557143,0.65
1,Advanced only,0.52,0.65812,0.574627,0.77
5,Baseline and Advanced,0.54,0.673295,0.586634,0.79
4,Advanced and Fourier,0.716,0.780864,0.727011,0.843333
6,"Baseline, Advanced, and Fourier",0.73,0.789392,0.741935,0.843333
2,Fourier only,0.74,0.813754,0.713568,0.946667
3,Baseline and Fourier,0.78,0.832827,0.765363,0.913333


In [39]:
print("Passsword protected files (all)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], None)
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (all)
------------------------------------------------
                                               Baseline only: 		 0.609 		 0.498 		7000
                                               Advanced only: 		 0.598 		 0.488 		7000
                                                Fourier only: 		 0.819 		 0.762 		7000
                                        Baseline and Fourier: 		 0.886 		 0.860 		7000
                                        Advanced and Fourier: 		 0.809 		 0.784 		7000
                                       Baseline and Advanced: 		 0.632 		 0.493 		7000
                             Baseline, Advanced, and Fourier: 		 0.840 		 0.815 		7000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.497857 &  0.609488 &   0.571339 &  0.653095 \\
1 &                    Advanced only &  0.487714 &  0.598252 &   0.564960 &  0.635714 \

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.487714,0.598252,0.56496,0.635714
0,Baseline only,0.497857,0.609488,0.571339,0.653095
5,Baseline and Advanced,0.492714,0.63183,0.559596,0.725476
4,Advanced and Fourier,0.783571,0.808736,0.86079,0.762619
2,Fourier only,0.761571,0.819431,0.750942,0.901667
6,"Baseline, Advanced, and Fourier",0.814857,0.840158,0.871546,0.810952
3,Baseline and Fourier,0.859714,0.886316,0.862551,0.911429


In [41]:
print("Image files (all)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".jpeg", ".jpg", ".png", ".bmp", ".ico", ".webp"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (all)
------------------------------------------------
                                               Baseline only: 		 0.671 		 0.627 		23104
                                               Advanced only: 		 0.682 		 0.606 		23104
                                                Fourier only: 		 0.728 		 0.649 		23104
                                        Baseline and Fourier: 		 0.788 		 0.746 		23104
                                        Advanced and Fourier: 		 0.738 		 0.694 		23104
                                       Baseline and Advanced: 		 0.710 		 0.658 		23104
                             Baseline, Advanced, and Fourier: 		 0.754 		 0.719 		23104


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.626731 &  0.670513 &   0.714693 &  0.631477 \\
1 &                    Advanced only &  0.605999 &  0.681813 &   0.662883 &  0.701857 \\
2 &  

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.626731,0.670513,0.714693,0.631477
1,Advanced only,0.605999,0.681813,0.662883,0.701857
5,Baseline and Advanced,0.658241,0.710217,0.724685,0.696315
2,Fourier only,0.648546,0.727535,0.681567,0.780153
4,Advanced and Fourier,0.694468,0.738449,0.761094,0.717113
6,"Baseline, Advanced, and Fourier",0.71901,0.753867,0.796635,0.715458
3,Baseline and Fourier,0.745672,0.787517,0.791467,0.783607


In [42]:
print("Image files (excluding webp)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".jpeg", ".jpg", ".png", ".bmp", ".ico"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (excluding webp)
------------------------------------------------
                                               Baseline only: 		 0.674 		 0.644 		14099
                                               Advanced only: 		 0.704 		 0.638 		14099
                                                Fourier only: 		 0.745 		 0.685 		14099
                                        Baseline and Fourier: 		 0.818 		 0.793 		14099
                                        Advanced and Fourier: 		 0.767 		 0.738 		14099
                                       Baseline and Advanced: 		 0.735 		 0.698 		14099
                             Baseline, Advanced, and Fourier: 		 0.780 		 0.759 		14099


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.644301 &  0.674456 &   0.751591 &  0.611680 \\
1 &                    Advanced only &  0.637634 &  0.704083 &   0.692886 &  0.7156

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.644301,0.674456,0.751591,0.61168
1,Advanced only,0.637634,0.704083,0.692886,0.715648
5,Baseline and Advanced,0.697709,0.735016,0.778685,0.695985
2,Fourier only,0.684942,0.745094,0.726744,0.764394
4,Advanced and Fourier,0.738279,0.767339,0.825981,0.716472
6,"Baseline, Advanced, and Fourier",0.758564,0.779961,0.864698,0.71035
3,Baseline and Fourier,0.792893,0.817637,0.870594,0.770752


In [44]:
print("Image files (only webp)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".webp"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (only webp)
------------------------------------------------
                                               Baseline only: 		 0.665 		 0.599 		9005
                                               Advanced only: 		 0.648 		 0.556 		9005
                                                Fourier only: 		 0.703 		 0.592 		9005
                                        Baseline and Fourier: 		 0.746 		 0.672 		9005
                                        Advanced and Fourier: 		 0.697 		 0.626 		9005
                                       Baseline and Advanced: 		 0.674 		 0.596 		9005
                             Baseline, Advanced, and Fourier: 		 0.717 		 0.657 		9005


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.599223 &  0.664871 &   0.667164 &  0.662595 \\
1 &                    Advanced only &  0.556469 &  0.647920 &   0.618583 &  0.680178 \\
2 &   

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.556469,0.64792,0.618583,0.680178
0,Baseline only,0.599223,0.664871,0.667164,0.662595
5,Baseline and Advanced,0.596446,0.674489,0.653532,0.696835
4,Advanced and Fourier,0.625875,0.697277,0.677611,0.71812
2,Fourier only,0.59156,0.702812,0.623691,0.804923
6,"Baseline, Advanced, and Fourier",0.657079,0.716853,0.71034,0.723487
3,Baseline and Fourier,0.671738,0.746092,0.696105,0.803813


In [47]:
print("video files")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".mpeg", ".mpg", ".avi", ".xvid", ".mp4"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

video & audio files
------------------------------------------------
                                               Baseline only: 		 0.708 		 0.619 		2000
                                               Advanced only: 		 0.630 		 0.524 		2000
                                                Fourier only: 		 0.681 		 0.598 		2000
                                        Baseline and Fourier: 		 0.907 		 0.894 		2000
                                        Advanced and Fourier: 		 0.704 		 0.656 		2000
                                       Baseline and Advanced: 		 0.688 		 0.571 		2000
                             Baseline, Advanced, and Fourier: 		 0.741 		 0.695 		2000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &    0.6185 &  0.708445 &   0.654199 &  0.772500 \\
1 &                    Advanced only &    0.5240 &  0.629860 &   0.590379 &  0.675000 \\
2 &       

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.524,0.62986,0.590379,0.675
2,Fourier only,0.5975,0.68144,0.648832,0.7175
5,Baseline and Advanced,0.571,0.687546,0.610608,0.786667
4,Advanced and Fourier,0.6565,0.704007,0.728814,0.680833
0,Baseline only,0.6185,0.708445,0.654199,0.7725
6,"Baseline, Advanced, and Fourier",0.695,0.741306,0.75475,0.728333
3,Baseline and Fourier,0.894,0.90718,0.95572,0.863333


In [50]:
print("audio files")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".wav", ".mp3", ".aac", ".flac", ".ogg"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

audio files
------------------------------------------------
                                               Baseline only: 		 0.739 		 0.704 		1000
                                               Advanced only: 		 0.740 		 0.656 		1000
                                                Fourier only: 		 0.605 		 0.553 		1000
                                        Baseline and Fourier: 		 0.724 		 0.712 		1000
                                        Advanced and Fourier: 		 0.751 		 0.720 		1000
                                       Baseline and Advanced: 		 0.800 		 0.742 		1000
                             Baseline, Advanced, and Fourier: 		 0.730 		 0.713 		1000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &     0.704 &  0.738516 &   0.785714 &  0.696667 \\
1 &                    Advanced only &     0.656 &  0.740181 &   0.676796 &  0.816667 \\
2 &               

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.553,0.605472,0.643527,0.571667
3,Baseline and Fourier,0.712,0.724138,0.851351,0.63
6,"Baseline, Advanced, and Fourier",0.713,0.7295,0.839479,0.645
0,Baseline only,0.704,0.738516,0.785714,0.696667
1,Advanced only,0.656,0.740181,0.676796,0.816667
4,Advanced and Fourier,0.72,0.75089,0.805344,0.703333
5,Baseline and Advanced,0.742,0.8,0.747826,0.86


In [22]:
def custom_result_print_ransomware(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c or "ransomware" in c]

    rf_results2 = rf_results[colnames]

    ppr = rf_results2[rf_results2["is_ransomware"] == 1]

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

In [24]:
rdf = custom_result_print_ransomware(rf_results, None)
rdf.sort_values(by="F1")[["FeatureSet", "Recall"]]

                                               Baseline only: 		 0.670 		 0.504 		707
                                               Advanced only: 		 0.707 		 0.547 		707
                                                Fourier only: 		 0.797 		 0.662 		707
                                        Baseline and Fourier: 		 0.727 		 0.571 		707
                                        Advanced and Fourier: 		 0.643 		 0.474 		707
                                       Baseline and Advanced: 		 0.681 		 0.516 		707
                             Baseline, Advanced, and Fourier: 		 0.629 		 0.458 		707


Unnamed: 0,FeatureSet,Recall
6,"Baseline, Advanced, and Fourier",0.458274
4,Advanced and Fourier,0.473833
0,Baseline only,0.503536
5,Baseline and Advanced,0.516266
1,Advanced only,0.547383
3,Baseline and Fourier,0.571429
2,Fourier only,0.661952


In [25]:
print(rdf.sort_values(by="F1")[["FeatureSet", "Recall"]].to_latex())

\begin{tabular}{llr}
\toprule
{} &                       FeatureSet &    Recall \\
\midrule
6 &  Baseline, Advanced, and Fourier &  0.458274 \\
4 &             Advanced and Fourier &  0.473833 \\
0 &                    Baseline only &  0.503536 \\
5 &            Baseline and Advanced &  0.516266 \\
1 &                    Advanced only &  0.547383 \\
3 &             Baseline and Fourier &  0.571429 \\
2 &                     Fourier only &  0.661952 \\
\bottomrule
\end{tabular}



In [51]:
rf_results.to_csv("../randomforest_results.csv.gz")