## Train with one dataset and test with another

In [2]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

In [3]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [4]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [5]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    train_datasets = {}
    test_datasets = {}
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df = df.sample(frac=1).reset_index(drop=True)
        df["csv_filename"] = file
        df["is_encrypted"] = 1 if "encr" in file.lower()  else 0
        if file.startswith("n1."):
            test_datasets[file] = df
        else:
            train_datasets[file] = df
        if DEBUG and len(test_datasets) > 2 and len(train_datasets) > 2:
            break
    return train_datasets, test_datasets

In [6]:
def run_model(traindf, testdf, columns, description, clf, clfname="clf"):
    call_gc()
    
    traindf = traindf.sample(frac=1).reset_index(drop=True)
    call_gc()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
        
    call_gc()
    
    trainX = traindf[columns].to_numpy() 
    testX = testdf[columns].to_numpy()
    trainY = traindf["is_encrypted"].to_numpy()
    testY = testdf["is_encrypted"].to_numpy()
    
    estimators = [\
                  ('std,', MinMaxScaler()), \
                  (clfname, clf())]
    pipeline = Pipeline(estimators)
    
    print("Training started...")
    pipeline.fit(trainX, trainY)
    print("Done.")
    
    print("Prediction started...")
    y_pred = pipeline.predict(testX)
    print("Done.")
    
    return y_pred

In [7]:
def get_results_for_classifier(clf, clfname="clf"):
    print(f"Evaluating : {clf}")
    results = {
        "Feature Set": [],
        "Accuracy": [],
        "F1-Score": [],
        "Precision": [],
        "Recall": []
    }

    traindf, testdf = load_datasets_once()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
    
    if DEBUG:
        qraindf = traindf.head(5000)
        testdf = testdf.head(5000)
    
    testdf_copy = testdf.copy()

    columns = get_columns(traindf)

    for desc, cols in columns.items():
        y_pred = run_model(traindf, testdf, cols, desc, clf, clfname)
        testdf_copy[f"Prediction:{desc}"] = y_pred
    return testdf_copy
    

In [8]:
rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
lr_clf = lambda: LogisticRegression(\
            n_jobs=8, \
            solver='saga', \
            random_state=42, \
            max_iter=1000, \
            multi_class='ovr')
rf_results = get_results_for_classifier(rfc_clf, "Random Forest")
#lr_results = get_results_for_classifier(lr_clf, "Logistic Regression")

Evaluating : <function <lambda> at 0x7f7e1236fb80>
Loading n1.zip.expanded.plaintext.csv.gz
Loading plaintext.base32.combined.csv.gz
Loading expanded.base32.des3.csv.gz
Loading n1.plaintext.base32.csv.gz
Loading expanded_encrypted_v3.csv.gz
Loading n1.expanded.plaintext.csv.gz
Loading n1.expanded.pyencrypted_v2.csv.gz
Loading n1.expanded.pyencrypted_v1.base32.csv.gz
Loading expanded.des3.csv.gz
Loading n1.expanded.pyencrypted_v2.base32.csv.gz
Loading expanded.pyencrypted_v1.csv.gz
Loading expanded.pyencrypted_v2.base32.csv.gz
Loading n1.zip.expanded.encrypted.v2.csv.gz
Loading n1.zip.expanded.encrypted.v2.base32.csv.gz
Loading expanded_encrypted_v3_base32.csv.gz
Loading n1.expanded.pyencrypted_v3.base32.csv.gz
Loading n1.zip.expanded.plaintext.base32.csv.gz
Loading plaintext.combined.csv.gz
Loading plaintext.expanded.csv.gz
Loading expanded.pyencrypted_v2.csv.gz
Loading expanded.plaintext.base32.csv.gz
Loading n1.expanded.pyencrypted_v3.csv.gz
Loading n1.expanded.plaintext.base32.csv.g

In [7]:
def custom_result_print(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

In [10]:
print("Office Files: ")
print("------------------------")
rdf = custom_result_print(rf_results, [".xls", ".csv", ".ppt", ".doc", ".doc", "odf", "opf"]).sort_values(by="F1")
print()
print()
print(rdf.to_latex())
rdf

Office Files: 
------------------------
                                               Baseline only: 		 0.706 		 0.656 		18990
                                               Advanced only: 		 0.696 		 0.625 		18990
                                                Fourier only: 		 0.854 		 0.814 		18990
                                        Baseline and Fourier: 		 0.904 		 0.887 		18990
                                        Advanced and Fourier: 		 0.866 		 0.848 		18990
                                       Baseline and Advanced: 		 0.719 		 0.645 		18990
                             Baseline, Advanced, and Fourier: 		 0.876 		 0.858 		18990


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
1 &                    Advanced only &  0.625066 &  0.696453 &   0.677168 &  0.716869 \\
0 &                    Baseline only &  0.655661 &  0.706046 &   0.723712 &  0.689222 \\
5 &            Baseline and Adva

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.625066,0.696453,0.677168,0.716869
0,Baseline only,0.655661,0.706046,0.723712,0.689222
5,Baseline and Advanced,0.645129,0.71861,0.685384,0.755222
2,Fourier only,0.814271,0.85367,0.809505,0.902931
4,Advanced and Fourier,0.847709,0.865788,0.918653,0.818676
6,"Baseline, Advanced, and Fourier",0.858083,0.876404,0.917779,0.838599
3,Baseline and Fourier,0.886519,0.903679,0.920758,0.887221


In [11]:
print("Passsword protected files (excluding archives)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], [".7z", ".gz", ".zip"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (excluding archives)
------------------------------------------------
                                               Baseline only: 		 0.610 		 0.495 		7000
                                               Advanced only: 		 0.604 		 0.494 		7000
                                                Fourier only: 		 0.822 		 0.763 		7000
                                        Baseline and Fourier: 		 0.886 		 0.861 		7000
                                        Advanced and Fourier: 		 0.817 		 0.792 		7000
                                       Baseline and Advanced: 		 0.623 		 0.485 		7000
                             Baseline, Advanced, and Fourier: 		 0.848 		 0.822 		7000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.495143 &  0.609848 &   0.568547 &  0.657619 \\
1 &                    Advanced only &  0.494143 &  0.604225 &   0.56941

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.494143,0.604225,0.569412,0.643571
0,Baseline only,0.495143,0.609848,0.568547,0.657619
5,Baseline and Advanced,0.484714,0.623211,0.555183,0.710238
4,Advanced and Fourier,0.791714,0.816788,0.864822,0.77381
2,Fourier only,0.762857,0.822041,0.74766,0.912857
6,"Baseline, Advanced, and Fourier",0.822429,0.847616,0.873642,0.823095
3,Baseline and Fourier,0.860571,0.886194,0.868373,0.904762


In [12]:
print("Passsword protected files (excluding office files)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], [".xls", ".csv", ".ppt", ".doc", ".doc", "odf", "opf"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (excluding office files)
------------------------------------------------
                                               Baseline only: 		 0.614 		 0.492 		1000
                                               Advanced only: 		 0.672 		 0.539 		1000
                                                Fourier only: 		 0.826 		 0.756 		1000
                                        Baseline and Fourier: 		 0.820 		 0.766 		1000
                                        Advanced and Fourier: 		 0.789 		 0.727 		1000
                                       Baseline and Advanced: 		 0.670 		 0.537 		1000
                             Baseline, Advanced, and Fourier: 		 0.787 		 0.730 		1000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &     0.492 &  0.613982 &   0.564246 &  0.673333 \\
1 &                    Advanced only &     0.539 &  0.672353 &   0.5

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.492,0.613982,0.564246,0.673333
5,Baseline and Advanced,0.537,0.669522,0.585518,0.781667
1,Advanced only,0.539,0.672353,0.586121,0.788333
6,"Baseline, Advanced, and Fourier",0.73,0.78673,0.747748,0.83
4,Advanced and Fourier,0.727,0.789189,0.735252,0.851667
3,Baseline and Fourier,0.766,0.819723,0.762178,0.886667
2,Fourier only,0.756,0.826211,0.721393,0.966667


In [13]:
print("Passsword protected files (all)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], None)
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (all)
------------------------------------------------
                                               Baseline only: 		 0.610 		 0.495 		7000
                                               Advanced only: 		 0.604 		 0.494 		7000
                                                Fourier only: 		 0.822 		 0.763 		7000
                                        Baseline and Fourier: 		 0.886 		 0.861 		7000
                                        Advanced and Fourier: 		 0.817 		 0.792 		7000
                                       Baseline and Advanced: 		 0.623 		 0.485 		7000
                             Baseline, Advanced, and Fourier: 		 0.848 		 0.822 		7000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.495143 &  0.609848 &   0.568547 &  0.657619 \\
1 &                    Advanced only &  0.494143 &  0.604225 &   0.569412 &  0.643571 \

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.494143,0.604225,0.569412,0.643571
0,Baseline only,0.495143,0.609848,0.568547,0.657619
5,Baseline and Advanced,0.484714,0.623211,0.555183,0.710238
4,Advanced and Fourier,0.791714,0.816788,0.864822,0.77381
2,Fourier only,0.762857,0.822041,0.74766,0.912857
6,"Baseline, Advanced, and Fourier",0.822429,0.847616,0.873642,0.823095
3,Baseline and Fourier,0.860571,0.886194,0.868373,0.904762


In [14]:
print("Image files (all)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".jpeg", ".jpg", ".png", ".bmp", ".ico", ".webp"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (all)
------------------------------------------------
                                               Baseline only: 		 0.674 		 0.631 		23020
                                               Advanced only: 		 0.681 		 0.606 		23020
                                                Fourier only: 		 0.732 		 0.654 		23020
                                        Baseline and Fourier: 		 0.790 		 0.750 		23020
                                        Advanced and Fourier: 		 0.744 		 0.701 		23020
                                       Baseline and Advanced: 		 0.707 		 0.655 		23020
                             Baseline, Advanced, and Fourier: 		 0.757 		 0.722 		23020


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.630582 &  0.674251 &   0.715878 &  0.637200 \\
1 &                    Advanced only &  0.605517 &  0.680595 &   0.661810 &  0.700478 \\
2 &  

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.630582,0.674251,0.715878,0.6372
1,Advanced only,0.605517,0.680595,0.66181,0.700478
5,Baseline and Advanced,0.6553,0.7066,0.722058,0.69179
2,Fourier only,0.653779,0.732425,0.68286,0.789748
4,Advanced and Fourier,0.700782,0.743941,0.764517,0.724443
6,"Baseline, Advanced, and Fourier",0.721677,0.756675,0.79575,0.721257
3,Baseline and Fourier,0.749913,0.790418,0.794904,0.785983


In [15]:
print("Image files (excluding webp)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".jpeg", ".jpg", ".png", ".bmp", ".ico"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (excluding webp)
------------------------------------------------
                                               Baseline only: 		 0.679 		 0.649 		14015
                                               Advanced only: 		 0.704 		 0.638 		14015
                                                Fourier only: 		 0.749 		 0.689 		14015
                                        Baseline and Fourier: 		 0.820 		 0.795 		14015
                                        Advanced and Fourier: 		 0.772 		 0.742 		14015
                                       Baseline and Advanced: 		 0.727 		 0.691 		14015
                             Baseline, Advanced, and Fourier: 		 0.781 		 0.759 		14015


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.649090 &  0.679358 &   0.751912 &  0.619574 \\
1 &                    Advanced only &  0.638316 &  0.704414 &   0.691076 &  0.7182

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.64909,0.679358,0.751912,0.619574
1,Advanced only,0.638316,0.704414,0.691076,0.718278
5,Baseline and Advanced,0.691045,0.727467,0.772697,0.68724
2,Fourier only,0.689333,0.748962,0.726917,0.772387
4,Advanced and Fourier,0.741919,0.771553,0.822737,0.726365
6,"Baseline, Advanced, and Fourier",0.758616,0.780908,0.857366,0.71697
3,Baseline and Fourier,0.794506,0.820045,0.863989,0.780354


In [16]:
print("Image files (only webp)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".webp"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (only webp)
------------------------------------------------
                                               Baseline only: 		 0.667 		 0.602 		9005
                                               Advanced only: 		 0.644 		 0.554 		9005
                                                Fourier only: 		 0.709 		 0.598 		9005
                                        Baseline and Fourier: 		 0.749 		 0.681 		9005
                                        Advanced and Fourier: 		 0.704 		 0.637 		9005
                                       Baseline and Advanced: 		 0.677 		 0.600 		9005
                             Baseline, Advanced, and Fourier: 		 0.722 		 0.664 		9005


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.601777 &  0.666976 &   0.669338 &  0.664631 \\
1 &                    Advanced only &  0.554470 &  0.644389 &   0.618302 &  0.672774 \\
2 &   

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.55447,0.644389,0.618302,0.672774
0,Baseline only,0.601777,0.666976,0.669338,0.664631
5,Baseline and Advanced,0.599667,0.676884,0.656239,0.698871
4,Advanced and Fourier,0.636757,0.704437,0.688206,0.721451
2,Fourier only,0.598445,0.709371,0.626936,0.816768
6,"Baseline, Advanced, and Fourier",0.664187,0.722314,0.716785,0.727929
3,Baseline and Fourier,0.680511,0.749062,0.708347,0.794744


In [17]:
print("video files")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".mpeg", ".mpg", ".avi", ".xvid", ".mp4"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

video files
------------------------------------------------
                                               Baseline only: 		 0.713 		 0.624 		2000
                                               Advanced only: 		 0.636 		 0.530 		2000
                                                Fourier only: 		 0.697 		 0.617 		2000
                                        Baseline and Fourier: 		 0.887 		 0.873 		2000
                                        Advanced and Fourier: 		 0.713 		 0.664 		2000
                                       Baseline and Advanced: 		 0.676 		 0.567 		2000
                             Baseline, Advanced, and Fourier: 		 0.758 		 0.721 		2000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &    0.6235 &  0.712924 &   0.657063 &  0.779167 \\
1 &                    Advanced only &    0.5305 &  0.635623 &   0.594771 &  0.682500 \\
2 &               

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.5305,0.635623,0.594771,0.6825
5,Baseline and Advanced,0.567,0.676141,0.613297,0.753333
2,Fourier only,0.617,0.697233,0.663158,0.735
0,Baseline only,0.6235,0.712924,0.657063,0.779167
4,Advanced and Fourier,0.664,0.713311,0.730769,0.696667
6,"Baseline, Advanced, and Fourier",0.721,0.758442,0.789189,0.73
3,Baseline and Fourier,0.873,0.88691,0.952199,0.83


In [18]:
print("audio files")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".wav", ".mp3", ".aac", ".flac", ".ogg"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

audio files
------------------------------------------------
                                               Baseline only: 		 0.747 		 0.709 		1000
                                               Advanced only: 		 0.748 		 0.667 		1000
                                                Fourier only: 		 0.639 		 0.582 		1000
                                        Baseline and Fourier: 		 0.718 		 0.710 		1000
                                        Advanced and Fourier: 		 0.748 		 0.717 		1000
                                       Baseline and Advanced: 		 0.814 		 0.768 		1000
                             Baseline, Advanced, and Fourier: 		 0.737 		 0.714 		1000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &     0.709 &  0.746736 &   0.781421 &  0.715000 \\
1 &                    Advanced only &     0.667 &  0.747536 &   0.685675 &  0.821667 \\
2 &               

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.582,0.639033,0.663082,0.616667
3,Baseline and Fourier,0.71,0.718447,0.860465,0.616667
6,"Baseline, Advanced, and Fourier",0.714,0.736648,0.823045,0.666667
0,Baseline only,0.709,0.746736,0.781421,0.715
1,Advanced only,0.667,0.747536,0.685675,0.821667
4,Advanced and Fourier,0.717,0.748444,0.801905,0.701667
5,Baseline and Advanced,0.768,0.814103,0.783951,0.846667


In [21]:
rf_results.to_csv("../napier1_random_forest_results.csv.gz")

In [3]:
rf_results = pd.read_csv("napier1_random_forest_results.csv.gz")

In [5]:
df = rf_results
def custom_result_print(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    if match_string:

        ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]
    else:
        print("Matched None")
        ppr = rf_results2

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

print("All Files: ")
print("------------------------")
rdf = custom_result_print(df, None)
print()
print()
print(rdf.to_latex(index=False))
rdf

All Files: 
------------------------
Matched None
                                               Baseline only: 		 0.692 		 0.642 		90022
                                               Advanced only: 		 0.708 		 0.643 		90022
                                                Fourier only: 		 0.799 		 0.739 		90022
                                        Baseline and Fourier: 		 0.830 		 0.793 		90022
                                        Advanced and Fourier: 		 0.792 		 0.755 		90022
                                       Baseline and Advanced: 		 0.722 		 0.665 		90022
                             Baseline, Advanced, and Fourier: 		 0.796 		 0.762 		90022


\begin{tabular}{lrrrr}
\toprule
                     FeatureSet &  Accuracy &       F1 &  Precision &   Recall \\
\midrule
                  Baseline only &  0.642365 & 0.691574 &   0.717266 & 0.667659 \\
                  Advanced only &  0.643054 & 0.708007 &   0.695847 & 0.720599 \\
                   Fourier only &  0.738819 &

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.642365,0.691574,0.717266,0.667659
1,Advanced only,0.643054,0.708007,0.695847,0.720599
2,Fourier only,0.738819,0.798984,0.74283,0.864322
3,Baseline and Fourier,0.79345,0.829971,0.82071,0.839444
4,Advanced and Fourier,0.755204,0.791662,0.809642,0.774463
5,Baseline and Advanced,0.664582,0.722215,0.718414,0.726055
6,"Baseline, Advanced, and Fourier",0.761969,0.796238,0.819315,0.774426


In [6]:
print("All Archives")
print("------------------------------------------------")
rdf = custom_result_print(df, [".7z", ".gz", ".zip", ".bzip2", ".tar.gz", ".tar.bz2", ".bz2"])
print()
print()
print(rdf.to_latex(index=False))
rdf

All Archives
------------------------------------------------
                                               Baseline only: 		 0.611 		 0.510 		12200
                                               Advanced only: 		 0.640 		 0.547 		12200
                                                Fourier only: 		 0.733 		 0.611 		12200
                                        Baseline and Fourier: 		 0.709 		 0.588 		12200
                                        Advanced and Fourier: 		 0.674 		 0.579 		12200
                                       Baseline and Advanced: 		 0.623 		 0.526 		12200
                             Baseline, Advanced, and Fourier: 		 0.650 		 0.553 		12200


\begin{tabular}{lrrrr}
\toprule
                     FeatureSet &  Accuracy &       F1 &  Precision &   Recall \\
\midrule
                  Baseline only &  0.510492 & 0.610945 &   0.589811 & 0.633649 \\
                  Advanced only &  0.547377 & 0.639697 &   0.618471 & 0.662432 \\
                   Fourier only &

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.510492,0.610945,0.589811,0.633649
1,Advanced only,0.547377,0.639697,0.618471,0.662432
2,Fourier only,0.611148,0.733363,0.627791,0.881622
3,Baseline and Fourier,0.588279,0.709132,0.620428,0.827432
4,Advanced and Fourier,0.579344,0.674407,0.635613,0.718243
5,Baseline and Advanced,0.526475,0.623231,0.602294,0.645676
6,"Baseline, Advanced, and Fourier",0.553033,0.649843,0.619112,0.683784
