## Train with one dataset and test with another

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

In [3]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [2]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [5]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    train_datasets = {}
    test_datasets = {}
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df = df.sample(frac=1).reset_index(drop=True)
        df["csv_filename"] = file
        df["is_encrypted"] = 1 if "encr" in file.lower() else 0
        if file.startswith("n1."):
            test_datasets[file] = df
        else:
            train_datasets[file] = df
        if DEBUG and len(test_datasets) > 2 and len(train_datasets) > 2:
            break
    return train_datasets, test_datasets

In [6]:
def run_model(traindf, testdf, columns, description, clf, clfname="clf"):
    call_gc()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
        
        
    call_gc()
    traindf = traindf.sample(frac=1).reset_index(drop=True)
    call_gc()
    
    trainX = traindf[columns].to_numpy() 
    testX = testdf[columns].to_numpy()
    trainY = traindf["is_encrypted"].to_numpy()
    testY = testdf["is_encrypted"].to_numpy()
    
    estimators = [\
                  ('std,', MinMaxScaler()), \
                  (clfname, clf())]
    pipeline = Pipeline(estimators)
    
    print("Training started...")
    pipeline.fit(trainX, trainY)
    print("Done.")
    
    print("Prediction started...")
    y_pred = pipeline.predict(testX)
    print("Done.")
    
    return y_pred

In [7]:
def get_results_for_classifier(clf, clfname="clf"):
    print(f"Evaluating : {clf}")
    results = {
        "Feature Set": [],
        "Accuracy": [],
        "F1-Score": [],
        "Precision": [],
        "Recall": []
    }

    traindf, testdf = load_datasets_once()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
    
    if DEBUG:
        qraindf = traindf.head(5000)
        testdf = testdf.head(5000)
    
    testdf_copy = testdf.copy()

    columns = get_columns(traindf)

    for desc, cols in columns.items():
        y_pred = run_model(traindf, testdf, cols, desc, clf, clfname)
        testdf_copy[f"Prediction:{desc}"] = y_pred
    return testdf_copy
    

In [8]:
rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
lr_clf = lambda: LogisticRegression(\
            n_jobs=8, \
            solver='saga', \
            random_state=42, \
            max_iter=1000, \
            multi_class='ovr')
#rf_results = get_results_for_classifier(rfc_clf, "Random Forest")
lr_results = get_results_for_classifier(lr_clf, "Logistic Regression")

# hack to avoid having rewrite code below
rf_results = lr_results

Evaluating : <function <lambda> at 0x7fb7fab6a9d0>
Loading n1.zip.expanded.plaintext.csv.gz
Loading plaintext.base32.combined.csv.gz
Loading expanded.base32.des3.csv.gz
Loading n1.plaintext.base32.csv.gz
Loading expanded_encrypted_v3.csv.gz
Loading n1.expanded.plaintext.csv.gz
Loading n1.expanded.pyencrypted_v2.csv.gz
Loading n1.expanded.pyencrypted_v1.base32.csv.gz
Loading expanded.des3.csv.gz
Loading n1.expanded.pyencrypted_v2.base32.csv.gz
Loading expanded.pyencrypted_v1.csv.gz
Loading expanded.pyencrypted_v2.base32.csv.gz
Loading n1.zip.expanded.encrypted.v2.csv.gz
Loading n1.zip.expanded.encrypted.v2.base32.csv.gz
Loading expanded_encrypted_v3_base32.csv.gz
Loading n1.expanded.pyencrypted_v3.base32.csv.gz
Loading n1.zip.expanded.plaintext.base32.csv.gz
Loading plaintext.combined.csv.gz
Loading plaintext.expanded.csv.gz
Loading expanded.pyencrypted_v2.csv.gz
Loading expanded.plaintext.base32.csv.gz
Loading n1.expanded.pyencrypted_v3.csv.gz
Loading n1.expanded.plaintext.base32.csv.g

In [9]:
def custom_result_print(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

In [10]:
print("Office Files: ")
print("------------------------")
rdf = custom_result_print(rf_results, [".xls", ".csv", ".ppt", ".doc", ".doc", "odf", "opf"]).sort_values(by="F1")
print()
print()
print(rdf.to_latex())
rdf

Office Files: 
------------------------
                                               Baseline only: 		 0.523 		 0.526 		18990
                                               Advanced only: 		 0.646 		 0.533 		18990
                                                Fourier only: 		 0.703 		 0.646 		18990
                                        Baseline and Fourier: 		 0.726 		 0.694 		18990
                                        Advanced and Fourier: 		 0.763 		 0.709 		18990
                                       Baseline and Advanced: 		 0.667 		 0.562 		18990
                             Baseline, Advanced, and Fourier: 		 0.770 		 0.713 		18990


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.525803 &  0.522560 &   0.659971 &  0.432508 \\
1 &                    Advanced only &  0.533491 &  0.645654 &   0.593151 &  0.708355 \\
5 &            Baseline and Adva

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.525803,0.52256,0.659971,0.432508
1,Advanced only,0.533491,0.645654,0.593151,0.708355
5,Baseline and Advanced,0.562401,0.666774,0.613851,0.729682
2,Fourier only,0.64613,0.70276,0.7084,0.697209
3,Baseline and Fourier,0.693681,0.72622,0.78301,0.677111
4,Advanced and Fourier,0.708689,0.763388,0.744535,0.783219
6,"Baseline, Advanced, and Fourier",0.712954,0.769601,0.742275,0.799017


In [11]:
print("Passsword protected files (excluding archives)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], [".7z", ".gz", ".zip"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (excluding archives)
------------------------------------------------
                                               Baseline only: 		 0.563 		 0.536 		7000
                                               Advanced only: 		 0.586 		 0.427 		7000
                                                Fourier only: 		 0.697 		 0.620 		7000
                                        Baseline and Fourier: 		 0.713 		 0.665 		7000
                                        Advanced and Fourier: 		 0.721 		 0.635 		7000
                                       Baseline and Advanced: 		 0.610 		 0.457 		7000
                             Baseline, Advanced, and Fourier: 		 0.728 		 0.636 		7000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.535857 &  0.563248 &   0.646805 &  0.498810 \\
1 &                    Advanced only &  0.426714 &  0.585904 &   0.51702

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.535857,0.563248,0.646805,0.49881
1,Advanced only,0.426714,0.585904,0.517028,0.675952
5,Baseline and Advanced,0.457143,0.609536,0.536153,0.70619
2,Fourier only,0.620143,0.697187,0.668195,0.72881
3,Baseline and Fourier,0.664571,0.713379,0.731964,0.695714
4,Advanced and Fourier,0.635429,0.72091,0.666667,0.784762
6,"Baseline, Advanced, and Fourier",0.636429,0.728301,0.660151,0.812143


In [12]:
print("Passsword protected files (excluding office files)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], [".xls", ".csv", ".ppt", ".doc", ".doc", "odf", "opf"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (excluding office files)
------------------------------------------------
                                               Baseline only: 		 0.549 		 0.508 		1000
                                               Advanced only: 		 0.451 		 0.386 		1000
                                                Fourier only: 		 0.676 		 0.588 		1000
                                        Baseline and Fourier: 		 0.671 		 0.578 		1000
                                        Advanced and Fourier: 		 0.678 		 0.593 		1000
                                       Baseline and Advanced: 		 0.514 		 0.415 		1000
                             Baseline, Advanced, and Fourier: 		 0.691 		 0.596 		1000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &     0.508 &  0.549451 &   0.609756 &  0.500000 \\
1 &                    Advanced only &     0.386 &  0.450805 &   0.4

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.386,0.450805,0.486486,0.42
5,Baseline and Advanced,0.415,0.513716,0.512438,0.515
0,Baseline only,0.508,0.549451,0.609756,0.5
3,Baseline and Fourier,0.578,0.670827,0.630499,0.716667
2,Fourier only,0.588,0.676101,0.639881,0.716667
4,Advanced and Fourier,0.593,0.678261,0.645113,0.715
6,"Baseline, Advanced, and Fourier",0.596,0.691131,0.638418,0.753333


In [13]:
print("Passsword protected files (all)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, ["password"], None)
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Passsword protected files (all)
------------------------------------------------
                                               Baseline only: 		 0.563 		 0.536 		7000
                                               Advanced only: 		 0.586 		 0.427 		7000
                                                Fourier only: 		 0.697 		 0.620 		7000
                                        Baseline and Fourier: 		 0.713 		 0.665 		7000
                                        Advanced and Fourier: 		 0.721 		 0.635 		7000
                                       Baseline and Advanced: 		 0.610 		 0.457 		7000
                             Baseline, Advanced, and Fourier: 		 0.728 		 0.636 		7000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.535857 &  0.563248 &   0.646805 &  0.498810 \\
1 &                    Advanced only &  0.426714 &  0.585904 &   0.517028 &  0.675952 \

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.535857,0.563248,0.646805,0.49881
1,Advanced only,0.426714,0.585904,0.517028,0.675952
5,Baseline and Advanced,0.457143,0.609536,0.536153,0.70619
2,Fourier only,0.620143,0.697187,0.668195,0.72881
3,Baseline and Fourier,0.664571,0.713379,0.731964,0.695714
4,Advanced and Fourier,0.635429,0.72091,0.666667,0.784762
6,"Baseline, Advanced, and Fourier",0.636429,0.728301,0.660151,0.812143


In [14]:
print("Image files (all)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".jpeg", ".jpg", ".png", ".bmp", ".ico", ".webp"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (all)
------------------------------------------------
                                               Baseline only: 		 0.542 		 0.504 		23020
                                               Advanced only: 		 0.483 		 0.413 		23020
                                                Fourier only: 		 0.652 		 0.584 		23020
                                        Baseline and Fourier: 		 0.639 		 0.575 		23020
                                        Advanced and Fourier: 		 0.629 		 0.567 		23020
                                       Baseline and Advanced: 		 0.517 		 0.473 		23020
                             Baseline, Advanced, and Fourier: 		 0.636 		 0.573 		23020


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.503519 &  0.542016 &   0.606928 &  0.489647 \\
1 &                    Advanced only &  0.413423 &  0.483455 &   0.512531 &  0.457501 \\
2 &  

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.413423,0.483455,0.512531,0.457501
5,Baseline and Advanced,0.472502,0.516927,0.573687,0.470388
0,Baseline only,0.503519,0.542016,0.606928,0.489647
4,Advanced and Fourier,0.567333,0.628608,0.648085,0.610266
6,"Baseline, Advanced, and Fourier",0.573197,0.636071,0.651195,0.621633
3,Baseline and Fourier,0.574587,0.638755,0.651124,0.626846
2,Fourier only,0.584144,0.652283,0.654494,0.650087


In [15]:
print("Image files (excluding webp)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".jpeg", ".jpg", ".png", ".bmp", ".ico"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (excluding webp)
------------------------------------------------
                                               Baseline only: 		 0.539 		 0.508 		14015
                                               Advanced only: 		 0.525 		 0.430 		14015
                                                Fourier only: 		 0.652 		 0.602 		14015
                                        Baseline and Fourier: 		 0.641 		 0.591 		14015
                                        Advanced and Fourier: 		 0.642 		 0.591 		14015
                                       Baseline and Advanced: 		 0.534 		 0.481 		14015
                             Baseline, Advanced, and Fourier: 		 0.653 		 0.601 		14015


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.507884 &  0.539248 &   0.615244 &  0.479962 \\
1 &                    Advanced only &  0.429682 &  0.524707 &   0.524738 &  0.5246

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.429682,0.524707,0.524738,0.524676
5,Baseline and Advanced,0.480628,0.533726,0.57845,0.495422
0,Baseline only,0.507884,0.539248,0.615244,0.479962
3,Baseline and Fourier,0.59051,0.641066,0.676121,0.609466
4,Advanced and Fourier,0.590724,0.641948,0.675601,0.611488
2,Fourier only,0.601784,0.651905,0.685467,0.621477
6,"Baseline, Advanced, and Fourier",0.600999,0.652929,0.682851,0.62552


In [16]:
print("Image files (only webp)")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".webp"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

Image files (only webp)
------------------------------------------------
                                               Baseline only: 		 0.546 		 0.497 		9005
                                               Advanced only: 		 0.409 		 0.388 		9005
                                                Fourier only: 		 0.653 		 0.557 		9005
                                        Baseline and Fourier: 		 0.635 		 0.550 		9005
                                        Advanced and Fourier: 		 0.609 		 0.531 		9005
                                       Baseline and Advanced: 		 0.489 		 0.460 		9005
                             Baseline, Advanced, and Fourier: 		 0.611 		 0.530 		9005


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.496724 &  0.546165 &   0.595025 &  0.504720 \\
1 &                    Advanced only &  0.388118 &  0.409052 &   0.486356 &  0.352952 \\
2 &   

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.388118,0.409052,0.486356,0.352952
5,Baseline and Advanced,0.459856,0.489397,0.565365,0.431427
0,Baseline only,0.496724,0.546165,0.595025,0.50472
4,Advanced and Fourier,0.530927,0.608816,0.609268,0.608366
6,"Baseline, Advanced, and Fourier",0.529928,0.611116,0.606713,0.615584
3,Baseline and Fourier,0.549806,0.635432,0.617981,0.653896
2,Fourier only,0.556691,0.652809,0.615751,0.694614


In [17]:
print("video files")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".mpeg", ".mpg", ".avi", ".xvid", ".mp4"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

video files
------------------------------------------------
                                               Baseline only: 		 0.545 		 0.500 		2000
                                               Advanced only: 		 0.574 		 0.408 		2000
                                                Fourier only: 		 0.596 		 0.540 		2000
                                        Baseline and Fourier: 		 0.575 		 0.522 		2000
                                        Advanced and Fourier: 		 0.598 		 0.534 		2000
                                       Baseline and Advanced: 		 0.562 		 0.402 		2000
                             Baseline, Advanced, and Fourier: 		 0.616 		 0.549 		2000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &    0.5000 &  0.545455 &   0.600000 &  0.500000 \\
1 &                    Advanced only &    0.4085 &  0.574001 &   0.505390 &  0.664167 \\
2 &               

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.5,0.545455,0.6,0.5
5,Baseline and Advanced,0.4015,0.561699,0.50098,0.639167
1,Advanced only,0.4085,0.574001,0.50539,0.664167
3,Baseline and Fourier,0.5225,0.57461,0.617225,0.5375
2,Fourier only,0.54,0.596491,0.62963,0.566667
4,Advanced and Fourier,0.534,0.597582,0.620072,0.576667
6,"Baseline, Advanced, and Fourier",0.549,0.61617,0.629565,0.603333


In [18]:
print("audio files")
print("------------------------------------------------")
rdf = custom_result_print(rf_results, [".wav", ".mp3", ".aac", ".flac", ".ogg"])
print()
print()
print(rdf.to_latex())
rdf.sort_values(by="F1")

audio files
------------------------------------------------
                                               Baseline only: 		 0.525 		 0.458 		1000
                                               Advanced only: 		 0.555 		 0.421 		1000
                                                Fourier only: 		 0.601 		 0.556 		1000
                                        Baseline and Fourier: 		 0.582 		 0.541 		1000
                                        Advanced and Fourier: 		 0.555 		 0.507 		1000
                                       Baseline and Advanced: 		 0.489 		 0.444 		1000
                             Baseline, Advanced, and Fourier: 		 0.552 		 0.488 		1000


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &     0.458 &  0.525394 &   0.553506 &  0.500000 \\
1 &                    Advanced only &     0.421 &  0.554958 &   0.514979 &  0.601667 \\
2 &               

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
5,Baseline and Advanced,0.444,0.488971,0.545082,0.443333
0,Baseline only,0.458,0.525394,0.553506,0.5
6,"Baseline, Advanced, and Fourier",0.488,0.552448,0.580882,0.526667
4,Advanced and Fourier,0.507,0.554652,0.605523,0.511667
1,Advanced only,0.421,0.554958,0.514979,0.601667
3,Baseline and Fourier,0.541,0.581586,0.641851,0.531667
2,Fourier only,0.556,0.600719,0.652344,0.556667


In [19]:
rf_results.to_csv("../napier1_logistic_regression_results.csv.gz")

In [14]:
def custom_result_print(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    if match_string:

        ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]
    else:
        print("Matched None")
        ppr = rf_results2

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

print("All Files: ")
print("------------------------")
rdf = custom_result_print(df, None).sort_values(by="F1")
print()
print()
print(rdf.to_latex(index=False))
rdf

All Files: 
------------------------
Matched None
                                               Baseline only: 		 0.532 		 0.519 		90022
                                               Advanced only: 		 0.557 		 0.494 		90022
                                                Fourier only: 		 0.674 		 0.615 		90022
                                        Baseline and Fourier: 		 0.677 		 0.628 		90022
                                        Advanced and Fourier: 		 0.682 		 0.626 		90022
                                       Baseline and Advanced: 		 0.581 		 0.520 		90022
                             Baseline, Advanced, and Fourier: 		 0.694 		 0.636 		90022


\begin{tabular}{lrrrr}
\toprule
                     FeatureSet &  Accuracy &       F1 &  Precision &   Recall \\
\midrule
                  Baseline only &  0.519106 & 0.532106 &   0.640025 & 0.455329 \\
                  Advanced only &  0.493535 & 0.557053 &   0.586650 & 0.530299 \\
          Baseline and Advanced &  0.520351 &

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.519106,0.532106,0.640025,0.455329
1,Advanced only,0.493535,0.557053,0.58665,0.530299
5,Baseline and Advanced,0.520351,0.581132,0.610999,0.554049
2,Fourier only,0.615216,0.673676,0.686448,0.66137
3,Baseline and Fourier,0.627635,0.676716,0.706954,0.648959
4,Advanced and Fourier,0.62648,0.68156,0.698298,0.665606
6,"Baseline, Advanced, and Fourier",0.636444,0.694262,0.701325,0.68734


In [19]:
print("All Archives")
print("------------------------------------------------")
rdf = custom_result_print(df, [".7z", ".gz", ".zip", ".bzip2", ".tar.gz", ".tar.bz2", ".bz2"])
print()
print()
print(rdf.to_latex(index=False))
rdf

All Archives
------------------------------------------------
                                               Baseline only: 		 0.548 		 0.500 		12200
                                               Advanced only: 		 0.235 		 0.364 		12200
                                                Fourier only: 		 0.635 		 0.550 		12200
                                        Baseline and Fourier: 		 0.623 		 0.543 		12200
                                        Advanced and Fourier: 		 0.563 		 0.506 		12200
                                       Baseline and Advanced: 		 0.336 		 0.404 		12200
                             Baseline, Advanced, and Fourier: 		 0.595 		 0.526 		12200


\begin{tabular}{lrrrr}
\toprule
                     FeatureSet &  Accuracy &       F1 &  Precision &   Recall \\
\midrule
                  Baseline only &  0.499836 & 0.547933 &   0.606428 & 0.499730 \\
                  Advanced only &  0.364016 & 0.234586 &   0.434417 & 0.160676 \\
                   Fourier only &

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.499836,0.547933,0.606428,0.49973
1,Advanced only,0.364016,0.234586,0.434417,0.160676
2,Fourier only,0.549754,0.634652,0.624885,0.64473
3,Baseline and Fourier,0.543115,0.622511,0.623948,0.621081
4,Advanced and Fourier,0.506148,0.563374,0.607439,0.52527
5,Baseline and Advanced,0.404098,0.335709,0.518341,0.248243
6,"Baseline, Advanced, and Fourier",0.52582,0.595143,0.617216,0.574595
