## Train with one dataset and test with another

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

In [2]:
!ls

Iteration_2.1_EXT2_RF_weighted_importances.ipynb
expanded.base32.des3.csv.gz
expanded.des3.csv.gz
expanded.plaintext.base32.csv.gz
expanded.pyencrypted_v1.b32.csv.gz
expanded.pyencrypted_v1.csv.gz
expanded.pyencrypted_v2.base32.csv.gz
expanded.pyencrypted_v2.csv.gz
expanded_encrypted_v3.csv.gz
expanded_encrypted_v3_base32.csv.gz
iteration_2.1_compare_base32_alternate_encryption_scheme.ipynb
iteration_2.1_compare_base32_logistic_regression.ipynb
n1.expanded.plaintext.base32.csv.gz
n1.expanded.plaintext.csv.gz
n1.expanded.pyencrypted_v1.base32.csv.gz
n1.expanded.pyencrypted_v1.csv.gz
n1.expanded.pyencrypted_v2.base32.csv.gz
n1.expanded.pyencrypted_v2.csv.gz
n1.expanded.pyencrypted_v3.base32.csv.gz
n1.expanded.pyencrypted_v3.csv.gz
n1.plaintext.base32.csv.gz
n1.plaintext.csv.gz
napierone_1.0.ipynb
plaintext.base32.combined.csv.gz
plaintext.combined.csv.gz
plaintext.expanded.csv.gz
simple-average.pickle
weighted-results.pickle
weighted-scaled-results.pickle


In [3]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [4]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [5]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    train_datasets = {}
    test_datasets = {}
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df = df.sample(frac=1).reset_index(drop=True)
        df["is_encrypted"] = 1 if "encr" in file.lower() else 0
        if file.startswith("n1."):
            test_datasets[file] = df
        else:
            train_datasets[file] = df
        if DEBUG and len(test_datasets) > 2 and len(train_datasets) > 2:
            break
    return train_datasets, test_datasets

In [6]:
def run_model(traindf, testdf, columns, description, clf, clfname="clf"):
    call_gc()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
        
    call_gc()
    
    trainX = traindf[columns].to_numpy() 
    testX = testdf[columns].to_numpy()
    trainY = traindf["is_encrypted"].to_numpy()
    testY = testdf["is_encrypted"].to_numpy()
    
    estimators = [\
                  ('std,', MinMaxScaler()), \
                  (clfname, clf())]
    pipeline = Pipeline(estimators)
    
    print("Training started...")
    pipeline.fit(trainX, trainY)
    print("Done.")
    
    print("Prediction started...")
    y_pred = pipeline.predict(testX)
    print("Done.")
    
    return y_pred

In [7]:
def get_results_for_classifier(clf, clfname="clf"):
    print(f"Evaluating : {clf}")
    results = {
        "Feature Set": [],
        "Accuracy": [],
        "F1-Score": [],
        "Precision": [],
        "Recall": []
    }

    traindf, testdf = load_datasets_once()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
    
    if DEBUG:
        qraindf = traindf.head(5000)
        testdf = testdf.head(5000)
    
    testdf_copy = testdf.copy()

    columns = get_columns(traindf)

    for desc, cols in columns.items():
        y_pred = run_model(traindf, testdf, cols, desc, clf, clfname)
        testdf_copy[f"Prediction:{desc}"] = y_pred
    return testdf_copy
    

In [8]:
rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
lr_clf = lambda: LogisticRegression(\
            n_jobs=8, \
            solver='saga', \
            random_state=42, \
            max_iter=1000, \
            multi_class='ovr')
rf_results = get_results_for_classifier(rfc_clf, "Random Forest")
#lr_results = get_results_for_classifier(lr_clf, "Logistic Regression")

Evaluating : <function <lambda> at 0x7fd0bd213af0>
Loading plaintext.base32.combined.csv.gz
Loading expanded.base32.des3.csv.gz
Loading n1.plaintext.base32.csv.gz
Loading expanded_encrypted_v3.csv.gz
Loading n1.expanded.plaintext.csv.gz
Loading n1.expanded.pyencrypted_v2.csv.gz
Loading n1.expanded.pyencrypted_v1.base32.csv.gz
Loading expanded.des3.csv.gz
Loading n1.expanded.pyencrypted_v2.base32.csv.gz
Loading expanded.pyencrypted_v1.csv.gz
Loading expanded.pyencrypted_v2.base32.csv.gz
Loading expanded_encrypted_v3_base32.csv.gz
Loading n1.expanded.pyencrypted_v3.base32.csv.gz
Loading plaintext.combined.csv.gz
Loading plaintext.expanded.csv.gz
Loading expanded.pyencrypted_v2.csv.gz
Loading expanded.plaintext.base32.csv.gz
Loading n1.expanded.pyencrypted_v3.csv.gz
Loading n1.expanded.plaintext.base32.csv.gz
Loading n1.plaintext.csv.gz
Loading expanded.pyencrypted_v1.b32.csv.gz
Loading n1.expanded.pyencrypted_v1.csv.gz
Training started...
Done.
Prediction started...
Done.
Training starte

In [47]:
def custom_result_print(rf_results, match_string, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

In [41]:
rdf = custom_result_print(rf_results, [".xls", ".csv", ".ppt", ".docx", ".doc", "webp"])

                                               Baseline only: 		 0.693 		 0.639 		28000
                                               Advanced only: 		 0.681 		 0.603 		28000
                                                Fourier only: 		 0.806 		 0.749 		28000
                                        Baseline and Fourier: 		 0.846 		 0.814 		28000
                                        Advanced and Fourier: 		 0.811 		 0.779 		28000
                                       Baseline and Advanced: 		 0.712 		 0.638 		28000
                             Baseline, Advanced, and Fourier: 		 0.823 		 0.794 		28000


In [43]:
print(rdf.to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.638929 &  0.692892 &   0.707506 &  0.678869 \\
1 &                    Advanced only &  0.602643 &  0.681441 &   0.656516 &  0.708333 \\
2 &                     Fourier only &  0.748607 &  0.806488 &   0.749323 &  0.873095 \\
3 &             Baseline and Fourier &  0.813786 &  0.846294 &   0.838337 &  0.854405 \\
4 &             Advanced and Fourier &  0.778857 &  0.811058 &   0.832081 &  0.791071 \\
5 &            Baseline and Advanced &  0.638250 &  0.712203 &   0.681326 &  0.746012 \\
6 &  Baseline, Advanced, and Fourier &  0.794036 &  0.822603 &   0.851168 &  0.795893 \\
\bottomrule
\end{tabular}



In [45]:
rdf.sort_values(by="F1")

Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.602643,0.681441,0.656516,0.708333
0,Baseline only,0.638929,0.692892,0.707506,0.678869
5,Baseline and Advanced,0.63825,0.712203,0.681326,0.746012
2,Fourier only,0.748607,0.806488,0.749323,0.873095
4,Advanced and Fourier,0.778857,0.811058,0.832081,0.791071
6,"Baseline, Advanced, and Fourier",0.794036,0.822603,0.851168,0.795893
3,Baseline and Fourier,0.813786,0.846294,0.838337,0.854405


In [49]:
rdf = custom_result_print(rf_results, ["webp"], ["base32", "b32"])
rdf.sort_values(by="F1")

                                               Baseline only: 		 0.701 		 0.653 		4505
                                               Advanced only: 		 0.623 		 0.549 		4505
                                                Fourier only: 		 0.665 		 0.597 		4505
                                        Baseline and Fourier: 		 0.745 		 0.696 		4505
                                        Advanced and Fourier: 		 0.723 		 0.675 		4505
                                       Baseline and Advanced: 		 0.679 		 0.608 		4505
                             Baseline, Advanced, and Fourier: 		 0.730 		 0.683 		4505


Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.54939,0.622677,0.6257,0.619682
2,Fourier only,0.59667,0.665316,0.662509,0.668147
5,Baseline and Advanced,0.607991,0.678909,0.667501,0.690714
0,Baseline only,0.653496,0.701015,0.726767,0.677026
4,Advanced and Fourier,0.67525,0.722654,0.741058,0.705142
6,"Baseline, Advanced, and Fourier",0.682797,0.730428,0.745189,0.716241
3,Baseline and Fourier,0.696337,0.744681,0.751412,0.738069


In [56]:
rdf = custom_result_print(rf_results, ["password"], [".7z", ".gz", ".zip"])
rdf.sort_values(by="F1")

                                               Baseline only: 		 0.605 		 0.492 		7000
                                               Advanced only: 		 0.602 		 0.493 		7000
                                                Fourier only: 		 0.825 		 0.768 		7000
                                        Baseline and Fourier: 		 0.880 		 0.853 		7000
                                        Advanced and Fourier: 		 0.822 		 0.796 		7000
                                       Baseline and Advanced: 		 0.640 		 0.505 		7000
                             Baseline, Advanced, and Fourier: 		 0.842 		 0.818 		7000


Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.492714,0.60195,0.568735,0.639286
0,Baseline only,0.491714,0.604754,0.566847,0.648095
5,Baseline and Advanced,0.505143,0.640216,0.567797,0.73381
4,Advanced and Fourier,0.795857,0.821887,0.862412,0.785
2,Fourier only,0.768143,0.824749,0.754594,0.909286
6,"Baseline, Advanced, and Fourier",0.818143,0.842353,0.877677,0.809762
3,Baseline and Fourier,0.852714,0.880353,0.858728,0.903095


In [19]:
rf_results

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,extended.extension,extended.base_filename,baseline.head_shannon_entropy,baseline.tail_shannon_entropy,baseline.shannon_entropy,baseline.montecarlo_pi,baseline.chisquare_full,baseline.chisquare_begin,...,fourier.value.4byte.249,fourier.value.4byte.250,is_encrypted,Prediction:Baseline only,Prediction:Advanced only,Prediction:Fourier only,Prediction:Baseline and Fourier,Prediction:Advanced and Fourier,Prediction:Baseline and Advanced,"Prediction:Baseline, Advanced, and Fourier"
0,96,96,.mp4,base32.0070-mp4.mp4,4.783723,3.576158,5.021144,0.916173,4.487420e+06,245.848845,...,6.524194e+16,5.675740e+16,0,1,1,1,1,1,1,1
1,3,3,.webp,base32.0014-webp-lossless-c2.webp,4.918240,4.955872,5.034564,0.877066,3.498306e+06,250.152382,...,6.358850e+16,5.461427e+16,0,0,1,1,0,1,1,1
2,42,42,.eps,base32.0016-eps-from-web.eps,4.835545,4.888585,4.755883,0.574371,5.516668e+04,236.906310,...,5.207635e+16,4.462496e+16,0,0,0,0,0,0,0,0
3,80,80,.png,base32.0073-png-c3.png,4.741412,4.731011,5.034336,0.872250,6.072265e+06,211.019614,...,6.552776e+16,5.553798e+16,0,1,1,1,1,1,1,1
4,39,39,.pptx,base32.0028-pptx.pptx,2.095376,3.831430,5.025329,0.922990,1.135653e+07,189.294554,...,6.587653e+16,5.560126e+16,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8884,75,75,.svg,0061-svg.svg,7.154911,7.161527,7.999897,3.133198,8.036218e+07,5747.977905,...,1.563723e+18,1.587008e+18,1,1,1,1,1,1,1,1
8885,91,91,.zip,0080-zip-lzma.zip,7.207293,7.082288,7.999962,3.130137,1.969117e+08,5879.505709,...,1.529225e+18,1.542678e+18,1,0,0,0,0,0,0,0
8886,26,26,.png,0061-png-c7.png,7.283503,7.101178,7.999852,3.133698,5.943885e+07,5232.351325,...,1.486020e+18,1.509867e+18,1,1,0,1,1,0,1,1
8887,8,8,.eps,0014-eps-from-web.eps,7.125576,7.255942,7.999513,3.128182,1.717457e+07,5108.788398,...,1.495703e+18,1.589314e+18,1,1,1,1,1,1,1,1


In [13]:
custom_result_print(rf_results)
rf_results

Prediction:Baseline only : 0.6053386278402823 0.4939179632248939 7070
Prediction:Advanced only : 0.6050438840128874 0.4971711456859972 7070
Prediction:Fourier only  : 0.8253186248259611 0.7693069306930693 7070
Prediction:Baseline and Fourier: 0.87983472971422 0.8519094766619519 7070
Prediction:Advanced and Fourier: 0.8234423195558298 0.7975954738330976 7070
Prediction:Baseline and Advanced: 0.6429086414666804 0.5096181046676096 7070
Prediction:Baseline, Advanced, and Fourier: 0.8432671081677704 0.8192362093352192 7070


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,extended.extension,extended.base_filename,baseline.head_shannon_entropy,baseline.tail_shannon_entropy,baseline.shannon_entropy,baseline.montecarlo_pi,baseline.chisquare_full,baseline.chisquare_begin,...,fourier.value.4byte.249,fourier.value.4byte.250,is_encrypted,Prediction:Baseline only,Prediction:Advanced only,Prediction:Fourier only,Prediction:Baseline and Fourier,Prediction:Advanced and Fourier,Prediction:Baseline and Advanced,"Prediction:Baseline, Advanced, and Fourier"
0,96,96,.mp4,base32.0070-mp4.mp4,4.783723,3.576158,5.021144,0.916173,4.487420e+06,245.848845,...,6.524194e+16,5.675740e+16,0,1,1,1,1,1,1,1
1,3,3,.webp,base32.0014-webp-lossless-c2.webp,4.918240,4.955872,5.034564,0.877066,3.498306e+06,250.152382,...,6.358850e+16,5.461427e+16,0,0,1,1,0,1,1,1
2,42,42,.eps,base32.0016-eps-from-web.eps,4.835545,4.888585,4.755883,0.574371,5.516668e+04,236.906310,...,5.207635e+16,4.462496e+16,0,0,0,0,0,0,0,0
3,80,80,.png,base32.0073-png-c3.png,4.741412,4.731011,5.034336,0.872250,6.072265e+06,211.019614,...,6.552776e+16,5.553798e+16,0,1,1,1,1,1,1,1
4,39,39,.pptx,base32.0028-pptx.pptx,2.095376,3.831430,5.025329,0.922990,1.135653e+07,189.294554,...,6.587653e+16,5.560126e+16,0,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8884,75,75,.svg,0061-svg.svg,7.154911,7.161527,7.999897,3.133198,8.036218e+07,5747.977905,...,1.563723e+18,1.587008e+18,1,1,1,1,1,1,1,1
8885,91,91,.zip,0080-zip-lzma.zip,7.207293,7.082288,7.999962,3.130137,1.969117e+08,5879.505709,...,1.529225e+18,1.542678e+18,1,0,0,0,0,0,0,0
8886,26,26,.png,0061-png-c7.png,7.283503,7.101178,7.999852,3.133698,5.943885e+07,5232.351325,...,1.486020e+18,1.509867e+18,1,1,0,1,1,0,1,1
8887,8,8,.eps,0014-eps-from-web.eps,7.125576,7.255942,7.999513,3.128182,1.717457e+07,5108.788398,...,1.495703e+18,1.589314e+18,1,1,1,1,1,1,1,1


In [55]:
def is_password_protected(x):
    if "password" in x.lower():
        return True
    else:
        return False
    
df = rf_results[rf_results["extended.base_filename"].map(is_password_protected)]
colnames = [c for c in df.columns if "name" in c or "is_encrypted" == c or "Prediction:" in c]
df = df[colnames]
df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,extended.base_filename,is_encrypted,Prediction:Baseline only,Prediction:Advanced only,Prediction:Fourier only,Prediction:Baseline and Fourier,Prediction:Advanced and Fourier,Prediction:Baseline and Advanced,"Prediction:Baseline, Advanced, and Fourier"
6,44,44,base32.0012-pptx-password.pptx,0,1,0,1,1,0,1,1
8,80,80,base32.0027-xlsx-password.xlsx,0,0,0,0,0,0,1,0
22,84,84,base32.0059-docx-password.docx,0,0,1,0,0,0,1,0
40,90,90,base32.0099-doc-password.doc,0,1,0,0,0,0,1,0
50,41,41,base32.0004-xls-password.xls,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
8837,62,62,0094-docx-password.docx,1,0,0,1,1,0,0,0
8845,45,45,0032-xls-password.xls,1,0,1,1,1,0,1,1
8861,15,15,0036-ppt-password.ppt,1,0,0,1,1,0,1,0
8865,41,41,0042-pdf-password.pdf,1,1,0,1,1,1,0,1
