## Train with one dataset and test with another

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

In [2]:
!ls

Iteration_2.1_EXT2_RF_weighted_importances.ipynb
RF_CS5_DES3.ipynb
expanded.base32.des3.csv.gz
expanded.des3.csv.gz
expanded.plaintext.base32.csv.gz
expanded.pyencrypted_v1.b32.csv.gz
expanded.pyencrypted_v1.csv.gz
expanded.pyencrypted_v2.base32.csv.gz
expanded.pyencrypted_v2.csv.gz
expanded_encrypted_v3.csv.gz
expanded_encrypted_v3_base32.csv.gz
iteration_2.1_compare_base32_alternate_encryption_scheme.ipynb
iteration_2.1_compare_base32_logistic_regression.ipynb
n1.expanded.plaintext.base32.csv.gz
n1.expanded.plaintext.csv.gz
n1.expanded.pyencrypted_v1.base32.csv.gz
n1.expanded.pyencrypted_v1.csv.gz
n1.expanded.pyencrypted_v2.base32.csv.gz
n1.expanded.pyencrypted_v2.csv.gz
n1.expanded.pyencrypted_v3.base32.csv.gz
n1.expanded.pyencrypted_v3.csv.gz
n1.plaintext.base32.csv.gz
n1.plaintext.csv.gz
plaintext.base32.combined.csv.gz
plaintext.combined.csv.gz
plaintext.expanded.csv.gz
simple-average.pickle
weighted-results.pickle
weighted-scaled-results.pickle


In [3]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [4]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [5]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    train_datasets = {}
    test_datasets = {}
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df = df.sample(frac=1).reset_index(drop=True)
        df["is_encrypted"] = 1 if "encr" in file.lower() else 0
        if file.startswith("n1."):
            test_datasets[file] = df
        else:
            train_datasets[file] = df
    return train_datasets, test_datasets

In [9]:
def run_model(traindf, testdf, columns, description, clf, clfname="clf"):
    call_gc()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
        
    call_gc()
    
    trainX = traindf[columns].to_numpy() 
    testX = testdf[columns].to_numpy()
    trainY = traindf["is_encrypted"].to_numpy()
    testY = testdf["is_encrypted"].to_numpy()
    
    estimators = [\
                  ('std,', MinMaxScaler()), \
                  (clfname, clf())]
    pipeline = Pipeline(estimators)
    
    print("Training started...")
    pipeline.fit(trainX, trainY)
    print("Done.")
    
    print("Prediction started...")
    y_pred = pipeline.predict(testX)
    print("Done.")
    
    acc = accuracy_score(testY, y_pred)
    f1 = f1_score(testY, y_pred)
    recall = recall_score(testY, y_pred)
    precision = precision_score(testY, y_pred)
    
    return acc, f1, precision, recall

In [10]:
def get_results_for_classifier(clf, clfname="clf"):
    print(f"Evaluating : {clf}")
    results = {
        "Feature Set": [],
        "Accuracy": [],
        "F1-Score": [],
        "Precision": [],
        "Recall": []
    }

    traindf, testdf = load_datasets_once()

    key = str(list(traindf.keys())[0])
    columns = get_columns(traindf[key])

    for desc, cols in columns.items():
        acc, f1, prec, rec = run_model(traindf, testdf, cols, desc, clf, clfname)
        print(f"--- {desc} : acc = {acc}, f1 = {f1}, precision = {prec}, recall = {rec}")
        results["Feature Set"].append(desc)
        results["Accuracy"].append(acc)
        results["F1-Score"].append(f1)
        results["Precision"].append(prec)
        results["Recall"].append(rec)

    rdf = pd.DataFrame(results)
    return rdf
    

In [11]:
rfc_clf = lambda: RandomForestClassifier(n_jobs=5, random_state=42)
lr_clf = lambda: LogisticRegression(\
            n_jobs=8, \
            solver='saga', \
            random_state=42, \
            max_iter=1000, \
            multi_class='ovr')
rf_results = get_results_for_classifier(rfc_clf, "Random Forest")
lr_results = get_results_for_classifier(lr_clf, "Logistic Regression")

Evaluating : <function <lambda> at 0x7fe9b3287430>
Loading plaintext.base32.combined.csv.gz
Loading expanded.base32.des3.csv.gz
Loading n1.plaintext.base32.csv.gz
Loading expanded_encrypted_v3.csv.gz
Loading n1.expanded.plaintext.csv.gz
Loading n1.expanded.pyencrypted_v2.csv.gz
Loading n1.expanded.pyencrypted_v1.base32.csv.gz
Loading expanded.des3.csv.gz
Loading n1.expanded.pyencrypted_v2.base32.csv.gz
Loading expanded.pyencrypted_v1.csv.gz
Loading expanded.pyencrypted_v2.base32.csv.gz
Loading expanded_encrypted_v3_base32.csv.gz
Loading n1.expanded.pyencrypted_v3.base32.csv.gz
Loading plaintext.combined.csv.gz
Loading plaintext.expanded.csv.gz
Loading expanded.pyencrypted_v2.csv.gz
Loading expanded.plaintext.base32.csv.gz
Loading n1.expanded.pyencrypted_v3.csv.gz
Loading n1.expanded.plaintext.base32.csv.gz
Loading n1.plaintext.csv.gz
Loading expanded.pyencrypted_v1.b32.csv.gz
Loading n1.expanded.pyencrypted_v1.csv.gz
Training started...
Done.
Prediction started...
Done.
--- Baseline on

In [12]:
rf_results

Unnamed: 0,Feature Set,Accuracy,F1-Score,Precision,Recall
0,Baseline only,0.643925,0.692715,0.717804,0.669321
1,Advanced only,0.647281,0.711734,0.697869,0.726161
2,Fourier only,0.741538,0.80065,0.744789,0.865569
3,Baseline and Fourier,0.798728,0.8338,0.825791,0.841965
4,Advanced and Fourier,0.756356,0.79179,0.811979,0.77258
5,Baseline and Advanced,0.668731,0.726392,0.719579,0.733335
6,"Baseline, Advanced, and Fourier",0.767019,0.799806,0.824974,0.776129


In [13]:
print(rf_results.to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &                      Feature Set &  Accuracy &  F1-Score &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.643925 &  0.692715 &   0.717804 &  0.669321 \\
1 &                    Advanced only &  0.647281 &  0.711734 &   0.697869 &  0.726161 \\
2 &                     Fourier only &  0.741538 &  0.800650 &   0.744789 &  0.865569 \\
3 &             Baseline and Fourier &  0.798728 &  0.833800 &   0.825791 &  0.841965 \\
4 &             Advanced and Fourier &  0.756356 &  0.791790 &   0.811979 &  0.772580 \\
5 &            Baseline and Advanced &  0.668731 &  0.726392 &   0.719579 &  0.733335 \\
6 &  Baseline, Advanced, and Fourier &  0.767019 &  0.799806 &   0.824974 &  0.776129 \\
\bottomrule
\end{tabular}



In [14]:
lr_results

Unnamed: 0,Feature Set,Accuracy,F1-Score,Precision,Recall
0,Baseline only,0.519423,0.531596,0.639631,0.454783
1,Advanced only,0.495226,0.560461,0.586433,0.536692
2,Fourier only,0.615618,0.673164,0.686715,0.660138
3,Baseline and Fourier,0.628229,0.676421,0.707419,0.648026
4,Advanced and Fourier,0.627992,0.68273,0.698671,0.667499
5,Baseline and Advanced,0.522182,0.584325,0.610776,0.560071
6,"Baseline, Advanced, and Fourier",0.637372,0.694655,0.701553,0.687892


In [15]:
print(lr_results.to_latex())

\begin{tabular}{llrrrr}
\toprule
{} &                      Feature Set &  Accuracy &  F1-Score &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.519423 &  0.531596 &   0.639631 &  0.454783 \\
1 &                    Advanced only &  0.495226 &  0.560461 &   0.586433 &  0.536692 \\
2 &                     Fourier only &  0.615618 &  0.673164 &   0.686715 &  0.660138 \\
3 &             Baseline and Fourier &  0.628229 &  0.676421 &   0.707419 &  0.648026 \\
4 &             Advanced and Fourier &  0.627992 &  0.682730 &   0.698671 &  0.667499 \\
5 &            Baseline and Advanced &  0.522182 &  0.584325 &   0.610776 &  0.560071 \\
6 &  Baseline, Advanced, and Fourier &  0.637372 &  0.694655 &   0.701553 &  0.687892 \\
\bottomrule
\end{tabular}

