## Train with one dataset and test with another

### In this one, we make one modification. We report statistics for ransomware separately

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

In [2]:
!ls

NapierOneRansomware_1.1.ipynb
expanded.base32.des3.csv.gz
expanded.des3.csv.gz
expanded.plaintext.base32.csv.gz
expanded.pyencrypted_v1.b32.csv.gz
expanded.pyencrypted_v1.csv.gz
expanded.pyencrypted_v2.base32.csv.gz
expanded.pyencrypted_v2.csv.gz
n1.encrypted.expanded.ransomware.csv.gz
n1.expanded.plaintext.base32.csv.gz
n1.expanded.plaintext.csv.gz
n1.expanded.pyencrypted_v1.base32.csv.gz
n1.expanded.pyencrypted_v1.csv.gz
n1.expanded.pyencrypted_v2.base32.csv.gz
n1.expanded.pyencrypted_v2.csv.gz
n1.plaintext.base32.csv.gz
n1.plaintext.csv.gz
n1.zip.expanded.encrypted.base32.csv.gz
n1.zip.expanded.encrypted.csv.gz
n1.zip.expanded.encrypted.v2.base32.csv.gz
n1.zip.expanded.encrypted.v2.csv.gz
n1.zip.expanded.plaintext.base32.csv.gz
n1.zip.expanded.plaintext.csv.gz
napierone_0.1.ipynb
napierone_1.0_rf_ransomware_focus.ipynb
napierone_logistic_regression_1.1.ipynb
napierone_randomforest_1.1.ipynb
plaintext.base32.combined.csv.gz
plaintext.combined.csv.gz
plain

In [3]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [4]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [5]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    train_datasets = {}
    test_datasets = {}
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df = df.sample(frac=1).reset_index(drop=True)
        df["is_encrypted"] = 1 if "encr" in file.lower() else 0
        if file.startswith("n1."):
            test_datasets[file] = df
        else:
            train_datasets[file] = df
        if DEBUG and len(test_datasets) > 2 and len(train_datasets) > 2:
            break
    return train_datasets, test_datasets

In [6]:
_, test_datasets = load_datasets_once()

Loading n1.zip.expanded.plaintext.csv.gz
Loading plaintext.base32.combined.csv.gz
Loading expanded.base32.des3.csv.gz
Loading n1.plaintext.base32.csv.gz
Loading n1.expanded.plaintext.csv.gz
Loading n1.expanded.pyencrypted_v2.csv.gz
Loading n1.expanded.pyencrypted_v1.base32.csv.gz
Loading n1.encrypted.expanded.ransomware.csv.gz
Loading expanded.des3.csv.gz
Loading n1.expanded.pyencrypted_v2.base32.csv.gz
Loading expanded.pyencrypted_v1.csv.gz
Loading expanded.pyencrypted_v2.base32.csv.gz
Loading n1.zip.expanded.encrypted.v2.csv.gz
Loading n1.zip.expanded.encrypted.v2.base32.csv.gz
Loading n1.zip.expanded.plaintext.base32.csv.gz
Loading plaintext.combined.csv.gz
Loading plaintext.expanded.csv.gz
Loading expanded.pyencrypted_v2.csv.gz
Loading expanded.plaintext.base32.csv.gz
Loading n1.expanded.plaintext.base32.csv.gz
Loading n1.plaintext.csv.gz
Loading expanded.pyencrypted_v1.b32.csv.gz
Loading n1.expanded.pyencrypted_v1.csv.gz
Loading n1.zip.expanded.encrypted.csv.gz
Loading n1.zip.expa

In [7]:
def run_model(traindf, testdf, columns, description, clf, clfname="clf"):
    call_gc()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
    if (isinstance(testdf, dict)):
        testdf = [df for df in testdf.values()]
    if (isinstance(testdf, list)):
        testdf = pd.concat(testdf)
        
    call_gc()
    
    trainX = traindf[columns].to_numpy() 
    testX = testdf[columns].to_numpy()
    trainY = traindf["is_encrypted"].to_numpy()
    testY = testdf["is_encrypted"].to_numpy()
    
    estimators = [\
                  ('std,', MinMaxScaler()), \
                  (clfname, clf())]
    pipeline = Pipeline(estimators)
    
    print("Training started...")
    pipeline.fit(trainX, trainY)
    print("Done.")
    
    print("Prediction started...")
    y_pred = pipeline.predict(testX)
    print("Done.")
    
    return y_pred

In [8]:
def get_results_for_classifier(clf, clfname="clf"):
    print(f"Evaluating : {clf}")
    results = {
        "Feature Set": [],
        "Accuracy": [],
        "F1-Score": [],
        "Precision": [],
        "Recall": []
    }

    traindf, testdf = load_datasets_once()
    
    if (isinstance(traindf, dict)):
        traindf = [df for df in traindf.values()]
    if isinstance(traindf, list):
        traindf = pd.concat(traindf)
        
    def get_test_df():
        array = []
        for filename, df in testdf.items():
            df["is_ransomware"] = 1 if "ransomware" in filename else 0
            array.append(df)
        df = pd.concat(array)
        names = [str(c) for c in df.columns if "is_ransomware" != str(c)]
        return df[names], df[["is_ransomware"]]
          
    testdf, is_ransomware = get_test_df()
    
    if DEBUG:
        qraindf = traindf.head(5000)
        testdf = testdf.head(5000)
    
    testdf_copy = testdf.copy()

    columns = get_columns(traindf)

    for desc, cols in columns.items():
        y_pred = run_model(traindf, testdf, cols, desc, clf, clfname)
        testdf_copy[f"Prediction:{desc}"] = y_pred
    return testdf_copy, is_ransomware
    

In [9]:
rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
lr_clf = lambda: LogisticRegression(\
            n_jobs=8, \
            solver='saga', \
            random_state=42, \
            max_iter=1000, \
            multi_class='ovr')
rf_results, is_ransomware = get_results_for_classifier(rfc_clf, "Random Forest")
#lr_results = get_results_for_classifier(lr_clf, "Logistic Regression")

Evaluating : <function <lambda> at 0x7f7766a7a160>
Training started...
Done.
Prediction started...
Done.
Training started...
Done.
Prediction started...
Done.
Training started...
Done.
Prediction started...
Done.
Training started...
Done.
Prediction started...
Done.
Training started...
Done.
Prediction started...
Done.
Training started...
Done.
Prediction started...
Done.
Training started...
Done.
Prediction started...
Done.


In [19]:
def custom_result_print(rf_results, match_string=None, notmatches=None):
    colnames = rf_results.columns
    colnames = [c for c in colnames if "name" in c or "is_encrypted" == c or "Prediction:" in c]

    rf_results2 = rf_results[colnames]

    def is_password_protected(x):
        if not x:
            return False
        if notmatches is not None:
            if isinstance(notmatches, list):
                for x1 in notmatches:
                    if x1.lower() in x:
                        return False
            else:
                if notmatches.lower() in x:
                    return False
        if (isinstance(match_string, list)):
            for x1 in match_string:
                if x1.lower() in x:
                    return True
        else:
            if match_string.lower() in x.lower():
                return True
        return False

    if match_string:
        ppr = rf_results2[rf_results2["extended.base_filename"].map(is_password_protected)]
    else:
        ppr = rf_results2

    dfdict = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    for colname in ppr.columns:
        if "Prediction" in colname:
            y_pred = ppr[colname]
            y_true = ppr["is_encrypted"]
            f1 = f1_score(y_true, y_pred)
            acc = accuracy_score(y_true, y_pred)
            prec = precision_score(y_true, y_pred)
            recall = recall_score(y_true, y_pred)
            colname = colname.split(":")[1]
            
            dfdict["FeatureSet"].append(colname)
            dfdict["Accuracy"].append(acc)
            dfdict["F1"].append(f1)
            dfdict["Precision"].append(prec)
            dfdict["Recall"].append(recall)
            print(f"{colname:>60s}: \t\t {f1:1.3f} \t\t {acc:1.3f} \t\t{len(ppr)}")
    return pd.DataFrame(dfdict)

In [31]:
rdfxx = custom_result_print(rf_results[is_ransomware["is_ransomware"]==1])
print("\n\n\n")
print(rdfxx.round(3).to_latex(index=False))
print("\n\n\n")
print(is_ransomware["is_ransomware"].value_counts())

                                               Baseline only: 		 0.572 		 0.400 		707
                                               Advanced only: 		 0.626 		 0.455 		707
                                                Fourier only: 		 0.017 		 0.008 		707
                                        Baseline and Fourier: 		 0.161 		 0.088 		707
                                        Advanced and Fourier: 		 0.522 		 0.354 		707
                                       Baseline and Advanced: 		 0.599 		 0.427 		707
                             Baseline, Advanced, and Fourier: 		 0.548 		 0.378 		707




\begin{tabular}{lrrrr}
\toprule
                     FeatureSet &  Accuracy &    F1 &  Precision &  Recall \\
\midrule
                  Baseline only &     0.400 & 0.572 &        1.0 &   0.400 \\
                  Advanced only &     0.455 & 0.626 &        1.0 &   0.455 \\
                   Fourier only &     0.008 & 0.017 &        1.0 &   0.008 \\
           Baseline and Fourier &     0.0

In [32]:
rdfuu = custom_result_print(rf_results[is_ransomware["is_ransomware"]==0])
print("\n\n\n")
print(rdfuu.round(3).to_latex(index=False))
print("\n\n\n")

                                               Baseline only: 		 0.578 		 0.612 		72284
                                               Advanced only: 		 0.577 		 0.602 		72284
                                                Fourier only: 		 0.242 		 0.550 		72284
                                        Baseline and Fourier: 		 0.484 		 0.621 		72284
                                        Advanced and Fourier: 		 0.669 		 0.707 		72284
                                       Baseline and Advanced: 		 0.602 		 0.623 		72284
                             Baseline, Advanced, and Fourier: 		 0.686 		 0.720 		72284




\begin{tabular}{lrrrr}
\toprule
                     FeatureSet &  Accuracy &    F1 &  Precision &  Recall \\
\midrule
                  Baseline only &     0.612 & 0.578 &      0.637 &   0.530 \\
                  Advanced only &     0.602 & 0.577 &      0.619 &   0.541 \\
                   Fourier only &     0.550 & 0.242 &      0.792 &   0.143 \\
           Baseline and Fou

In [40]:
def is_interesting(x):
    return True if x.lower().endswith("ryk") else False
rf2 = rf_results[rf_results["extended.extension"].map(is_interesting)]
rf2["Prediction:Fourier only"].sum()

0