## Train with one dataset and test with another

In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SVMSMOTE as smote
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score

import matplotlib.pyplot as plt
import glob

from tqdm import notebook as tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

import random
random.seed(42)

DEBUG = False
N_JOBS = 8

In [2]:
def call_gc():
    for i in range(3):
        for j in range(3):
            gc.collect(j)

In [3]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }

In [4]:
@lru_cache
def load_datasets_once():
    """Load all datasets only once
    
    We want to load the datasets only once. Once loaded
    serve from cache
    """
    nonransomware = []
    ransomware = None
    
    for file in glob.glob("*.csv.gz"):
        print(f"Loading {file}")
        df = pd.read_csv(file)
        df["csv_filename"] = file
        df["is_encrypted"] = 1 if "encr" in file.lower() else 0
        df["is_ransomware"] = 1 if "ransom" in file.lower() else 0
        df["is_encrypted"] = df["is_encrypted"].astype("int8")
        df["is_ransomware"] = df["is_ransomware"].astype("int8")
        
        if "ransom" in file.lower():
            df = df.sample(frac=1).reset_index(drop=True)
            ransomware = df
        else:
            nonransomware.append(df)
        
    nonransomware = pd.concat(nonransomware).sample(frac=1).reset_index(drop=True)
    return ransomware, nonransomware

# First Step

First step is to get an overall 5 fold cross validation score based
for
1. Ransomware
2. Non ransomware

In [5]:
DEBUG_SMALL = False
def run_cross_validated(nonransomDF, ransomDF, clf_gen, n_splits=2):
    def run_once(traindf, nonransomtestdf, ransomtestdf, column_list):
        traindf = traindf.sample(frac=1).reset_index(drop=True)
        trainX = traindf[column_list].to_numpy()
        trainY = traindf["is_encrypted"].to_numpy()
        ransomX = ransomtestdf[column_list].to_numpy()
        nonransomX = nonransomtestdf[column_list].to_numpy()
        
        pipeline = Pipeline(steps=\
                            [('scaler', StandardScaler()), \
                             ('smote', smote()), \
                             ('clf', clf_gen())])
        pipeline.fit(trainX, trainY)
        
        return pipeline.predict(nonransomX), pipeline.predict(ransomX)
    
    nonransomDF = nonransomDF.dropna()
    ransomDF = ransomDF.dropna()
    
    if DEBUG_SMALL:
        nonransomDF = nonransomDF.head(500)
        ransomDF = ransomDF.head(500)
        
    skf = StratifiedKFold(n_splits=n_splits)
    # We reverse test and train indices for ransomware samples
    if False:
        ransomDF_indices = [(test_indices, train_indices) \
                                for train_indices, test_indices \
                                in skf.split(ransomDF, ransomDF["is_encrypted"])]
    else:
        ransomDF_indices = [(train_indices, test_indices) \
                                for train_indices, test_indices \
                                in skf.split(ransomDF, ransomDF["is_encrypted"])]
    
    skf = StratifiedKFold(n_splits=n_splits)
    nonransomDF_indices = [(train_indices, test_indices) \
                              for train_indices, test_indices \
                              in skf.split(nonransomDF, nonransomDF["is_encrypted"])]
    
    ransomDF = ransomDF.reset_index(drop=True)
    nonransomDF = nonransomDF.reset_index(drop=True)
    
    
    
    returndf = []
    for run in tqdm.tqdm(range(len(ransomDF_indices))):
        ransom_train_indices, ransom_test_indices = ransomDF_indices[run]
        non_ransom_train_indices, non_ransom_test_indices = nonransomDF_indices[run]
        
        ransom_train, ransom_test = \
            ransomDF.iloc[ransom_train_indices], ransomDF.iloc[ransom_test_indices]
        #print(f"{run}. Non Ransom", ransom_train.shape, ransom_test.shape)
        
        nonransom_train, nonransom_test = \
            nonransomDF.iloc[non_ransom_train_indices], nonransomDF.iloc[non_ransom_test_indices]
        #print(f"{run}. Non Ransom", nonransom_train.shape, nonransom_test.shape)
        
        traindf = pd.concat([ransom_train, nonransom_train]).sample(frac=1).reset_index(drop=True)
        
        resultdf = pd.concat([nonransom_test, ransom_test])
        resultdf = resultdf[["extended.base_filename", "is_ransomware", "is_encrypted"]]
        
        for description, column_list in tqdm.tqdm(get_columns(ransomDF).items()):
            nonransom_pred, ransom_pred = run_once(traindf, nonransom_test, ransom_test, column_list)
            resultdf[f"{description}: pred"] = np.append(nonransom_pred, ransom_pred)
        
        returndf.append(resultdf)
    
    return pd.concat(returndf)

In [6]:
def format_result(result):
    dfdict = {
        "run": [],
        "accuracy": [],
        "f1": [],
        "precision": [],
        "recall": []
    }

    for c in result.columns:
        if c.lower().endswith("pred"):
            desc = c.split(":")[0]
            dfdict["run"].append(desc)

            x = result
            dfdict["accuracy"].append(accuracy_score(x["is_encrypted"], x[c]))
            dfdict["f1"].append(f1_score(x["is_encrypted"], x[c]))
            dfdict["precision"].append(accuracy_score(x["is_encrypted"], x[c]))
            dfdict["recall"].append(accuracy_score(x["is_encrypted"], x[c]))

    summarydf = pd.DataFrame(dfdict)
    return summarydf

In [7]:
rfc_gen = lambda: RandomForestClassifier(n_jobs=10, random_state=42)
rdf, nrdf = load_datasets_once()
result_rfc = run_cross_validated(nrdf.copy(), rdf.copy(), rfc_gen,  n_splits=5)
rfc_summary = format_result(result_rfc)

result_rfc.to_csv("rfc_result.csv")
rfc_summary.to_csv("rfc_summary.csv")


print(rfc_summary.to_latex())
rfc_summary


Loading n1.sampled.encrypted.v1.csv.gz
Loading n1.sampled.pt.base32.csv.gz
Loading n1.sampled.encrypted.v1.base32.csv.gz
Loading n1.sampled.ransomware.encrypted.csv.gz
Loading n1.sampled.encrypted.v2.base32.csv.gz
Loading n1.sampled.encrypted.v2.csv.gz
Loading n1.sampled.pt.csv.gz


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

\begin{tabular}{llrrrr}
\toprule
{} &                              run &  accuracy &        f1 &  precision &    recall \\
\midrule
0 &                    Baseline only &  0.874125 &  0.906194 &   0.874125 &  0.874125 \\
1 &                    Advanced only &  0.850181 &  0.887752 &   0.850181 &  0.850181 \\
2 &                     Fourier only &  0.846272 &  0.884368 &   0.846272 &  0.846272 \\
3 &             Baseline and Fourier &  0.910490 &  0.934065 &   0.910490 &  0.910490 \\
4 &             Advanced and Fourier &  0.908040 &  0.931877 &   0.908040 &  0.908040 \\
5 &            Baseline and Advanced &  0.884732 &  0.914135 &   0.884732 &  0.884732 \\
6 &  Baseline, Advanced, and Fourier &  0.915074 &  0.937274 &   0.915074 &  0.915074 \\
\bottomrule
\end{tabular}



Unnamed: 0,run,accuracy,f1,precision,recall
0,Baseline only,0.874125,0.906194,0.874125,0.874125
1,Advanced only,0.850181,0.887752,0.850181,0.850181
2,Fourier only,0.846272,0.884368,0.846272,0.846272
3,Baseline and Fourier,0.91049,0.934065,0.91049,0.91049
4,Advanced and Fourier,0.90804,0.931877,0.90804,0.90804
5,Baseline and Advanced,0.884732,0.914135,0.884732,0.884732
6,"Baseline, Advanced, and Fourier",0.915074,0.937274,0.915074,0.915074


In [8]:
lr_gen = lambda: LogisticRegression(n_jobs=8, \
                        solver='saga', \
                        random_state=42, \
                        max_iter=3000, \
                        multi_class='ovr')
lr_result = run_cross_validated(nrdf.copy(), rdf.copy(), lr_gen,  n_splits=5)
lr_summary = format_result(lr_result)

lr_result.to_csv("lr_result.csv")
lr_summary.to_csv("lr_summary.csv")

print(lr_summary.to_latex())
lr_summary

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]



  0%|          | 0/7 [00:00<?, ?it/s]



\begin{tabular}{llrrrr}
\toprule
{} &                              run &  accuracy &        f1 &  precision &    recall \\
\midrule
0 &                    Baseline only &  0.671237 &  0.751895 &   0.671237 &  0.671237 \\
1 &                    Advanced only &  0.707266 &  0.768954 &   0.707266 &  0.707266 \\
2 &                     Fourier only &  0.703618 &  0.781251 &   0.703618 &  0.703618 \\
3 &             Baseline and Fourier &  0.728890 &  0.801076 &   0.728890 &  0.728890 \\
4 &             Advanced and Fourier &  0.740170 &  0.797911 &   0.740170 &  0.740170 \\
5 &            Baseline and Advanced &  0.710895 &  0.772964 &   0.710895 &  0.710895 \\
6 &  Baseline, Advanced, and Fourier &  0.738879 &  0.797552 &   0.738879 &  0.738879 \\
\bottomrule
\end{tabular}



Unnamed: 0,run,accuracy,f1,precision,recall
0,Baseline only,0.671237,0.751895,0.671237,0.671237
1,Advanced only,0.707266,0.768954,0.707266,0.707266
2,Fourier only,0.703618,0.781251,0.703618,0.703618
3,Baseline and Fourier,0.72889,0.801076,0.72889,0.72889
4,Advanced and Fourier,0.74017,0.797911,0.74017,0.74017
5,Baseline and Advanced,0.710895,0.772964,0.710895,0.710895
6,"Baseline, Advanced, and Fourier",0.738879,0.797552,0.738879,0.738879


In [9]:
lr_summary

Unnamed: 0,run,accuracy,f1,precision,recall
0,Baseline only,0.671237,0.751895,0.671237,0.671237
1,Advanced only,0.707266,0.768954,0.707266,0.707266
2,Fourier only,0.703618,0.781251,0.703618,0.703618
3,Baseline and Fourier,0.72889,0.801076,0.72889,0.72889
4,Advanced and Fourier,0.74017,0.797911,0.74017,0.74017
5,Baseline and Advanced,0.710895,0.772964,0.710895,0.710895
6,"Baseline, Advanced, and Fourier",0.738879,0.797552,0.738879,0.738879


In [10]:
format_result(lr_result)

Unnamed: 0,run,accuracy,f1,precision,recall
0,Baseline only,0.671237,0.751895,0.671237,0.671237
1,Advanced only,0.707266,0.768954,0.707266,0.707266
2,Fourier only,0.703618,0.781251,0.703618,0.703618
3,Baseline and Fourier,0.72889,0.801076,0.72889,0.72889
4,Advanced and Fourier,0.74017,0.797911,0.74017,0.74017
5,Baseline and Advanced,0.710895,0.772964,0.710895,0.710895
6,"Baseline, Advanced, and Fourier",0.738879,0.797552,0.738879,0.738879


In [11]:
!pwd


/Users/phantom/mscwork/processed_save/csvwork
