In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
#from imblearn.pipeline import Pipeline
#from imblearn.over_sampling import SVMSMOTE as SMOTE

import numpy as np
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score
import imblearn

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool
import pickle

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8


USE_SMOTE = True

PRINT_ACCURACY = True

np.random.seed(42)
random.seed(42)

!pwd

/Users/phantom/tempwork


In [2]:

def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and advanced": advanced_columns,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline, advanced, and Fourier": fourier_columns,\
    }



In [3]:
!ls

NapierOneRansomware_1.1.ipynb         s9.n1.base.password.csv.gz
exclude_one_ransomware_results.pickle s9.n1.ransomware.DHARMA.csv.gz
n1.expanded.pyencrypted_v1.csv.gz     s9.n1.ransomware.MAZE.csv.gz
n1.expanded.pyencrypted_v2.csv.gz     s9.n1.ransomware.NETWALKER.csv.gz
n1.expanded.pyencrypted_v3.csv.gz     s9.n1.ransomware.NOTPETYA.csv.gz
s9.n1.base.archive.csv.gz             s9.n1.ransomware.PHOBOS.csv.gz
s9.n1.base.csv.gz                     s9.n1.ransomware.RYUK.csv.gz
s9.n1.base.encrypted.csv.gz           s9.n1.ransomware.SODINOKIBI.csv.gz


In [4]:
def rfc_gen_function():
    return RandomForestClassifier(n_jobs=8, random_state=42)


def lr_gen_function():
    return LogisticRegression(n_jobs=8, \
                        solver='saga', \
                        random_state=42, \
                        max_iter=5000, \
                        multi_class='ovr')

def reformat_df(df, metric_score):
    df = df[["ransomware", "feature_set", metric_score]]
    return df.pivot(index="feature_set", columns="ransomware", values=metric_score)

@lru_cache(maxsize=5)
def get_plaintext():
    files = [\
                 "s9.n1.base.csv.gz", \
                 "s9.n1.base.archive.csv.gz", \
                 "s9.n1.base.encrypted.csv.gz", \
                 "s9.n1.base.password.csv.gz"]
    all_dfs = []
    for filename in files:
        df = pd.read_csv(filename)
        all_dfs.append(df)
    df = pd.concat(all_dfs).copy()
    nrows = df.shape[0]
    frac = 1400.0 / nrows
    df = df.sample(frac=frac, random_state=42).reset_index(drop=True)
    df["encrypted"]  = 0
    return df.head(1200), df.tail(200)

@lru_cache(maxsize=5)
def get_extra():
    extra_files = [\
                  "n1.expanded.pyencrypted_v1.csv.gz", \
                  "n1.expanded.pyencrypted_v2.csv.gz",\
                  "n1.expanded.pyencrypted_v3.csv.gz"]
    df = [pd.read_csv(f) for f in extra_files]
    df = pd.concat(df).copy()
    frac = 800.0 / df.shape[0]
    df = df.sample(frac=frac, random_state=42).reset_index(drop=True)
    df["encrypted"] = 1
    return df

ransomware_names = ["DHARMA", "MAZE", "NETWALKER", "NOTPETYA", "PHOBOS", "RYUK", "SODINOKIBI"]

@lru_cache(maxsize=5)
def get_ransomware():
    ransomware_dict = {}
    for name in ransomware_names:
        filename = f"s9.n1.ransomware.{name}.csv.gz"
        df = pd.read_csv(filename)
        df["encrypted"] = 1
        ransomware_dict[name] = df
    return ransomware_dict

@lru_cache(maxsize=40)
def get_train_test(test_ransomware_name):
    train_pt, test_pt = get_plaintext()
    test_ransomware = get_ransomware()[test_ransomware_name]\
                            .sample(frac=1.0, random_state=42)\
                            .reset_index(drop=True)
    train_ransomware = [get_ransomware()[s] for s in ransomware_names if s != test_ransomware_name]
    #train_ransomware = random.sample(train_ransomware, 5)
    traindf = train_ransomware
    traindf.append(train_pt)
    
    if False:
        traindf.append(get_extra())
        
    traindf = pd.concat(traindf).copy()
    testdf = pd.concat([test_ransomware, test_pt]).copy()
    return traindf, testdf


def run_classifier(test_ransomware_name, classif_gen_fn):
    f1_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    feature_set_names = []
    ransomware_names = []
    
    traindf, testdf = get_train_test(test_ransomware_name)
    column_name_df = get_columns(traindf)
    for feature_set_name, columns in column_name_df.items():

        train_x = traindf[columns].to_numpy()
        train_y = traindf["encrypted"].to_numpy().flatten()
        test_x = testdf[columns].to_numpy()
        test_y = testdf["encrypted"].to_numpy().flatten()
        
        estimators = [\
                        ('std,', MinMaxScaler()), \
                        #('smote', SMOTE()),
                        ('classifier', classif_gen_fn())]
        pipeline = Pipeline(estimators)
        pipeline.fit(train_x, train_y)
        
        y_pred = pipeline.predict(test_x)
        
        accuracy_scores.append(accuracy_score(test_y, y_pred))
        f1_scores.append(f1_score(test_y, y_pred, zero_division=0))
        precision_scores.append(precision_score(test_y, y_pred, zero_division=0))
        recall_scores.append(recall_score(test_y, y_pred, zero_division=0))
        feature_set_names.append(feature_set_name)
        ransomware_names.append(test_ransomware_name)
        
    return pd.DataFrame(\
        {\
             "ransomware": ransomware_names,\
             "feature_set": feature_set_names,\
             "accuracy": accuracy_scores,\
             "f1_score": f1_scores,\
             "precision": precision_scores,\
             "recall": recall_scores,\
        }\
    )

def compare_feature_sets(gen_fn):
    results = []
    for ransomware in ransomware_names:
        df = run_classifier(ransomware, gen_fn)
        results.append(df)
    return pd.concat(results).copy()

rfc_comparison = compare_feature_sets(rfc_gen_function)
lr_comparison = compare_feature_sets(lr_gen_function)

In [14]:
print(reformat_df(lr_comparison.round(3), "f1_score").to_latex())
df = reformat_df(lr_comparison.round(3), "f1_score")
df["mean"] = df.mean(axis=1)
df.sort_values(by="mean").round(3)

\begin{tabular}{lrrrrrrr}
\toprule
ransomware &  DHARMA &   MAZE &  NETWALKER &  NOTPETYA &  PHOBOS &   RYUK &  SODINOKIBI \\
feature\_set                     &         &        &            &           &         &        &             \\
\midrule
Advanced and Fourier            &   0.767 &  0.880 &      0.091 &     0.691 &   0.773 &  0.885 &       0.870 \\
Advanced only                   &   0.604 &  0.690 &      0.028 &     0.564 &   0.626 &  0.710 &       0.684 \\
Baseline and Fourier            &   0.686 &  0.782 &      0.123 &     0.608 &   0.686 &  0.782 &       0.782 \\
Baseline and advanced           &   0.628 &  0.771 &      0.043 &     0.677 &   0.677 &  0.777 &       0.788 \\
Baseline only                   &   0.000 &  0.000 &      0.000 &     0.000 &   0.000 &  0.000 &       0.000 \\
Baseline, advanced, and Fourier &   0.767 &  0.876 &      0.091 &     0.701 &   0.773 &  0.881 &       0.866 \\
Fourier only                    &   0.683 &  0.798 &      0.126 &     0.598 &   

ransomware,DHARMA,MAZE,NETWALKER,NOTPETYA,PHOBOS,RYUK,SODINOKIBI,mean
feature_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Baseline only,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Advanced only,0.604,0.69,0.028,0.564,0.626,0.71,0.684,0.558
Baseline and advanced,0.628,0.771,0.043,0.677,0.677,0.777,0.788,0.623
Baseline and Fourier,0.686,0.782,0.123,0.608,0.686,0.782,0.782,0.636
Fourier only,0.683,0.798,0.126,0.598,0.676,0.793,0.798,0.639
"Baseline, advanced, and Fourier",0.767,0.876,0.091,0.701,0.773,0.881,0.866,0.708
Advanced and Fourier,0.767,0.88,0.091,0.691,0.773,0.885,0.87,0.708


In [19]:
if PRINT_ACCURACY:
    print(reformat_df(lr_comparison.round(3), "accuracy").to_latex())
    df = reformat_df(lr_comparison.round(3), "accuracy")
    df["mean"] = df.mean(axis=1)
    print()
    print()
    print(df.sort_values(by="mean").round(3).to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
ransomware &  DHARMA &   MAZE &  NETWALKER &  NOTPETYA &  PHOBOS &   RYUK &  SODINOKIBI \\
feature\_set                     &         &        &            &           &         &        &             \\
\midrule
Advanced and Fourier            &   0.840 &  0.910 &      0.600 &     0.800 &   0.843 &  0.913 &       0.903 \\
Advanced only                   &   0.747 &  0.797 &      0.540 &     0.727 &   0.757 &  0.807 &       0.793 \\
Baseline and Fourier            &   0.783 &  0.837 &      0.573 &     0.747 &   0.783 &  0.837 &       0.837 \\
Baseline and advanced           &   0.763 &  0.843 &      0.553 &     0.787 &   0.787 &  0.847 &       0.853 \\
Baseline only                   &   0.667 &  0.667 &      0.667 &     0.667 &   0.667 &  0.667 &       0.667 \\
Baseline, advanced, and Fourier &   0.840 &  0.907 &      0.600 &     0.807 &   0.843 &  0.910 &       0.900 \\
Fourier only                    &   0.783 &  0.850 &      0.583 &     0.740 &   

In [16]:
print(reformat_df(rfc_comparison.round(3), "f1_score").to_latex())
df = reformat_df(rfc_comparison.round(3), "f1_score")
df["mean"] = df.mean(axis=1)
df.sort_values(by="mean").round(3)

\begin{tabular}{lrrrrrrr}
\toprule
ransomware &  DHARMA &   MAZE &  NETWALKER &  NOTPETYA &  PHOBOS &   RYUK &  SODINOKIBI \\
feature\_set                     &         &        &            &           &         &        &             \\
\midrule
Advanced and Fourier            &   0.868 &  0.876 &      0.083 &     0.746 &   0.912 &  0.895 &       0.885 \\
Advanced only                   &   0.835 &  0.853 &      0.033 &     0.738 &   0.893 &  0.856 &       0.867 \\
Baseline and Fourier            &   0.884 &  0.877 &      0.066 &     0.731 &   0.889 &  0.867 &       0.889 \\
Baseline and advanced           &   0.875 &  0.870 &      0.098 &     0.768 &   0.892 &  0.900 &       0.892 \\
Baseline only                   &   0.877 &  0.852 &      0.128 &     0.768 &   0.884 &  0.863 &       0.869 \\
Baseline, advanced, and Fourier &   0.870 &  0.876 &      0.083 &     0.768 &   0.921 &  0.889 &       0.886 \\
Fourier only                    &   0.849 &  0.848 &      0.063 &     0.706 &   

ransomware,DHARMA,MAZE,NETWALKER,NOTPETYA,PHOBOS,RYUK,SODINOKIBI,mean
feature_set,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Fourier only,0.849,0.848,0.063,0.706,0.847,0.811,0.822,0.707
Advanced only,0.835,0.853,0.033,0.738,0.893,0.856,0.867,0.725
Baseline and Fourier,0.884,0.877,0.066,0.731,0.889,0.867,0.889,0.743
Baseline only,0.877,0.852,0.128,0.768,0.884,0.863,0.869,0.749
Advanced and Fourier,0.868,0.876,0.083,0.746,0.912,0.895,0.885,0.752
"Baseline, advanced, and Fourier",0.87,0.876,0.083,0.768,0.921,0.889,0.886,0.756
Baseline and advanced,0.875,0.87,0.098,0.768,0.892,0.9,0.892,0.756


In [20]:
if PRINT_ACCURACY:
    print(reformat_df(rfc_comparison.round(3), "accuracy").to_latex())
    df = reformat_df(rfc_comparison.round(3), "accuracy")
    df["mean"] = df.mean(axis=1)
    print()
    print()
    print(df.sort_values(by="mean").round(3).to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
ransomware &  DHARMA &   MAZE &  NETWALKER &  NOTPETYA &  PHOBOS &   RYUK &  SODINOKIBI \\
feature\_set                     &         &        &            &           &         &        &             \\
\midrule
Advanced and Fourier            &   0.910 &  0.913 &      0.633 &     0.843 &   0.937 &  0.927 &       0.920 \\
Advanced only                   &   0.887 &  0.897 &      0.610 &     0.837 &   0.923 &  0.900 &       0.907 \\
Baseline and Fourier            &   0.917 &  0.913 &      0.623 &     0.833 &   0.920 &  0.907 &       0.920 \\
Baseline and advanced           &   0.913 &  0.910 &      0.630 &     0.853 &   0.923 &  0.930 &       0.923 \\
Baseline only                   &   0.913 &  0.897 &      0.637 &     0.853 &   0.917 &  0.903 &       0.907 \\
Baseline, advanced, and Fourier &   0.910 &  0.913 &      0.633 &     0.853 &   0.943 &  0.923 &       0.920 \\
Fourier only                    &   0.890 &  0.890 &      0.607 &     0.817 &   

In [18]:
results = {
    "logistic regression": lr_comparison,
    "random forest": rfc_comparison
}

with open("exclude_one_ransomware_results.pickle", "wb") as f:
    pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

In [21]:
!pwd


/Users/phantom/tempwork
