In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score
import imblearn

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

USE_SMOTE = True

!pwd

/Users/phantom/mscwork/processed_save/csv


In [2]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }



In [3]:
def read_data():
    df = pd.read_csv("n1.encrypted.expanded.ransomware.csv.gz")
    df["is_encrypted"] = 1
    print(df.shape)
    df2 = pd.read_csv("n1.plaintext.csv.gz")
    df2["is_encrypted"] = 0
    df3 = pd.read_csv("n1.zip.expanded.plaintext.csv.gz")
    df3["is_encrypted"] = 0

    df2 = pd.concat([df2, df3])
    if not USE_SMOTE:
        df2 = df2.sample(frac=0.2).reset_index(drop=True)
    print(df2.shape)
    df = pd.concat([df, df2])

    df = df.sample(frac=1).reset_index(drop=True)
    featuresets = get_columns(df)

    y = df["is_encrypted"]
    X = df[[c for c in df.columns if "is_encrypted" != c]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    
    return df, X_train, X_test, y_train, y_test

In [4]:
def run_algorithms(X_train, X_test, y_train, y_test, clf):
    result_df = X_test.copy()
    result_df["y_true"] = y_test

    np.random.seed(42)
    random.seed(42)
    
    featuresets = get_columns(X_train)
    for key, value in featuresets.items():
        XX_train = X_train[value]
        XX_test = X_test[value]
        
        if USE_SMOTE:
            estimators = [\
                  ('smote', imblearn.over_sampling.SMOTE()),
                  ('std,', MinMaxScaler()), \
                  ("classifier", clf())]
        else:
            estimators = [\
                  ('std,', MinMaxScaler()), \
                  ("classifier", clf())]
            
        if USE_SMOTE:
            pipeline = imblearn.pipeline.Pipeline(estimators)
        else:
            pipeline = Pipeline(estimators)
            
        pipeline.fit(XX_train, y_train)
        y_pred = pipeline.predict(XX_test)
        result_df[key] = y_pred
        
        print(key, f1_score(y_test, y_pred))
    return result_df



In [5]:
def classify_and_compare():
    df, X_train, X_test, y_train, y_test = read_data()
    
    print("Logistic Regression")
    print("------------------------")
    lr_clf = lambda: LogisticRegression(\
                n_jobs=8, \
                solver='saga', \
                random_state=42, \
                max_iter=1000, \
                multi_class='ovr')
    lr_result = run_algorithms(X_train, X_test, y_train, y_test, lr_clf)
    
    print("Random Forest")
    print("------------------------")
    rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
    rfc_result = run_algorithms(X_train, X_test, y_train, y_test, rfc_clf)
    return lr_result, rfc_result

In [6]:
def format_results_as_dataframe0(df):
    dictdf = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    
    featuresets = get_columns(df)
    
    for feature in featuresets.keys():
        if feature in df.keys():
            y_true = df["y_true"]
            y_pred = df[str(feature)]
            dictdf["FeatureSet"].append(feature)
            dictdf["Accuracy"].append(accuracy_score(y_true, y_pred))
            dictdf["F1"].append(f1_score(y_true, y_pred))
            dictdf["Precision"].append(f1_score(y_true, y_pred))
            dictdf["Recall"].append(f1_score(y_true, y_pred))

    return pd.DataFrame(dictdf)

def format_results_as_dataframe(df, include=None, exclude=None):
    if include and not (isinstance(include, list)):
        include = [include]
    if exclude and not isinstance(exclude, list):
        exclude = [exclude]
    if include:
        include = [x.lower() for x in include]
    if exclude:
        exclude = [x.lower() for x in exclude]
        
    def is_interesting(item):
        if exclude:
            for x in exclude:
                if x.lower() in item.lower():
                    return False
        if include is None or 0 == len(include):
            return True
        if include:
            for x in include:
                if x.lower() in item.lower():
                    return True
        if (include is None or 0 == len(include)) \
            and (exclude is None or 0 == len(exclude)):
            return True
        return False

    dictdf = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    
    featuresets = get_columns(df)
    df = df[df["extended.base_filename"].map(is_interesting)]
    
    for feature in featuresets.keys():
        if feature in df.keys():
            y_true = df["y_true"]
            y_pred = df[str(feature)]
            dictdf["FeatureSet"].append(feature)
            dictdf["Accuracy"].append(accuracy_score(y_true, y_pred))
            dictdf["F1"].append(f1_score(y_true, y_pred))
            dictdf["Precision"].append(f1_score(y_true, y_pred))
            dictdf["Recall"].append(f1_score(y_true, y_pred))

    return pd.DataFrame(dictdf)

# Using a subset of data (no SMOTE)

In [7]:
USE_SMOTE = False
lr_result, rfc_result = classify_and_compare()

(707, 677)
(1818, 677)
Logistic Regression
------------------------
Baseline only 0.0
Advanced only 0.0
Fourier only 0.48066298342541436
Baseline and Fourier 0.5104166666666666
Advanced and Fourier 0.6907449209932279
Baseline and Advanced 0.0
Baseline, Advanced, and Fourier 0.6848072562358277
Random Forest
------------------------
Baseline only 0.7982261640798227
Advanced only 0.731934731934732
Fourier only 0.7072599531615924
Baseline and Fourier 0.811659192825112
Advanced and Fourier 0.7813953488372094
Baseline and Advanced 0.8009153318077803
Baseline, Advanced, and Fourier 0.8137931034482758


In [8]:
results = format_results_as_dataframe0(lr_result)

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples:")
print("--------------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples:
--------------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.723022 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.723022 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.774580 &  0.480663 &   0.480663 &  0.480663 \\
3 &             Baseline and Fourier &  0.774580 &  0.510417 &   0.510417 &  0.510417 \\
4 &             Advanced and Fourier &  0.835731 &  0.690745 &   0.690745 &  0.690745 \\
5 &            Baseline and Advanced &  0.720624 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.833333 &  0.684807 &   0.684807 &  0.684807 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.723022,0.0,0.0,0.0
1,Advanced only,0.723022,0.0,0.0,0.0
5,Baseline and Advanced,0.720624,0.0,0.0,0.0
2,Fourier only,0.77458,0.480663,0.480663,0.480663
3,Baseline and Fourier,0.77458,0.510417,0.510417,0.510417
6,"Baseline, Advanced, and Fourier",0.833333,0.684807,0.684807,0.684807
4,Advanced and Fourier,0.835731,0.690745,0.690745,0.690745


In [9]:
results = format_results_as_dataframe0(rfc_result)

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples:")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples:
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.890887 &  0.798226 &   0.798226 &  0.798226 \\
1 &                    Advanced only &  0.862110 &  0.731935 &   0.731935 &  0.731935 \\
2 &                     Fourier only &  0.850120 &  0.707260 &   0.707260 &  0.707260 \\
3 &             Baseline and Fourier &  0.899281 &  0.811659 &   0.811659 &  0.811659 \\
4 &             Advanced and Fourier &  0.887290 &  0.781395 &   0.781395 &  0.781395 \\
5 &            Baseline and Advanced &  0.895683 &  0.800915 &   0.800915 &  0.800915 \\
6 &  Baseline, Advanced, and Fourier &  0.902878 &  0.813793 &   0.813793 &  0.813793 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.85012,0.70726,0.70726,0.70726
1,Advanced only,0.86211,0.731935,0.731935,0.731935
4,Advanced and Fourier,0.88729,0.781395,0.781395,0.781395
0,Baseline only,0.890887,0.798226,0.798226,0.798226
5,Baseline and Advanced,0.895683,0.800915,0.800915,0.800915
3,Baseline and Fourier,0.899281,0.811659,0.811659,0.811659
6,"Baseline, Advanced, and Fourier",0.902878,0.813793,0.813793,0.813793


In [10]:
results = format_results_as_dataframe(lr_result, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.648585 &  0.538700 &   0.538700 &  0.538700 \\
3 &             Baseline and Fourier &  0.669811 &  0.583333 &   0.583333 &  0.583333 \\
4 &             Advanced and Fourier &  0.813679 &  0.794805 &   0.794805 &  0.794805 \\
5 &            Baseline and Advanced &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.808962 &  0.788512 &   0.788512 &  0.788512 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.455189,0.0,0.0,0.0
1,Advanced only,0.455189,0.0,0.0,0.0
5,Baseline and Advanced,0.455189,0.0,0.0,0.0
2,Fourier only,0.648585,0.5387,0.5387,0.5387
3,Baseline and Fourier,0.669811,0.583333,0.583333,0.583333
6,"Baseline, Advanced, and Fourier",0.808962,0.788512,0.788512,0.788512
4,Advanced and Fourier,0.813679,0.794805,0.794805,0.794805


In [11]:
results = format_results_as_dataframe(rfc_result, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.865566 &  0.863309 &   0.863309 &  0.863309 \\
1 &                    Advanced only &  0.808962 &  0.794937 &   0.794937 &  0.794937 \\
2 &                     Fourier only &  0.799528 &  0.780362 &   0.780362 &  0.780362 \\
3 &             Baseline and Fourier &  0.872642 &  0.870192 &   0.870192 &  0.870192 \\
4 &             Advanced and Fourier &  0.841981 &  0.833747 &   0.833747 &  0.833747 \\
5 &            Baseline and Advanced &  0.853774 &  0.849515 &   0.849515 &  0.849515 \\
6 &  Baseline, Advanced, and Fourier &  0.867925 &  0.863415 &   0.863415 &  0.863415 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.799528,0.780362,0.780362,0.780362
1,Advanced only,0.808962,0.794937,0.794937,0.794937
4,Advanced and Fourier,0.841981,0.833747,0.833747,0.833747
5,Baseline and Advanced,0.853774,0.849515,0.849515,0.849515
0,Baseline only,0.865566,0.863309,0.863309,0.863309
6,"Baseline, Advanced, and Fourier",0.867925,0.863415,0.863415,0.863415
3,Baseline and Fourier,0.872642,0.870192,0.870192,0.870192


In [12]:
results = format_results_as_dataframe(lr_result, 
                    ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.648585 &  0.538700 &   0.538700 &  0.538700 \\
3 &             Baseline and Fourier &  0.669811 &  0.583333 &   0.583333 &  0.583333 \\
4 &             Advanced and Fourier &  0.813679 &  0.794805 &   0.794805 &  0.794805 \\
5 &            Baseline and Advanced &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.808962 &  0.788512 &   0.788512 &  0.788512 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.455189,0.0,0.0,0.0
1,Advanced only,0.455189,0.0,0.0,0.0
5,Baseline and Advanced,0.455189,0.0,0.0,0.0
2,Fourier only,0.648585,0.5387,0.5387,0.5387
3,Baseline and Fourier,0.669811,0.583333,0.583333,0.583333
6,"Baseline, Advanced, and Fourier",0.808962,0.788512,0.788512,0.788512
4,Advanced and Fourier,0.813679,0.794805,0.794805,0.794805


In [13]:
results = format_results_as_dataframe(rfc_result, 
                    ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.865566 &  0.863309 &   0.863309 &  0.863309 \\
1 &                    Advanced only &  0.808962 &  0.794937 &   0.794937 &  0.794937 \\
2 &                     Fourier only &  0.799528 &  0.780362 &   0.780362 &  0.780362 \\
3 &             Baseline and Fourier &  0.872642 &  0.870192 &   0.870192 &  0.870192 \\
4 &             Advanced and Fourier &  0.841981 &  0.833747 &   0.833747 &  0.833747 \\
5 &            Baseline and Advanced &  0.853774 &  0.849515 &   0.849515 &  0.849515 \\
6 &  Baseline, Advanced, and Fourier &  0.867925 &  0.863415 &   0.863415 &  0.863415 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.799528,0.780362,0.780362,0.780362
1,Advanced only,0.808962,0.794937,0.794937,0.794937
4,Advanced and Fourier,0.841981,0.833747,0.833747,0.833747
5,Baseline and Advanced,0.853774,0.849515,0.849515,0.849515
0,Baseline only,0.865566,0.863309,0.863309,0.863309
6,"Baseline, Advanced, and Fourier",0.867925,0.863415,0.863415,0.863415
3,Baseline and Fourier,0.872642,0.870192,0.870192,0.870192


In [14]:
results = format_results_as_dataframe(lr_result, 
                    ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.511628 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.511628 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.653277 &  0.514793 &   0.514793 &  0.514793 \\
3 &             Baseline and Fourier &  0.674419 &  0.560000 &   0.560000 &  0.560000 \\
4 &             Advanced and Fourier &  0.790698 &  0.755556 &   0.755556 &  0.755556 \\
5 &            Baseline and Advanced &  0.511628 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.786469 &  0.749380 &   0.749380 &  0.749380 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.511628,0.0,0.0,0.0
1,Advanced only,0.511628,0.0,0.0,0.0
5,Baseline and Advanced,0.511628,0.0,0.0,0.0
2,Fourier only,0.653277,0.514793,0.514793,0.514793
3,Baseline and Fourier,0.674419,0.56,0.56,0.56
6,"Baseline, Advanced, and Fourier",0.786469,0.74938,0.74938,0.74938
4,Advanced and Fourier,0.790698,0.755556,0.755556,0.755556


In [15]:
results = format_results_as_dataframe(rfc_result, 
                    ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.849894 &  0.835267 &   0.835267 &  0.835267 \\
1 &                    Advanced only &  0.807611 &  0.775309 &   0.775309 &  0.775309 \\
2 &                     Fourier only &  0.788584 &  0.751244 &   0.751244 &  0.751244 \\
3 &             Baseline and Fourier &  0.858351 &  0.843823 &   0.843823 &  0.843823 \\
4 &             Advanced and Fourier &  0.841438 &  0.817518 &   0.817518 &  0.817518 \\
5 &            Baseline and Advanced &  0.852008 &  0.833333 &   0.833333 &  0.833333 \\
6 &  Baseline, Advanced, and Fourier &  0.862579 &  0.844869 &   0.844869 &  0.844869 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.788584,0.751244,0.751244,0.751244
1,Advanced only,0.807611,0.775309,0.775309,0.775309
4,Advanced and Fourier,0.841438,0.817518,0.817518,0.817518
5,Baseline and Advanced,0.852008,0.833333,0.833333,0.833333
0,Baseline only,0.849894,0.835267,0.835267,0.835267
3,Baseline and Fourier,0.858351,0.843823,0.843823,0.843823
6,"Baseline, Advanced, and Fourier",0.862579,0.844869,0.844869,0.844869


In [16]:
lr_result.to_csv("../n1_ransomware_result_lr_nosmote.csv.gz")
rfc_result.to_csv("../n1_ransomware_result_rfc_nosmote.csv.gz")

# Using all data (SMOTE)

In [17]:
USE_SMOTE = True
lr_result2, rfc_result2 = classify_and_compare()

(707, 677)
(9091, 677)
Logistic Regression
------------------------
Baseline only 0.19443087445041526
Advanced only 0.28269085411942557
Fourier only 0.2975338106603023
Baseline and Fourier 0.2974683544303798
Advanced and Fourier 0.375
Baseline and Advanced 0.30438247011952196
Baseline, Advanced, and Fourier 0.38515546639919757
Random Forest
------------------------
Baseline only 0.5880398671096345
Advanced only 0.5818181818181818
Fourier only 0.4747847478474785
Baseline and Fourier 0.5549132947976879
Advanced and Fourier 0.6148760330578512
Baseline and Advanced 0.6195286195286196
Baseline, Advanced, and Fourier 0.6206896551724138


In [18]:
results = format_results_as_dataframe0(lr_result2)

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples:")
print("--------------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples:
--------------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.490105 &  0.194431 &   0.194431 &  0.194431 \\
1 &                    Advanced only &  0.706555 &  0.282691 &   0.282691 &  0.282691 \\
2 &                     Fourier only &  0.726964 &  0.297534 &   0.297534 &  0.297534 \\
3 &             Baseline and Fourier &  0.725417 &  0.297468 &   0.297468 &  0.297468 \\
4 &             Advanced and Fourier &  0.802103 &  0.375000 &   0.375000 &  0.375000 \\
5 &            Baseline and Advanced &  0.730056 &  0.304382 &   0.304382 &  0.304382 \\
6 &  Baseline, Advanced, and Fourier &  0.810451 &  0.385155 &   0.385155 &  0.385155 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.490105,0.194431,0.194431,0.194431
1,Advanced only,0.706555,0.282691,0.282691,0.282691
3,Baseline and Fourier,0.725417,0.297468,0.297468,0.297468
2,Fourier only,0.726964,0.297534,0.297534,0.297534
5,Baseline and Advanced,0.730056,0.304382,0.304382,0.304382
4,Advanced and Fourier,0.802103,0.375,0.375,0.375
6,"Baseline, Advanced, and Fourier",0.810451,0.385155,0.385155,0.385155


In [19]:
results = format_results_as_dataframe0(rfc_result2)

print()
print()
print("SMOTE: Random Forest with Ransomware Samples:")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples:
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.923315 &  0.588040 &   0.588040 &  0.588040 \\
1 &                    Advanced only &  0.921769 &  0.581818 &   0.581818 &  0.581818 \\
2 &                     Fourier only &  0.867965 &  0.474785 &   0.474785 &  0.474785 \\
3 &             Baseline and Fourier &  0.904762 &  0.554913 &   0.554913 &  0.554913 \\
4 &             Advanced and Fourier &  0.927953 &  0.614876 &   0.614876 &  0.614876 \\
5 &            Baseline and Advanced &  0.930118 &  0.619529 &   0.619529 &  0.619529 \\
6 &  Baseline, Advanced, and Fourier &  0.928571 &  0.620690 &   0.620690 &  0.620690 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.867965,0.474785,0.474785,0.474785
3,Baseline and Fourier,0.904762,0.554913,0.554913,0.554913
1,Advanced only,0.921769,0.581818,0.581818,0.581818
0,Baseline only,0.923315,0.58804,0.58804,0.58804
4,Advanced and Fourier,0.927953,0.614876,0.614876,0.614876
5,Baseline and Advanced,0.930118,0.619529,0.619529,0.619529
6,"Baseline, Advanced, and Fourier",0.928571,0.62069,0.62069,0.62069


In [20]:
results = format_results_as_dataframe(lr_result, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.648585 &  0.538700 &   0.538700 &  0.538700 \\
3 &             Baseline and Fourier &  0.669811 &  0.583333 &   0.583333 &  0.583333 \\
4 &             Advanced and Fourier &  0.813679 &  0.794805 &   0.794805 &  0.794805 \\
5 &            Baseline and Advanced &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.808962 &  0.788512 &   0.788512 &  0.788512 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.455189,0.0,0.0,0.0
1,Advanced only,0.455189,0.0,0.0,0.0
5,Baseline and Advanced,0.455189,0.0,0.0,0.0
2,Fourier only,0.648585,0.5387,0.5387,0.5387
3,Baseline and Fourier,0.669811,0.583333,0.583333,0.583333
6,"Baseline, Advanced, and Fourier",0.808962,0.788512,0.788512,0.788512
4,Advanced and Fourier,0.813679,0.794805,0.794805,0.794805


In [21]:
results = format_results_as_dataframe(rfc_result, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.865566 &  0.863309 &   0.863309 &  0.863309 \\
1 &                    Advanced only &  0.808962 &  0.794937 &   0.794937 &  0.794937 \\
2 &                     Fourier only &  0.799528 &  0.780362 &   0.780362 &  0.780362 \\
3 &             Baseline and Fourier &  0.872642 &  0.870192 &   0.870192 &  0.870192 \\
4 &             Advanced and Fourier &  0.841981 &  0.833747 &   0.833747 &  0.833747 \\
5 &            Baseline and Advanced &  0.853774 &  0.849515 &   0.849515 &  0.849515 \\
6 &  Baseline, Advanced, and Fourier &  0.867925 &  0.863415 &   0.863415 &  0.863415 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.799528,0.780362,0.780362,0.780362
1,Advanced only,0.808962,0.794937,0.794937,0.794937
4,Advanced and Fourier,0.841981,0.833747,0.833747,0.833747
5,Baseline and Advanced,0.853774,0.849515,0.849515,0.849515
0,Baseline only,0.865566,0.863309,0.863309,0.863309
6,"Baseline, Advanced, and Fourier",0.867925,0.863415,0.863415,0.863415
3,Baseline and Fourier,0.872642,0.870192,0.870192,0.870192


In [22]:
results = format_results_as_dataframe(lr_result, 
                ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.648585 &  0.538700 &   0.538700 &  0.538700 \\
3 &             Baseline and Fourier &  0.669811 &  0.583333 &   0.583333 &  0.583333 \\
4 &             Advanced and Fourier &  0.813679 &  0.794805 &   0.794805 &  0.794805 \\
5 &            Baseline and Advanced &  0.455189 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.808962 &  0.788512 &   0.788512 &  0.788512 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.455189,0.0,0.0,0.0
1,Advanced only,0.455189,0.0,0.0,0.0
5,Baseline and Advanced,0.455189,0.0,0.0,0.0
2,Fourier only,0.648585,0.5387,0.5387,0.5387
3,Baseline and Fourier,0.669811,0.583333,0.583333,0.583333
6,"Baseline, Advanced, and Fourier",0.808962,0.788512,0.788512,0.788512
4,Advanced and Fourier,0.813679,0.794805,0.794805,0.794805


In [23]:
results = format_results_as_dataframe(rfc_result, 
                ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Random Forest with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.865566 &  0.863309 &   0.863309 &  0.863309 \\
1 &                    Advanced only &  0.808962 &  0.794937 &   0.794937 &  0.794937 \\
2 &                     Fourier only &  0.799528 &  0.780362 &   0.780362 &  0.780362 \\
3 &             Baseline and Fourier &  0.872642 &  0.870192 &   0.870192 &  0.870192 \\
4 &             Advanced and Fourier &  0.841981 &  0.833747 &   0.833747 &  0.833747 \\
5 &            Baseline and Advanced &  0.853774 &  0.849515 &   0.849515 &  0.849515 \\
6 &  Baseline, Advanced, and Fourier &  0.867925 &  0.863415 &   0.863415 &  0.863415 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.799528,0.780362,0.780362,0.780362
1,Advanced only,0.808962,0.794937,0.794937,0.794937
4,Advanced and Fourier,0.841981,0.833747,0.833747,0.833747
5,Baseline and Advanced,0.853774,0.849515,0.849515,0.849515
0,Baseline only,0.865566,0.863309,0.863309,0.863309
6,"Baseline, Advanced, and Fourier",0.867925,0.863415,0.863415,0.863415
3,Baseline and Fourier,0.872642,0.870192,0.870192,0.870192


In [24]:
results = format_results_as_dataframe(lr_result, 
                ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.511628 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.511628 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.653277 &  0.514793 &   0.514793 &  0.514793 \\
3 &             Baseline and Fourier &  0.674419 &  0.560000 &   0.560000 &  0.560000 \\
4 &             Advanced and Fourier &  0.790698 &  0.755556 &   0.755556 &  0.755556 \\
5 &            Baseline and Advanced &  0.511628 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.786469 &  0.749380 &   0.749380 &  0.749380 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.511628,0.0,0.0,0.0
1,Advanced only,0.511628,0.0,0.0,0.0
5,Baseline and Advanced,0.511628,0.0,0.0,0.0
2,Fourier only,0.653277,0.514793,0.514793,0.514793
3,Baseline and Fourier,0.674419,0.56,0.56,0.56
6,"Baseline, Advanced, and Fourier",0.786469,0.74938,0.74938,0.74938
4,Advanced and Fourier,0.790698,0.755556,0.755556,0.755556


In [25]:
results = format_results_as_dataframe(rfc_result, 
                ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.849894 &  0.835267 &   0.835267 &  0.835267 \\
1 &                    Advanced only &  0.807611 &  0.775309 &   0.775309 &  0.775309 \\
2 &                     Fourier only &  0.788584 &  0.751244 &   0.751244 &  0.751244 \\
3 &             Baseline and Fourier &  0.858351 &  0.843823 &   0.843823 &  0.843823 \\
4 &             Advanced and Fourier &  0.841438 &  0.817518 &   0.817518 &  0.817518 \\
5 &            Baseline and Advanced &  0.852008 &  0.833333 &   0.833333 &  0.833333 \\
6 &  Baseline, Advanced, and Fourier &  0.862579 &  0.844869 &   0.844869 &  0.844869 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.788584,0.751244,0.751244,0.751244
1,Advanced only,0.807611,0.775309,0.775309,0.775309
4,Advanced and Fourier,0.841438,0.817518,0.817518,0.817518
5,Baseline and Advanced,0.852008,0.833333,0.833333,0.833333
0,Baseline only,0.849894,0.835267,0.835267,0.835267
3,Baseline and Fourier,0.858351,0.843823,0.843823,0.843823
6,"Baseline, Advanced, and Fourier",0.862579,0.844869,0.844869,0.844869


In [26]:
lr_result.to_csv("../n1_ransomware_result_lr_smote.csv.gz")
rfc_result.to_csv("../n1_ransomware_result_rfc_smote.csv.gz")

In [27]:
!pwd

/Users/phantom/mscwork/processed_save/csv
