In [1]:
import pandas as pd
import numpy as np
import scipy
from scipy import stats

from functools import lru_cache

import gc

import sklearn
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
import numpy as np
import random

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, accuracy_score, recall_score
import imblearn

import matplotlib.pyplot as plt
import glob

import tqdm
from multiprocessing.pool import ThreadPool, Pool

plt.rcParams["figure.figsize"] = (20,20)

DEBUG = False
N_JOBS = 8

USE_SMOTE = True

np.random.seed(42)
random.seed(42)

!pwd

/Users/phantom/mscwork/processed_save/csv


In [2]:
def get_columns(thisdf):
    baseline_columns = [c for c in thisdf.columns if c.startswith('baseline') and "head" not in c and "tail" not in c]
    baseline_columns = [c for c in baseline_columns if "filesize" not in c]
    baseline_columns = [c for c in baseline_columns if "begin" not in c and "end" not in c]

    advanced_columns = [c for c in thisdf.columns if "advanced" in c]
    advanced_columns = [c for c in advanced_columns if "begin" not in c and "end" not in c]
    advanced_columns = [c for c in advanced_columns if "head" not in c and "tail" not in c]
    advanced_columns = [c for c in advanced_columns if "start" not in c]
    advanced_columns_only = list(set(advanced_columns))
    advanced_columns = list(set(advanced_columns + baseline_columns))

    fourier_columns = [c for c in thisdf.columns if "fourier" in c and "value" not in c]
    fourier_columns = [c for c in fourier_columns if "1byte" in c]
    fourier_columns = [c for c in fourier_columns if "begin" not in c and "end" not in c]
    fourier_columns = [c for c in fourier_columns if "head" not in c and "tail" not in c]
    fourier_columns = [c for c in fourier_columns if "start" not in c]
    fourier_columns_only = list(set(fourier_columns))
    fourier_columns = list(set(advanced_columns + fourier_columns))
    
    baseline_and_advanced = list(set(baseline_columns + advanced_columns_only))
    baseline_and_fourier = list(set(baseline_columns + fourier_columns_only))
    advanced_and_fourier = list(set(advanced_columns_only + fourier_columns_only))
    
    return {\
        "Baseline only": baseline_columns,\
        "Advanced only": advanced_columns_only,\
        "Fourier only": fourier_columns_only,\
        "Baseline and Fourier": baseline_and_fourier,\
        "Advanced and Fourier": advanced_and_fourier,\
        "Baseline and Advanced": advanced_columns,\
        "Baseline, Advanced, and Fourier": fourier_columns,\
    }



In [3]:
def read_data():
    df = pd.read_csv("n1.encrypted.expanded.ransomware.csv.gz")
    df["is_encrypted"] = 1
    print(df.shape)
    df2 = pd.read_csv("n1.plaintext.csv.gz")
    df2["is_encrypted"] = 0
    df3 = pd.read_csv("n1.zip.expanded.plaintext.csv.gz")
    df3["is_encrypted"] = 0

    df2 = pd.concat([df2, df3])
    if not USE_SMOTE:
        df2 = df2.sample(frac=0.2).reset_index(drop=True)
    print(df2.shape)
    df = pd.concat([df, df2])

    df = df.sample(frac=1).reset_index(drop=True)
    featuresets = get_columns(df)

    y = df["is_encrypted"]
    X = df[[c for c in df.columns if "is_encrypted" != c]]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    
    return df, X_train, X_test, y_train, y_test

In [4]:
def run_algorithms(X_train, X_test, y_train, y_test, clf):
    result_df = X_test.copy()
    result_df["y_true"] = y_test

    np.random.seed(42)
    random.seed(42)
    
    featuresets = get_columns(X_train)
    for key, value in featuresets.items():
        XX_train = X_train[value]
        XX_test = X_test[value]
        
        if USE_SMOTE:
            estimators = [\
                  ('smote', imblearn.over_sampling.SVMSMOTE()),
                  ('std,', MinMaxScaler()), \
                  ("classifier", clf())]
        else:
            estimators = [\
                  ('std,', MinMaxScaler()), \
                  ("classifier", clf())]
            
        if USE_SMOTE:
            pipeline = imblearn.pipeline.Pipeline(estimators)
        else:
            pipeline = Pipeline(estimators)
            
        pipeline.fit(XX_train, y_train)
        y_pred = pipeline.predict(XX_test)
        result_df[key] = y_pred
        
        print(key, f1_score(y_test, y_pred))
    return result_df



In [5]:
def classify_and_compare():
    df, X_train, X_test, y_train, y_test = read_data()
    
    print("Train Set", 
          " plaintext = ", y_train[y_train != 1].count(), 
          " encrypted = ", y_train[y_train == 1].count(), 
          " total = ", len(y_train))
    print("Test Set ", 
          " plaintext = ", y_test[y_test != 1].count(), 
          " encrypted = ", y_test[y_test == 1].count(), 
          " total = ", len(y_test))

    
    print("Logistic Regression")
    print("------------------------")
    lr_clf = lambda: LogisticRegression(\
                n_jobs=8, \
                solver='saga', \
                random_state=42, \
                max_iter=1000, \
                multi_class='ovr')
    lr_result = run_algorithms(X_train, X_test, y_train, y_test, lr_clf)
    
    print("Random Forest")
    print("------------------------")
    rfc_clf = lambda: RandomForestClassifier(n_jobs=N_JOBS, random_state=42)
    rfc_result = run_algorithms(X_train, X_test, y_train, y_test, rfc_clf)
    return lr_result, rfc_result

In [6]:
def format_results_as_dataframe0(df):
    dictdf = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    
    featuresets = get_columns(df)
    
    for feature in featuresets.keys():
        if feature in df.keys():
            y_true = df["y_true"]
            y_pred = df[str(feature)]
            dictdf["FeatureSet"].append(feature)
            dictdf["Accuracy"].append(accuracy_score(y_true, y_pred))
            dictdf["F1"].append(f1_score(y_true, y_pred))
            dictdf["Precision"].append(f1_score(y_true, y_pred))
            dictdf["Recall"].append(f1_score(y_true, y_pred))

    return pd.DataFrame(dictdf)

def format_results_as_dataframe(df, include=None, exclude=None):
    if include and not (isinstance(include, list)):
        include = [include]
    if exclude and not isinstance(exclude, list):
        exclude = [exclude]
    if include:
        include = [x.lower() for x in include]
    if exclude:
        exclude = [x.lower() for x in exclude]
        
    def is_interesting(item):
        if exclude:
            for x in exclude:
                if x.lower() in item.lower():
                    return False
        if include is None or 0 == len(include):
            return True
        if include:
            for x in include:
                if x.lower() in item.lower():
                    return True
        if (include is None or 0 == len(include)) \
            and (exclude is None or 0 == len(exclude)):
            return True
        return False

    dictdf = {
        "FeatureSet": [],
        "Accuracy": [],
        "F1": [],
        "Precision": [],
        "Recall": []
    }
    
    featuresets = get_columns(df)
    df = df[df["extended.base_filename"].map(is_interesting)]
    
    for feature in featuresets.keys():
        if feature in df.keys():
            y_true = df["y_true"]
            y_pred = df[str(feature)]
            dictdf["FeatureSet"].append(feature)
            dictdf["Accuracy"].append(accuracy_score(y_true, y_pred))
            dictdf["F1"].append(f1_score(y_true, y_pred))
            dictdf["Precision"].append(f1_score(y_true, y_pred))
            dictdf["Recall"].append(f1_score(y_true, y_pred))

    return pd.DataFrame(dictdf)

# Using a subset of data (no SMOTE)

In [7]:
USE_SMOTE = False
lr_result, rfc_result = classify_and_compare()

(707, 677)
(1818, 677)
Train Set  plaintext =  1223  encrypted =  468  total =  1691
Test Set   plaintext =  595  encrypted =  239  total =  834
Logistic Regression
------------------------
Baseline only 0.0
Advanced only 0.0
Fourier only 0.5174129353233831
Baseline and Fourier 0.5219512195121951
Advanced and Fourier 0.6881720430107527
Baseline and Advanced 0.0
Baseline, Advanced, and Fourier 0.685344827586207
Random Forest
------------------------
Baseline only 0.7621052631578947
Advanced only 0.7304347826086957
Fourier only 0.6919831223628693
Baseline and Fourier 0.767590618336887
Advanced and Fourier 0.7829787234042553
Baseline and Advanced 0.7633262260127931
Baseline, Advanced, and Fourier 0.7708779443254818


In [8]:
results = format_results_as_dataframe0(lr_result)

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples:")
print("--------------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples:
--------------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.712230 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.713429 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.767386 &  0.517413 &   0.517413 &  0.517413 \\
3 &             Baseline and Fourier &  0.764988 &  0.521951 &   0.521951 &  0.521951 \\
4 &             Advanced and Fourier &  0.826139 &  0.688172 &   0.688172 &  0.688172 \\
5 &            Baseline and Advanced &  0.713429 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.824940 &  0.685345 &   0.685345 &  0.685345 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.71223,0.0,0.0,0.0
1,Advanced only,0.713429,0.0,0.0,0.0
5,Baseline and Advanced,0.713429,0.0,0.0,0.0
2,Fourier only,0.767386,0.517413,0.517413,0.517413
3,Baseline and Fourier,0.764988,0.521951,0.521951,0.521951
6,"Baseline, Advanced, and Fourier",0.82494,0.685345,0.685345,0.685345
4,Advanced and Fourier,0.826139,0.688172,0.688172,0.688172


In [9]:
results = format_results_as_dataframe0(rfc_result)

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples:")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples:
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.864508 &  0.762105 &   0.762105 &  0.762105 \\
1 &                    Advanced only &  0.851319 &  0.730435 &   0.730435 &  0.730435 \\
2 &                     Fourier only &  0.824940 &  0.691983 &   0.691983 &  0.691983 \\
3 &             Baseline and Fourier &  0.869305 &  0.767591 &   0.767591 &  0.767591 \\
4 &             Advanced and Fourier &  0.877698 &  0.782979 &   0.782979 &  0.782979 \\
5 &            Baseline and Advanced &  0.866906 &  0.763326 &   0.763326 &  0.763326 \\
6 &  Baseline, Advanced, and Fourier &  0.871703 &  0.770878 &   0.770878 &  0.770878 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.82494,0.691983,0.691983,0.691983
1,Advanced only,0.851319,0.730435,0.730435,0.730435
0,Baseline only,0.864508,0.762105,0.762105,0.762105
5,Baseline and Advanced,0.866906,0.763326,0.763326,0.763326
3,Baseline and Fourier,0.869305,0.767591,0.767591,0.767591
6,"Baseline, Advanced, and Fourier",0.871703,0.770878,0.770878,0.770878
4,Advanced and Fourier,0.877698,0.782979,0.782979,0.782979


In [10]:
results = format_results_as_dataframe(lr_result, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.409877 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.409877 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.664198 &  0.604651 &   0.604651 &  0.604651 \\
3 &             Baseline and Fourier &  0.669136 &  0.614943 &   0.614943 &  0.614943 \\
4 &             Advanced and Fourier &  0.804938 &  0.802005 &   0.802005 &  0.802005 \\
5 &            Baseline and Advanced &  0.409877 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.802469 &  0.798995 &   0.798995 &  0.798995 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.409877,0.0,0.0,0.0
1,Advanced only,0.409877,0.0,0.0,0.0
5,Baseline and Advanced,0.409877,0.0,0.0,0.0
2,Fourier only,0.664198,0.604651,0.604651,0.604651
3,Baseline and Fourier,0.669136,0.614943,0.614943,0.614943
6,"Baseline, Advanced, and Fourier",0.802469,0.798995,0.798995,0.798995
4,Advanced and Fourier,0.804938,0.802005,0.802005,0.802005


In [11]:
results = format_results_as_dataframe(rfc_result, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.841975 &  0.849765 &   0.849765 &  0.849765 \\
1 &                    Advanced only &  0.809877 &  0.813559 &   0.813559 &  0.813559 \\
2 &                     Fourier only &  0.809877 &  0.809877 &   0.809877 &  0.809877 \\
3 &             Baseline and Fourier &  0.846914 &  0.853081 &   0.853081 &  0.853081 \\
4 &             Advanced and Fourier &  0.856790 &  0.863850 &   0.863850 &  0.863850 \\
5 &            Baseline and Advanced &  0.837037 &  0.844340 &   0.844340 &  0.844340 \\
6 &  Baseline, Advanced, and Fourier &  0.846914 &  0.853081 &   0.853081 &  0.853081 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.809877,0.809877,0.809877,0.809877
1,Advanced only,0.809877,0.813559,0.813559,0.813559
5,Baseline and Advanced,0.837037,0.84434,0.84434,0.84434
0,Baseline only,0.841975,0.849765,0.849765,0.849765
3,Baseline and Fourier,0.846914,0.853081,0.853081,0.853081
6,"Baseline, Advanced, and Fourier",0.846914,0.853081,0.853081,0.853081
4,Advanced and Fourier,0.85679,0.86385,0.86385,0.86385


In [12]:
results = format_results_as_dataframe(lr_result, 
                    ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.409877 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.409877 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.664198 &  0.604651 &   0.604651 &  0.604651 \\
3 &             Baseline and Fourier &  0.669136 &  0.614943 &   0.614943 &  0.614943 \\
4 &             Advanced and Fourier &  0.804938 &  0.802005 &   0.802005 &  0.802005 \\
5 &            Baseline and Advanced &  0.409877 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.802469 &  0.798995 &   0.798995 &  0.798995 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.409877,0.0,0.0,0.0
1,Advanced only,0.409877,0.0,0.0,0.0
5,Baseline and Advanced,0.409877,0.0,0.0,0.0
2,Fourier only,0.664198,0.604651,0.604651,0.604651
3,Baseline and Fourier,0.669136,0.614943,0.614943,0.614943
6,"Baseline, Advanced, and Fourier",0.802469,0.798995,0.798995,0.798995
4,Advanced and Fourier,0.804938,0.802005,0.802005,0.802005


In [13]:
results = format_results_as_dataframe(rfc_result, 
                    ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.841975 &  0.849765 &   0.849765 &  0.849765 \\
1 &                    Advanced only &  0.809877 &  0.813559 &   0.813559 &  0.813559 \\
2 &                     Fourier only &  0.809877 &  0.809877 &   0.809877 &  0.809877 \\
3 &             Baseline and Fourier &  0.846914 &  0.853081 &   0.853081 &  0.853081 \\
4 &             Advanced and Fourier &  0.856790 &  0.863850 &   0.863850 &  0.863850 \\
5 &            Baseline and Advanced &  0.837037 &  0.844340 &   0.844340 &  0.844340 \\
6 &  Baseline, Advanced, and Fourier &  0.846914 &  0.853081 &   0.853081 &  0.853081 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.809877,0.809877,0.809877,0.809877
1,Advanced only,0.809877,0.813559,0.813559,0.813559
5,Baseline and Advanced,0.837037,0.84434,0.84434,0.84434
0,Baseline only,0.841975,0.849765,0.849765,0.849765
3,Baseline and Fourier,0.846914,0.853081,0.853081,0.853081
6,"Baseline, Advanced, and Fourier",0.846914,0.853081,0.853081,0.853081
4,Advanced and Fourier,0.85679,0.86385,0.86385,0.86385


In [14]:
results = format_results_as_dataframe(lr_result, 
                    ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.492569 &  0.000000 &   0.000000 &  0.000000 \\
1 &                    Advanced only &  0.492569 &  0.000000 &   0.000000 &  0.000000 \\
2 &                     Fourier only &  0.673036 &  0.574586 &   0.574586 &  0.574586 \\
3 &             Baseline and Fourier &  0.677282 &  0.584699 &   0.584699 &  0.584699 \\
4 &             Advanced and Fourier &  0.785563 &  0.760095 &   0.760095 &  0.760095 \\
5 &            Baseline and Advanced &  0.492569 &  0.000000 &   0.000000 &  0.000000 \\
6 &  Baseline, Advanced, and Fourier &  0.783439 &  0.757143 &   0.757143 &  0.757143 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.492569,0.0,0.0,0.0
1,Advanced only,0.492569,0.0,0.0,0.0
5,Baseline and Advanced,0.492569,0.0,0.0,0.0
2,Fourier only,0.673036,0.574586,0.574586,0.574586
3,Baseline and Fourier,0.677282,0.584699,0.584699,0.584699
6,"Baseline, Advanced, and Fourier",0.783439,0.757143,0.757143,0.757143
4,Advanced and Fourier,0.785563,0.760095,0.760095,0.760095


In [15]:
results = format_results_as_dataframe(rfc_result, 
                    ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("NO SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



NO SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.825902 &  0.815315 &   0.815315 &  0.815315 \\
1 &                    Advanced only &  0.806794 &  0.786885 &   0.786885 &  0.786885 \\
2 &                     Fourier only &  0.791932 &  0.769953 &   0.769953 &  0.769953 \\
3 &             Baseline and Fourier &  0.832272 &  0.820046 &   0.820046 &  0.820046 \\
4 &             Advanced and Fourier &  0.855626 &  0.844037 &   0.844037 &  0.844037 \\
5 &            Baseline and Advanced &  0.836518 &  0.822989 &   0.822989 &  0.822989 \\
6 &  Baseline, Advanced, and Fourier &  0.845011 &  0.831409 &   0.831409 &  0.831409 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.791932,0.769953,0.769953,0.769953
1,Advanced only,0.806794,0.786885,0.786885,0.786885
0,Baseline only,0.825902,0.815315,0.815315,0.815315
3,Baseline and Fourier,0.832272,0.820046,0.820046,0.820046
5,Baseline and Advanced,0.836518,0.822989,0.822989,0.822989
6,"Baseline, Advanced, and Fourier",0.845011,0.831409,0.831409,0.831409
4,Advanced and Fourier,0.855626,0.844037,0.844037,0.844037


In [16]:
lr_result.to_csv("../n1_ransomware_result_lr_nosmote.csv.gz")
rfc_result.to_csv("../n1_ransomware_result_rfc_nosmote.csv.gz")

# Using all data (SMOTE)

In [17]:
USE_SMOTE = True
lr_result2, rfc_result2 = classify_and_compare()

(707, 677)
(9091, 677)
Train Set  plaintext =  6090  encrypted =  474  total =  6564
Test Set   plaintext =  3001  encrypted =  233  total =  3234
Logistic Regression
------------------------
Baseline only 0.20788912579957355
Advanced only 0.2948609941027801
Fourier only 0.31217838765008576
Baseline and Fourier 0.31478260869565217
Advanced and Fourier 0.404040404040404
Baseline and Advanced 0.3112639724849527
Baseline, Advanced, and Fourier 0.40979955456570155
Random Forest
------------------------
Baseline only 0.6056338028169014
Advanced only 0.578096947935368
Fourier only 0.4782608695652174
Baseline and Fourier 0.5933333333333334
Advanced and Fourier 0.6308243727598567
Baseline and Advanced 0.6263736263736263
Baseline, Advanced, and Fourier 0.6252158894645942


In [18]:
results = format_results_as_dataframe0(lr_result2)

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples:")
print("--------------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples:
--------------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.540507 &  0.207889 &   0.207889 &  0.207889 \\
1 &                    Advanced only &  0.741187 &  0.294861 &   0.294861 &  0.294861 \\
2 &                     Fourier only &  0.752010 &  0.312178 &   0.312178 &  0.312178 \\
3 &             Baseline and Fourier &  0.756339 &  0.314783 &   0.314783 &  0.314783 \\
4 &             Advanced and Fourier &  0.835807 &  0.404040 &   0.404040 &  0.404040 \\
5 &            Baseline and Advanced &  0.752319 &  0.311264 &   0.311264 &  0.311264 \\
6 &  Baseline, Advanced, and Fourier &  0.836116 &  0.409800 &   0.409800 &  0.409800 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.540507,0.207889,0.207889,0.207889
1,Advanced only,0.741187,0.294861,0.294861,0.294861
5,Baseline and Advanced,0.752319,0.311264,0.311264,0.311264
2,Fourier only,0.75201,0.312178,0.312178,0.312178
3,Baseline and Fourier,0.756339,0.314783,0.314783,0.314783
4,Advanced and Fourier,0.835807,0.40404,0.40404,0.40404
6,"Baseline, Advanced, and Fourier",0.836116,0.4098,0.4098,0.4098


In [19]:
results = format_results_as_dataframe0(rfc_result2)

print()
print()
print("SMOTE: Random Forest with Ransomware Samples:")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples:
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.930736 &  0.605634 &   0.605634 &  0.605634 \\
1 &                    Advanced only &  0.927335 &  0.578097 &   0.578097 &  0.578097 \\
2 &                     Fourier only &  0.881262 &  0.478261 &   0.478261 &  0.478261 \\
3 &             Baseline and Fourier &  0.924552 &  0.593333 &   0.593333 &  0.593333 \\
4 &             Advanced and Fourier &  0.936302 &  0.630824 &   0.630824 &  0.630824 \\
5 &            Baseline and Advanced &  0.936920 &  0.626374 &   0.626374 &  0.626374 \\
6 &  Baseline, Advanced, and Fourier &  0.932900 &  0.625216 &   0.625216 &  0.625216 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.881262,0.478261,0.478261,0.478261
1,Advanced only,0.927335,0.578097,0.578097,0.578097
3,Baseline and Fourier,0.924552,0.593333,0.593333,0.593333
0,Baseline only,0.930736,0.605634,0.605634,0.605634
6,"Baseline, Advanced, and Fourier",0.9329,0.625216,0.625216,0.625216
5,Baseline and Advanced,0.93692,0.626374,0.626374,0.626374
4,Advanced and Fourier,0.936302,0.630824,0.630824,0.630824


In [20]:
results = format_results_as_dataframe(lr_result2, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.664691 &  0.496183 &   0.496183 &  0.496183 \\
1 &                    Advanced only &  0.944115 &  0.841346 &   0.841346 &  0.841346 \\
2 &                     Fourier only &  0.822185 &  0.634146 &   0.634146 &  0.634146 \\
3 &             Baseline and Fourier &  0.828959 &  0.641844 &   0.641844 &  0.641844 \\
4 &             Advanced and Fourier &  0.906012 &  0.764331 &   0.764331 &  0.764331 \\
5 &            Baseline and Advanced &  0.941575 &  0.839907 &   0.839907 &  0.839907 \\
6 &  Baseline, Advanced, and Fourier &  0.908552 &  0.773109 &   0.773109 &  0.773109 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.664691,0.496183,0.496183,0.496183
2,Fourier only,0.822185,0.634146,0.634146,0.634146
3,Baseline and Fourier,0.828959,0.641844,0.641844,0.641844
4,Advanced and Fourier,0.906012,0.764331,0.764331,0.764331
6,"Baseline, Advanced, and Fourier",0.908552,0.773109,0.773109,0.773109
5,Baseline and Advanced,0.941575,0.839907,0.839907,0.839907
1,Advanced only,0.944115,0.841346,0.841346,0.841346


In [21]:
results = format_results_as_dataframe(rfc_result2, 
                    ["password", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples: Office files, password protected and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.937341 &  0.822967 &   0.822967 &  0.822967 \\
1 &                    Advanced only &  0.933954 &  0.805000 &   0.805000 &  0.805000 \\
2 &                     Fourier only &  0.933108 &  0.816705 &   0.816705 &  0.816705 \\
3 &             Baseline and Fourier &  0.950889 &  0.859903 &   0.859903 &  0.859903 \\
4 &             Advanced and Fourier &  0.948349 &  0.852300 &   0.852300 &  0.852300 \\
5 &            Baseline and Advanced &  0.944115 &  0.838235 &   0.838235 &  0.838235 \\
6 &  Baseline, Advanced, and Fourier &  0.950042 &  0.859857 &   0.859857 &  0.859857 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.933954,0.805,0.805,0.805
2,Fourier only,0.933108,0.816705,0.816705,0.816705
0,Baseline only,0.937341,0.822967,0.822967,0.822967
5,Baseline and Advanced,0.944115,0.838235,0.838235,0.838235
4,Advanced and Fourier,0.948349,0.8523,0.8523,0.8523
6,"Baseline, Advanced, and Fourier",0.950042,0.859857,0.859857,0.859857
3,Baseline and Fourier,0.950889,0.859903,0.859903,0.859903


In [22]:
results = format_results_as_dataframe(lr_result2, 
                ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.664691 &  0.496183 &   0.496183 &  0.496183 \\
1 &                    Advanced only &  0.944115 &  0.841346 &   0.841346 &  0.841346 \\
2 &                     Fourier only &  0.822185 &  0.634146 &   0.634146 &  0.634146 \\
3 &             Baseline and Fourier &  0.828959 &  0.641844 &   0.641844 &  0.641844 \\
4 &             Advanced and Fourier &  0.906012 &  0.764331 &   0.764331 &  0.764331 \\
5 &            Baseline and Advanced &  0.941575 &  0.839907 &   0.839907 &  0.839907 \\
6 &  Baseline, Advanced, and Fourier &  0.908552 &  0.773109 &   0.773109 &  0.773109 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.664691,0.496183,0.496183,0.496183
2,Fourier only,0.822185,0.634146,0.634146,0.634146
3,Baseline and Fourier,0.828959,0.641844,0.641844,0.641844
4,Advanced and Fourier,0.906012,0.764331,0.764331,0.764331
6,"Baseline, Advanced, and Fourier",0.908552,0.773109,0.773109,0.773109
5,Baseline and Advanced,0.941575,0.839907,0.839907,0.839907
1,Advanced only,0.944115,0.841346,0.841346,0.841346


In [23]:
results = format_results_as_dataframe(rfc_result2, 
                ["doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Random Forest with Ransomware Samples: Office files and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples: Office files and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.937341 &  0.822967 &   0.822967 &  0.822967 \\
1 &                    Advanced only &  0.933954 &  0.805000 &   0.805000 &  0.805000 \\
2 &                     Fourier only &  0.933108 &  0.816705 &   0.816705 &  0.816705 \\
3 &             Baseline and Fourier &  0.950889 &  0.859903 &   0.859903 &  0.859903 \\
4 &             Advanced and Fourier &  0.948349 &  0.852300 &   0.852300 &  0.852300 \\
5 &            Baseline and Advanced &  0.944115 &  0.838235 &   0.838235 &  0.838235 \\
6 &  Baseline, Advanced, and Fourier &  0.950042 &  0.859857 &   0.859857 &  0.859857 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
1,Advanced only,0.933954,0.805,0.805,0.805
2,Fourier only,0.933108,0.816705,0.816705,0.816705
0,Baseline only,0.937341,0.822967,0.822967,0.822967
5,Baseline and Advanced,0.944115,0.838235,0.838235,0.838235
4,Advanced and Fourier,0.948349,0.8523,0.8523,0.8523
6,"Baseline, Advanced, and Fourier",0.950042,0.859857,0.859857,0.859857
3,Baseline and Fourier,0.950889,0.859903,0.859903,0.859903


In [24]:
results = format_results_as_dataframe(lr_result2, 
                ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Logistic Regression with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.586604 &  0.389610 &   0.389610 &  0.389610 \\
1 &                    Advanced only &  0.811231 &  0.556439 &   0.556439 &  0.556439 \\
2 &                     Fourier only &  0.746955 &  0.493225 &   0.493225 &  0.493225 \\
3 &             Baseline and Fourier &  0.752368 &  0.497253 &   0.497253 &  0.497253 \\
4 &             Advanced and Fourier &  0.817997 &  0.572337 &   0.572337 &  0.572337 \\
5 &            Baseline and Advanced &  0.819350 &  0.575517 &   0.575517 &  0.575517 \\
6 &  Baseline, Advanced, and Fourier &  0.820704 &  0.581359 &   0.581359 &  0.581359 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
0,Baseline only,0.586604,0.38961,0.38961,0.38961
2,Fourier only,0.746955,0.493225,0.493225,0.493225
3,Baseline and Fourier,0.752368,0.497253,0.497253,0.497253
1,Advanced only,0.811231,0.556439,0.556439,0.556439
4,Advanced and Fourier,0.817997,0.572337,0.572337,0.572337
5,Baseline and Advanced,0.81935,0.575517,0.575517,0.575517
6,"Baseline, Advanced, and Fourier",0.820704,0.581359,0.581359,0.581359


In [25]:
results = format_results_as_dataframe(rfc_result2, 
                ["webp", "doc", "docx", "jpg", "pdf", "pptx", "xls", "xlsx", "ppt", "pptx"])

print()
print()
print("SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware")
print("--------------------------------------")
print()
print()

print(results.to_latex())
results.sort_values(by="F1")



SMOTE: Random Forest with Ransomware Samples: Office files, webp and ransomware
--------------------------------------


\begin{tabular}{llrrrr}
\toprule
{} &                       FeatureSet &  Accuracy &        F1 &  Precision &    Recall \\
\midrule
0 &                    Baseline only &  0.903924 &  0.707819 &   0.707819 &  0.707819 \\
1 &                    Advanced only &  0.915426 &  0.720358 &   0.720358 &  0.720358 \\
2 &                     Fourier only &  0.880920 &  0.666667 &   0.666667 &  0.666667 \\
3 &             Baseline and Fourier &  0.911367 &  0.731006 &   0.731006 &  0.731006 \\
4 &             Advanced and Fourier &  0.922192 &  0.753747 &   0.753747 &  0.753747 \\
5 &            Baseline and Advanced &  0.926252 &  0.758315 &   0.758315 &  0.758315 \\
6 &  Baseline, Advanced, and Fourier &  0.922192 &  0.758910 &   0.758910 &  0.758910 \\
\bottomrule
\end{tabular}



Unnamed: 0,FeatureSet,Accuracy,F1,Precision,Recall
2,Fourier only,0.88092,0.666667,0.666667,0.666667
0,Baseline only,0.903924,0.707819,0.707819,0.707819
1,Advanced only,0.915426,0.720358,0.720358,0.720358
3,Baseline and Fourier,0.911367,0.731006,0.731006,0.731006
4,Advanced and Fourier,0.922192,0.753747,0.753747,0.753747
5,Baseline and Advanced,0.926252,0.758315,0.758315,0.758315
6,"Baseline, Advanced, and Fourier",0.922192,0.75891,0.75891,0.75891


In [26]:
lr_result2.to_csv("../n1_ransomware_result_lr_smote.csv.gz")
rfc_result2.to_csv("../n1_ransomware_result_rfc_smote.csv.gz")

In [27]:
!pwd

/Users/phantom/mscwork/processed_save/csv
