# Imports / Parameters

In [1]:
import pandas as pd
import numpy as np
import random
import pickle

from sklearn.model_selection import train_test_split

# from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)

In [2]:
data_path = r"C:\Users\caleb\PycharmProjects\security_project\security_project\Data\extended_dike_modeling.p"

# Functions

In [3]:
def set_label(vec, grouping = "generic"):
    # If include_generic is True then we allow the usage of the "generic" label for malicious files
    # Otherwise, we exclude the generic label
    
    return_value = None
    if vec[0] == "benign":
        return_value = "benign"
    else:
        # Use the labels provided in the Dike dataset
        if grouping == "raw":
            possible_labels = vec[1:]
            lookup_dic = {
                0:"generic",
                1:"trojan",
                2:"ransomware",
                3:"worm",
                4:"backdoor",
                5:"spyware",
                6:"rootkit",
                7:"encrypter",
                8:"downloader"
            }
        # If the file is trojan, label as trojan. Otherwise, label as generic
        elif grouping == "generic":
            possible_labels = vec[1:]
            lookup_dic = {
                0:"generic",
                1:"trojan",
                2:"generic",
                3:"generic",
                4:"generic",
                5:"generic",
                6:"generic",
                7:"generic",
                8:"generic"
            }  
        # Return all malicious files as "Generic"
        elif grouping == "generic_all":
            possible_labels = vec[1:]
            lookup_dic = {
                0:"generic",
                1:"generic",
                2:"generic",
                3:"generic",
                4:"generic",
                5:"generic",
                6:"generic",
                7:"generic",
                8:"generic"
            } 
        # Use raw labels but exclude generic label when calculating classification
        elif grouping == "exclude_generic":
            possible_labels = vec[2:]
            lookup_dic = {
                0:"trojan",
                1:"ransomware",
                2:"worm",
                3:"backdoor",
                4:"spyware",
                5:"rootkit",
                6:"encrypter",
                7:"downloader"
            }            
            
        index_max = max(range(len(possible_labels)), key=possible_labels.__getitem__)
        
        return_value = lookup_dic[index_max]
            
    return return_value

# Get Data

In [4]:
base_df = pd.read_pickle(data_path)

In [5]:
base_df.head(3)

Unnamed: 0,hash,source,type,ID,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,AddressOfEntryPoint,BaseOfCode,BaseOfData,ImageBase,SectionAlignment,FileAlignment,MajorOperatingSystemVersion,MinorOperatingSystemVersion,MajorImageVersion,MinorImageVersion,MajorSubsystemVersion,MinorSubsystemVersion,SizeOfImage,SizeOfHeaders,CheckSum,Subsystem,DllCharacteristics,SizeOfStackReserve,SizeOfStackCommit,SizeOfHeapReserve,SizeOfHeapCommit,LoaderFlags,NumberOfRvaAndSizes,SectionsNb,SectionsMeanEntropy,SectionsMinEntropy,SectionsMaxEntropy,SectionsMeanRawsize,SectionsMinRawsize,SectionMaxRawsize,SectionsMeanVirtualsize,SectionsMinVirtualsize,SectionMaxVirtualsize,ImportsNbDLL,ImportsNb,ImportsNbOrdinal,ExportNb,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate,malice,generic,trojan,ransomware,worm,backdoor,spyware,rootkit,encrypter,downloader
0,012a50629c3cf1e2a2e5133b729c2ac7765cf4115187b1...,malware,0,1,dd7b1c66faa1bde0a42a868acafefcb3,332,224,271,6,0,36864,380928,0,5616,4096,40960,4194304,4096,4096,4,0,0,0,4,0,823296,4096,0,2,0,1048576,4096,1048576,4096,0,16,4,4.804751,0.707497,7.904654,104448.0,8192,360448,202528.5,14300,402506,4,62,0,0,5,3.041344,1.781496,6.505338,963.6,20,4264,0,0,,0.934579,0.32,0.56,0.0,0.02,0.0,0.0,0.0,0.1,0.0
1,018b4029f46e529da8dc57b8bc6245644ed59f63b4f8ea...,malware,0,1,6f6d36db833a383f3836639cad4ddbb1,332,224,271,7,0,24576,420539,397312,6469,4096,28672,4259840,4096,4096,4,0,0,0,4,0,856064,4096,518968,2,0,1048576,4096,1048576,4096,0,16,4,5.768973,2.662357,7.637939,112640.0,16384,376832,211277.5,20756,415160,5,85,0,0,4,5.667223,1.919241,7.95996,8029.5,20,16936,0,0,,0.923077,0.493506,0.415584,0.0,0.012987,0.0,0.0,0.0,0.077922,0.0
2,01a0bea806384827759ae032031516bfe843285cac3980...,malware,0,1,4f105bf3461951a61a770ff1c9577107,332,224,271,7,0,20480,390466,430080,5873,4096,24576,4259840,4096,4096,4,0,0,0,4,0,856064,4096,437261,2,0,1048576,4096,1048576,4096,0,16,4,7.212979,6.636076,7.876911,104448.0,20480,262144,210782.75,20164,532664,4,63,0,0,4,5.648782,1.919241,7.961305,7916.75,20,16936,0,0,,0.931298,0.452055,0.39726,0.0,0.0,0.0,0.0,0.0,0.123288,0.027397


In [6]:
base_df["source"].unique()

array(['malware', 'benign'], dtype=object)

# Standardize Labels 

In [7]:
base_df["label"] = base_df[["source","generic","trojan","ransomware","worm","backdoor","spyware","rootkit","encrypter","downloader"]].apply(set_label, grouping = "generic_all", axis=1)

In [8]:
base_df["raw_label"] = base_df[["source","generic","trojan","ransomware","worm","backdoor","spyware","rootkit","encrypter","downloader"]].apply(set_label, grouping = "raw", axis=1)

In [9]:
base_df.groupby("label")["hash"].nunique()

label
benign      962
generic    6016
Name: hash, dtype: int64

In [10]:
base_df.groupby("raw_label")["hash"].nunique()

raw_label
benign      962
generic    1770
trojan     4238
worm          8
Name: hash, dtype: int64

# Final Cleaning Before Modeling

## Remove unneeded Columns

In [11]:
# Look for columns with no or little unique values - could exclude
for col in base_df.columns.values:
    nunique = base_df[col].nunique()
    if nunique < 5:
        print(f"{col} has {nunique} unique values")

source has 2 unique values
type has 1 unique values
ID has 1 unique values
Machine has 3 unique values
SizeOfOptionalHeader has 2 unique values
SectionAlignment has 4 unique values
FileAlignment has 3 unique values
Subsystem has 3 unique values
SizeOfHeapReserve has 4 unique values
SizeOfHeapCommit has 3 unique values
LoaderFlags has 2 unique values
NumberOfRvaAndSizes has 2 unique values
VersionInformationSize has 1 unique values
legitimate has 0 unique values
label has 2 unique values
raw_label has 4 unique values


In [12]:
modeling_data = base_df.drop(columns=[
    "source","type","ID","VersionInformationSize","legitimate","malice"
    ,"generic","trojan","ransomware","worm","backdoor","spyware","rootkit","encrypter","downloader"
])

In [13]:
modeling_data = modeling_data.rename(columns={
    "hash":"hash_ID"
})

## Check Data Types

In [14]:
for col in modeling_data.columns.values:
    d_type = type(modeling_data[col].values[0])
    if str(d_type) == "<class 'str'>":
        print(col,":",d_type)

hash_ID : <class 'str'>
md5 : <class 'str'>
label : <class 'str'>
raw_label : <class 'str'>


In [15]:
modeling_data = modeling_data.drop(columns=["md5"])

# Split Generic & Trojan

In [16]:
modeling_data["is_malicious"] = np.where(
    modeling_data["label"] == "benign"
    ,False
    ,True
)

In [17]:
modeling_generic = modeling_data[modeling_data["label"].isin(["benign","generic"])]
modeling_generic = modeling_generic.drop(columns=["label","raw_label"])

In [18]:
modeling_trojan = modeling_data[modeling_data["raw_label"].isin(["benign","trojan"])]
modeling_trojan = modeling_trojan.drop(columns=["label","raw_label"])

# Prepare for Modeling

In [19]:
generic_labels = np.array(modeling_generic['is_malicious'])

In [20]:
generic_features= modeling_generic.drop(columns=['hash_ID','is_malicious'], axis = 1)

In [21]:
generic_feature_list = list(generic_features.columns)

In [22]:
generic_features = np.array(generic_features)

## Trojan

In [23]:
trojan_labels = np.array(modeling_trojan['is_malicious'])

In [24]:
trojan_features= modeling_trojan.drop(columns=['hash_ID','is_malicious'], axis = 1)

In [25]:
trojan_feature_list = list(trojan_features.columns)

In [26]:
trojan_features = np.array(trojan_features)

## Split Train/Test

In [27]:
generic_train_features, generic_test_features, generic_train_labels, generic_test_labels = \
train_test_split(generic_features, generic_labels, test_size = 0.25, random_state = 1202)

In [28]:
trojan_train_features, trojan_test_features, trojan_train_labels, trojan_test_labels = \
train_test_split(trojan_features, trojan_labels, test_size = 0.25, random_state = 1202)

# Create the Models

## Generic

In [29]:
%%time
generic_rf = RandomForestClassifier(n_estimators = 100, random_state = 1202)

Wall time: 0 ns


In [30]:
%%time
generic_rf.fit(generic_train_features, generic_train_labels)

Wall time: 556 ms


RandomForestClassifier(random_state=1202)

## Trojan

In [31]:
%%time
trojan_rf = RandomForestClassifier(n_estimators = 100, random_state = 1202)

Wall time: 0 ns


In [32]:
%%time
trojan_rf.fit(trojan_train_features, trojan_train_labels)

Wall time: 395 ms


RandomForestClassifier(random_state=1202)

# Test Models
    Compare actual labels against predicted labels

## Generic

In [33]:
# Test Generic Model on Generic Data
generic_predictions = generic_rf.predict(generic_test_features)

In [34]:
len(generic_predictions) == len(generic_test_labels)

True

In [35]:
len(generic_predictions)

1745

In [36]:
correct = 0
wrong = []
for i in range(len(generic_predictions)):
    if generic_predictions[i] == generic_test_labels[i]:
        correct += 1
    else:
        wrong.append([generic_test_labels[i],generic_predictions[i]])

In [37]:
percent_correct = round((correct/len(generic_predictions))*100,2)

In [38]:
correct

1736

In [39]:
percent_correct

99.48

In [40]:
flagged_benign = 0
ignored_malicious = 0
for bad_prediction in wrong:
    if (bad_prediction[0] == 1) & (bad_prediction[1] == 0):
        ignored_malicious += 1
    elif (bad_prediction[0] == 0) & (bad_prediction[1] == 1):
        flagged_benign += 1
    else:
        print("ISSUE: ", bad_prediction)
        break

In [41]:
print(f"Accuracy of Classifier is: {percent_correct}%")
print(f"There were {len(wrong)} units wrongly classified out of {len(generic_predictions)}")
print(f"We flagged {flagged_benign} benign files and let {ignored_malicious} malicious files slip by")

Accuracy of Classifier is: 99.48%
There were 9 units wrongly classified out of 1745
We flagged 3 benign files and let 6 malicious files slip by


## Trojan

In [42]:
# Test Trojan Model on Trojan Data
trojan_predictions = trojan_rf.predict(trojan_test_features)

In [43]:
len(trojan_predictions) == len(trojan_test_labels)

True

In [44]:
len(trojan_predictions)

1300

In [45]:
correct = 0
wrong = []
for i in range(len(trojan_predictions)):
    if trojan_predictions[i] == trojan_test_labels[i]:
        correct += 1
    else:
        wrong.append([trojan_test_labels[i],trojan_predictions[i]])

In [46]:
percent_correct = round((correct/len(trojan_predictions))*100,2)

In [47]:
correct

1298

In [48]:
percent_correct

99.85

In [49]:
flagged_benign = 0
ignored_malicious = 0
for bad_prediction in wrong:
    if (bad_prediction[0] == 1) & (bad_prediction[1] == 0):
        ignored_malicious += 1
    elif (bad_prediction[0] == 0) & (bad_prediction[1] == 1):
        flagged_benign += 1
    else:
        print("ISSUE: ", bad_prediction)
        break

In [50]:
print(f"Accuracy of Classifier is: {percent_correct}%")
print(f"There were {len(wrong)} units wrongly classified out of {len(trojan_predictions)}")
print(f"We flagged {flagged_benign} benign files and let {ignored_malicious} malicious files slip by")

Accuracy of Classifier is: 99.85%
There were 2 units wrongly classified out of 1300
We flagged 0 benign files and let 2 malicious files slip by


# Feature Importance
    Leverage the secondary model focusing on Trojan Malware Attacks to determine feature importance

In [51]:
# Get numerical feature importances
trojan_importances = list(trojan_rf.feature_importances_)
# List of tuples with variable and importance
trojan_feature_importances = [(feature, round(importance, 5)) for feature, importance in zip(trojan_feature_list, trojan_importances)]
# Sort the feature importances by most important first
trojan_feature_importances = sorted(trojan_feature_importances, key = lambda x: x[1], reverse = True)
trojan_feature_importances_df = pd.DataFrame(trojan_feature_importances,columns=["feature","trojan_feature_importance"])

In [53]:
trojan_feature_importances_df.head(15)

Unnamed: 0,feature,trojan_feature_importance
0,SectionsMaxEntropy,0.12495
1,SizeOfImage,0.09079
2,DllCharacteristics,0.07393
3,BaseOfData,0.06957
4,SizeOfOptionalHeader,0.06086
5,Subsystem,0.05687
6,SectionMaxVirtualsize,0.05299
7,SectionMaxRawsize,0.04917
8,MajorSubsystemVersion,0.04789
9,SizeOfStackReserve,0.03411
