In [1]:
import os
import json
import pandas as pd
import numpy as np
import statistics
from helpers import *
import re

In [2]:
json_file = r"D:\ClassWork\anti_virus\Vigil-Anti\EXE_Dataset\ember2018\train_features_1.jsonl"

with open(json_file, 'r') as f:
    json_ds_list = list(f)

DataSet = []
for i,ds in enumerate(json_ds_list):
    if( i > 2500):
        break
    DataSet.append(json.loads(ds))

# to free some of the precious memory
del json_ds_list

print (DataSet[0])

simple_ds = DataSet[5]

{'sha256': '2ef9a92ee6c955364564b0df75ee3753473014b2ba162b9df90afe6df9dbb256', 'md5': '7e39aeea7bc21d16b8652516a150b282', 'appeared': '2018-01', 'label': 1, 'avclass': 'sivis', 'histogram': [60782, 5895, 2020, 1487, 2075, 1367, 1145, 856, 2037, 725, 2027, 716, 1418, 903, 672, 1014, 1605, 652, 702, 691, 1048, 927, 641, 599, 795, 636, 598, 598, 677, 629, 597, 571, 8564, 738, 921, 600, 1253, 835, 645, 565, 1015, 919, 958, 868, 917, 784, 1435, 1307, 1470, 1081, 903, 1380, 913, 914, 872, 823, 1013, 1048, 1001, 1289, 1063, 1261, 792, 771, 1852, 3074, 928, 1346, 1238, 1786, 1036, 857, 1028, 1149, 902, 749, 1003, 1101, 1014, 883, 2012, 1152, 1374, 1468, 1242, 1374, 1312, 1447, 975, 848, 716, 1067, 940, 1566, 1298, 1468, 897, 3196, 1406, 2574, 2206, 5376, 2771, 1455, 2052, 2923, 1401, 908, 2522, 1562, 3768, 3473, 2336, 813, 3879, 2968, 5270, 2441, 1323, 1398, 1176, 1245, 843, 944, 984, 1172, 878, 851, 1168, 1116, 1029, 2612, 900, 1471, 827, 767, 953, 1479, 908, 4228, 772, 1342, 753, 719, 828, 7

### Exploring the unique section names

In [3]:
"""
all_sectionNames = set()
for ds_obj in DataSet:
    for dic_elm in ds_obj['section']['sections']:
        all_sectionNames.add(dic_elm['name'])

with open('sectionNames.txt', 'w') as f:
    f.write('\n'.join(all_sectionNames))

correct_sec_names = []
for n in all_sectionNames:
    if(n and n[0] == "."):
        correct_sec_names.append(n)

with open('sectionNames_correct.txt', 'w') as f:
    f.write('\n'.join(correct_sec_names))
"""

'\nall_sectionNames = set()\nfor ds_obj in DataSet:\n    for dic_elm in ds_obj[\'section\'][\'sections\']:\n        all_sectionNames.add(dic_elm[\'name\'])\n\nwith open(\'sectionNames.txt\', \'w\') as f:\n    f.write(\'\n\'.join(all_sectionNames))\n\ncorrect_sec_names = []\nfor n in all_sectionNames:\n    if(n and n[0] == "."):\n        correct_sec_names.append(n)\n\nwith open(\'sectionNames_correct.txt\', \'w\') as f:\n    f.write(\'\n\'.join(correct_sec_names))\n'

#### spoiler: there are lots of malicious section names
#### so I just extracted the most common and correct section names and then wrote them into "common_section_names.txt"
#### any other section names will be considered "UNKNOWN"

In [4]:
# Saving the most common section names

with open('common_section_names.txt', 'r') as f:
    Common_section_names = f.readlines()

Common_section_names = [re.sub(r'\n', '', i) for i in Common_section_names]

# Explore all the possible imports

In [5]:
"""
from tqdm import tqdm
all_imports = set()
for obj in tqdm(DataSet):
    import_DLL_dict = obj['imports']
    DLL_list = list(import_DLL_dict.keys())
    for elm in DLL_list:
        if(elm.endswith('.dll')):
            all_imports.add(elm)
    #all_imports = set(all_imports)

with open('all_imports_cleansed.txt', 'w') as f:
    f.write('\n'.join(all_imports))
"""

"\nfrom tqdm import tqdm\nall_imports = set()\nfor obj in tqdm(DataSet):\n    import_DLL_dict = obj['imports']\n    DLL_list = list(import_DLL_dict.keys())\n    for elm in DLL_list:\n        if(elm.endswith('.dll')):\n            all_imports.add(elm)\n    #all_imports = set(all_imports)\n\nwith open('all_imports_cleansed.txt', 'w') as f:\n    f.write('\n'.join(all_imports))\n"


### Same problem with DLL imports, there are numerous different DLLs
### and I cannot really filter all of them, so I will just grab the most common DLLs that are associated with most malwares
### and another feature which will be the number of imported DLLs

# Let's just cleanse the data

In [6]:
new_Dataset = []

for simple_ds in tqdm(DataSet, desc='cleansing the dataset'):
    try:
        # add reduced features of byteentropy distribution
        simple_ds.update(Interpret_Histogram(simple_ds['byteentropy'], 'byteentropy'))

        # add reduced features of byte histogram distribution
        simple_ds.update(Interpret_Histogram(simple_ds['histogram'], 'bytehistogram'))

        # reduce strings field
        simple_ds = extract_subfields_from_fields(simple_ds, 'strings', normalize_names=True, delete_field=True)

        # flatten the strings printables distribution field
        simple_ds = flatten_strings_printable_distribution(simple_ds, delete_field=True)

        # reduce general field
        simple_ds = extract_subfields_from_fields(simple_ds, 'general', normalize_names=True, delete_field=True)

        # reduce header field
        simple_ds = extract_subfields_from_fields(simple_ds, 'header', normalize_names=True, delete_field=True)
        simple_ds = extract_subfields_from_fields(simple_ds, 'header_optional', normalize_names=False, delete_field=True)
        simple_ds = extract_subfields_from_fields(simple_ds, 'header_coff', normalize_names=False, delete_field=True)


        # handle data directories field
        simple_ds = handle_data_directories_field(simple_ds)


        # handle sections fields
        simple_ds = handle_section_names(simple_ds, Common_section_names, delete_field=True)

        # handle imports fields
        simple_ds = handle_DLL_imports(simple_ds, delete_field=False)

        # Remove the useless columns for now (they are not entirely useless but they will make the training process very complex for me :(( )
        useless_columns = ['sha256'
            ,'md5'
            ,'appeared'
            ,'avclass'
            ,'histogram'
            ,'byteentropy'
            ,'imports'
            ,'exports'
            ,'dll_characteristics'
            ,'characteristics']

        for useless_col in useless_columns:
            del simple_ds[useless_col]
        
        new_Dataset.append(simple_ds)
    except:
        continue


# Finally, free the original dataset from our precious memory
del DataSet

#print(simple_ds)

with open('lol.json', 'w') as f:
     json.dump(new_Dataset[5], f, indent=6)


cleansing the dataset:   0%|          | 0/2501 [00:00<?, ?it/s]

cleansing the dataset: 100%|██████████| 2501/2501 [00:01<00:00, 1457.46it/s]


# Let's prepare our Pandas DataFrame

In [7]:
# df= pd.DataFrame()
# i = 0
# for dic in new_Dataset:
#     df = pd.concat([df, pd.DataFrame([0]*len(df.columns))], axis=0)
#     for k in dic.keys():
#         if k in df.columns:
#             try:
#                 df.loc[i, k] = dic[k]
#             except:
#                 print(k)
#                 print(df)
#         else:
#             dummy_list = pd.DataFrame([0]*len(df) if len(df) > 0 else [0])
#             df.insert(0, k, dummy_list)
#             #print(df.columns)
#             df.loc[i, k] = dic[k]
    
#     #print(df.head())
#     i+=1


# df.fillna(0)
# print(df)

# df.to_csv('lol.csv')

In [8]:
#df = pd.DataFrame().from_dict(DataSet_Dict)

#print(df.head())



df = pd.DataFrame()

for dictionary_obj in tqdm(new_Dataset, desc="constructing a pandas dataframe..."):
    df_row = pd.DataFrame().from_dict(dictionary_obj, orient='index').transpose()
    df = pd.concat([df, df_row], axis=0, join='outer',ignore_index=False)

df.fillna(0, inplace=True)
df.to_csv('Dataset.csv')
df.describe()

constructing a pandas dataframe...:   0%|          | 0/2501 [00:00<?, ?it/s]

constructing a pandas dataframe...: 100%|██████████| 2501/2501 [00:42<00:00, 58.65it/s]


Unnamed: 0,.code_size,.code_entropy,.code_vsize,.code_props_len,.text_size,.text_entropy,.text_vsize,.text_props_len,.rdata_size,.rdata_entropy,...,.debug_props_len,.aspack_size,.aspack_entropy,.aspack_vsize,.aspack_props_len,Bcrypt.dll_num_funcs,.sxdata_size,.sxdata_entropy,.sxdata_vsize,.sxdata_props_len
count,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,...,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0,2501.0
mean,305.191124,0.07623,306.287885,0.045582,359112.6,4.996167,365193.4,2.676929,87295.92,2.905195,...,0.002399,5.936825,0.004607,6.55098,0.002399,0.005598,0.204718,0.000324,0.001599,0.001599
std,4547.115373,0.64488,4539.330384,0.383042,1097485.0,2.705754,1107032.0,2.035579,993491.8,2.692505,...,0.084819,211.019478,0.162869,231.612087,0.084819,0.260708,10.237953,0.016222,0.079984,0.079984
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,2560.0,4.766444,2680.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,74240.0,6.361935,76324.0,3.0,512.0,2.418296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,262144.0,6.64059,261943.0,3.0,35840.0,5.262078,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,116224.0,7.469426,116159.0,6.0,21253630.0,7.999856,21253370.0,16.0,44892670.0,7.9978,...,3.0,8192.0,5.776264,8192.0,3.0,13.0,512.0,0.811278,4.0,4.0


In [9]:


with open('suspicious_imports.txt', 'r') as f:
    sus_imports = f.readlines()
sus_imports = [re.sub(r'\n', '', i) for i in sus_imports]

boolean_columns = sus_imports + []
categorical_columns = ["subsystem", "magic", "machine"]


for col in df.columns:
    if col in boolean_columns:
        df[col] = df[col].astype(bool)
        df[col].fillna(False)
        continue

    if col in categorical_columns:
        df[col].replace(0, 'UNKNOWN', inplace=True)
        continue
    df[col].fillna(0)
    df[col] = df[col].astype(np.int64)
    df[col].fillna(0)

for col in df.columns:
    print(f"{col}:        {df[col].dtype}")

df.to_csv('Dataset.csv')

label:        int64
zero_bytes_byteentropy:        int64
full_bytes_byteentropy:        int64
mean_of_bytes_byteentropy:        int64
standard_dev_byteentropy:        int64
total_bytes_byteentropy:        int64
mean_of_first_tertile_byteentropy:        int64
mean_of_second_tertile_byteentropy:        int64
mean_of_third_tertile_byteentropy:        int64
zero_bytes_bytehistogram:        int64
full_bytes_bytehistogram:        int64
mean_of_bytes_bytehistogram:        int64
standard_dev_bytehistogram:        int64
total_bytes_bytehistogram:        int64
mean_of_first_tertile_bytehistogram:        int64
mean_of_second_tertile_bytehistogram:        int64
mean_of_third_tertile_bytehistogram:        int64
strings_numstrings:        int64
strings_avlength:        int64
strings_printables:        int64
strings_entropy:        int64
strings_paths:        int64
strings_urls:        int64
strings_registry:        int64
strings_MZ:        int64
strings_printabledist_0:        int64
strings_printabl

# Remove the -1 tuples!

In [10]:
df = df[df['label'] != -1]

# Let's make our classifier

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

df_train_1 = df.copy()
for col in categorical_columns:
    df_train_1[col] = LabelEncoder().fit_transform(df_train_1[col])

feature_columns = list(df_train_1.columns)
feature_columns.pop(feature_columns.index("label"))

x_train, x_test, y_train, y_test = train_test_split(df_train_1[feature_columns], df_train_1['label'], test_size=0.3, shuffle=True)

svm_model = SVC(kernel='poly', degree= 3, verbose=True).fit(x_train, y_train)

y_pred = svm_model.predict(x_test)
print(classification_report(y_test, y_pred))


[LibSVM]              precision    recall  f1-score   support

           0       0.00      0.00      0.00       270
           1       0.51      1.00      0.68       287

    accuracy                           0.51       557
   macro avg       0.26      0.50      0.34       557
weighted avg       0.27      0.51      0.35       557



In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

df_train_2 = df.copy()

for col in categorical_columns:
    df_train_2[col] = LabelEncoder().fit_transform(df_train_2[col])

feature_columns = list(df_train_2.columns)
feature_columns.pop(feature_columns.index("label"))

x_train, x_test, y_train, y_test = train_test_split(df_train_2[feature_columns], df_train_2['label'], test_size=0.3, shuffle=True)

rf_model = RandomForestClassifier().fit(x_train, y_train)

y_pred = rf_model.predict(x_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.91      0.88       256
           1       0.92      0.88      0.90       301

    accuracy                           0.89       557
   macro avg       0.89      0.89      0.89       557
weighted avg       0.89      0.89      0.89       557



In [13]:
# Neural Networks suck


from sklearn.neural_network import MLPClassifier

MLP_classifier = MLPClassifier(hidden_layer_sizes=[120, 120, 30], solver='sgd', alpha=1, random_state=1)

df_train_3 = df.copy()

for col in categorical_columns:
    df_train_3[col] = LabelEncoder().fit_transform(df_train_3[col])

feature_columns = list(df_train_3.columns)
feature_columns.pop(feature_columns.index("label"))

x_train, x_test, y_train, y_test = train_test_split(df_train_3[feature_columns], df_train_3['label'], test_size=0.3, shuffle=True)

for i in range(10):
    MLP_classifier.fit(x_train, y_train)

y_pred = MLP_classifier.predict(x_test)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.46      1.00      0.63       257
           1       1.00      0.00      0.00       300

    accuracy                           0.46       557
   macro avg       0.73      0.50      0.32       557
weighted avg       0.75      0.46      0.29       557

