# Biblioteka ne radi jer nema dataseta na githubu zbog velicine od 20GB

In [1]:
import os
import math
import numpy as np
import scipy as sp
import pandas as pd
import ujson as json
import matplotlib.pyplot as plt
%matplotlib notebook

from tqdm import tqdm
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

### Read data

In [3]:
shas, groups = zip(*[x.split(',') for x in open('samples_by_packer_group.txt').read().splitlines()])
sha_group = {a: int(b) for a, b in zip(shas, groups)}
sha_label = {a: 1 if int(b) else 0 for a, b in zip(shas, groups)}

In [4]:
shas = list(sha_label.keys())

In [5]:
# Extracting named features from TC report
section_names = []
data_size = []
entropies = [] 
section_sizes = []
dates = [] 
import_names = [] 
max_sec_ent = [] # highest section entropy in section_list
api_list = [] # list of api_lists in import_list
warnings = [] # names of warnings in warning_list
resources = [] # names in resource_list
opH_size = [] # size of Optional Header
mean_sec_ent = [] # mean of entropies of sections in section_list
min_sec_ent = [] # lowest section entropy in section_list

# appending all needed features form TC report
for sha in tqdm(shas):
    report = json.load(
        open("reports/{}/{}.json".format(sha[-2:], sha), 'rb')
    )
    entry = report['coreReport']['entries']['entry_list'][0]
    
    # It is possible sections do not exist!
    if 'sections' not in entry['metadata']['application']['pe']:
        section_sizes.append(0)
        section_names.append([])
    else:
        section_names.append(
            [sec.get('name', 'NO_NAME')
             for sec in entry['metadata']['application']['pe']['sections']['section_list']])

        section_sizes.append(sum([int(sec.get('size'),0)
             for sec in entry['metadata']['application']['pe']['sections']['section_list']]))
    
    # It is possible imports do not exist!
    if 'imports' not in entry['metadata']['application']['pe']:
        import_names.append([])
        api_list.append([])
    else:
        import_names.append(
            [imp.get('name', 'NO_NAME')
             for imp in entry['metadata']['application']['pe']['imports']['import_list']])
        api_list.append([imp.get('api_list', '[]')
             for imp in entry['metadata']['application']['pe']['imports']['import_list']])
    
    # It is possible warnings do not exist!
    if 'warnings' not in entry['info']['validation']:
        warnings.append([])
    else:
        warnings.append(entry['info']['validation']['warnings']['warning_list'])
    
    if 'resources' not in entry['metadata']['application']['pe']:
        resources.append([])
    else:
        resources.append(
            [imp.get('type', 'NO_TYPE')
             for imp in entry['metadata']['application']['pe']['resources']['resource_list']])
        
    data_size.append(entry['info']['file']['size'])
    entropies.append(entry['info']['file']['entropy'])
    dates.append(entry['metadata']['application']['pe']['fileHeader']['timeDateStamp'])
    
    max_sec_ent.append(
        max([float(sec.get('entropy'))
             for sec in entry['metadata']['application']['pe']['sections']['section_list']]))
    
    opH_size.append(entry['metadata']['application']['pe']['optionalHeader']['sizeOfHeaders'])
    
    mean_sec_ent.append(
          np.mean([float(sec.get('entropy'))
             for sec in entry['metadata']['application']['pe']['sections']['section_list']]))
    
    min_sec_ent.append(
          min([float(sec.get('entropy'))
             for sec in entry['metadata']['application']['pe']['sections']['section_list']]))

100%|██████████| 43784/43784 [08:05<00:00, 90.14it/s] 


In [6]:
# Extracting apis from api_list
konacna=[]
for lista in api_list:
    pomocna=[]
    for lis in lista:
        for li in lis:
            if(li not in pomocna):
                pomocna.append(li)
    konacna.append(pomocna)

# Removing repeating names in resources
resource_list= []
for lista in resources:
    lis=[]
    for li in lista:
        if li not in lis:
            lis.append(li)
    resource_list.append(lis)        

In [7]:
import calendar
d={v: k for k,v in enumerate(calendar.month_abbr)}

In [8]:
# Converting from hexadecimal to decimal
data_size_ints = [int(x,0) for x in data_size]
opH_size_ints = [int(x, 0) for x in opH_size]

entropies_f = [round(float(x),2) for x in entropies]
max_sec_ent_f = [round(float(x),2) for x in max_sec_ent]
mean_sec_ent_f = [round(float(x),2) for x in mean_sec_ent]
min_sec_ent_f = [round(float(x),2) for x in min_sec_ent]


In [9]:
# Converting the size of Optional Header from bytes to kilobytes
opH_size_ints=[i/1024 for i in opH_size_ints]

In [10]:
from functools import reduce

# If size of file is smaller than sum of all sections of a file, than that is suspicious
# So we have put that as a feature
diff=[]
for i, size in enumerate(data_size_ints):
    if(size-section_sizes[i] < 0):
        diff.append(1)
    else:
        diff.append(0)

# Comparison of size of Optional Header to the mean of sizes of all Optional Headers
opH=[]
mean = reduce(lambda a, b: a + b, opH_size_ints) / len(opH_size_ints)
for i, size in enumerate(opH_size_ints):
    if(size < mean):
        opH.append(0)
    else:
        opH.append(1)


In [11]:
# Converting the size of Optional Header from kilobytes to megabytes
opH_size_ints_2=[i/1024 for i in opH_size_ints]

In [12]:
# If the date says that a file was made before 1985, or later than now, that is suspicious
# We made that a feature here

import datetime
import time

dates_without_day=[date[4:] for date in dates]
dates_num = [str(d[p[:3]])+p[3:] for p in dates_without_day]

dates_final = [datetime.datetime.strptime(dat, "%m %d %H:%M:%S %Y").strftime("%Y-%m-%d %H:%M:%S") for dat in dates_num]

tmpTime = time.strftime("%Y-%m-%d %I:%M:%S")
timee = "1985-1-1 00:00:00"
dates_bin = [1 if (tim > tmpTime or timee>tim) else 0 for tim in dates_final]


In [13]:
sections = pd.DataFrame(data={'sha1': shas, 'section_names': section_names,
                              'import_names':import_names,'size':diff, 'entropy':entropies_f,
                              'date':dates_bin, 'max_section_entropy':max_sec_ent_f,
                              'mean_section_entropy':mean_sec_ent_f,'min_sec_ent':min_sec_ent,
                             'import_apis':konacna,'warnings':warnings,
                             'resource_list':resource_list, 'opH':opH_size_ints_2})
sections = sections[['sha1', 'section_names','import_names','import_apis','warnings',
                     'resource_list','size','entropy','max_section_entropy',
                     'min_sec_ent','mean_section_entropy','date','opH']]

In [14]:
sections['group'] = [sha_group[sha1] for sha1 in tqdm(sections.sha1.values)]
sections['label'] = [sha_label[sha1] for sha1 in tqdm(sections.sha1.values)]
sections

100%|██████████| 43784/43784 [00:00<00:00, 966992.11it/s]
100%|██████████| 43784/43784 [00:00<00:00, 685441.20it/s]


Unnamed: 0,sha1,section_names,import_names,import_apis,warnings,resource_list,size,entropy,max_section_entropy,min_sec_ent,mean_section_entropy,date,opH,group,label
0,000003cf1b52a990d99352da24aa73585b97647c,"[.text, .rdata, .data, .rsrc]","[ADVAPI32.dll, COMCTL32.dll, GDI32.dll, KERNEL...","[LookupPrivilegeValueA, OpenProcessToken, Copy...",[Whitelisted certificate: Microsoft Corporatio...,[RT_VERSION],0,6.20,6.51,1.044567,4.24,0,0.003906,1,1
1,000007e5aa61eff1435edf63459d70a99441109d,"[.text, .rsrc, .reloc]",[mscoree.dll],[_CorExeMain],[],"[RT_ICON, RT_GROUP_ICON, RT_VERSION]",0,4.51,5.40,0.013127,2.75,0,0.007812,0,0
2,000023255f375fb8397182833c8dd660fd760202,"[.text, .rdata, .data, .idata, .rsrc, .reloc]","[KERNEL32.dll, MSVCR100.dll, QtCore4.dll, QtGu...","[GetSystemTimeAsFileTime, GetCurrentProcessId,...",[PEHeader.SizeOfStackReserve is fixable],[RT_MANIFEST],0,5.59,6.29,3.787657,4.92,0,0.003906,0,0
3,000023cf6df3479eef56dbdd5152382a9c22a4e8,"[.text, .rdata, .data, .pdata, .gfids, .reloc]","[KERNEL32.dll, VCRUNTIME140.dll, api-ms-win-cr...","[GetModuleHandleW, IsProcessorFeaturePresent, ...",[],[],0,5.45,5.86,0.159084,2.47,0,0.003906,0,0
4,0000379bd1ea62b8147bba528f1a186424ca43b3,"[UPX0, UPX1, .rsrc]","[ADVAPI32.dll, COMCTL32.dll, GDI32.dll, KERNEL...","[RegEnumKeyA, InitCommonControlsEx, Escape, Lo...","[Whitelisted certificate: Digital River, Inc. ...","[RT_CURSOR, RT_BITMAP, RT_ICON, RT_DIALOG, RT_...",0,7.86,7.93,0.000000,4.37,0,0.003906,3,1
5,000045c34059978f905862f0878b7e583c4f4561,"[UPX0, UPX1, UPX2]","[KERNEL32.DLL, libcairo-2.dll, libcroco-0.6-3....","[LoadLibraryA, GetProcAddress, VirtualProtect,...",[PEHeader.AddressOfEntryPoint is fixable],[],0,7.59,7.92,0.000000,4.38,0,0.003906,3,1
6,00004a9067a95c6a85618aaa061f7af85abef948,"[UPX0, UPX1, .rsrc]","[KERNEL32.DLL, advapi32.dll, comctl32.dll, gdi...","[LoadLibraryA, GetProcAddress, VirtualProtect,...","[ImageHeader.Characteristics is fixable, PEHea...","[RT_CURSOR, RT_BITMAP, RT_ICON, RT_DIALOG, RT_...",0,7.95,8.00,0.000000,4.11,0,0.003906,3,1
7,000057e8d0be48f95181365c28dc9b0d25dd1771,"[.text, .rdata, .data, .rsrc, .reloc]","[ACE.dll, ADVAPI32.dll, AutomationServer.dll, ...","[?get_handle@ACE_Event_Handler@@UBEPAXXZ, ?set...",[],[RT_MANIFEST],0,5.54,5.74,3.935130,4.92,0,0.003906,0,0
8,0000618bda52300218d223f02e3323406d9e1640,"[.text, .data, .rsrc]","[ADVAPI32.dll, KERNEL32.dll, OLEAUT32.dll, SHL...","[RegSetValueExW, RegDeleteKeyW, RegCreateKeyEx...",[],"[RT_ICON, RT_MENU, RT_STRING, RT_GROUP_ICON, R...",0,5.19,5.32,2.712999,4.30,0,0.003906,0,0
9,00006234076ecb1fea4ee2df95f9470b00ca043c,"[.text, .rdata, .data, .pdata, .tls, .rsrc, .r...","[KERNEL32.dll, VCRUNTIME140.dll, api-ms-win-cr...","[TerminateProcess, RtlLookupFunctionEntry, Rtl...",[],"[RT_VERSION, RT_MANIFEST]",0,5.62,6.15,0.000000,3.76,0,0.003906,0,0


In [15]:
labels = sections['label'].values
groups = sections['group'].values
shas = sections['sha1'].values

In [16]:
X_train, X_val, y_train, y_val = train_test_split(
    shas, labels, test_size=0.2, random_state=42, stratify=groups
)

In [17]:
m_train = np.where(sections['sha1'].isin(X_train).values)[0]
m_val = np.where(sections['sha1'].isin(X_val).values)[0]

In [18]:
# Dataset size and number of packer samples
len(X_train), y_train.sum()

(35027, 17458)

In [19]:
# Packer percentage in the dataset
(y_train.sum() / X_train.shape[0] * 100)

49.84155080366575

In [20]:
# Check the dataset structure!
Counter(groups).most_common()

[(0, 21962), (3, 14604), (4, 3336), (1, 1956), (2, 1926)]

### Create features from section names

#### Approach 1 - most used names

In [21]:
# Count all features for each given feature type from train dataset!
cnt = Counter()
for names in tqdm(sections.iloc[m_train].section_names.values):
    cnt += Counter(names)
    
cnt_imp = Counter()
for names in tqdm(sections.iloc[m_train].import_names.values):
    cnt_imp += Counter(names)

cnt_imp_api = Counter()
for names in tqdm(sections.iloc[m_train].import_apis.values):
    cnt_imp_api += Counter(names)
    
cnt_warnings = Counter()
for names in tqdm(sections.iloc[m_train].warnings.values):
    cnt_warnings += Counter(names)
    
cnt_resources = Counter()
for names in tqdm(sections.iloc[m_train].resource_list.values):
    cnt_resources += Counter(names)

100%|██████████| 35027/35027 [00:04<00:00, 8566.03it/s] 
100%|██████████| 35027/35027 [00:05<00:00, 5998.87it/s] 
100%|██████████| 35027/35027 [08:16<00:00, 70.49it/s] 
100%|██████████| 35027/35027 [00:01<00:00, 26277.84it/s]
100%|██████████| 35027/35027 [00:02<00:00, 17020.42it/s]


In [22]:
# Pick top x for given feature types
print("Most common section names")
num_used = 16
for a, b in cnt.most_common()[:num_used]:
    print("{:10}\t{:5}".format(a, b))

print("Most common import names")
num_used_imp = 16
for a, b in cnt_imp.most_common()[:num_used_imp]:
    print("{:10}\t{:5}".format(a, b))

print("Most common apis in import_list")
num_used_imp_api = 25
for a, b in cnt_imp_api.most_common()[:num_used_imp_api]:
    print("{:10}\t\t{:5}".format(a, b))
    
print("Most common warnings")    
num_used_war = 25
for a, b in cnt_warnings.most_common()[:num_used_war]:
    print("{:10}\t\t{:5}".format(a, b))

print("Most common resource names in resource list")
num_used_res = 16
for a, b in cnt_resources.most_common()[:num_used_res]:
    print("{:10}\t\t{:5}".format(a, b))

Most common section names
.rsrc     	30385
.text     	14879
.data     	12428
.reloc    	12245
.rdata    	11621
NO_NAME   	10726
UPX0      	 6881
UPX1      	 6881
.idata    	 3249
.pdata    	 3180
pec1      	 2373
.imports  	 2260
pec2      	 1743
UPX2      	 1618
.adata    	 1303
.tls      	 1277
Most common import names
KERNEL32.DLL	10211
KERNEL32.dll	 9070
USER32.dll	 7990
user32.dll	 7241
ole32.dll 	 6310
ADVAPI32.dll	 5893
kernel32.dll	 5419
oleaut32.dll	 4491
advapi32.dll	 4404
SHELL32.dll	 4332
GDI32.dll 	 3976
comdlg32.dll	 3711
OLEAUT32.dll	 3625
COMCTL32.dll	 2787
gdi32.dll 	 2754
msvcrt.dll	 2690
Most common apis in import_list
GetProcAddress		20331
LoadLibraryA		16854
ExitProcess		12033
GetModuleHandleA		 9451
VirtualAlloc		 8555
VirtualFree		 8333
RegCloseKey		 8216
UnhandledExceptionFilter		 8175
GetCurrentThreadId		 8119
MessageBoxA		 7844
GetCurrentProcess		 7525
TerminateProcess		 7260
CloseHandle		 7218
GetCurrentProcessId		 7190
Sleep     		 6970
GetLastError		 6741
W

In [23]:
# Enumerating features and their frequencies for each given feature type
most_used_war = [x[0] for x in cnt_warnings.most_common()[:num_used_war]]
name_ind_war = {name: i for i, name in enumerate(most_used_war)}

most_used_imp = [x[0] for x in cnt_imp.most_common()[:num_used_imp]]
name_ind_imp = {name: i for i, name in enumerate(most_used_imp)}

most_used_imp_api = [x[0] for x in cnt_imp_api.most_common()[:num_used_imp_api]]
name_ind_imp_api = {name: i for i, name in enumerate(most_used_imp_api)}

most_used_res = [x[0] for x in cnt_resources.most_common()[:num_used_res]]
name_res = {name: i for i, name in enumerate(most_used_res)}

most_used_all = [x[0] for x in cnt.most_common()[:len(cnt)]]
name_ind_all = {name: i for i, name in enumerate(most_used_all)}

inv_name_ind_all = {v: k for k, v in name_ind_all.items()}

most_used = [x[0] for x in cnt.most_common()[:num_used]]
name_ind = {name: i for i, name in enumerate(most_used)}

In [24]:
data_analysis = np.zeros((sections.shape[0], len(cnt)))
data_analysis.shape

pos = np.zeros(len(cnt))
neg = np.zeros(len(cnt))

for i, names in enumerate(tqdm(sections.iloc[m_train].section_names.values)):
    for name in names:
        if name in name_ind_all:
            data_analysis[i, name_ind_all[name]] += 1
            
            if(sections['label'].values[i] == 1):
                pos[name_ind_all[name]] += 1
            else:
                neg[name_ind_all[name]] += 1
                
suspicious_sections = []
suspicious_sections_80 = []
perc = pos / (pos+neg) *100
print(perc)
for i,sec in enumerate(perc):
    if(sec > 95 ):
        suspicious_sections.append(inv_name_ind_all[i])
    if(sec > 80 ):
        suspicious_sections_80.append(inv_name_ind_all[i])


vec=[]
for sec_li in sections.iloc[m_train].section_names.values:
    flag =0
    for name in sec_li:
        if name in suspicious_sections:
            flag = 1
            vec.append(1)
            break
    if(flag == 0):
        vec.append(0)

vec_80 =[]
for sec_li in sections.iloc[m_train].section_names.values:
    flag =0
    for name in sec_li:
        if name in suspicious_sections_80:
            flag = 1
            vec_80.append(1)
            break
    if(flag == 0):
        vec_80.append(0)
        

100%|██████████| 35027/35027 [00:02<00:00, 17109.07it/s]


[59.30557841 58.33725385 59.30157708 ...  0.          0.
  0.        ]


In [25]:
# Count the top 16 section names, all the rest goes in the last column (if you want)
data = np.zeros((sections.shape[0], num_used+num_used_war+ num_used_res+num_used_imp+ num_used_imp_api + 13))
#data_for_analasis = np.zeros((sections.shape[0], num_used ))
for i, names in enumerate(tqdm(sections.section_names.values)):
    for name in names:
        if name in name_ind:
            data[i, name_ind[name]] += 1
            #data_for_analasis[i, name_ind[name]]+=1
        else:
            data[i, num_used] += 1
            
for i, names in enumerate(tqdm(sections.import_names.values)):
    for imp in names:
        if imp in name_ind_imp:
            data[i,num_used+1+ name_ind_imp[imp]] += 1
        else:
            data[i,num_used+1+ num_used_imp] += 1
            
for i, names in enumerate(tqdm(sections.import_apis.values)):
    for imp in names:
        if imp in name_ind_imp_api:
            data[i,num_used+num_used_imp+1+ name_ind_imp_api[imp]] += 1
            
for i, names in enumerate(tqdm(sections.warnings.values)):
    for imp in names:
        if imp in name_ind_war:
            data[i,num_used+num_used_imp+num_used_imp_api+1+ name_ind_war[imp]] += 1

for i, names in enumerate(tqdm(sections.resource_list.values)):
    for imp in names:
        if imp in name_res:
            data[i,num_used+num_used_imp+num_used_imp_api+ num_used_war+1+ name_res[imp]] += 1
            
for i, size in enumerate(tqdm(sections['size'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 2] = size
for i, ent in enumerate(tqdm(sections['entropy'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 3] = ent
for i, dat in enumerate(tqdm(sections['date'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 4] = dat
for i, v in enumerate(tqdm(vec)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 5] = v
for i, v in enumerate(tqdm(vec_80)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 6] = v
for i, ent_m in enumerate(tqdm(sections['max_section_entropy'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 7] = ent_m
for i, ent_m in enumerate(tqdm(sections['mean_section_entropy'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 8] = ent_m
for i, ent_m in enumerate(tqdm(sections['min_sec_ent'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 9] = ent_m
for i, imp in enumerate(tqdm(sections['import_names'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 10] = len(imp)
for i, imp in enumerate(tqdm(sections['import_apis'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 11] = len(imp)
for i, size in enumerate(tqdm(sections['opH'].values)):
    data[i][num_used+ num_used_imp+num_used_war+ num_used_imp_api+num_used_res + 12] = size
#p = sections['label'].values
#data_for_analasis

100%|██████████| 43784/43784 [00:00<00:00, 84448.27it/s]
100%|██████████| 43784/43784 [00:00<00:00, 84254.59it/s]
100%|██████████| 43784/43784 [00:01<00:00, 26086.50it/s]
100%|██████████| 43784/43784 [00:00<00:00, 338601.20it/s]
100%|██████████| 43784/43784 [00:00<00:00, 100916.94it/s]
100%|██████████| 43784/43784 [00:00<00:00, 345408.63it/s]
100%|██████████| 43784/43784 [00:00<00:00, 220633.49it/s]
100%|██████████| 43784/43784 [00:00<00:00, 177241.96it/s]
100%|██████████| 35027/35027 [00:00<00:00, 171331.34it/s]
100%|██████████| 35027/35027 [00:00<00:00, 233348.51it/s]
100%|██████████| 43784/43784 [00:00<00:00, 364670.67it/s]
100%|██████████| 43784/43784 [00:00<00:00, 344176.72it/s]
100%|██████████| 43784/43784 [00:00<00:00, 132960.47it/s]
100%|██████████| 43784/43784 [00:00<00:00, 167498.25it/s]
100%|██████████| 43784/43784 [00:00<00:00, 213640.45it/s]
100%|██████████| 43784/43784 [00:00<00:00, 185881.04it/s]


## AutoSklearn

In [37]:
os.environ['CUDA_VISIBLE_DEVICES']='0'
import autosklearn.classification
import sklearn.metrics

In [None]:
# In this cell we used autosklearn to find the best parameters for our Random Forest Classifier model
# We ran it for 4 hours
"""
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=14400,per_run_time_limit=1800,
    include_estimators=["random_forest", ], exclude_estimators=None,
include_preprocessors=["no_preprocessing", ], exclude_preprocessors=None)
"""

In [40]:
#print(automl.show_models())
#print(automl.cv_results_)
#automl?

{'mean_test_score': array([0.98961848, 0.98477377, 0.98356259, 0.99065663, 0.9761225 ,
       0.97811229, 0.97214292, 0.98520633, 0.9890994 , 0.98287049,
       0.97646855, 0.98823428, 0.98953197, 0.98511982, 0.96747123,
       0.98667705, 0.9808807 , 0.9822649 , 0.98814776, 0.98217839,
       0.98858033, 0.98494679, 0.98624448, 0.98728264, 0.97837183,
       0.98676356, 0.98425469, 0.98883987, 0.97802578, 0.98693659,
       0.97006661, 0.98762869, 0.98572541, 0.9856389 , 0.9740462 ,
       0.98901289, 0.986331  , 0.98685007, 0.98165931, 0.986331  ,
       0.98823428, 0.98780171, 0.96124232, 0.99048361, 0.98780171,
       0.96833636, 0.95544597, 0.98537936, 0.99091617, 0.98875335,
       0.98736915, 0.98615797, 0.97897742, 0.99074314, 0.98667705,
       0.98589843, 0.95873345, 0.98944545, 0.98460074, 0.99048361,
       0.98607146, 0.98321654, 0.98235141, 0.99065663, 0.98261095,
       0.98875335, 0.96366468, 0.99065663, 0.98901289, 0.96669262,
       0.98555238, 0.98503331, 0.98607146,

In [41]:
# We chose the appropriate model by the maximum precision given in section "mean_test_score"
# It means that we wanted to choose the model by its performance on various test_sets, so we chose the best one
"""
m = max(automl.cv_results_['mean_test_score'])
m = list(automl.cv_results_['mean_test_score']).index(m)
"""

In [42]:
### Parameters for random forest classifier (cv_results)
"""
print(automl.cv_results_['mean_test_score'][m])
print(automl.cv_results_['mean_fit_time'][m])
print(automl.cv_results_['params'][m]) ### Ovdje se vidi koji su sve parametri naseg modela
print(automl.cv_results_['rank_test_scores'][m])
print(automl.cv_results_['status'][m])
print(automl.cv_results_['param_balancing:strategy'][m])
print(automl.cv_results_['param_categorical_encoding:__choice__'][m])
print(automl.cv_results_['param_classifier:__choice__'][m])
print(automl.cv_results_['param_imputation:strategy'][m])
print(automl.cv_results_['param_preprocessor:__choice__'][m])
print(automl.cv_results_['param_rescaling:__choice__'][m])
print(automl.cv_results_['param_categorical_encoding:one_hot_encoding:use_minimum_fraction'][m])
print(automl.cv_results_['param_classifier:random_forest:bootstrap'][m])
print(automl.cv_results_['param_classifier:random_forest:criterion'][m])
print("bootstrap: {}".format(automl.cv_results_['param_classifier:random_forest:bootstrap'][m]))
print("criterion: {}".format(automl.cv_results_['param_classifier:random_forest:criterion'][m]))
print("max depth: {}".format(automl.cv_results_['param_classifier:random_forest:max_depth'][m]))
print("max_features: {}".format(automl.cv_results_['param_classifier:random_forest:max_features'][m]))
print("max_leaf_nodes: {}".format(automl.cv_results_['param_classifier:random_forest:max_leaf_nodes'][m]))
print("min_impurity_decrease: {}".format(automl.cv_results_['param_classifier:random_forest:min_impurity_decrease'][m]))
print("min_samples_leaf: {}".format(automl.cv_results_['param_classifier:random_forest:min_samples_leaf'][m]))
print("min_samples_split: {}".format(automl.cv_results_['param_classifier:random_forest:min_samples_split'][m]))
print("min_weight_fraction_leaf: {}".format(automl.cv_results_['param_classifier:random_forest:min_weight_fraction_leaf'][m]))
print("n_estimators: {}".format(automl.cv_results_['param_classifier:random_forest:n_estimators'][m]))
print(automl.cv_results_['param_rescaling:quantile_transformer:n_quantiles'][m])
print(automl.cv_results_['param_rescaling:quantile_transformer:output_distribution'][m])
print(automl.cv_results_['param_rescaling:robust_scaler:q_max'][m])
print(automl.cv_results_['param_rescaling:robust_scaler:q_min'][m])
print(automl.cv_results_['param_categorical_encoding:one_hot_encoding:minimum_fraction'][m])
"""

0.9909161692187906
7.553250551223755
{'balancing:strategy': 'none', 'categorical_encoding:__choice__': 'one_hot_encoding', 'classifier:__choice__': 'random_forest', 'imputation:strategy': 'mean', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'standardize', 'categorical_encoding:one_hot_encoding:use_minimum_fraction': 'True', 'classifier:random_forest:bootstrap': 'True', 'classifier:random_forest:criterion': 'gini', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.6042050560464365, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 1, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'classifier:random_forest:n_estimators': 100, 'categorical_encoding:one_hot_encoding:minimum_fraction': 0.010000000000000004}
1
Success
none
one_hot_encoding
random_forest
mean
no_preproce

## Model

In [52]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators = 100, 
                             criterion = 'gini', 
                             max_depth = None, 
                             min_samples_split = 2,
                             bootstrap = True,
                             min_samples_leaf = 1,
                             min_weight_fraction_leaf = 0.0,
                             max_features = 0.6042050560464365,
                             max_leaf_nodes = None,
                             min_impurity_decrease = 0.0,
                             random_state=0)
clf.fit(data[m_train], labels[m_train])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.6042050560464365,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=0,
            verbose=0, warm_start=False)

In [41]:
dictionary = {}
lista=[]
i=0
for p in cnt.most_common()[:num_used]:
    #dictionary[i]print(p[0])
    lista.append(p [0])
lista.append("other_sections")
for p in cnt_imp.most_common()[:num_used_imp]:
    lista.append(p [0])
lista.append("other_imports")
for p in cnt_imp_api.most_common()[:num_used_imp_api]:
    lista.append(p [0])
for p in cnt_warnings.most_common()[:num_used_war]:
    lista.append(p [0])
for p in cnt_resources.most_common()[:num_used_res]:
    lista.append(p [0])
lista.append("size")
lista.append("entropy")
lista.append("date")
lista.append("suspicious_sections_95")
lista.append("suspicious_sections_80")
lista.append("max_entropy")
lista.append("mean_entropy")
lista.append("min_entropy")
lista.append("import_names")
lista.append("import_apis")
lista.append("opH")
len(lista)

111

In [43]:
#print(clf.feature_importances_)
import ast
for x,y in zip(lista,clf.feature_importances_):
    print(str(x)+"\t\t"+str(round(float(y)*100,5))+"%")

.rsrc		0.1166%
.text		0.60593%
.data		0.2261%
.reloc		0.22932%
.rdata		0.29308%
NO_NAME		0.09071%
UPX0		0.03469%
UPX1		0.01927%
.idata		0.04795%
.pdata		0.0842%
pec1		0.45643%
.imports		1.82948%
pec2		0.25479%
UPX2		0.00899%
.adata		0.02108%
.tls		0.02671%
other_sections		0.46535%
KERNEL32.DLL		0.04359%
KERNEL32.dll		0.04612%
USER32.dll		0.03691%
user32.dll		0.16782%
ole32.dll		0.0234%
ADVAPI32.dll		0.02149%
kernel32.dll		1.51122%
oleaut32.dll		0.02337%
advapi32.dll		0.00391%
SHELL32.dll		0.27495%
GDI32.dll		0.04699%
comdlg32.dll		0.05234%
OLEAUT32.dll		0.02043%
COMCTL32.dll		0.02003%
gdi32.dll		0.00268%
msvcrt.dll		0.05066%
other_imports		0.3287%
GetProcAddress		3.77151%
LoadLibraryA		0.06238%
ExitProcess		0.04857%
GetModuleHandleA		0.02066%
VirtualAlloc		0.04131%
VirtualFree		0.05011%
RegCloseKey		0.02584%
UnhandledExceptionFilter		0.08141%
GetCurrentThreadId		0.57769%
MessageBoxA		0.0411%
GetCurrentProcess		0.02981%
TerminateProcess		0.03174%
CloseHandle		0.04489%
GetCurrentProcessI

In [53]:
from sklearn.feature_selection import SelectFromModel
model = SelectFromModel(clf, prefit=True)
data_new = model.transform(data)

In [54]:
mask = model.get_support() #list of booleans
new_features = [] # The list of your K best features

for boool, feature in zip(mask, lista):
    if boool:
        new_features.append(feature)
new_features    

['.imports',
 'kernel32.dll',
 'GetProcAddress',
 'MultiByteToWideChar',
 'Whitelisted certificate: Microsoft Corporation (41), trust level: 0',
 'entropy',
 'max_entropy',
 'min_entropy',
 'import_apis']

In [55]:
print("FEATURE\t\t\t\t\tZNAČAJNOST\n")
for x,y in zip(new_features,clf.feature_importances_):
    print(str(x)+"\t\t\t\t\t"+str(round(float(y)*100,5))+"%")

FEATURE					ZNAČAJNOST

.imports					0.1166%
kernel32.dll					0.60593%
GetProcAddress					0.2261%
MultiByteToWideChar					0.22932%
Whitelisted certificate: Microsoft Corporation (41), trust level: 0					0.29308%
entropy					0.09071%
max_entropy					0.03469%
min_entropy					0.01927%
import_apis					0.04795%


In [56]:
y = clf.predict(data)
print("Accuracy score", sklearn.metrics.accuracy_score(labels, y))

Accuracy score 0.9980129727754431


In [59]:
with open("rjesenje.tsv", "w") as record_file:
    for i in range(0, 43784):
        record_file.write(str(shas[i])+"    "+str(labels[i])+"\n")