In [70]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt

In [71]:
func_dict = {}

In [72]:
def ld_data(dir_name):
    # Load data in files from directory dir_name in a pandas dataframe
    df = pd.DataFrame()
    for filename in os.listdir(dir_name):
        funcs = []
        with open(os.path.join(dir_name, filename), 'r') as f:
            lines = f.read().splitlines()
            
            for line in lines:
                lib, func = line.split(' ')
                if '@' in func or '?' in func:
                    continue
                
                # TODO: Comentado para testar Vectorizer sem lower()
                # if not func.lower() in func_dict:
                #     func_dict[func.lower()] = [(lib, func)]
                # else:
                #     func_dict[func.lower()].append((lib, func))

                if not func in func_dict:
                    func_dict[func] = [(lib, func)]
                else:
                    # TODO: Melhorar. So dar append se a lib for diferente das atuais
                    func_dict[func].append((lib, func))


                funcs.append(func)

        label = 'MALWARE' if filename.startswith('R') else 'GOODWARE'
        df = pd.concat([df, pd.DataFrame({'filename': filename, 'label': label, 'funcs': ' '.join(funcs)}, index=[0])], ignore_index=True)
        
    return df


In [73]:
# train_dt_dir_name = '../data/arquivos-nilson-filtro-2/'
train_dt_dir_name = '/archive/files/import-small-dir/'
export_dt_dir_name = '/archive/files/export-small-dir/'
test_mw_dt_dir_name = '/archive/files/nastyware-files/import-malware-bazaar-2021-01-to-2021-02-f'
teste_gd_dt_dir_name = '/archive/files/nastyware-files/import-windows-server-2016-f'

In [74]:
train = ld_data(train_dt_dir_name)
test_mw = ld_data(test_mw_dt_dir_name)
test_gw = ld_data(teste_gd_dt_dir_name)

test = pd.concat([test_mw, test_gw], ignore_index=True)

In [75]:
train

Unnamed: 0,filename,label,funcs
0,R-6196c30108fc71ecdab192dfbeda98b51a25a220195a...,MALWARE,HeapFree VirtualFree HeapSize HeapReAlloc Heap...
1,G-pnpmig.dll,GOODWARE,memcpy_s wcstoul swprintf_s memmove toupper wc...
2,G-ConnectedAccountState.dll,GOODWARE,_unlock _lock __dllonexit _amsg_exit _XcptFilt...
3,R-20f02f1e76aa29fb091f954451babf92d4ea31b25d5c...,MALWARE,_CorExeMain
4,R-144c28f64d5e23966923a2a0c779286494f27b3fba1c...,MALWARE,_CorExeMain
...,...,...,...
1027,R-17b4666d69c4c82a89ee9208dfe8ebb84e3f7acdd81d...,MALWARE,FillConsoleOutputCharacterA GetProcessWorkingS...
1028,G-KBDUGHR.DLL,GOODWARE,
1029,G-wpnservice.dll.mui,GOODWARE,
1030,G-csrsrv.dll.mui,GOODWARE,


In [76]:
vectorizer = CountVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(train['funcs'])

In [77]:
# export the ordered list of features used by the vectorizer
with open('feature_names.txt', 'w') as f:
    for feature in vectorizer.get_feature_names():
        f.write(feature + '\n')



In [78]:
# Print train_vectors index that are not zero
print(train_vectors[0].nonzero()[1])
print(vectorizer.get_feature_names()[2800])

print(train.iloc[0]['filename'])


[2403 5766 2407 2405 2399 2140 5771  665 5588 5139 1867 5514 2799 2774
 2205 2065 4116 1868 1871 2237 2606 4685 2626 2014 5051 1140 1188 2959
  945 2601 5521 5523 5524 5522 1537 2136 2978 4132 1368 2064 2059 3299
 5897 1747 2208 1962 2942 1472 1480 1488 2811 2084 1777 1828 1829 1935
 1536 2217 5112 1519 5978 1842 1848 5034  929  443 5977]
IsRectEmpty
R-6196c30108fc71ecdab192dfbeda98b51a25a220195ad64bd73b2a566936fc9b.dll


In [79]:
# For each row in train datatable, write a new file in the export directory

for index, row in train.iterrows():
    filename = row['filename']
    # Create file in export directory 
    with open(os.path.join(export_dt_dir_name, filename), 'w') as f:
        # write train_vectors list as a string in the file
        f.write(''.join([str(x) for x in train_vectors[index].toarray()[0]]))