In [1]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt

In [2]:
func_dict = {}

In [3]:
def ld_data(dir_name):
    # Load data in files from directory dir_name in a pandas dataframe
    df = pd.DataFrame()
    for filename in os.listdir(dir_name):
        funcs = []
        with open(os.path.join(dir_name, filename), 'r') as f:
            lines = f.read().splitlines()
            
            for line in lines:
                lib, func = line.split(' ')
                if '@' in func or '?' in func:
                    continue
                
                # TODO: Comentado para testar Vectorizer sem lower()
                # if not func.lower() in func_dict:
                #     func_dict[func.lower()] = [(lib, func)]
                # else:
                #     func_dict[func.lower()].append((lib, func))

                if not func in func_dict:
                    func_dict[func] = [(lib, func)]
                else:
                    # TODO: Melhorar. So dar append se a lib for diferente das atuais
                    func_dict[func].append((lib, func))


                funcs.append(func)

        label = 'MALWARE' if filename.startswith('R') else 'GOODWARE'
        df = pd.concat([df, pd.DataFrame({'filename': filename, 'label': label, 'funcs': ' '.join(funcs)}, index=[0])], ignore_index=True)
        
    return df


In [4]:
# train_dt_dir_name = '../data/arquivos-nilson-filtro-2/'
train_dt_dir_name = '/archive/files/nastyware-files/mix/'
export_dt_dir_name = '/archive/files/nastyware-files/mix-new-format/'

In [5]:
train = ld_data(train_dt_dir_name)

In [6]:
train

Unnamed: 0,filename,label,funcs
0,G-cdp.dll,GOODWARE,_initterm_e _initterm _o__localtime64 _o__lock...
1,R-f0cd7710ff81d06494b7130e510dbdd80503aa290be1...,MALWARE,LoadLibraryA ExitProcess GetProcAddress Virtua...
2,G-ConnectedAccountState.dll,GOODWARE,_unlock __C_specific_handler __dllonexit _amsg...
3,G-MapControlCore.dll,GOODWARE,_initterm _initterm_e _o__purecall _o__registe...
4,G-amsiproxy.dll,GOODWARE,__C_specific_handler malloc free _amsg_exit _i...
...,...,...,...
1148,G-cmd.exe,GOODWARE,__set_app_type __getmainargs _amsg_exit _XcptF...
1149,G-Microsoft.Management.Infrastructure.Native.U...,GOODWARE,_initterm malloc __C_specific_handler _XcptFil...
1150,G-msasn1.dll,GOODWARE,_ui64toa _ultoa _vsnprintf _atoi64 strchr memm...
1151,R-9491c91cae1e9ad06eeba4463f79aaeb35329c72a24d...,MALWARE,RegCreateKeyExA RegEnumKeyA RegQueryValueExA R...


In [7]:
vectorizer = CountVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(train['funcs'])

In [8]:
# export the ordered list of features used by the vectorizer
with open('feature_names.txt', 'w') as f:
    for feature in vectorizer.get_feature_names():
        f.write(feature + '\n')



In [9]:
# Print train_vectors index that are not zero
print(train_vectors[0].nonzero()[1])
print(vectorizer.get_feature_names()[2800])

print(train.iloc[0]['filename'])


[8923 8922 9075 9077 9082 9087 9090 9092 9095 9101 9107 9110 9648 9151
 9154 9155 9156 9164 9168 9172 9174 9189 9197 9199 9200 9201 9210 9215
 9224 9228 9230 9231 9236 9237 9239 9241 9252 8568 9061 9059 9055 9049
 9048 9044 9041 9039 9037 9070 9069 9650 8643 8644 9646 9757 9771 9847
 8571 8527 9032 9031 9030 9029 9027 9021 9020 9019 9067 9066 8581 9647
 2961 4201 2413 1760 4193 2359 4226 2956 2958 3059 4867 4856  622 4890
  626  620 4868 3513  623 3511  618  625 4869 4892  621 3509  619 4859
  624 6301 4870 5818 2741 3958 2742 7277 5383 7678 2745 5372 2747 2707
 2915 3174 3226 3267 3176 3773 6631 6764 6386 5395 3932 7286 7174 7769
 5875 2909 7855 7856 8518 8535 8548 8542 8554 8537 8515 8517 8543 8566
 8522 8534 8521 8516 8524 8520 8519 8523 8553 8536 8514 8552 8551 8555
 8540 8538 8539 9770 9766 9311 9436 2917 3173 2395 7467 2060 1230 4171
 3766  174 1231 5377 6200 6148 8178 7139 1188 6146 6143 8177  173 1657
 1189 3774 6147 1276 6002 6040 6064 6054 6031 1008 1014 1007  931  929
 1006 

In [10]:
# For each row in train datatable, write a new file in the export directory

for index, row in train.iterrows():
    filename = row['filename']
    # Create file in export directory 
    with open(os.path.join(export_dt_dir_name, filename), 'w') as f:
        # write train_vectors list as a string in the file
        f.write(''.join([str(x) * 1 for x in train_vectors[index].toarray()[0]]))