In [10]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
import matplotlib.pyplot as plt

In [11]:
func_dict = {}

In [12]:
def ld_data(dir_name):
    # Load data in files from directory dir_name in a pandas dataframe
    df = pd.DataFrame()
    for filename in os.listdir(dir_name):
        funcs = []
        with open(os.path.join(dir_name, filename), 'r') as f:
            lines = f.read().splitlines()
            
            for line in lines:
                lib, func = line.split(' ')
                if '@' in func or '?' in func:
                    continue
                
                # TODO: Comentado para testar Vectorizer sem lower()
                # if not func.lower() in func_dict:
                #     func_dict[func.lower()] = [(lib, func)]
                # else:
                #     func_dict[func.lower()].append((lib, func))

                if not func in func_dict:
                    func_dict[func] = [(lib, func)]
                else:
                    # TODO: Melhorar. So dar append se a lib for diferente das atuais
                    func_dict[func].append((lib, func))


                funcs.append(func)

        label = 'MALWARE' if filename.startswith('R') else 'GOODWARE'
        df = pd.concat([df, pd.DataFrame({'filename': filename, 'label': label, 'funcs': ' '.join(funcs)}, index=[0])], ignore_index=True)
        
    return df


In [13]:
# train_dt_dir_name = '../data/arquivos-nilson-filtro-2/'
train_dt_dir_name = '/archive/files/nastyware-files-mix/mix4-format0/'
export_dt_dir_name = '/archive/files/nastyware-files-mix/mix4-format1/'

In [14]:
train = ld_data(train_dt_dir_name)

In [15]:
train

Unnamed: 0,filename,label,funcs
0,informationstealer-yunsip-1026,GOODWARE,CreateEventW WaitForSingleObject SetEvent Load...
1,goodware-calc.exe,GOODWARE,ShellExecuteW GetLastError SetEvent CreateEven...
2,informationstealer-yunsip-1025,GOODWARE,CreateEventW WaitForSingleObject SetEvent Load...
3,ransomware-gandcrab-639,GOODWARE,GetLastError PulseEvent EraseTape GetSystemTim...
4,cryptominer-coinminer-3073,GOODWARE,FindFirstFileA FindNextFileA CreateDirectoryA ...
5,backdoor-berbew-1200,GOODWARE,CoCreateInstance CLSIDFromString CoInitialize ...
6,backdoor-berbew-257,GOODWARE,WSAGetLastError WSAStartup __WSAFDIsSet accept...
7,dropper-gepys-1776,GOODWARE,CryptBinaryToStringA CloseHandle ConnectNamedP...
8,dropper-dinwod-2175,GOODWARE,LoadLibraryA GetProcAddress VirtualProtect Vir...
9,trojan-autoit-2348,GOODWARE,LoadLibraryA GetProcAddress VirtualProtect Vir...


In [16]:
vectorizer = CountVectorizer(lowercase=False)
train_vectors = vectorizer.fit_transform(train['funcs'])

In [17]:
# export the ordered list of features used by the vectorizer
with open('feature_names.txt', 'w') as f:
    for feature in vectorizer.get_feature_names():
        f.write(feature + '\n')



In [18]:
# Print train_vectors index that are not zero
print(train_vectors[0].nonzero()[1])
# print(vectorizer.get_feature_names()[2800])

print(train.iloc[0]['filename'])

[ 104 1007  852  607  279  418  164  994  595  213  992   43  766  555
   26  925  168  371   99  352 1165 1170  131 1159  255  263 1161  258
  762  617  615  353  106  719  855 1027  637 1009  179  394  374  397
 1163  464  395  501  510  505  497  509   80  438  689 1006  671  619
  620  140  327  757  666  851  121  199  324  864  854  343  349  477
  276 1168  377  506  313  236  447  146  424  675  878  695  586  923
   47  491  302  588  492  756  328  329  330  883  450  305  663  203
  830   39 1018  626  945  823  174 1253  600   29 1066 1082 1088 1053
 1239 1249 1217 1205 1223 1221 1241 1244 1245 1073 1171 1135 1040 1177
 1042 1079 1126 1180 1178 1182 1214]
informationstealer-yunsip-1026


In [19]:
# For each row in train datatable, write a new file in the export directory

for index, row in train.iterrows():
    filename = row['filename']
    # Create file in export directory 
    with open(os.path.join(export_dt_dir_name, filename), 'w') as f:
        # write train_vectors list as a string in the file
        f.write(''.join([str(x) * 1 for x in train_vectors[index].toarray()[0]]))