# Vectorization of Samples
To conserve resources and prepare the samples for final ingest, the last thing we need to do is turn all of the samples into vectors. This will be done using a custom tokenizer similar to that in tokenization.ipynb in MalDroid_feature_engineering repo. It will take the sample.apk.json files as a string and convert them to index ints. Due to the large size of our training dataset and the potential for the future computation of additional n-grams, performance and reusability will be optimized. The vocabulary.txt file from the aforementioned repo has been copied to local.
## Algorithm Process (for sample in samples; given: regex delimiter from vocab)
1. Fetch sample and load as dict, extract list of behaviors
2. Call multisort function, ordering by id then ts 
3. For behavior in sorted list, drop unused keys
4. Cast each dict in sorted list to string and add to str list of behaviors
5. Initalize vector with SOA index, For behavior in list, call tokenizer function passing vector: 
6. Append vector with SOB index, init prev_matchEnd=0
7. For match in finditer(delim, behavior):
8. If match.start() - prev_matchEnd > 2, append vector with UNK index
9. Append vector with match.lastgroup set prev_matchEnd to match.end(), next match
10. Append vector with EOB index, return vector, next behavior
11. Append vector with EOA index, save to .npy under *class*/*hash*.npy, next sample
## Current Objectives
1. Fetch a random array of samples. This is intended to account for having to cut down the sample size while minimizing any bias therein. Needs to be done with a random seed so array is the same on each run, making it robust. Additionally, samples need to be interleaved by class so no one class is overrepresented. 
2. Implement resumable infacture. This may be in the form of a 'last_token' variable outside of the main loop so it can be fetched if the loop crashes. Sometimes if a large script crashes the kernel requires a restart, clearing all variables. Therefore, this variable needs to be written to a file after each run.
3. Introduce multithreading with the multiprocessing library if possible. 

In [None]:
import json
import tensorflow as tf
import numpy as np
import ast
import re
import time
import timeit
from statistics import mean
import multiprocessing
from joblib import Parallel, delayed
import random

In [None]:
def sort_behaviors(raw_behaviors):
    raw_behaviors.sort(key = lambda x: x['low'][0]['id'])
    raw_behaviors.sort(key = lambda x: float(x['low'][0]['ts']))
    return raw_behaviors

In [None]:
def strip_unused_keys(behavior_list, unused_keys):
    behavior_index = 0
    for behavior in behavior_list:
        behavior = {key: value for key, value in behavior.items() if key not in unused_keys}
        sub_behavior_index = 0
        for sub_behavior in behavior['low']:
            sub_behavior = {sub_key: sub_value for sub_key, sub_value in sub_behavior.items() if sub_key not in unused_keys}
            behavior['low'][sub_behavior_index] = sub_behavior
            sub_behavior_index += 1
        behavior_list[behavior_index] = behavior
        behavior_index += 1
    return behavior_list

In [None]:
def tokenize(behavior, delimiter):
    behavior_vector = [3]
    prev_matchEnd = 0

    for match in re.finditer(delimiter, behavior):
        matchStart = match.start()
        if (matchStart - prev_matchEnd) > 2 and any(char.isalnum() for char in behavior[prev_matchEnd:matchStart]) and 'low' not in behavior[prev_matchEnd:matchStart]: 
            behavior_vector.append(0)
        behavior_vector.append(int(match.lastgroup[1:]))
        prev_matchEnd = match.end()

    behavior_vector.append(4)

    return behavior_vector

In [None]:
def vectorize(sample, regex_pattern, parent_dir):
    with open('resume_dependencies.txt', 'w') as write_resume:
        write_resume.write('Current sample: \n')
        write_resume.write('{0} \n'.format(sample))
    sample_times = []
    sample_hash, sample_class = sample
    start_sample = time.time()
    with open(parent_dir + sample_class + '\\' + sample_hash + '\\sample_for_analysis.apk.json') as sample_path:
        try:
            sample_behaviors = json.load(sample_path)['behaviors']['dynamic']['host']
        except:
            with open('error_hashes.txt', 'a+') as write_errors:
                write_resume.write(sample_hash)
            print("Error loading hash {0}".format(sample_hash))
            return None
    load_end = time.time()

    sort_start = time.time()
    sorted_behaviors = sort_behaviors(sample_behaviors)
    sort_end = time.time()
    sample_behaviors = []
    # lists are cleared after useage to preserve memory resources

    strip_start = time.time()
    stripped_behaviors = strip_unused_keys(sorted_behaviors, ['arguments', 'blob', 'parameters', 'id', 'xref', 'ts', 'tid', 'interfaceGroup', 'methodName'])
    strip_end = time.time()
    sorted_behaviors = []

    cast_str_start = time.time()
    string_behaviors = [json.dumps(behavior) for behavior in stripped_behaviors] 
    cast_str_end = time.time()
    stripped_behaviors = []

    vector_start = time.time()
    token_times = []
    append_times = []
    vectorized_sample = [1]
    for behavior in string_behaviors:
        token_start = time.time()
        append_to_vector = tokenize(behavior, regex_pattern)
        token_end = time.time()
        token_times.append(token_end-token_start)
        append_start = time.time()
        for scalar in append_to_vector:
            vectorized_sample.append(scalar)
        append_end = time.time()
        append_times.append(append_end-append_start)
    vectorized_sample.append(2)
    vector_end = time.time()

    write_start = time.time()
    # with open("vectorized_samples/" + sample_class + "/" + sample_hash + ".npy", 'wb') as vector_path:
    #     np.save(vector_path, vectorized_sample, allow_pickle = False)
    end_sample = time.time()

    print("full pass: {0}".format(end_sample-start_sample))
    print("load sample: {0}".format(load_end-start_sample))
    print("sort: {0}".format(sort_end-sort_start))
    print("strip: {0}".format(strip_end-strip_start))
    print("cast: {0}".format(cast_str_end-cast_str_start))
    print("vectorize: {0}\n".format(vector_end-vector_start))
    print("avg tokenize: {0}".format(mean(token_times)))
    print("write: {0}".format(end_sample-write_start))
    
    return None

In [None]:
def shuffle_and_interleave(adware, banking, riskware, sms):
    final_list = []

    random.seed(42)
    #sets random seed for shuffling of samples
    random.shuffle(adware)
    random.shuffle(banking)
    random.shuffle(riskware)
    random.shuffle(sms)

    adware_len = len(adware)
    banking_len = len(banking)
    riskware_len = len(riskware)
    sms_len = len(sms)

    for index in range(max(adware_len, banking_len, riskware_len, sms_len)):
        if index < adware_len:
            final_list.append((adware[index], 'adware'))
        if index < banking_len:
            final_list.append((banking[index], 'banking'))
        if index < riskware_len:
            final_list.append((riskware[index], 'riskware'))
        if index < sms_len:
            final_list.append((sms[index], 'sms'))

    return(final_list)

In [None]:
with open('vocabulary.txt') as vocab_path:
    vocab_file = vocab_path.read()

vocab = ast.literal_eval(vocab_file)

token_specification = []
delimiter_list = []

for packed_items in vocab.items():
    index, value = packed_items
    delimiter, literal = value
    if delimiter != None:
        delimiter_list.append(str(delimiter))
        token_specification.append(('I' + str(index), delimiter))

regex_pattern = '|'.join('(?P<{0}>{1})'.format(name, delim) for name, delim in token_specification)
# calling re.compile() on the pattern could lead to increased performance, but causes issues with backslashes, therefore it is omitted

parent_dir = 'X:\\MITRE\\MalDroid Data\\MalDroid_feature_engineering\\'
adware_hashes = [adware_hash for adware_hash in os.listdir(parent_dir + 'adware\\')]
banking_hashes = [banking_hash for banking_hash in os.listdir(parent_dir + 'banking\\')]
riskware_hashes = [riskware_hash for riskware_hash in os.listdir(parent_dir + 'riskware\\')]
sms_hashes = [sms_hash for sms_hash in os.listdir(parent_dir + 'sms\\')]

sample_list = shuffle_and_interleave(adware_hashes, banking_hashes, riskware_hashes, sms_hashes)

current_sample = 'Start'
error_hashes = []

for sample in sample_list[:10]:
    vectorize(sample, regex_pattern, parent_dir)