# Vectorization of Samples
To conserve resources and prepare the samples for final ingest, the last thing we need to do is turn all of the samples into vectors. This will be done using a custom tokenizer similar to that in tokenization.ipynb in MalDroid_feature_engineering repo. It will take the sample.apk.json files as a string and convert them to index ints. Due to the large size of our training dataset and the potential for the future computation of additional n-grams, performance and reusability will be optimized. The vocabulary.txt file from the aforementioned repo has been copied to local.
## Algorithm Process (for sample in samples; given: regex delimiter from vocab)
1. Fetch sample and load as dict, extract list of behaviors
2. Call multisort function, ordering by id then ts 
3. For behavior in sorted list, drop unused keys
4. Cast each dict in sorted list to string and add to str list of behaviors
5. Initalize vector with SOA index, For behavior in list, call tokenizer function passing vector: 
6. Append vector with SOB index, init prev_matchEnd=0
7. For match in finditer(delim, behavior):
8. If match.start() - prev_matchEnd > 2, append vector with UNK index
9. Append vector with match.lastgroup set prev_matchEnd to match.end(), next match
10. Append vector with EOB index, return vector, next behavior
11. Append vector with EOA index, save to .tfrecord under *class*/*hash*.tfrecord, next sample

In [1]:
import json
import tensorflow as tf
import numpy as np
import ast

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
with open('vocabulary.txt') as vocab_path:
    vocab_file = vocab_path.read()

vocab = ast.literal_eval(vocab_file)

token_specification = []
delimiter_list = []

for packed_items in vocab.items():
    index, value = packed_items
    delimiter, literal = value
    if delimiter != None:
        delimiter_list.append(str(delimiter))
        token_specification.append(('I' + str(index), delimiter))

regex_pattern = '|'.join('(?P<{0}>{1})'.format(name, delim) for name, delim in token_specification)

In [3]:
def sort_behaviors(raw_behaviors):
    raw_behaviors.sort(key = lambda x: x['low'][0]['id'])
    raw_behaviors.sort(key = lambda x: float(x['low'][0]['ts']))
    #check to ensure this correctly parses str to float
    return raw_behaviors

In [4]:
def strip_unused_keys(behavior_list, unused_keys):
    behavior_index = 0
    for behavior in behavior_list:
        behavior = {key: value for key, value in behavior.items() if key not in unused_keys}
        sub_behavior_index = 0
        for sub_behavior in behavior['low']:
            sub_behavior = {sub_key: sub_value for sub_key, sub_value in sub_behavior.items() if sub_key not in unused_keys}
            behavior['low'][sub_behavior_index] = sub_behavior
            sub_behavior_index += 1
        behavior_list[behavior_index] = behavior
        behavior_index += 1
    return behavior_list

In [5]:
def tokenize(behavior, delimiter):
    behavior_vector = [3]
    prev_matchEnd = 0

    for match in re.finditer(delimiter, behavior):
        if (match.start() - prev_matchEnd) > 2: 
            behavior_vector.append(0)
        behavior_vector.append(int(match.lastgroup[1:]))
        prev_matchEnd = match.end()
    behavior_vector.append(4)
    
    return behavior_vector

In [6]:
mal_classes = ['adware', 'banking', 'riskware', 'sms']
error_hashes = []
parent_dir = 'X:\\MITRE\\MalDroid Data\\MalDroid_feature_engineering\\'

for mal_class in mal_classes:
    for sample_folder in os.listdir(parent_dir + mal_class + '\\'):
        # with open(parent_dir + mal_class + '\\' + sample_folder + '\\sample_for_analysis.apk.json') as sample_path:
        with open('X:\\MITRE\\MalDroid Data\\MalDroid_feature_engineering\\riskware\\f6036a5730f8ec961bc1666cfc3269dbf9b854b1a705a65610154a697e804d79\\sample_for_analysis.apk.json') as sample_path:
            try:
                sample_behaviors = json.load(sample_path)['behaviors']['dynamic']['host']
            except:
                error_hashes.append(sample_folder)
                continue

        sorted_behaviors = sort_behaviors(sample_behaviors)
        sample_behaviors *= 0 
        # lists are cleared after useage to preserve memory resources

        stripped_behaviors = strip_unused_keys(sorted_behaviors, [])
        sorted_behaviors *= 0 

        string_behaviors = [ast.literal_eval(behavior) for behavior in stripped_behaviors] 
        # check to ensure this preserves json syntax (ex "" not '' in keys)
        stripped_behaviors *= 0 

        vectorized_sample = [1]
        for behavior in string_behaviors:
            vectorized_sample.append(tokenize(behavior, regex_pattern))
        vectorized_sample.append(2)

        tf_dataset = tf.data.Dataset.from_tensor_slices(vectorized_sample)
        tf_dataset = tf_dataset.map(tf.io.serialize_tensor)
        writer = tf.data.experimental.TFRecordWriter("TFRecord_files/" + mal_class + "/" + sample_folder + ".tfrecord")
        writer.write(tf_dataset)
        break
    break