# MalDroid Data Ingest
This notebook handles the retrieval of MalDroid malware sample analysis (sample_for_analysis.apk.json) files from the MalDroid repo (http://205.174.165.80/CICDataset/MalDroid-2020/Dataset/Capturing_logs/). 
## Process
1. Take the URIs of the .tar.gz files for each sample from MalDroid_ref_raw.csv, extract them to .tar and then to a directory *malware class*/*hash*
2. Check to ensure the sample_for_analysis.apk.json file does not throw an error when opening. If yes, directory will be deleted and script will move to next sample
3. Move the sample_for_analysis.apk.json file from the *malware class*/*hash*/sample_for_analysis.apk subdirectory to under *malware class*/*hash*/ and delete the subdirectory

In [None]:
import pandas as pd 
import tarfile
import os
import urllib.request
import json
import shutil

In [None]:
with open('MalDroid_ref_raw.csv') as ref_raw_path:
    ref_raw = pd.read_csv(ref_raw_path)
ref_raw = ref_raw[['hash', 'URI', 'malware_class']]
ref_raw = ref_raw.drop([13076])
# Drops final row; does not contain a sample due to parsing issues

In [None]:
issue_hashes = []
large_hashes = []
last_hash = 'start'

for sample in ref_raw.itertuples(index=False):
    last_hash = sample.hash
    if '.tar.gz' in sample.URI:
        base_path = sample.malware_class.lower() + '/' + sample.hash
        temp_file = urllib.request.urlretrieve(sample.URI, filename=None)[0]
        file = tarfile.open(temp_file)
        file.extractall(base_path)
        file.close()
        with open(base_path + '/sample_for_analysis.apk/sample_for_analysis.apk.json') as default_path:
            try:
                json.load(default_path)
            except:
                issue_hashes.append(sample.hash)
                continue
        json_size = os.stat(base_path + '/sample_for_analysis.apk/sample_for_analysis.apk.json').st_size
        if json_size >= 100000000:
            # GitHub will not host files larger than 100 MB, this removes and logs these files for compatability
            large_hashes.append(sample.hash)
            shutil.rmtree(base_path)
            continue
        shutil.move(base_path + '/sample_for_analysis.apk/sample_for_analysis.apk.json', base_path + '/sample_for_analysis.apk.json')
        shutil.rmtree(base_path + '/sample_for_analysis.apk')

In [None]:
issue_df = pd.DataFrame(issue_hashes, columns = ['hash'])
with open('issue_hashes.csv', 'w') as issue_write:
    issue_df.to_csv(issue_write)

large_df = pd.DataFrame(large_hashes, columns = ['hash'])
with open('large_hashes.csv', 'w') as large_write:
    large_df.to_csv(large_write)

with open('last_hash_processed.txt', 'w') as resume_write:
    resume_write.write(last_hash)