# MalDroid Data Ingest
This notebook handles the retrieval of MalDroid malware sample analysis (sample_for_analysis.apk.json) files from the MalDroid repo (http://205.174.165.80/CICDataset/MalDroid-2020/Dataset/Capturing_logs/). 
## Process
1. Take the URIs of the .tar.gz files for each sample from MalDroid_ref_raw.csv, extract them to .tar and then to a directory *malware class*/*hash*
2. Check to ensure the sample_for_analysis.apk.json file does not throw an error when opening. If yes, directory will be deleted and script will move to next sample
3. Move the sample_for_analysis.apk.json file from the *malware class*/*hash*/sample_for_analysis.apk subdirectory to under *malware class*/*hash*/ and delete the subdirectory

In [None]:
import pandas as pd 
import tarfile
import os
import urllib.request
import json
import shutil
import numpy as np
import socket

In [None]:
with open('MalDroid_ref_raw.csv') as ref_raw_path:
    ref_raw = pd.read_csv(ref_raw_path)
ref_raw = ref_raw[['hash', 'URI', 'malware_class']]
ref_raw = ref_raw.drop([13076])
# Drops final row; does not contain a sample due to parsing issues

In [None]:
issue_hashes = ['0ac36a24aa3dd7e8bd7f009ab6115481812ed053c8b81967b0796a8d2f098b73', '0c41904e61ca30362129e4b7f460b2d8b5b4319de24f497b0dbdfbd64ec0f87a', '0cceea84886d862a703264074ace86a76ec8218c10789929b28352feab0d3267', '0ec63150ec195601eaaf3a42330fe03f9ea97b123ccbc728ef883271c41f8dc3', '00847524ec1e69b2cdd53205cd9725295e87094eeed1567b3efb12191ded24d2', '0669dccf3a9a08ba5d148aed4568c6a15f65075d913243f160b4c5c5ab31fc61', '082b102aaab4354b7b003a6fe5b21274582b0ed49441351a4a3f8a23012b55a5', '1d06256e8e1b7b90b242bebc323a5ef2bbbc71e72185216c3f39462c4bcb71c7', '1da5fadcaa815d9619353b8bf1bf4646c47eabfbc154d5dc7941adc1795b04b6', '2BB5E30C62E2049ADE758CF0375367B02732CE24B4C56BB234B4C4DBA3760089', '2c9fd7881e1822b92639ef3e011e64d87784d23ac05876d3fd1cec0238d1c46d', '2d62d2e6c3ec431d36ac449c0c062eee6d98b3cdac246316bb2d2622a464b26c', '2d351ee5139813d65b0f10c6fb40f9786e6ca91163e3dd2150ad4ded51ce4082', '3acbca1fb0e82db03483c8e0fbb3978c287444162e7278d2c65ea2633067087a', '3c50d8354bf9f52f3b14930599ed5a59152d83772a9b8e4377c8e0d1b73121f7', '3e0b445a1d2e33997ce78f94d62f01b5bccd6dd2b351336e567a6aac370a0b7f','3fcf12e1546a2f6b494ad42b6b699e27c4501ee662363d39ee41fd7559ac7b80','4a595d6ad083a662076b3907bcc2f5c508d63e2a7d3c0c12b4ccab98870557fd','4ab2d0bee49938748d4abf357ed2185b19dc2984fe679c0c162b0064ccb490f3','4b904ba542637bade7ff105c4e6e617b57cb1e5fd70f586f439794424f52ba40','4ce2c46fd9d88b2a2552eda6f4afc4c7e19043909fd92a294dcad198ff85cbc4', '5ace49f9f1a70b915a7e469289558c303f51ceeabf9ed6bdb5dc6818743a64ce', '5ad048ce9aeabd62088de2e8038f1540762049d64749e3a953928c2dc88e7922', '5b741e527f86eca88b64f9f64691e903110f15db77b3947a067ee1a2c44e6403', '5cbdc05183fb0ce11248ead0c4a5de031cc724171360b38214a4b3facf58ce7d', '6a59cdfefdc77efe385002e13b4dfb0595020c506c06fb0261e0be911479792e', '6a37609938daf3c7d139ef7f4222024054e18466ad752e771b26b01d36e6d8fb', '6b869cb1e71098dc70359ef3770dd0394979d50faf0ef7f955a9b83958d472db', '6cae32799b7ef805a72c575a2473d03488e2daaac7ea6906237bbd1ed74f2b55', '6dded5a4e6457fee955020998ccf176f7175d016d8f504aff32e798ec52f94f3', '6e932aecef61e61a71345a4a6bdc033aac4483d9eefb0b9f2ce7df1352ae3f5a', '7d05b21cc30b212b6e9814b3bdddf775ae8e410f31bd2d5c59355f39eb05b8c4', '7f96d56aa3bfdc3c1cefa1f024e32be3a74834bfcbac39c5fd7449bc9e8629fc', '8b9abb1eb86416332d4946fd76cdbf63b27d07f3f11fa920c2192cc16de84d36', '8b9adcd8534e45cef0f4749efc969c0988da969a2f12e60530d7403c99bf9783', '9a093cfbf7b815ae2f85796a34dfe436aeb8723027b9ba3650f0d7d5b9e5a916', '9d9647b0d4629f0755159121b2b3af5ea16236570cf0318cbe0a13d4e81b0ce8', '13b91b96b84d98452fbc1bf26554b7e5abd725bb7513f4ab59dec3245ceedfb7', '15ff51b59bf8f781b78e7f4c843b2abad6a0ab5dcd70f97315534e2b7bc0e090', '22e18b1f1e0a533cf1adbb3103b317db0d11a01f223ee1dd9bae979a2dd64151','23d9c8cca11abe2cfac4a3efd8b3883edff0eab159dfcb709d7e8f13dd3c94e1','25bb120ce291909288a13353e5e46ba9dbf4d1551346f999c3ffc7e5d5372eeb','25be589140f73949124f08759ab5bb57b126396f1401e3bfbfdc5e5c056e0d03','25c4692b0a781878087e909c34447673af41ed27fd8b768db7af84c65c5781b1','31a2566adae3689916a6405b7595f170333f6bf538167d560f3b98332df45cf2','31f6421d8e2e8d11b0e9cad08508a6354e88f5b80839da61fe75001a65ad58a8','32fe5a7358767ae5d904f551881011b7af6bc21383d58e333d397f6739603eda','38a5e1e169f3f94c675700ab8371171134f114181633532df225f99c11f7de4b','38f1d67ca112bbe4caa649e5a1d1bd21c3e04c615e1ad20cd473863301f4d4a2','45e15d928f4697147cd926bd8116538a9d29c1b47c16d39c473fdc1000738493','50a426d14ecf3aa938c637eccafbe75db04c80be43950973a79acc1bfb069d90','51ee27309fdca0451da2273390e8ea491db79f3df25bd2f563fc152b27de7689','55d417e85444f487b6844de9e1c1a0fbe2d45c2ead79b759b772e8c67362a9c7','57d5f79200f1c0f07401a8ca61a2cec66f3c6587c00b37b00e2bf221134b7d3b','59c2ba5b9b47a6c0df0e2a54b83f38d943419b7fdec6d37797fce8961a549458','69f881fee4b71287c2a285b994687d6f632589837b7c84de81a394f6d3d36756','70b8fec3d5fec7e13340350bd9209ca9b04d65841cc76b87aadd8e8b7a7bf85f','87def242def82c92704cb080fb9d32aeabfdf75290740bcd9d5db39e1d60b846','88baf90657a2fbe9abdca7840317cb8bd3b51b14ad87edfaab1b60e2c799d6d4','88c2a9d866639e84ebe9659c09db73b56eb2dfc1144475265e6920de6bebaed8','89b3095cfeac18bada832c7c0197ad4d3a420312d64839c1e21daa092da1c1bb','97f2a0da04e08671c00e31347997e85c195a8e9e66dc8bcec43055491929579b','126b346b85a96b3fec082aba092c11d37f481ef6ee40f76de9b1e9df89700c96','242a0048497bcbdeb4d1a5a43df08e492bfd42b0b85ff63b2c2a49ad5ea50829','467fbfd9b6cb258df0abdef8fce67a41b6f6b674264dccb514c1255784985bad','523ada03e2bae6ef3c079cda4bafef6201779d2ba2c01d36525c1112a30768e0','545a6714759d4133ca2b91441823da69e763eb39d4bf2304ce89a40bade0a0ea','652a7e2e53adb6a33ea5141ffdd8409ba09761a7cc72166c906c1a22fa1fc72b','660f6b9c214c5dae903433f4bce907a15e6d6b5f9a54d15f1505d68bc1e8a07e','717aea92b781b76396f45f345752ff82c62ebfce78f18f29b6919584b1eb445a','759b0e1ff925abddc1ba22bd8e9d5a540dc0e698baeef2bc1522ced4162fc8f4']

large_hashes = ['0ccf9cfae60602b20c679ad8d56cc14f94ebf50aed389ef9db8a86e717ed76ea', '0f18d0cf80f7619d17142110eadbfe1ad01bc5936f275d666c454234241ca382', '03c78d58f22b4ebe6bb92735d19c869e57d10ed3071644c1a429170e7c9f3d51', '048449f839bfa7afc61aed035ba870aff254b7534c8444cbb701e31ec6d47f24', '09c70721776a3d0b345d2fbc647b363648c91b2f8f3dd0c25cf928aba4ade179',
'1a2a6761a1279bfe9a352381dd66614b466b19aefaf17c300536be8e4be0b0c5',
'1a2b969ba56b3d3473fe45f87a04fa9610293e2ccf687b2d1d71220b8c765077',
'1a9a75579366f1d2dbeaa6d2423819d874537973815778eae3debef47da08a1d',
'1c31d50db2aca2c03836e7e0cbef657e97ee8dae70758c58f89975a11fc0b6ca', '1e83a89395c3de4d2f0945e21d3fd6a7324538c0aa613836cfbe212c2bc1d145', '2de4216275ee614c108ab69519e97c7a9fcf50b9593557d554749165ee55a4b5', '2fbbe31be65aecfc88bf9f321f128f28280ae7dc26117a0ac2c179d9c7521d94', '5b42432aad27746f04353fdd70bd8c2d979303eaf367de28d88f6b0cb72dd9a5', '5bc23138fd301b7f293f9b4efb13d72019815d64695a6f9b694c23c33f1442b8', '6f012f1a12bec0dba7d0e23f03593d2339c1e7e8af8fcff70650c44ea72717e4', '7b5338e1e7bf8b4816b821db9ed042ed13ce4f8ebd1748ba9788b070e45bf03d', '7e5e1502c9f2db87563ac77f8e420ffe281fe0d02b6773a6f1873d88fcada4d4', '8be77cdbc0e25a1a6342544057f8fdf3839be43b313074af794df36b36f3b165','9d9cc59569aa8706d797cca4df36f689878305039cd8b41d99de3007681f814e','9f42775930835f224c2af24a924964bb7027f3f1b90548865b9801f63fa7fb2b','10f6a5912fc45c6dce219f28bd493a765081f6a01b090293b5a3b57836c762eb','27f0a1484ace19692681d3f938d161cdbb943edb2452674fb13599be71f9c665','31b2ba2f2c1fb196dd38017134b5402dde24803f764de6962f68d2c71edbabdf','34ff80271b6864e437812edeb4ff7ed465a1d359af4c2271c22e745c1c6d3a9d','49dfe143dc62ad64d62bcd33f4e771662514462788d393daeb5549fcaecdf339','52b563e039fbcbb875625f2d8bf27a491c31867c1ce37c1148a267ced9923c2a','64a77e654d5f4c183182628ef446734468068fd5f9af99be5220bb8f25036192','72e6ae9cd081f8d38488cf4077f66db0f97cef486a60eb38c593ba82db77ecd8','77c57468c9857c20c69dedec214845f6e13f53bd8ff486b58b10df7629b45a05','98bed95679d422e89d48f08b62db8103a880aa498f364b733c8a110656dc9134','213a0abc71295a8d9e2062c1758b9533717f2e6cd79c01841f768e2e8e1f6a0f','228eb3562a913b827f11b736b649bfb18ed8b6dc5422f311cfc5ea11e5dbafe0','452b25f223ac1ea7f77bc9c78fd6bf8bf444b9372a8dec111a01827ac8a0c0bd','478fda44cbe4ece2931ea0f3b244b522b9638ca55c8592756ab092e041e52e34','582de54cf4e74d2474d47514393e57f55e251586de0b5be62c5f82b6109520e0','734df9c568c7d05544ecde71aafdbcf2a882c81b23753beb6d71f2c9b2a03024','770cd9626aefe1aeffa70155068c5880963eb634895f21b5ea25ec91c494cb72']

In [None]:
last_hash = 'dfc08c3bac274a7d23b6ea96019dbdbef3bbec411b2a19787d56b55fe6311b4f'
last_hash = last_hash.lower()

socket.setdefaulttimeout(600)
# Sets default timeout to 10 min to avoid potential freezing

# prev_index = ref_raw['hash'].loc[lambda x: x==last_hash].index
ref_raw = ref_raw.iloc[1210:]

def ingest_loop(sample):
    last_hash = sample[0]
    if '.tar.gz' in sample[1]:
        base_path = sample[2].lower() + '/' + sample[0]
        try:
            temp_file = urllib.request.urlretrieve(sample[1], filename=None)[0]
        except:
            print('Error loading URL on hash: ' + sample[0])
            issue_hashes.append(sample[0])
            return
        try:
            file = tarfile.open(temp_file)
            file.extractall(base_path)
            file.close()
        except:
            print('Error extracting on hash: ' + sample[0])
            issue_hashes.append(sample[0])
            try:
                shutil.rmtree(base_path)
            except:
                pass
            return
        default_path = open(base_path + '/sample_for_analysis.apk/sample_for_analysis.apk.json')
        try:
            json.load(default_path)
            default_path.close()
        except:
            print('Error loading JSON on hash: ' + sample[0])
            issue_hashes.append(sample[0])
            default_path.close()
            return
        json_size = os.stat(base_path + '/sample_for_analysis.apk/sample_for_analysis.apk.json').st_size
        if json_size >= 100000000:
            # GitHub will not host files larger than 100 MB, this removes and logs these files for compatability
            print('JSON file too large on hash: ' + sample[0])
            large_hashes.append(sample[0])
            shutil.rmtree(base_path)
            return
        shutil.move(base_path + '/sample_for_analysis.apk/sample_for_analysis.apk.json', base_path + '/sample_for_analysis.apk.json')
        shutil.rmtree(base_path + '/sample_for_analysis.apk')

ref_raw.apply(lambda x: ingest_loop(x.to_numpy()), axis = 1)

In [None]:
issue_df = pd.DataFrame(issue_hashes, columns = ['hash'])
with open('issue_hashes.csv', 'w') as issue_write:
    issue_df.to_csv(issue_write)

large_df = pd.DataFrame(large_hashes, columns = ['hash'])
with open('large_hashes.csv', 'w') as large_write:
    large_df.to_csv(large_write)

with open('last_hash_processed.txt', 'w') as resume_write:
    resume_write.write(last_hash)