In [None]:
import os
import pandas

# Try to mitigate path length limitations?
OUTPUT_PATH = "static_dataset"
LOG_LOCATION = "output.log"

os.makedirs(OUTPUT_PATH, exist_ok=True)
if os.path.exists(LOG_LOCATION):
    os.remove(LOG_LOCATION)

In [None]:
# Requires androguard to be installed
# https://androguard.readthedocs.io/en/latest/intro/installation.html

import androguard.cli.entry_points as androguard_entries
import androguard.cli.main as androguard_main
from androguard import session

# Need to abstract Androguard CLI commands
def decompile(file, output, limit=None, jar=None, decompiler=None, format_=None):
    s = session.Session()
    with open(file, "rb") as fd:
        s.add(file, fd.read())
    androguard_main.export_apps_to_format(file, s, output, limit, jar, decompiler, format_)


def cfg(file, output, class_name=".*", method_name=".*", descriptor=".*", access_flag=".*", no_isolated=False, show=False, verbose=False):
    androguard_entries.androcg_main(
        verbose=verbose,
        APK=file,
        classname=class_name,
        methodname=method_name,
        descriptor=descriptor,
        accessflag=access_flag,
        no_isolated=no_isolated,
        show=show,
        output=output
    )

In [None]:
# If you don't have dataset.csv, please run `dataset-summary-builder.ipynb`
df = pandas.read_csv('dataset.csv')

In [None]:
import logging

logger = logging.getLogger(__name__)
syslog = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')
syslog.setFormatter(formatter)
logger.setLevel(logging.INFO)
logger.addHandler(syslog)
fh = logging.FileHandler(LOG_LOCATION)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)


In [None]:
output_locations = []

checkpoint = "55fc3365a1"
checkpoint_reached = False

for index, row in df.iterrows():
    apk_location = row['file_path']
    apk_hash = row['sha256'][:10]
    
    if apk_hash != checkpoint and not checkpoint_reached:
        continue
        
    checkpoint_reached = True
        
    apk_output = os.path.join(OUTPUT_PATH, apk_hash)
    decompiled = os.path.join(apk_output, "decompiled")
    blocks = os.path.join(apk_output, "blocks")
    cfg_xml = os.path.join(apk_output, "cfg.gml")
    
    os.makedirs(apk_output, exist_ok=True)
    os.makedirs(decompiled, exist_ok=True)
    os.makedirs(blocks, exist_ok=True)
    
    error_decompile = False
    try:
        decompile(file=apk_location, output=decompiled)
        logger.info(f"Successfully decompiled: {apk_output}")
    except:
        logger.error(f"Failed to decompile: {apk_output}")
        error_decompile = True

    error_cfg = False
    try:
        cfg(file=apk_location, output=cfg_xml)
        logger.info(f"Successfully created CFG: {apk_output}")
    except:
        logger.error(f"Failed to generate CFG: {apk_output}")
        error_cfg = True

    df.at[index, 'static_output_location'] = apk_output
    df.at[index, 'error_decompile'] = error_decompile
    df.at[index, 'error_cfg'] = error_cfg
    df.at[index, 'error'] = error_decompile or error_cfg
    
    output_locations.append({
        'index': index,
        'path': apk_output, 
        'error_decompile': error_decompile,
        'error_cfg': error_cfg,
        'error': error_decompile or error_cfg
    })
    
    