In [2]:
from google.colab import drive
drive.mount("/content/mydrive")

Mounted at /content/mydrive


In [10]:
!ls /content/mydrive/MyDrive/New_Dataset/ApkDataset/Benign | head -n 100 | grep .apk > test_apk.txt
!echo "==============" >> test_apk.txt
!ls /content/mydrive/MyDrive/New_Dataset/ApkDataset/Malware | head -n 100 | grep .apk >> test_apk.txt

In [13]:
%%capture
!pip install dgl==1.1.2;
!pip install androguard==3.4.0a1;
!pip install networkit
!pip install loguru

In [14]:
import sys
import traceback
from pathlib import Path
import dgl
import joblib as J
import networkx as nx
import networkit as nk
import torch
from androguard.misc import AnalyzeAPK
from gensim.models import Word2Vec
import traceback
import numpy as np
import matplotlib.pyplot as plt
from loguru import logger

fmt = "[{time}] - [{level}] : {message}"
config = {
    "handlers": [
        {"sink": sys.stderr, "format": fmt},
    ],
}
logger.configure(**config)

def plot(nx_original, nx_pruned):
    """ Plot graph original reflect with pruned graph """
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    pos = nx.spring_layout(nx_original)

    nx.draw(nx_original, pos, with_labels=False, ax=axes[0], node_color="skyblue", edge_color="gray", node_size=100, font_size=10)
    nx.draw_networkx_edge_labels(nx_original, pos, ax=axes[0])
    axes[0].set_title("Đồ thị ban đầu")

    nx.draw(nx_pruned, pos, with_labels=False, ax=axes[1], node_color="lightgreen", edge_color="gray", node_size=100, font_size=10)
    nx.draw_networkx_edge_labels(nx_pruned, pos, ax=axes[1])
    axes[1].set_title("Đồ thị sau khi cắt tỉa")

    plt.tight_layout()
    plt.show()

def convert_to_networkit(nx_graph):
    """ function convert from networkx to networkit """
    node_mapping = {node: i for i, node in enumerate(nx_graph.nodes())}
    reverse_mapping = {i: node for node, i in node_mapping.items()}
    nk_graph = nk.graph.Graph(weighted=True, directed=nx_graph.is_directed())

    for _ in node_mapping.values():
        nk_graph.addNode()

    for u, v, data in nx_graph.edges(data=True):
        weight = data["weight"] if "weight" in data else 1.0
        nk_graph.addEdge(node_mapping[u], node_mapping[v], weight)
    nk_graph.indexEdges()
    return nk_graph, reverse_mapping

def convert_to_networkx(nk_graph, reverse_mapping):
    """ function convert from networkit to networkx """
    nx_graph = nx.DiGraph() if nk_graph.isDirected() else nx.Graph()
    for u, v, w in nk_graph.iterEdgesWeights():
        nx_graph.add_edge(reverse_mapping[u], reverse_mapping[v], weight=w)
    return nx_graph


def prune_graph_local_degree(g):
    targetRatio = 0.2
    nk_graph, reverse_mapping = convert_to_networkit(g)
    logger.info(f"Before prune: {nk_graph.numberOfNodes()} nodes - {nk_graph.numberOfEdges()} edges")
    local_degree = nk.sparsification.LocalDegreeSparsifier()
    nk_pruned = local_degree.getSparsifiedGraphOfSize(nk_graph, targetRatio)
    logger.info(f"After pruned: {nk_pruned.numberOfNodes()} nodes - {nk_pruned.numberOfEdges()} edges")
    return convert_to_networkx(nk_pruned, reverse_mapping)

def process(source_file: Path, dest_dir: Path):
    logger.info(f"Processing ...")
    try:
        file_name = source_file.stem
        _, _, dx = AnalyzeAPK(source_file)
        cg = dx.get_call_graph()
        mappings, mappingsH = {}, {}

        nx_original = nx.DiGraph(cg)

        # prune graph
        G = prune_graph_local_degree(nx_original)
        # plot(nx_original, G)

        katz = nx.katz_centrality(G)
        closeness = nx.closeness_centrality(G)
        clustering = nx.clustering(G)
        sentences = []

        for node in G.nodes():
            mappings[node] = [G.in_degree(node),
                                           G.out_degree(node),
                                           katz[node],
                                           closeness[node],
                                           clustering[node]]
            sentences.append([node.class_name, node.method.name])

        model = Word2Vec(sentences=sentences, vector_size=100, min_count=1)

        for node in G.nodes():
            mappingsH[node] =  torch.tensor(np.average(model.wv[[node.class_name, node.method.name]], axis=0))

        nx.set_node_attributes(G, mappings, 'features')
        nx.set_node_attributes(G, mappingsH, 'featuresH')

        G = nx.convert_node_labels_to_integers(G)
        dg = dgl.from_networkx(G, node_attrs=['features', 'featuresH'])
        dest_dir = dest_dir / f'{file_name}.fcg'
        dgl.data.utils.save_graphs(str(dest_dir), [dg])
        print(f"Processed {source_file}")

    except:
        print(f"Error while processing {source_file}")
        traceback.print_exception(*sys.exc_info())
        return

In [15]:
path_benign = "/content/mydrive/MyDrive/New_Dataset/ApkDataset/Benign/"
path_malware = "/content/mydrive/MyDrive/New_Dataset/ApkDataset/Malware/"

sample_bengin = "0000049D8911607971A3336DE5CF36F4799D679D6BB9EF014CBFE73578A6E3EA.apk"
sample_malware = "0022A484CA5D219B98AA3EFBCCC9D7347E092FD847F9FBF58A93D67C124BFE4F.apk"

# process(Path(path_benign + sample_bengin), "")
process(Path(path_malware + sample_malware), Path("./"))

[2025-02-12T15:01:07.148017+0000] - [INFO] : Processing ...
[2025-02-12T15:01:30.686877+0000] - [INFO] : Before prune: 17992 nodes - 34294 edges
[2025-02-12T15:01:31.450009+0000] - [INFO] : After pruned: 17992 nodes - 18678 edges


Processed /content/mydrive/MyDrive/New_Dataset/ApkDataset/Malware/0022A484CA5D219B98AA3EFBCCC9D7347E092FD847F9FBF58A93D67C124BFE4F.apk


In [49]:
!du -sh "/content/mydrive/MyDrive/New_Dataset/ApkDataset/Benign/0000049D8911607971A3336DE5CF36F4799D679D6BB9EF014CBFE73578A6E3EA.apk"

3.1M	/content/mydrive/MyDrive/New_Dataset/ApkDataset/Benign/0000049D8911607971A3336DE5CF36F4799D679D6BB9EF014CBFE73578A6E3EA.apk


In [53]:
!du -sh "/content/mydrive/MyDrive/New_Dataset/ApkDataset/Malware/0022A484CA5D219B98AA3EFBCCC9D7347E092FD847F9FBF58A93D67C124BFE4F.apk"

2.4M	/content/mydrive/MyDrive/New_Dataset/ApkDataset/Malware/0022A484CA5D219B98AA3EFBCCC9D7347E092FD847F9FBF58A93D67C124BFE4F.apk
