In [1]:
import os
import time
import csv
import sys
import yaml
import numpy as np
import pandas as pd
from src.util import ExeDataset, write_pred
from src.model import MalConv
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import pickle
import time

use_gpu = True
use_cuda = torch.cuda.is_available()

malconv = torch.load('/home/user/Desktop/Retraining_Malconv/checkpoint/Retrain_all_samples_sd_850.model', map_location=torch.device('cuda') if use_gpu else 'cpu')
malconv = malconv.cuda() if use_gpu else malconv
bce_loss = nn.BCEWithLogitsLoss().cuda() if use_gpu else nn.BCEWithLogitsLoss()

print("Loading MalConv model successful")

data_path = '/home/user/Desktop/CodeCaveFinal-main/KkrunchyCodeCave/Cave12288_kkrunchy2/'
csv_path='/home/user/Desktop/CodeCaveFinal-main/KkrunchyCodeCave/Cave12288_kkrunchy2_caves.csv'

validloader = DataLoader(ExeDataset(data_path, csv_path),
                         batch_size=1, shuffle=False, num_workers=0)

  malconv = torch.load('/home/user/Desktop/Retraining_Malconv/checkpoint/Retrain_all_samples_sd_850.model', map_location=torch.device('cuda') if use_gpu else 'cpu')


Loading MalConv model successful


In [2]:

def loss_function_gradient(malconv, original_x, adv_x, use_cuda=False):
    y = malconv.embedd_and_forward(adv_x)
    y = nn.Sigmoid()(y)
    
    malware_class = torch.ones(y.shape)
    if use_cuda:
        malware_class = malware_class.cuda()
    loss = torch.nn.functional.binary_cross_entropy(y, malware_class)
    g = torch.autograd.grad(loss, adv_x)[0]
    g = torch.transpose(g, -1, -2)[0]
    return g	

def optimization_solver(gradient_f, index_to_consider, x_init):
    device = gradient_f.device  # Get the device of gradient_f (it should be on the same device as your model)

    # Create a zero tensor on the same device as gradient_f
    zero_tensor = torch.zeros(gradient_f.shape, device=device)

    # Compare if gradient_f is zero using tensors on the same device
    g = gradient_f / torch.norm(gradient_f) if not torch.equal(gradient_f, zero_tensor) else torch.zeros(gradient_f.shape, device=device)

    epsilon = 100
    gradient_result = (epsilon * g).transpose(0,1)
    x_init_updated = x_init.clone()  # Create a clone of x_init to avoid in-place operation
    x_init_updated[0, :, index_to_consider] = x_init_updated[0, :, index_to_consider] + gradient_result[:, index_to_consider]
    
    return x_init_updated


In [3]:
import torch.multiprocessing as mp
import gc
mp.set_start_method('spawn', force=True)
torch.multiprocessing.set_sharing_strategy('file_system')

In [None]:
df = pd.DataFrame(columns=['val_batch_data', 'length', 'init_prob','iteration','progress'])
counter = 0
success=0
for _, val_batch_data in enumerate(validloader):
    if counter%100==0:
        print(f"Completed: {counter}")
        print(success)
        # df.to_csv('Malconv_UPXPack_first16384Bytes.csv', mode='a', header=False, index=False) 
        # df = pd.DataFrame(columns=['val_batch_data', 'length', 'init_prob','iteration','progress'])
        # torch.cuda.empty_cache()
    exe_input = val_batch_data[0].cuda() if use_gpu else val_batch_data[0]
    data = exe_input[0].cpu().numpy()
    length = data[-3]
    cave_start = data[-2]
    cave_length = data[-1]

    if cave_length==0:
        continue
    
    data = data[:length]
    data = np.concatenate([data, np.random.randint(0, 256, 2000000 - length)])

    embed = malconv.embed
    sigmoid = nn.Sigmoid()

    x0 = torch.from_numpy(np.array([data])).long().cuda() if use_gpu else torch.from_numpy(np.array([data])).long()
    x0 = Variable(x0.long(), requires_grad=False)
    pred, x_init = malconv(x0)
    initial_prob = sigmoid(pred).cpu().data.numpy()[0][0]

    if (cave_start+12288)>2000000 or initial_prob<0.5:
        continue
    
    index_to_consider = list(range(cave_start, cave_start+12288))
    counter +=1 
    for i in range(50):
        gradient_f = loss_function_gradient(malconv, x0, x_init, use_cuda=True)
        x_init = optimization_solver(gradient_f, index_to_consider, x_init)
        progress = sigmoid(malconv.embedd_and_forward(x_init)).cpu().data.numpy()[0][0]
        if progress<0.5:
            print(progress)
            success+=1
            break
    # Data to append as a dictionary
    new_data = {'val_batch_data': val_batch_data, 'length': length, 'init_prob': initial_prob, 'iteration': i, 'progress':progress}
    new_data_df = pd.DataFrame([new_data])
    # Use pd.concat to append the new row
    df = pd.concat([df, new_data_df], ignore_index=True)

print(f"Success: {success}/{counter}")

Completed: 0
0
Completed: 0
0


  df = pd.concat([df, new_data_df], ignore_index=True)


0.008701976
0.0017551557
0.32658058
0.0032167856
0.119298734
0.015874343
0.04343372
0.015398373
0.06984804
0.16930638
0.028546214
0.38856864
0.23490058
0.37924513
0.018499786
0.00096604903
0.12549923
0.35987362
0.06953656
0.031094015
0.06772896
0.058493603
Completed: 100
22
0.26274627
0.03389103
0.030542253
0.13597514
0.16407913
0.00035944168
0.27381164
0.18637933
0.0011975808
0.15452793
0.06671814
1.1858838e-05
0.012753675
0.0074908338
0.2925698
0.15679964
0.031405404
0.011894156
0.26135993
0.00013088506
0.21263862
8.8217115e-07
0.0002649043
0.24680625
4.908064e-11
Completed: 200
47
0.002790879
0.012998398
0.1719878
0.4412909
0.16798134
0.2368184
0.13782193
0.32363725
0.4976642
0.0015222302
3.3499875e-17
0.235999
0.14903829
0.01072855
0.028648995
0.017892106
0.0050012544
0.017042667
0.07300224
0.15966989
0.26532453
0.04011331
0.008416694
0.14355448
0.049790364
0.00077245577
0.2329
0.1553534
0.14434016
0.14888622
Completed: 300
77
0.004130244
0.3672583
0.35196868
0.39874837
0.001011658

In [None]:
import concurrent.futures
from tqdm import tqdm

# Create an empty DataFrame to store results
df = pd.DataFrame(columns=['val_batch_data', 'length', 'init_prob', 'iteration', 'progress'])

# Helper functions: Assuming loss_function_gradient, optimization_solver, etc., are defined elsewhere.

def process_batch(val_batch_data, malconv, use_gpu):
    # This function processes a single batch and returns the results to append to the dataframe
    perturbation_size=4096
    exe_input = val_batch_data[0].cuda() if use_gpu else val_batch_data[0]
    data = exe_input[0].cpu().numpy()
    length = data[-3]
    cave_start = data[-2]
    cave_length = data[-1]
    if cave_length==0:
        return None
    data = data[:length]
    data = np.concatenate([data, np.random.randint(0, 256, 2000000 - length)])

    sigmoid = nn.Sigmoid()

    x0 = torch.from_numpy(np.array([data])).long().cuda() if use_gpu else torch.from_numpy(np.array([data])).long()
    x0 = Variable(x0.long(), requires_grad=False)
    try:
        pred, x_init = malconv(x0)
        initial_prob = sigmoid(pred).cpu().data.numpy()[0][0]
    except:
        return None

    if (cave_start+perturbation_size) > 2000000 or initial_prob < 0.5:
        return None  # Skip if conditions are not met

    index_to_consider = list(range(cave_start, cave_start+perturbation_size))

    for i in range(50):
        try:
            gradient_f = loss_function_gradient(malconv, x0, x_init, use_cuda=True)
            x_init = optimization_solver(gradient_f, index_to_consider, x_init)
            progress = sigmoid(malconv.embedd_and_forward(x_init)).cpu().data.numpy()[0][0]
        except:
            break
        if progress < 0.5:
            print(progress)
            break

    # Return results as a dictionary
    return {
        'val_batch_data': val_batch_data,
        'length': length,
        'init_prob': initial_prob,
        'iteration': i,
        'progress': progress
    }

def append_to_df(results):
    # Append data to DataFrame (if result is not None)
    if results:
        new_data_df = pd.DataFrame([results])
        return new_data_df
    return pd.DataFrame()

# Assuming validloader and malconv are already defined
use_gpu = torch.cuda.is_available()

# Parallelize batch processing using concurrent.futures.ThreadPoolExecutor
counter = 0
success=0
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for val_batch_data in validloader:
        futures.append(executor.submit(process_batch, val_batch_data, malconv, use_gpu))

    for future in concurrent.futures.as_completed(futures):
        counter +=1
        if counter%100 == 0:
            print(f" Completed :{counter}")
        result = future.result()
        if result and result['progress'] < 0.5:
            success += 1
            print(f" Success :{success}")
        # new_data_df = append_to_df(result)
        # df = pd.concat([df, new_data_df], ignore_index=True)

print(f"Success: {success}/{counter}")
