In [None]:
import pandas as pd
import multiprocessing as mp
from tqdm.auto import tqdm
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

from gensim.utils import effective_n_jobs
from gensim.models import FastText

tqdm.pandas()

In [None]:
base_dir = ""
temp_dir = f"{base_dir}/temp"

## WebGraph

In [None]:
data = pd.read_csv(f"{base_dir}/D3_webgraph.csv", sep='\t')
data = data.sort_values(['hostname', 'script_url'], ascending=[True, True]).reset_index(drop=True)

In [None]:
model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=25)
result = list()
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=25)
for train_index, test_index in kf.split(np.zeros(len(data['label'])), data['label']):
    X = data.iloc[train_index,4:].values.tolist()
    y = data.iloc[train_index,3].values.tolist()
    model.fit(X, y)
    X_test = data.iloc[test_index,4:].values.tolist()
    y_test = data.iloc[test_index,3].values.tolist()
    y_pred = model.predict(X_test)
    per, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    result.append((per, rec, f1, accuracy_score(y_test, y_pred)))
    

In [None]:
print(f"Precision: {np.mean(result, axis=0)[0]} +/- {np.std(result, axis=0)[0]}")
print(f"Recall:    {np.mean(result, axis=0)[1]} +/- {np.std(result, axis=0)[1]}")
print(f"f-1:       {np.mean(result, axis=0)[2]} +/- {np.std(result, axis=0)[2]}")
print(f"Accuracy:  {np.mean(result, axis=0)[3]} +/- {np.std(result, axis=0)[3]}")

## RF(FastText)

In [None]:
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1  

class ByteExpression(object):
    def __init__(self, filename):
        self.filename = filename
        self.file_len = file_len(filename)
        
    def __iter__(self):
        with open(self.filename, 'r') as f:
            for ix, line in tqdm(enumerate(f), total=self.file_len, ncols=500):
                if ix == 0:
                    # header
                    print(line.split('\t')[3])
                    continue 
                yield line.split('\t')[3].split()
                

In [None]:
data = pd.read_csv(f"{base_dir}/D3_bytecode.csv", sep='\t')
data = data.sort_values(['hostname', 'script_url'], ascending=[True, True]).reset_index(drop=True)

In [None]:

cpu = effective_n_jobs(-1)
ft_model = None

def mean(vec):
    return np.mean([ft_model.wv[_] for _ in vec.split()], axis=0)

model = RandomForestClassifier(n_jobs=-1,  random_state=25,  n_estimators=200,  max_features='sqrt')
result = list()
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=25)
for ix, (train_index, test_index) in enumerate(kf.split(np.zeros(len(data['label'])), data['label'])):
    cpu = effective_n_jobs(-1)
    temp_file = f"{temp_dir}/{ix}"
    data.iloc[train_index].to_csv(temp_file, sep='\t', index=False)
    be = ByteExpression(temp_file)
    ft_model = FastText(vector_size=100, window=3, epochs=10, workers=cpu, sg=1)
    ft_model.build_vocab(corpus_iterable=be)

    with mp.Pool(mp.cpu_count()) as pool:
        print("[INFO] Converting train bytecode to vectors ...")
        X = pool.map(mean, data.iloc[train_index]['bytecode']) 
    y = data.iloc[train_index]['label'].values.tolist()
    print("[INFO] Done ... training")
    model.fit(X, y)
    
    with mp.Pool(mp.cpu_count()) as pool:
        print("[INFO] Converting test bytecode to vectors ...")
        X_test = pool.map(mean, data.iloc[test_index]['bytecode'])
    y_test = data.iloc[test_index]['label'].values.tolist()
    y_pred = model.predict(X_test)
    per, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(f"[INFO] Precision: {per}\tRecall: {rec}\tf-1: {f1}")
    result.append((per, rec, f1, accuracy_score(y_test, y_pred)))
 

In [None]:
print(f"Precision: {np.mean(result, axis=0)[0]} +/- {np.std(result, axis=0)[0]}")
print(f"Recall:    {np.mean(result, axis=0)[1]} +/- {np.std(result, axis=0)[1]}")
print(f"f-1:       {np.mean(result, axis=0)[2]} +/- {np.std(result, axis=0)[2]}")
print(f"Accuracy:  {np.mean(result, axis=0)[3]} +/- {np.std(result, axis=0)[3]}")