In [1]:
import os
import ray
import time
import joblib
import numpy as np

from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from PEParser import *

In [2]:
SEED = 7777

def read_label_csv(path):
    label_table = dict()
    with open(path, "r", encoding='ISO-8859-1') as f:
        for line in f.readlines()[1:]:
            fname, label = line.strip().split(",")
            label_table[fname] = int(label)
    return label_table

def evaluate(X_test, y_test, model):
    predict = model.predict(X_test)
    print("Accuracy)", accuracy_score(y_test, predict))
    print("Precision)", precision_score(y_test, predict))
    print("Recall)", recall_score(y_test, predict))
    print("F1-Score)", f1_score(y_test, predict))
    print()
    
def hard_voting(X_test, y_test, models):
    score = np.zeros(len(X_test))
    for model in models:
        pred = model.predict(X_test)
        score += np.array(pred)
    predict = (score > len(models)//2).astype(int)
    
    print("Accuracy)", accuracy_score(y_test, predict))
    print("Precision)", precision_score(y_test, predict))
    print("Recall)", recall_score(y_test, predict))
    print("F1-Score)", f1_score(y_test, predict))
    print()

In [3]:
@ray.remote
def getFeaturesA(fname, infer):
    feature_vector = []
    for data in ["PEMINER", "EMBER"]:
        path = os.path.realpath(f"./데이터/{data}/{infer}/{fname}.json")
      
        if data == "PEMINER":
            feature_vector += PeminerFeatureExtract(path)
        elif data == "EMBER":
            feature_vector += EmberFeatureExtract(path)
            
    return feature_vector

@ray.remote
def getFeaturesB(fname, infer):
    feature_vector = []
    for data in ["PEMINER", "EMBER", "PESTUDIO"]:
        path = os.path.realpath(f"./데이터/{data}/{infer}/{fname}.json")
      
        if data == "PEMINER":
            feature_vector += PeminerFeatureExtract(path)
        elif data == "EMBER":
            feature_vector += EmberFeatureExtract(path)
        else:
            feature_vector += PestudioFeatureExtract(path)
            
    return feature_vector

In [4]:
eval_label_table = read_label_csv(os.path.realpath('데이터/검증데이터_정답.csv'))
eval_fnames = list(eval_label_table.keys())

In [5]:
start = time.time()

ray.init(local_mode=False, num_cpus=8)

evalyA = []
evalyB = []

taskA = []
taskB = []
for fname in eval_fnames:
    
    path = os.path.realpath(f"./데이터/PESTUDIO/검증데이터/{fname}.json")
    if not os.path.exists(path):
        taskA.append(getFeaturesA.remote(fname, '검증데이터'))
        evalyA.append(eval_label_table[fname])
    else:
        taskB.append(getFeaturesB.remote(fname, '검증데이터'))
        evalyB.append(eval_label_table[fname])
    
evalXA = ray.get(taskA)
evalXB = ray.get(taskB)

ray.shutdown()

print(time.time() - start, 'sec')

71.15895414352417 sec


In [6]:
best_modelA_idx = [14, 99, 44, 24, 74, 39, 34, 49, 19]
best_modelB_idx = [87, 86, 61, 92, 62, 88, 67, 35, 91]

In [7]:
clfs = []
for idx in best_modelA_idx:
    clfs.append(joblib.load(f'./modelA/{idx}.pt'))
    
print('modelA best model inference')
evaluate(evalXA, evalyA, clfs[0])

print('modelA ensemble inference')
hard_voting(evalXA, evalyA, clfs)

modelA best model inference
Accuracy) 0.9653465346534653
Precision) 0.967032967032967
Recall) 0.9943502824858758
F1-Score) 0.9805013927576602

modelA ensemble inference
Accuracy) 0.9603960396039604
Precision) 0.9720670391061452
Recall) 0.9830508474576272
F1-Score) 0.9775280898876404



In [8]:
clfs = []
for idx in best_modelB_idx:
    clfs.append(joblib.load(f'./modelB/{idx}.pt'))
    
print('modelB best model inference')
evaluate(evalXB, evalyB, clfs[0])

print('modelB ensemble inference')
hard_voting(evalXB, evalyB, clfs)

modelB best model inference
Accuracy) 0.963053684425393
Precision) 0.9821708743403224
Recall) 0.96672750245683
F1-Score) 0.974388000566011

modelB ensemble inference
Accuracy) 0.9633598693610941
Precision) 0.9823160296634341
Recall) 0.9670082830268145
F1-Score) 0.9746020516448531

