In [1]:
import os
import ray
import time
import joblib
import numpy as np

from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from PEParser import *

In [2]:
SEED = 7777

def read_label_csv(path):
    label_table = dict()
    with open(path, "r", encoding='ISO-8859-1') as f:
        for line in f.readlines()[1:]:
            fname, label = line.strip().split(",")
            label_table[fname] = int(label)
    return label_table

def scoring(X_test, files, model, csv):
    predict = model.predict(X_test)
    for i in range(len(X_test)):
        csv.write(f'{files[i]},{predict[i]}\n')

def hard_voting(X_test, files, models, csv):
    score = np.zeros(len(X_test))
    for model in models:
        pred = model.predict(X_test)
        score += np.array(pred)
    predict = (score > len(models)//2).astype(int)
    
    for i in range(len(X_test)):
        csv.write(f'{files[i]},{predict[i]}\n')

In [3]:
@ray.remote
def getFeaturesA(fname, infer):
    feature_vector = []
    for data in ["PEMINER", "EMBER"]:
        path = os.path.realpath(f"./데이터/{data}/{infer}/{fname}.json")
      
        if data == "PEMINER":
            feature_vector += PeminerFeatureExtract(path)
        elif data == "EMBER":
            feature_vector += EmberFeatureExtract(path)
            
    return feature_vector

@ray.remote
def getFeaturesB(fname, infer):
    feature_vector = []
    for data in ["PEMINER", "EMBER", "PESTUDIO"]:
        path = os.path.realpath(f"./데이터/{data}/{infer}/{fname}.json")
      
        if data == "PEMINER":
            feature_vector += PeminerFeatureExtract(path)
        elif data == "EMBER":
            feature_vector += EmberFeatureExtract(path)
        else:
            feature_vector += PestudioFeatureExtract(path)
            
    return feature_vector

In [4]:
start = time.time()

ray.init(local_mode=False, num_cpus=8)

fileA = []
fileB = []

taskA = []
taskB = []
for fname in os.listdir(os.path.realpath(f"./데이터/PEMINER/테스트데이터")):
    
    fname = fname.split('.json')[0]
    
    path = os.path.realpath(f"./데이터/PESTUDIO/테스트데이터/{fname}.json")
    if not os.path.exists(path):
        taskA.append(getFeaturesA.remote(fname, '테스트데이터'))
        fileA.append(fname)
    else:
        taskB.append(getFeaturesB.remote(fname, '테스트데이터'))
        fileB.append(fname)
    
testXA = ray.get(taskA)
testXB = ray.get(taskB)

ray.shutdown()

print(time.time() - start, 'sec')

71.19274044036865 sec


In [5]:
best_modelA_idx = [14, 99, 44, 24, 74, 39, 34, 49, 19]
best_modelB_idx = [87, 86, 61, 92, 62, 88, 67, 35, 91]

In [6]:
csv = open('submission.csv', 'w')
csv.write('fname,label\n')

12

In [7]:
clfs = []
for idx in best_modelA_idx:
    clfs.append(joblib.load(f'./modelA/{idx}.pt'))
scoring(testXA, fileA, clfs[0], csv)

In [8]:
clfs = []
for idx in best_modelB_idx:
    clfs.append(joblib.load(f'./modelB/{idx}.pt'))
hard_voting(testXB, fileB, clfs, csv)

In [9]:
csv.close()