In [None]:
import os
import ray
import time
import joblib
import ujson as json

from lightgbm import LGBMClassifier

from PEParser import *

In [None]:
SEED = 7777

def read_label_csv(path):
    label_table = dict()
    with open(path, "r", encoding='ISO-8859-1') as f:
        for line in f.readlines()[1:]:
            fname, label = line.strip().split(",")
            label_table[fname] = int(label)
    return label_table

def train(X_train, y_train, model):
    clf = load_model(model=model, random_state=SEED)
    clf.fit(X_train, y_train)
    return clf

In [None]:
@ray.remote
def getFeaturesA(fname, infer):
    feature_vector = []
    for data in ["PEMINER", "EMBER"]:
        path = os.path.realpath(f"./데이터/{data}/{infer}/{fname}.json")
      
        if data == "PEMINER":
            feature_vector += PeminerFeatureExtract(path)
        elif data == "EMBER":
            feature_vector += EmberFeatureExtract(path)
            
    return feature_vector

@ray.remote
def getFeaturesB(fname, infer):
    feature_vector = []
    for data in ["PEMINER", "EMBER", "PESTUDIO"]:
        path = os.path.realpath(f"./데이터/{data}/{infer}/{fname}.json")
      
        if data == "PEMINER":
            feature_vector += PeminerFeatureExtract(path)
        elif data == "EMBER":
            feature_vector += EmberFeatureExtract(path)
        else:
            feature_vector += PestudioFeatureExtract(path)
            
    return feature_vector

In [None]:
train_label_table = read_label_csv(os.path.realpath('데이터/학습데이터_정답.csv'))
train_fnames = list(train_label_table.keys())

In [None]:
start = time.time()

ray.init(local_mode=False, num_cpus=8)

trainyA = []
trainyB = []

taskA = []
taskB = []
for fname in train_fnames:
    
    taskA.append(getFeaturesA.remote(fname, '학습데이터'))
    trainyA.append(train_label_table[fname])
    
    path = os.path.realpath(f"./데이터/PESTUDIO/학습데이터/{fname}.json")
    if os.path.exists(path):
        taskB.append(getFeaturesB.remote(fname, '학습데이터'))
        trainyB.append(train_label_table[fname])
    
trainXA = ray.get(taskA)
trainXB = ray.get(taskB)

ray.shutdown()

print(time.time() - start, 'sec')

In [None]:
N_ESTIMATORS = [128, 256, 384, 512]
NUM_LEAVES = [7, 15, 31, 63, 127]
LEARNING_RATE = [0.1, 0.07, 0.05, 0.03, 0.01]

In [None]:
idx = 0
for n_estimators in N_ESTIMATORS:
    for num_leaves in NUM_LEAVES:
        for learning_rate in LEARNING_RATE:
            clf = LGBMClassifier(n_estimators=n_estimators, num_leaves=num_leaves, learning_rate=learning_rate, random_state=SEED)
            clf.fit(trainXA, trainyA)
            joblib.dump(clf, f'./modelA/{idx}.pt')
            idx += 1

In [None]:
idx = 0
for n_estimators in N_ESTIMATORS:
    for num_leaves in NUM_LEAVES:
        for learning_rate in LEARNING_RATE:
            clf = LGBMClassifier(n_estimators=n_estimators, num_leaves=num_leaves, learning_rate=learning_rate, random_state=SEED)
            clf.fit(trainXB, trainyB)
            joblib.dump(clf, f'./modelB/{idx}.pt')
            idx += 1