In [2]:
import sys
import gzip
import glob
import pandas as pd
import numpy as np
from joblib import load


COLUMN_NAMES = [ 
    "job_id", "queue", "job_type", "hn",
    "just_started", "just_finished", "js", "nc", 
    "hsj", "hsm", "cpt", "rt", 
    "owner", "ram", "img", 
    "ts", "sn", "disk" 
]
CATEGORICAL_VARS = [*COLUMN_NAMES[1:3], *COLUMN_NAMES[4:6], "owner"]
STRING_VARS = ["job_id", "hn", "sn"]
AGG_COLUMNS = ['ram', 'img', 'disk']
LHC_QUEUES = ['alice', 'atlas', 'cms', 'lhcb']

 
class Preprocessor:
    @staticmethod
    def preprocess(inputf):
        data = pd.concat(map(lambda f: pd.read_csv(f, 
                                                   sep=" ", 
                                                   names=COLUMN_NAMES, 
                                                   dtype={c: "category" for c in CATEGORICAL_VARS}, 
                                                   compression="gzip"), 
                             glob.glob(args[0] + "/*.gz")))


        data[STRING_VARS] = data[STRING_VARS].astype("string")
        data.insert(0, 'job', data['job_id'] + "_" + data['sn'])
        data['job_work_type'] = data['queue'].str.contains(str.join("|", LHC_QUEUES)).map({True: "lhc", False: "non-lhc"}).astype('category')

        agg_data = data.groupby("job").agg({ 
            'rt': list, 
            'ram': list, 
            'img': list,
            'disk': list, 
            'js': max,
            'job_type': 'first',
            'job_work_type': 'first'
        })

        filtered_agg_data = agg_data[
            (agg_data['rt'].apply(lambda x: x[0] <= 180)) & 
            (agg_data['rt'].apply(lambda x: len(x) >= 20)) &
            (agg_data['js'] == 2)
        ].drop(['rt', 'js'], axis=1).reset_index(drop=False)

        for COL in AGG_COLUMNS:
            filtered_agg_data[COL] = filtered_agg_data[COL].apply(lambda x: [np.mean(x[i:j]) for i, j in zip([0, 5, 10, 15], [5, 10, 15, 20])])

        return pd.concat([
            filtered_agg_data[filtered_agg_data.columns.difference(AGG_COLUMNS)],
            pd.concat([pd.DataFrame(filtered_agg_data[COL].tolist()).add_prefix(f"{COL}_") for COL in AGG_COLUMNS], axis=1)
        ], axis=1)
    

class Classifier:
    def __init__(self, inputf):
        self.model = load(inputf)

    def predict(self, X):
        return self.model.predict(X)
        

if __name__ == '__main__':
    args = sys.argv[1:]
    if len(args) < 2:
        print("Usage: python classifier.py <log folder> <model dump>")
        exit(1)
        
    args[0] = "./test"
    args[1] = "./model.joblib"
    
    data = Preprocessor.preprocess(args[0])    
    clf = Classifier(args[1])
    np.savetxt('results.txt', np.c_[data['job'], clf.predict(data)], fmt=('%s', '%d'))