In [305]:
import sys
import gzip
import glob
import pandas as pd
import numpy as np
from joblib import load


COLUMN_NAMES = [
    "job_id", "group", "job_type", "host_name",
    "just_started", "just_finished", "job_status",
    "cores_used", "power_HS6_core", "power_HS6_machine",
    "cpu_time", "run_time", "owner", "ram_memory",
    "total_memory", "unix_time", "fromhost", "disk"
]
CATEGORICAL_VARS = [*COLUMN_NAMES[1:3], *COLUMN_NAMES[4:6], "owner"]
STRING_VARS = ["job_id", "host_name", "fromhost"]
AGG_COLUMNS = ['run_time', 'ram_memory', 'total_memory', 'disk']
LHC_QUEUES = ['alice', 'atlas', 'cms', 'lhcb']


class classifier:
    def __init__(self, inputf):
        self.model = load(inputf)

    def predict():
        print("predicted")
        
def preprocess(inputf):
    data = pd.concat(map(lambda f: pd.read_csv(f, 
                                           sep=" ", 
                                           names=COLUMN_NAMES, 
                                           dtype={c: "category" for c in CATEGORICAL_VARS}, 
                                           compression="gzip"), 
                     glob.glob(inputf + "/*.gz")))
    
    for VAR in STRING_VARS:
        data[VAR] = data[VAR].astype("string")

    data.insert(0, 'job', data['job_id'] + "_" + data['fromhost'] + "_" + data['host_name'])
    data.drop(['job_id', 'fromhost'], axis=1, inplace=True)
    data['job_queue_type'] = data['group'].str.contains(str.join("|", LHC_QUEUES)).map({True: "lhc", False: "non-lhc"}).astype('category')
    
    agg_data = data.groupby("job").agg({
    'run_time': list,
    'ram_memory': list, 
    'total_memory':list,
    'disk': list,
    'job_status': max
    })
    
    transformed_df = pd.merge(agg_data[
        (agg_data['run_time'].apply(lambda x: x[0] <= 180)) & 
        (agg_data['run_time'].apply(lambda x: len(x) >= 20)) &
        (agg_data['job_status'] == 2)
    ].reset_index(drop=False), data[['job', 'job_type', 'job_queue_type']], on = 'job')
    
    return transformed_df


if __name__ == '__main__':
    args = sys.argv[1:]
    if len(args) == 0:
        print("Usage: python classifier.py <model dump> <log file>")
        exit(1)
                
    data = preprocess("./test")

In [306]:
data

Unnamed: 0,job,run_time,ram_memory,total_memory,disk,job_status,job_type,job_queue_type
0,5017411.0_ce05-htc_cn-316-05-11,"[42, 223, 402, 584, 762, 943, 1123, 1303, 1483...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.998612, 0.998612, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...",2,grid,lhc
1,5017411.0_ce05-htc_cn-316-05-11,"[42, 223, 402, 584, 762, 943, 1123, 1303, 1483...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.998612, 0.998612, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...",2,grid,lhc
2,5017411.0_ce05-htc_cn-316-05-11,"[42, 223, 402, 584, 762, 943, 1123, 1303, 1483...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.998612, 0.998612, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...",2,grid,lhc
3,5017411.0_ce05-htc_cn-316-05-11,"[42, 223, 402, 584, 762, 943, 1123, 1303, 1483...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.998612, 0.998612, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...",2,grid,lhc
4,5017411.0_ce05-htc_cn-316-05-11,"[42, 223, 402, 584, 762, 943, 1123, 1303, 1483...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.998612, 0.998612, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...","[2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, 2.7e-05, ...",2,grid,lhc
...,...,...,...,...,...,...,...,...
975,9116137.0_ce02-htc_wn-205-13-27-07-a,"[84, 264, 444, 625, 803, 984, 1165, 1344, 1524...","[0.0, 0.0, 0.0, 0.0, 0.0, 4.254392, 4.254392, ...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 207.636512...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 0.06795, 0...",2,grid,lhc
976,9116137.0_ce02-htc_wn-205-13-27-07-a,"[84, 264, 444, 625, 803, 984, 1165, 1344, 1524...","[0.0, 0.0, 0.0, 0.0, 0.0, 4.254392, 4.254392, ...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 207.636512...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 0.06795, 0...",2,grid,lhc
977,9116137.0_ce02-htc_wn-205-13-27-07-a,"[84, 264, 444, 625, 803, 984, 1165, 1344, 1524...","[0.0, 0.0, 0.0, 0.0, 0.0, 4.254392, 4.254392, ...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 207.636512...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 0.06795, 0...",2,grid,lhc
978,9116137.0_ce02-htc_wn-205-13-27-07-a,"[84, 264, 444, 625, 803, 984, 1165, 1344, 1524...","[0.0, 0.0, 0.0, 0.0, 0.0, 4.254392, 4.254392, ...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 207.636512...","[4e-06, 4e-06, 4e-06, 4e-06, 4e-06, 0.06795, 0...",2,grid,lhc


In [219]:
for COL in AGG_COLUMNS:
    transformed_df[COL] = transformed_df[COL].apply(lambda x: [np.mean(x[i:j]) for i, j in zip([0, 5, 10, 15], [5, 10, 15, 20])])

  return _methods._mean(a, axis=axis, dtype=dtype,


TypeError: cannot perform reduce with flexible type