In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from dateutil import parser
import numpy as np
import sklearn
import scipy
import seaborn as sns
from sklearn.model_selection import train_test_split
import math
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold, GridSearchCV, ShuffleSplit, RandomizedSearchCV
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier, DistanceMetric
%load_ext Cython
import cython
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
pd.set_option('mode.chained_assignment', None)
#cython: boundscheck=False, wraparound=False, nonecheck=False
from sklearn.utils.class_weight import compute_sample_weight
import jupyterthemes as jt
from jupyterthemes.stylefx import set_nb_theme
it = iter(jt.get_themes())
theme = next(it)
print("Current Theme: ", theme)
set_nb_theme(theme)
#ocean / chesterish



Current Theme:  chesterish


In [56]:
def timetoclass(row):
    dt_obj1 = parser.parse(row['rundate'])
    dt_obj2 = parser.parse(row['stopdate'])
    return (dt_obj2-dt_obj1).total_seconds()//3600

def timedif(row):
    dt_obj1 = parser.parse(row['rundate'])
    dt_obj2 = parser.parse(row['stopdate'])
    return (dt_obj2-dt_obj1).total_seconds()/60

def intime(row):
    dt_obj1 = parser.parse(row['indate'].split()[0]+' 00:00:00')
    dt_obj2 = parser.parse(row['indate'])
    return (dt_obj2-dt_obj1).total_seconds()/3600

def indatetime(row, start):
    dt_obj1 = parser.parse(start)
    dt_obj2 = parser.parse(row['indate'])
    return (dt_obj2-dt_obj1).total_seconds()/3600

def load_bases(paths, date, plotting=False, names=None):
    pd.set_option('display.max_columns', None)  # or 1000
    pd.set_option('display.max_rows', None)  # or 1000
    pd.set_option('display.max_colwidth', 100)  # or 199
    pd.options.display.expand_frame_repr = False
    dfs = [0]*len(paths)
    for i in range(len(paths)):
        dfs[i] = pd.read_csv(paths[i],error_bad_lines=False, sep=";")
        dfs[i]['stopdate'].replace('', np.nan, inplace=True)
        dfs[i].dropna(subset=['stopdate'], inplace=True)
        dfs[i]["exec"] = dfs[i].apply (lambda row: timetoclass(row), axis=1)
        dfs[i]["exec_min"] = dfs[i].apply (lambda row: timedif(row), axis=1)
        dfs[i]["intime"] = dfs[i].apply (lambda row: intime(row), axis=1)
        dfs[i]["indatetime"] = dfs[i].apply (lambda row: indatetime(row, dfs[i]['indate'].min()), axis=1)
        dfs[i] = dfs[i].loc[dfs[i]['ntime']<=1440]
        dfs[i] = dfs[i].loc[dfs[i]['nproc']<=2000]
        dfs[i] = dfs[i].loc[dfs[i]['exec_min']<=dfs[i]['ntime']]
        if plotting:
            plt.rcParams['figure.dpi'] = 200
            plt.rcParams['savefig.dpi'] = 200
            ax1 = plt.subplot(221+i)
            sns.distplot(dfs[i]['exec']).set_title("Distribution of exec in "+names[i])
            ax1.set_xlim(0, 25)
            plt.show()
    return dfs

def plot_cmdline_stats(dfs, names):    
    fig, axs = plt.subplots(len(paths), 1, figsize=(5,10))
    fig.tight_layout(pad=5.0)
    for i in range(len(paths)):
        df = dfs[i]
        cmds = {}
        for s in df['cmdline']:
            cnt = len(s.split())
            if cnt not in cmds:
                cmds[cnt] = 1
            else:
                cmds[cnt] += 1
        s_cmds = {c:cmds[c] for c in sorted(cmds)}
        axs[i].bar([f'{i}' for i in s_cmds.keys()], s_cmds.values(),linewidth=100)
        axs[i].set_title(names[i])
        axs[i].set_ylabel("кол-во задач")
        axs[i].set_xlabel("кол-во параметров")
    plt.show()
    
def cmd_keys_stats(dfs, names):
    for k in range(len(dfs)):
        df = dfs[k]
        cmds = {}
        cmd_list = df['cmdline'].tolist()
        for i in range(len(cmd_list)):
            s = cmd_list[i]
            local_list = []
            for ss in s.split():
                for sss in ss.split('/'):
                    if sss != '':
                        local_list.append(sss)
            for j in range(len(local_list)-1):
                if local_list[j] is None or len(local_list[j])==0:
                    continue
                if local_list[j][0] == '-' and local_list[j+1][0] != '-':
                    local_list[j] = local_list[j]+" "+local_list[j+1]
                    #print(local_list[j])
                    local_list[j+1] = None

            for sss in local_list:
                if sss is None:
                    continue
                if sss not in cmds:
                    cmds[sss] = 1
                else:
                    cmds[sss] += 1
        s_cmds ={k: v for k, v in sorted(cmds.items(), key=lambda item: -item[1]) if cmds[k] > 200 and not "home" in k and not "pstorage" in k and '' != k and '.'!=k} 
        print(names[k])
        print(s_cmds)
        print('\n')

        
def unique_cmd_stat(dfs, names):
    fig, axs = plt.subplots(len(paths), 1, figsize=(5,10))
    fig.tight_layout(pad=5.0)
    s_cmds_arr = []
    for i in range(len(dfs)):
        df = dfs[i]
        cmds = {}
        for index, row in df.iterrows():
                uid = row['userid']
                if uid not in cmds:
                    cmds[uid] = [row['cmdline']]
                else:
                    cmds[uid].append(row['cmdline'])
        s_cmds ={k: v for k, v in sorted(cmds.items(), key=lambda item: -len(item[1]))} 
        s_cmds_arr.append(s_cmds)
        for x in s_cmds:
            print(x, " - ", len(s_cmds[x]), " - ", len(set(s_cmds[x])))
        v1 = [ len(set(s_cmds[x])) for x in s_cmds]
        v2 = [ len(s_cmds[x]) - len(set(s_cmds[x])) for x in s_cmds]
        v1,v2 = v1[:40], v2[:40]

        axs[i].bar([i for i in range(len(v1))], v1 ,linewidth=100, label = "Уникальне команды")
        axs[i].bar([i for i in range(len(v1))], v2, bottom=v1 ,linewidth=100,  label = "Повторные команды")
        axs[i].set_title(names[i])
        axs[i].set_ylabel("кол-во пусков")
        axs[i].set_xlabel("пользователь")
        axs[i].legend()
    return s_cmds_arr

def vif_stat(dfs, names):
    for k in range(len(dfs)):
        df = dfs[k]
        df = df.drop(columns=['indate','taskname', 'exittype', 'cmdline', 'rundate', 'stopdate', 'userid', 'indatetime'])
        df = df.dropna()
        df = df._get_numeric_data()
        # VIF dataframe
        vif_data = pd.DataFrame()
        vif_data["feature"] = df.columns
        # calculating VIF for each feature
        vif_data["VIF"] = [variance_inflation_factor(df.values, i)
                                  for i in range(len(df.columns))]
        print(names[k], vif_data)

def normalize_data(dfs, names = None, plotting=True):
    new_dfs = []
    for k in range(len(dfs)):
        df = dfs[k]
        dfdel = df.loc[df['ntime'] - df['exec_min'] <= 0.05]
        df = df.loc[df['ntime'] - df['exec_min'] > 0.05]
        df = df.sort_values(by=['indate'])
        df = df.drop(columns=['indate', 'rundate', 'stopdate', 'exittype', 'exec_min', 'taskname', 'indatetime'])

        col_to_dict = ['userid', 'gid', 'orgid']
        dicts = {'userid':{}, 'gid':{}, 'orgid':{}}
        for col in col_to_dict:
            for d in df[col]:
                if d in dicts[col]:
                    dicts[col][d]+=1
                else:
                    dicts[col][d]=1
            df[col] = [list(dicts[col]).index(d) for d in df[col]]

        col_to_norm = ['nproc','ntime','userid','gid','orgid','intime']
        for col in col_to_norm:
            colmin = df[col].min()
            colmax = df[col].max()
            if col in ['ntime']:
                dfdel.loc[:, col] = (dfdel[col] - colmin) / (colmax - colmin)
            df.loc[:, col] = (df[col] - colmin) / (colmax - colmin)
        new_dfs.append(df)
        # probability plot
        if plotting:
            plt.rcParams['figure.dpi'] = 200
            plt.rcParams['savefig.dpi'] = 200
            plt.subplots_adjust(left=0.1,
                    bottom=0.1,
                    right=0.9,
                    top=0.9,
                    wspace=0.5,
                    hspace=1.5)
            if names is None:
                names = ['_']*len(dfs)
            ax1 = plt.subplot(21 + 100*len(dfs)+ 2*k)
            sns.distplot(df['exec']).set_title(f"Distribution of exec in {names[k]}")
            ax1 = plt.subplot(22 + 100*len(dfs)+ 2*k)
            res = scipy.stats.probplot(df['exec'], plot=plt)
            #ax1 = plt.subplot(43 + 100*len(dfs)+ 4*k)
            #sns.distplot(df['nproc']).set_title("Distribution of nproc")
            #ax1 = plt.subplot(44 + 100*len(dfs)+ 4*k)
            #res = scipy.stats.probplot(df['nproc'], plot=plt)
            
            """
            plt.figure()
            plt.scatter(df['exec'], df['ntime'], s=5, label='Отобранные данные')
            plt.scatter(dfdel['exec'], dfdel['ntime'], s=5, c='red', label='Неподходящие данные')
            plt.ylabel('Запрошенное время(мин)', fontsize=12)
            plt.xlabel('Время выполнения(мин)', fontsize=12)
            plt.legend(loc='lower right')
            """
            
    return new_dfs

def prepare_cmd_data(dfs):
    cmd_data = []
    global_keys = []
    global_names = []
    for df in dfs:
        X = df.drop(columns=['exec'])
        program_data = ['']*len(X['cmdline'])
        cmd_dict = {"cmd_{}".format(i+1):([-1]*len(X['cmdline'])) for i in range(20)}
        keys = set()
        program_names = set()
        key_table = [[]]*len(X['cmdline'])
        for ii, s in enumerate(X['cmdline']):
            pas = False
            k = []
            program_name = s.split(' ')[0].split('/')[-1]
            program_names.add(program_name)
            for idx, ss in enumerate(s.split()):
                if pas:
                    pas = False
                    continue
                if ss[0]=='-' and idx<len(s.split())-1 and s.split()[idx+1][0]!='-':
                    keys.add(ss+' '+s.split()[idx+1])
                    k.append(ss+' '+s.split()[idx+1])
                    pas = True
                else:
                    keys.add(ss)
                    k.append(ss)
            #print(ii)
            key_table[ii] = k
            program_data[ii] = program_name
        keys = list(keys)
        program_names = list(program_names)
        #print(program_names)
        cmd_table = np.zeros((len(X['cmdline']), len(keys)), dtype=np.int32)
        for idx, ss in enumerate(key_table):
            program_data[idx] = program_names.index(program_data[idx])
            #for s in ss:
                #cmd_table[idx][keys.index(s)] = 1
            key_list = []
            for k in range(len(ss)):
                key_list.append(keys.index(ss[k]))
            key_list = sorted(key_list, reverse=True)
            #print(key_list)
            for k in range(len(key_list)):
                cmd_dict["cmd_{}".format(k+1)][idx]=key_list[k]
            
        for i in range(20):
            s = "cmd_{}".format(i+1)
            X[s] = cmd_dict[s]
        X['program_names'] = program_data
        X = X.drop(columns=['cmdline'])
        X = X.astype(np.float64)
        cmd_data.append(X)
        global_keys.append(keys)
        global_names.append(program_names)
    return cmd_data, global_keys, global_names

def generate_train_test_data(dfs,cmd_data, test_percent = 0.05):
    X_trains, X_tests, y_trains, y_tests = [],[],[],[]
    for i in range(len(dfs)):
        df = dfs[i]
        X = cmd_data[i]
        y = df.loc[:,['userid', 'exec']]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, shuffle=True)

        userdict = {}
        for u in X_train['userid']:
            if u in userdict:
                userdict[u]+=1
            else:
                userdict[u]=1

        for u in X_test['userid']:
            if u not in userdict:
                userdict[u]=0

        X_trains.append(X_train.loc[[userdict[user] > 3 for user in X_train['userid']]])
        X_tests.append(X_test.loc[[userdict[user] > 3 for user in X_test['userid']]])
        y_train = y_train.loc[[userdict[user] > 3 for user in y_train['userid']]]
        y_test = y_test.loc[[userdict[user] > 3 for user in y_test['userid']]]
        y_tests.append(y_test.drop(columns=['userid']))
        y_trains.append(y_train.drop(columns=['userid']))
    return X_trains, X_tests, y_trains, y_tests

In [3]:
user_idx = 2
group_idx = 3
org_idx = 4
cmd_idx = 5
intime_idx = 6
i = 0
def custom_metric(X1, X2, **kwargs):
    au = kwargs["au"]
    ag = kwargs["ag"]
    ao = kwargs["ao"]
    d = 0
    if X1[user_idx]!=X2[user_idx]:
        d+=au
    if X1[group_idx]!=X2[group_idx]:
        d+=ag
    if X1[org_idx]!=X2[org_idx]:
        d+=ao
    for idx in range(2):
            dif = (X1[idx] - X2[idx])**2
            d+=dif
    d+=(X1[intime_idx] - X2[intime_idx])**2
    d += sum(abs(cmd_table[int(X1[cmd_idx])] - cmd_table[int(X2[cmd_idx])]))/len(cmd_table[0])
            
    return d

In [4]:
"""
%%cython --annotate -n custom_metric_c
from libc.math cimport abs
from libc.stdio cimport printf
from libc.stdlib cimport malloc, free


def cython_f(double[:] X1, double[:] X2, int p = 2, double w = -1):
    cdef double d = 0
    cdef double dif = 0
    cdef root_w = 1
    if w != -1:
        root_w = sqrt(w)
    for idx in range(6):
        dif = (root_w*(X1[idx] - X2[idx]))**2
        d+=dif
    dif = 0
    for i in range(6,6+20):
        for j in range(6,6+20):
            if (X1[i] == X2[j]):
                continue;
            if X1[i] > X2[j] or j == 25:
                dif+=1
                continue;
    d+=dif/1000
    return sqrt(d)
"""

'\n%%cython --annotate -n custom_metric_c\nfrom libc.math cimport abs\nfrom libc.stdio cimport printf\nfrom libc.stdlib cimport malloc, free\n\n\ndef cython_f(double[:] X1, double[:] X2, int p = 2, double w = -1):\n    cdef double d = 0\n    cdef double dif = 0\n    cdef root_w = 1\n    if w != -1:\n        root_w = sqrt(w)\n    for idx in range(6):\n        dif = (root_w*(X1[idx] - X2[idx]))**2\n        d+=dif\n    dif = 0\n    for i in range(6,6+20):\n        for j in range(6,6+20):\n            if (X1[i] == X2[j]):\n                continue;\n            if X1[i] > X2[j] or j == 25:\n                dif+=1\n                continue;\n    d+=dif/1000\n    return sqrt(d)\n'

In [5]:
%%writefile cython_metric.pyx
from libc.math cimport sqrt, abs
from libc.stdio cimport printf
from libc.stdlib cimport malloc, free

cdef int* cmd_table
cdef int cmd_table_len
cdef int cmd_table_h
cdef int user_idx = 2
cdef int group_idx = 3
cdef int org_idx = 4
cdef int intime_idx = 5

def cython_metric(double[:] X1, double[:] X2, int p = 2, double w = -1):
    cdef double d = 0
    cdef double dif = 0
    cdef root_w = 1
    if w != -1:
        root_w = sqrt(w)
    for idx in range(6):
        dif = (root_w*(X1[idx] - X2[idx]))**2
        d+=dif
    dif = 0
    for i in range(6,6+20):
        for j in range(6,6+20):
            if (X1[i] == X2[j]):
                break;
            if X1[i] > X2[j] or j == 25:
                dif+=1
                break;
    d+=d*(dif/100)
    return sqrt(d)


Overwriting cython_metric.pyx


In [6]:
%%writefile cython_metric_v2.pyx
from libc.math cimport sqrt, abs
from libc.stdio cimport printf
from libc.stdlib cimport malloc, free

cdef int* cmd_table
cdef int cmd_table_len
cdef int cmd_table_h
cdef int user_idx = 2
cdef int group_idx = 3
cdef int org_idx = 4
cdef int intime_idx = 5

def cython_metric_v2(double[:] X1, double[:] X2, int p = 2, double w = -1):
    cdef double d = 0
    cdef double dif = 0
    cdef root_w = 1
    if w != -1:
        root_w = sqrt(w)
    for idx in range(6):
        dif = (root_w*(X1[idx] - X2[idx]))**2
        d+=dif
    
    if X1[user_idx]!=X2[user_idx]:
        d+=1
        
    dif = 0   
    for i in range(6,6+20):
        for j in range(6,6+20):
            if (X1[i] == X2[j]):
                break;
            if X1[i] > X2[j] or j == 25:
                dif+=1
                break;
    d+=(dif/100)
    return sqrt(d)

Overwriting cython_metric_v2.pyx


In [7]:
%%writefile minkovski_metric.pyx
from libc.math cimport sqrt, abs
from libc.stdio cimport printf
from libc.stdlib cimport malloc, free

cdef int* cmd_table
cdef int cmd_table_len
cdef int cmd_table_h
cdef int user_idx = 2
cdef int group_idx = 3
cdef int org_idx = 4
cdef int intime_idx = 5

def minkovski_metric(double[:] u, double[:] v, int p = 2, double w = -1):
    cdef double d = 0
    cdef root_w = 1
    if w != -1:
        root_w = sqrt(w)
    for idx in range(6):
        d+=(root_w*(u[idx]-v[idx]))**2
    return sqrt(d)

Overwriting minkovski_metric.pyx


In [8]:
%%writefile setup.py
from distutils.core import setup
from Cython.Build import cythonize

setup(name="cython_metric", ext_modules=cythonize('cython_metric.pyx'))
#setup(name="minkovski_metric", ext_modules=cythonize('minkovski_metric.pyx'))
setup(name="cython_metric_v2", ext_modules=cythonize('cython_metric_v2.pyx'))

Overwriting setup.py


In [9]:
!python setup.py build_ext --inplace

Compiling cython_metric.pyx because it changed.
[1/1] Cythonizing cython_metric.pyx
running build_ext
building 'cython_metric' extension
"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.30.30705\bin\HostX86\x64\cl.exe" /c /nologo /O2 /W3 /GL /DNDEBUG /MD -IC:\Users\alexm\anaconda3\include -IC:\Users\alexm\anaconda3\Include "-IC:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.30.30705\ATLMFC\include" "-IC:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.30.30705\include" "-IC:\Program Files (x86)\Windows Kits\NETFXSDK\4.8\include\um" "-IC:\Program Files (x86)\Windows Kits\10\include\10.0.19041.0\ucrt" "-IC:\Program Files (x86)\Windows Kits\10\\include\10.0.19041.0\\shared" "-IC:\Program Files (x86)\Windows Kits\10\\include\10.0.19041.0\\um" "-IC:\Program Files (x86)\Windows Kits\10\\include\10.0.19041.0\\winrt" "-IC:\Program Files (x86)\Windows Kits\10\\include\10.0.19041.0\\cppwinrt" /Tccython_metric.c /Fobuild\temp.

  tree = Parsing.p_module(s, pxd, full_module_name)
  tree = Parsing.p_module(s, pxd, full_module_name)


In [10]:
from cython_metric import cython_metric
from minkovski_metric import minkovski_metric
#train_d = X_train.drop(columns=["cmd_{}".format(i+1) for i in range(20)])
#z = train_d.values[:2]
#print(z[0], z[1])
def python_metric( X1,  X2,  p = 2,  w = -1):
    d = 0
    dif = 0
    root_w = 1
    if w != -1:
        root_w = sqrt(w)
    for idx in range(6):
        dif = (root_w*(X1[idx] - X2[idx]))**2
        d+=dif
    dif = 0
    for i in range(6,6+20):
        for j in range(6,6+20):
            if (X1[i] == X2[j]):
                break;
            if X1[i] > X2[j] or j == 25:
                dif+=1
                break;
    print(dif)
    d+=dif/1000
    return d

In [11]:
from cython_metric import cython_metric
from minkovski_metric import minkovski_metric
from cython_metric_v2 import cython_metric_v2
ridge_param = {
    'alpha': [ 0.6, 0.7, 0.75, 0.8, 0.9],
    'fit_intercept':[True, False],
    'normalize':[True, False],
    'solver':['svd'],
    'class_weight':[None, 'balanced']
}

tree_param = {
    "splitter":["best","random"],
    "max_depth" : [10, 15, 20, 25,30, 50],
    "min_samples_leaf":[2,3,4,5,6,7,8,9,10],
    "criterion": ['entropy', 'gini'],
    "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
    }

knn_param_no_cmd = {
    "algorithm":['ball_tree', 'brute'],
    "n_neighbors":[16, 20, 24,28],
    #'metric':[minkovski_metric],
    "weights":['uniform', 'distance'],
}

knn_param = {
    "algorithm":['ball_tree', 'brute'],
    "n_neighbors":[16, 20, 24,28],
    'metric':[cython_metric],
    "weights":['uniform', 'distance'],
    }
knnv2_param = {
    "algorithm":['ball_tree', 'brute'],
    "n_neighbors":[4,8, 16, 20, 24,28],
    'metric':[cython_metric_v2],
    "weights":['uniform', 'distance'],
    }

In [88]:
def prepare_program_trees(X_trains,y_trains,cmd_data, glob_keys, glob_prog_names):
    program_trees_global = []
    for i in range(len(dfs)):
        program_trees = {}
        df = X_trains[i].join(y_trains[i])
        keys = glob_keys[i]
        prog_names = glob_prog_names[i]
        for name in prog_names:
            df_prog = df.loc[df['program_names'] == prog_names.index(name)]
            if df_prog.shape[0] >= 0.03*df.shape[0]:
                local_key_d = {}
                df_prog = df_prog.reset_index() 
                for index, row in df_prog.iterrows():
                    for k in range(1,21):
                        key = row["cmd_{}".format(k)]
                        if key in local_key_d.keys():
                            local_key_d[key] += 1
                        else:
                            local_key_d[key] = 1
                keys_to_check = list(local_key_d.keys())
                for key in keys_to_check:
                    if local_key_d[key] < 0.03*df_prog.shape[0]:
                        del local_key_d[key]
                print(name, df_prog.shape[0], len(local_key_d))
                
                
        
        

In [57]:
date = "14_02_23"
paths = [".\\run_info\\broadwell_"+date+".csv", ".\\run_info\\cascade_lake_"+date+".csv", ".\\run_info\\skylake_"+date+".csv"]
names = ["broadwell", "cascade_lake", "skylake"]
dfs = load_bases(paths, date, plotting=False, names=names)
#plot_cmdline_stats(dfs, names)
#cmd_keys_stats(dfs, names)
#s_cmds_arr = unique_cmd_stat(dfs, names)
#vif_stat(dfs, names)
dfs = normalize_data(dfs, names=names, plotting=False)#, plotting=False
cmd_data, keys, prog_names = prepare_cmd_data(dfs)
print(prog_names)
X_trains, X_tests, y_trains, y_tests = generate_train_test_data(dfs, cmd_data)


[['/home2/vasp/soft_intel_21/qe-7.1/qe_inst/bin/pw.x', '/lustre/lstore/iitp1/blaster', '/home5/oivt27/yakovenko_2022/acet-air_stoich_wc5_h10mm_r1mm/./computing_module', '-i au13o1_2pbe.in', '-log /home3/oivt23/aizel/calc/znt/ti72_nb22_zr6/outfile_305.txt', '/home5/hppi1/mvm/b3c/spin/asbatch', '/home2/impbran5/korsh/f_2022_2_ksi_diff/d_diff_eta_2.4_k_4_ksi_1.5_ww_1_1_h.0015_n7001_e0.014/un1', '/home5/ibhfrnf1/oreshonkov/castep-20.11/projects/bsmobr/1_smobr', '/home1/pavlova1/mgtu4/15nm/100/bulk/./lmp_mpi', '/home2/ccas3/aristov/sims/plate/run_plate_1', '/home4/pstorage2/ibhf7/glebushek/ads/pt2s2_fs/../../vasp6', '/home4/pstorage1/oivt21/s.pavlov/free_energy/ooh/0.642/dlpoly.x', 'ts_2cc_exo-nbd_tpp_lpbevv.in', '/home4/pstorage1/oivt21/s.pavlov/free_energy/ooh/0.295/dlpoly.x', '-stdout au12ni1o1_2.out', '/home5/phys1/kat3500/koshelev/yaroslav/efield/fm/_1.0/vasp_std', '/home5/ipfnn1/asladkov/smilie/s', '/home3/ccras8/shibo/varprof220/profw2', 'cppd', 'i_mpi_adjust_bcast=3', '/home4/pstora

In [89]:
program_trees_global = prepare_program_trees(X_trains,y_trains ,cmd_data, keys, prog_names)
#X_trains[0].join(y_trains[0]).head(10)
#X_tests[1].describe()

vasp_std 1084 9
pw.x 278 14
dlpoly.x 519 3
orca-run 739 6
pic_2d_openmp 209 9
submit.sh 425 7
vasp6 222 3
lmp_intel_cpu_intelmpi 524 32
vasp_std 1005 6
pw.x 225 11
orca-run 496 9
vasp6 305 5
lmp.sh 29 20
vasp_std 120 10
pw.x 98 12
v610_std 23 14
orca-run 234 12


In [14]:
total_results = {}
outputs = {}
for i in range(len(dfs)):
    print(names[i])
    X_train, X_test, y_train, y_test = X_trains[i], X_tests[i], y_trains[i], y_tests[i]
    program_trees = program_trees_global[i]
    w = compute_sample_weight(class_weight='balanced', y=y_train)
    y_w = y_train['exec'].values.tolist()
    #print(w, y_w)
    #real_weights = [w[y_w.index(i)] for i in range(24)]
    models = {
        'knn':KNeighborsClassifier(),
        'knnv2':KNeighborsClassifier(),
        #'ridge':RidgeClassifier(),
        'tree':tree.DecisionTreeClassifier(),
        'knn without cmd':KNeighborsClassifier(),
        }

    params = {
        'knn' : knn_param,
        'knnv2' : knnv2_param,
        #'ridge': ridge_param,
        'tree': tree_param,
        'knn without cmd' : knn_param_no_cmd,
    }
    output = {}
    results = {}
    #w = compute_sample_weight(class_weight='balanced', y=y_train)
    for m in models:
        print(m)
        train_data = X_train
        test_data = X_test
        if m == 'tree':
            train_data = program_k_decision(train_data, program_trees)
        if m!='knn' and m!="knnv2":
            train_data = train_data.drop(columns=["cmd_{}".format(i+1) for i in range(20)])
            test_data = test_data.drop(columns=["cmd_{}".format(i+1) for i in range(20)])
        #print(train_data.describe())
        cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
        #search = RandomizedSearchCV(models[m], params[m], scoring='accuracy', n_jobs=-1, cv=cv, n_iter=30)
        search = GridSearchCV(models[m], params[m], scoring='f1_micro', n_jobs=-1, cv=cv)
        #if m.find('knn') < 0:
        #    result = search.fit(train_data, y_train,sample_weight=w)
        #else:
        result = search.fit(train_data, y_train)
        output[m] = result.predict(test_data)
        results[m] = result
        print('Best Score: %s' % result.best_score_)
        print('Best Hyperparameters: %s' % result.best_params_)
    total_results[names[i]] = results
    outputs[names[i]] = output

broadwell
knn
Best Score: 0.22143288776251877
Best Hyperparameters: {'algorithm': 'ball_tree', 'metric': <built-in function cython_metric>, 'n_neighbors': 16, 'weights': 'distance'}
knnv2
Best Score: 0.23607751655549825
Best Hyperparameters: {'algorithm': 'ball_tree', 'metric': <built-in function cython_metric_v2>, 'n_neighbors': 4, 'weights': 'distance'}
tree
Best Score: 0.2128324367662616
Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 30, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'splitter': 'random'}
knn without cmd
Best Score: 0.22143288776251877
Best Hyperparameters: {'algorithm': 'ball_tree', 'n_neighbors': 16, 'weights': 'distance'}
cascade_lake
knn
Best Score: 0.1640068059723854
Best Hyperparameters: {'algorithm': 'ball_tree', 'metric': <built-in function cython_metric>, 'n_neighbors': 16, 'weights': 'distance'}
knnv2
Best Score: 0.16271937605320433
Best Hyperparameters: {'algorithm': 'brute', 'metric': <built-in function cython_metric_v2>, 'n_neighbors': 4, '

In [15]:
for i in range(len(dfs)):
    print(names[i])
    y = y_tests[i].sort_values(by=['exec'])
    #plt.figure()
    x = range(len(y['exec']))
    #plt.scatter(x, y['exec'], s=5, label='real')
    out = outputs[names[i]]
    for m in models:
        #plt.scatter(x, y[m], s=5, label=m)
        print(m)
        print(classification_report(y['exec'], out[m], digits=4))
        print(m + " accuracy ", accuracy_score(y['exec'], out[m]))
        print(m + " F1 ", f1_score(y['exec'], out[m], average='micro'))

broadwell
knn
              precision    recall  f1-score   support

         0.0     0.6525    0.6937    0.6725       222
         1.0     0.1579    0.1429    0.1500        21
         2.0     0.1429    0.1429    0.1429        21
         3.0     0.0000    0.0000    0.0000        11
         4.0     0.0000    0.0000    0.0000         8
         5.0     0.1250    0.1250    0.1250         8
         6.0     0.0000    0.0000    0.0000         7
         7.0     0.0000    0.0000    0.0000         3
         8.0     0.0000    0.0000    0.0000         2
         9.0     0.0000    0.0000    0.0000         4
        10.0     1.0000    0.5000    0.6667         2
        11.0     0.0000    0.0000    0.0000         3
        12.0     0.0000    0.0000    0.0000         3
        13.0     0.0000    0.0000    0.0000         1
        14.0     0.0000    0.0000    0.0000         1
        15.0     0.0000    0.0000    0.0000         2
        18.0     0.0000    0.0000    0.0000         0
        19.0 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

In [16]:
list(map(str, set(y_train['exec'])))

['0.0',
 '1.0',
 '2.0',
 '3.0',
 '4.0',
 '5.0',
 '6.0',
 '7.0',
 '8.0',
 '9.0',
 '10.0',
 '11.0',
 '12.0',
 '13.0',
 '14.0',
 '15.0',
 '16.0',
 '17.0',
 '18.0',
 '19.0',
 '20.0',
 '21.0',
 '23.0']

In [17]:
import copy
res1 = copy.deepcopy(results)
print(res0, res1)
#best = tree.DecisionTreeRegressor(**(results['tree'].best_params_))


NameError: name 'res0' is not defined

In [None]:
[0,0,1]+[1,0,0]
