In [None]:
import argparse
import scipy
from sklearn.externals import joblib
import math

import pandas as pd

from sklearn import preprocessing
from sklearn_pandas import gen_features
from sklearn_pandas import DataFrameMapper

from sklearn import ensemble
import sklearn
import numpy as np
from matplotlib import pyplot as plt
from preprocessing import ChooseFeatureColumns
from preprocessing import MyMapper
import ast
import os
import skgarden

import mysql.connector
from tabulate import tabulate

cnx = mysql.connector.connect(user='root', password='root',
                              host='localhost',
                              database='datadump')
cursor = cnx.cursor()

class TrainModel(object):

    def __init__(self, filename):

        try:
            self.df=pd.read_csv("data/csv_dump/%s.csv"%filename)
        except FileNotFoundError as e:
            print("dont have this file yet")
            return
        df2=pd.read_csv("failed_jobs/walltime_csv/%s.csv"%filename)
        # df3=pd.read_csv("failed_jobs/memory_csv/%s.csv"%filename)
        self.df["result"], df2["result"]="ok", "walltime error"

        self.df.index=self.df["id"]
        df2.index=df2["id"]
        self.df=pd.concat([self.df,df2])
        self.df=self.df.sort_values("id")
        del self.df["id"]
        self.df=self.df.reset_index()

        self.df=self.remove_bad_columns(self.df)
        self.df=self.remove_bad_columns(self.df)

        num_errors=df2.shape[0]
        df_features, df_labels = self.df, self.df.pop("result")
        num_instances=len(df_labels)

        original_num_of_cols=df_features.shape[1]
        ################################################
        ################################################
        # prepare the data for the RandomForestRegressor
        print("setting up...")
        chooser = ChooseFeatureColumns()
        scaler = MyMapper()
        # regr = sklearn.ensemble.RandomForestRegressor(n_estimators=100, max_depth=12)
        self.regr=sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=12)

        self.pipe = sklearn.pipeline.Pipeline([
            ('chooser',chooser),
            ('scaler', scaler),
        ])
        # ################################################
        test_size = 0.2
        test_start=len(df_labels)-int(len(df_labels)*test_size)
 
        split_randomly=False
        time_split=num_instances>10
        if split_randomly and time_split:
            tr_features, ev_features, tr_labels, ev_labels = sklearn.model_selection.train_test_split(df_features, df_labels, test_size=test_size)
            print("splitting randomly")
        elif time_split:
            tr_features, tr_labels, ev_features, ev_labels = df_features[:test_start], df_labels[:test_start], df_features[test_start:], df_labels[test_start:]
            print("splitting non-randomly")
        if time_split and ((len(list(self.pipe.fit_transform(tr_features)))) != (len(list(self.pipe.transform(ev_features))))):
            print("!! pipe transformation broken")
            time_split=False
        # ################################################

        df_features=self.pipe.fit_transform(df_features)
        print("%r ----> %r" %(original_num_of_cols,df_features.shape[1]))
        if float(df_features.shape[1])/float(original_num_of_cols)>10:
            print("GREATER THAN 10")

        if time_split:
            tr_features=self.pipe.fit_transform(tr_features)
            ev_features=self.pipe.transform(ev_features)
             
        self.table="walltime_classify"

        accuracy=self.analysis(filename, df_features, df_labels, df_features, df_labels, timesplit=False)
        print("whole accuracy: %r" % accuracy)
        if time_split:
            accuracy2 = self.analysis(filename, tr_features, tr_labels, ev_features, ev_labels, timesplit=True)
            print("time accuracy: %r" % accuracy2)
            
            metrics={"toolid": filename, "num_instances":num_instances,
                    "accuracy": accuracy, "accuracy_time":accuracy2, "num_errors":num_errors}
            print(tabulate(sorted([(k,v) for k,v in metrics.items()])))
            command = '''INSERT INTO walltime_classify (toolid, num_instances, accuracy, accuracy_time_split, num_walltime_errors) 
                            
                            VALUES (%(toolid)s, %(num_instances)s, %(accuracy)s,%(accuracy_time)s,%(num_errors)s)
                            
                            ON DUPLICATE KEY UPDATE num_instances=%(num_instances)s, accuracy=%(accuracy)s, 
                            accuracy_time_split=%(accuracy_time)s, num_walltime_errors=%(num_errors)s;
                            '''
        else:
            metrics={"toolid": filename, "num_instances":num_instances, "accuracy": accuracy, "num_errors":num_errors}
            command = '''INSERT INTO walltime_classify (toolid, num_instances, accuracy, num_walltime_errors) 
                            
                            VALUES (%(toolid)s, %(num_instances)s, %(accuracy)s, %(num_errors)s)
                            
                            ON DUPLICATE KEY UPDATE num_instances=%(num_instances)s, accuracy=%(accuracy)s, num_walltime_errors=%(num_errors)s;
                            '''
        # print(tabulate(sorted([(k,v) for k,v in metrics.items()])))
        cursor.execute(command, metrics)
        cnx.commit()
        
        print("done")
        
    def remove_bad_columns(self, df):
        parameters = [i for i in list(df) if (i.startswith("parameters.") and not i.startswith("parameters.__job_r"))]
        filetypes=[i for i in list(df) if (i.endswith("_filetype") and not i.startswith("parameters.__job_r"))]
        files=[i[:-9] for i in filetypes]
        bad_parameters=["parameters.__workflow_invocation_uuid__","parameters.chromInfo"]
        for parameter in parameters:
            series=df[parameter].dropna()
            if all(type(item)==str and item.startswith('"') for item in series): 
                try:
                    df[parameter]=df[parameter].str[1:-1].astype(float)
                except:
                    pass
            if len(df[parameter].unique())>=0.5*df.shape[0]:
                bad_parameters.append(parameter)
            if df[parameter].dtype == object and len(df[parameter].unique())>=10*df.shape[1]:
                bad_parameters.append(parameter)
            if all(type(item)==str and item.startswith("[") and item.endswith("]") for item in series):#  and item.startswith("[{'src'")
                if all(type(ast.literal_eval(item))==list for item in series):
                    bad_parameters.append(parameter)
        for file in files:
            bad_parameters.append("parameters."+file)
            bad_parameters.append("parameters."+file+".values")
            bad_parameters.append("parameters."+file+"|__identifier__")
        for param in set(bad_parameters):
            try:
                parameters.remove(param)
            except:
                pass
        hardware=['destination_id',
         'galaxy_slots',
         'handler',
         'job_runner_name',
         'memtotal',
         'processor_count',
         'result']
        keep=parameters+filetypes+files+hardware
        columns=list(df)
        for column in columns:
            if not column in keep:
                del df[column]
        return df
        
    def analysis(self, filename, tr_features, tr_labels, ev_features, ev_labels, timesplit=False):
        self.regr.fit(tr_features,tr_labels)
        ev_pred = self.regr.predict(ev_features)
        accuracy=sklearn.metrics.accuracy_score(ev_labels,ev_pred) 
        return float(accuracy)
        
    def get_accuracy(self, cq):
        correct = 0.
        i=0
        for k, r in cq.iterrows():
            if r["labels"] < (r["pred"]+r["std"]) and r["labels"] > (r["pred"]-r["std"]):
                correct+=1.
            i+=1
        

        return (correct/cq.shape[0])

    def plot(self, cq, filename, timesplit=False):
        plot=True
        plot_w_error=True
        if plot:
            plt.figure(figsize=(10,10))
            plt.scatter(cq["labels"],cq["pred"])
            plt.xlabel("Real Runtime")
            plt.ylabel("Predicted Runtime")
            plt.title("Mean predictions")
            if timesplit:
                plt.savefig("plots_dump/timesplit/%s.png"%filename)
                print("saved a plot to plots_dump/timesplit/%s.png"%filename)
            else:
                plt.savefig("plots_dump/%s.png"%filename)
                print("saved a plot to plots_dump/%s.png"%filename)
            plt.close()
        if plot_w_error:
            plt.figure(figsize=(10,10))
            plt.scatter(cq["labels"],cq["pred"])
            plt.errorbar(cq["labels"],cq["pred"], yerr=[cq["std"],cq["std"]], fmt='o')
            plt.xlabel("Real Runtime")
            plt.ylabel("Predicted Runtime")
            plt.title("Mean predictions")
            if timesplit:
                plt.savefig("plots_w_error_dump/timesplit/%s.png"%filename)
                print("saved a plot to plots_w_error_dump/timesplit/%s.png"%filename)
            else:
                plt.savefig("plots_w_error_dump/%s.png"%filename)
                print("saved a plot to plots_w_error_dump/%s.png"%filename)
            
            plt.close()



def getfiles(dirpath):
    a = [s for s in os.listdir(dirpath)
         if os.path.isfile(os.path.join(dirpath, s))]
    a.sort(key=lambda s: os.path.getmtime(os.path.join(dirpath, s)), reverse=True)
    return a


toolids=getfiles("failed_jobs/walltime_csv")
toolids=[i[:-4] for i in toolids]
toolids=["dummy"]
print(toolids)
# toolids=["iuc_vsearch_vsearch_masking_1.9.7.0"]

for i in range(len(toolids)):
    print(i, toolids[i])
    TrainModel(toolids[i])

cursor.close()
cnx.close()


In [1]:
import argparse
import scipy
from sklearn.externals import joblib
import math

import pandas as pd

from sklearn import preprocessing
from sklearn_pandas import gen_features
from sklearn_pandas import DataFrameMapper

from sklearn import ensemble
import sklearn
import numpy as np
from matplotlib import pyplot as plt
from preprocessing import ChooseFeatureColumns
from preprocessing import MyMapper
import ast
import os
import skgarden

import mysql.connector
from tabulate import tabulate

In [59]:
def remove_bad_columns(df):
        parameters = [i for i in list(df) if (i.startswith("parameters.") and not i.startswith("parameters.__job_r"))]
        filetypes=[i for i in list(df) if (i.endswith("_filetype") and not i.startswith("parameters.__job_r"))]
        files=[i[:-9] for i in filetypes]
        bad_parameters=["parameters.__workflow_invocation_uuid__","parameters.chromInfo"]
        for parameter in parameters:
            series=df[parameter].dropna()
            if all(type(item)==str and item.startswith('"') for item in series): 
                try:
                    df[parameter]=df[parameter].str[1:-1].astype(float)
                except:
                    pass
            if len(df[parameter].unique())>=0.5*df.shape[0]:
                bad_parameters.append(parameter)
            if df[parameter].dtype == object and len(df[parameter].unique())>=10*df.shape[1]:
                bad_parameters.append(parameter)
            if all(type(item)==str and item.startswith("[") and item.endswith("]") for item in series):#  and item.startswith("[{'src'")
                if all(type(ast.literal_eval(item))==list for item in series):
                    bad_parameters.append(parameter)
        for file in files:
            bad_parameters.append("parameters."+file)
            bad_parameters.append("parameters."+file+".values")
            bad_parameters.append("parameters."+file+"|__identifier__")
        for param in set(bad_parameters):
            try:
                parameters.remove(param)
            except:
                pass
        hardware=[
#             'destination_id',
#          'galaxy_slots',
#          'handler',
#          'job_runner_name',
#          'memtotal',
#          'processor_count',
         'result']
        keep=parameters+filetypes+files+hardware
        columns=list(df)
        for column in columns:
            if not column in keep:
                del df[column]
        return df

In [69]:
filename="devteam_picard_picard_MarkDuplicates_1.126.0"
df=pd.read_csv("../data/csv_dump/%s.csv"%filename)
df2=pd.read_csv("memory_csv/%s.csv"%filename)

In [74]:
number_of_errors=df2.shape[0]
# if number_of_errors< df.shape[0]:
#     df=df.sample(number_of_errors)
df2["memtotal"].value_counts()

KeyError: 'memtotal'

In [73]:
df["memtotal"].value_counts()

132112168.0    5380
132111800.0       8
16332488.0        4
Name: memtotal, dtype: int64

In [62]:
df["result"], df2["result"]="ok", "walltime error"
df.index=df["id"]
df2.index=df2["id"]
df=pd.concat([df,df2])
df=df.sort_values("id")
del df["id"]
df=df.reset_index()
del df["id"]
df=remove_bad_columns(df)
df=remove_bad_columns(df)

In [63]:
df_features, df_labels = df, df.pop("result")

In [64]:
from sklearn import dummy
chooser = ChooseFeatureColumns()
scaler = MyMapper()
regr=sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=12, max_features=None)
# regr=sklearn.dummy.DummyClassifier()
pipe = sklearn.pipeline.Pipeline([
    ('chooser',chooser),
    ('scaler', scaler),
])

In [67]:
len(list(df))

464

In [65]:
test_size = 0.2
test_start=len(df_labels)-int(len(df_labels)*test_size)

split_randomly=False
time_split=True
if split_randomly and time_split:
    tr_features, ev_features, tr_labels, ev_labels = sklearn.model_selection.train_test_split(df_features, df_labels, test_size=test_size)
    print("splitting randomly")
elif time_split:
    tr_features, tr_labels, ev_features, ev_labels = df_features[:test_start], df_labels[:test_start], df_features[test_start:], df_labels[test_start:]
    print("splitting non-randomly")
if time_split and ((len(list(pipe.fit_transform(tr_features)))) != (len(list(pipe.transform(ev_features))))):
    print("!! pipe transformation broken")
    time_split=False

splitting non-randomly


KeyboardInterrupt: 

In [None]:
df_features=pipe.fit_transform(df_features)
if time_split:
            tr_features=pipe.fit_transform(tr_features)
            ev_features=pipe.transform(ev_features)

In [None]:
regr.fit(tr_features,tr_labels)
ev_pred = regr.predict(ev_features)
accuracy=sklearn.metrics.accuracy_score(ev_labels,ev_pred) 
accuracy

In [41]:
ev_pred

array(['ok', 'ok', 'ok', ..., 'ok', 'ok', 'ok'], dtype=object)

In [52]:
print(get_false_positives(ev_labels, ev_pred))
print(get_false_negatives(ev_labels, ev_pred))

0
0
nan
1035
2
0.001932367149758454


In [34]:
regr.fit(df_features, df_labels)
pred=regr.predict(df_features)
accuracy=sklearn.metrics.accuracy_score(df_labels,pred) 
accuracy

1.0

In [53]:
def get_false_positives(labels, pred):
    labels=labels.reset_index(drop=True)
    total_positive=0
    false_positive=0
    for i in range(len(pred)):
        if pred[i]=="walltime error":
            total_positive+=1
            if labels[i]=="ok":
                false_positive+=1
    print(total_positive)
    print(false_positive)
    if total_positive == 0:
            return np.nan
    return false_positive/total_positive

def get_false_negatives(labels, pred):
    labels=labels.reset_index(drop=True)
    total_negatives=0
    false_negatives=0
    for i in range(len(pred)):
        if pred[i]=="ok":
            total_negatives+=1
            if labels[i]=="walltime error":
                false_negatives+=1
    print(total_negatives)
    print(false_negatives)
    
    return false_negatives/total_negatives

print(get_false_positives(df_labels, pred))
print(get_false_negatives(df_labels, pred))

4
0
0.0
5172
0
0.0


In [49]:
df_labels.reset_index(drop=True)

0       ok
1       ok
2       ok
3       ok
4       ok
5       ok
6       ok
7       ok
8       ok
9       ok
10      ok
11      ok
12      ok
13      ok
14      ok
15      ok
16      ok
17      ok
18      ok
19      ok
20      ok
21      ok
22      ok
23      ok
24      ok
25      ok
26      ok
27      ok
28      ok
29      ok
        ..
5146    ok
5147    ok
5148    ok
5149    ok
5150    ok
5151    ok
5152    ok
5153    ok
5154    ok
5155    ok
5156    ok
5157    ok
5158    ok
5159    ok
5160    ok
5161    ok
5162    ok
5163    ok
5164    ok
5165    ok
5166    ok
5167    ok
5168    ok
5169    ok
5170    ok
5171    ok
5172    ok
5173    ok
5174    ok
5175    ok
Name: result, Length: 5176, dtype: object

In [147]:
df_labels

0        walltime error
1        walltime error
2        walltime error
3        walltime error
4        walltime error
5        walltime error
6        walltime error
7        walltime error
8        walltime error
9        walltime error
10       walltime error
11       walltime error
12       walltime error
13       walltime error
14       walltime error
15       walltime error
16       walltime error
17       walltime error
18       walltime error
19       walltime error
20       walltime error
21       walltime error
22       walltime error
23       walltime error
24       walltime error
25       walltime error
26                   ok
27                   ok
28                   ok
29                   ok
              ...      
17604                ok
17605                ok
17606                ok
17607                ok
17608                ok
17609                ok
17610                ok
17611                ok
17612                ok
17613                ok
17614           

In [148]:
from matplotlib import pyplot as plt
plt.scatter(df_features["destination_id"], df_labels)

KeyError: 'destination_id'

In [106]:
len(regr.feature_importances_)

215

In [149]:
fi=dict(zip(list(df_features),regr.feature_importances_))

In [150]:
import operator
sorted(fi.items(), key=operator.itemgetter(1), reverse=True)

[('input', 0.20661730306059617),
 ('parameters.dbkey_"28247"', 0.09514637174034685),
 ('parameters.dbkey_"278"', 0.07236990405147299),
 ('parameters.outcontrol.out_seqnos_nan', 0.06261366780842692),
 ('parameters.range.seq_range_end', 0.058684567133249294),
 ('parameters.dbkey_"9618"', 0.05104735958089012),
 ('parameters.outcontrol.__current_case__', 0.04109306667663864),
 ('parameters.outcontrol.out_seqnos_OFF', 0.04106272696921007),
 ('parameters.outcontrol.outform_clustal', 0.03884097151330057),
 ('parameters.dbkey_"bradJapo"', 0.03052314425438402),
 ('parameters.dbkey_"AgamP3__New"', 0.02766929529741702),
 ('parameters.dnarna_"DNA"', 0.02506933528289552),
 ('parameters.dbkey_"hg19"', 0.021062274798562805),
 ('parameters.outcontrol.outform_fasta', 0.02092532724936896),
 ('parameters.dnarna_"PROTEIN"', 0.019899557760235012),
 ('parameters.dbkey_"81"', 0.01943821106856741),
 ('parameters.dbkey_"237"', 0.018962645548229195),
 ('parameters.dbkey_"31"', 0.016636843907358832),
 ('paramete

In [24]:
type(np.nan)

float