bgruening · paulzierep · May 16, 2023 · May 16, 2023 · May 16, 2023 · May 16, 2023
diff --git a/tools/sklearn/association_rules.py b/tools/sklearn/association_rules.py
@@ -7,7 +7,16 @@
 from mlxtend.preprocessing import TransactionEncoder
 
 
-def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None):
+def main(
+    inputs,
+    infile,
+    outfile,
+    min_support=0.5,
+    min_confidence=0.5,
+    min_lift=1.0,
+    min_conviction=1.0,
+    max_length=None,
+):
     """
     Parameter
     ---------
@@ -36,13 +45,13 @@ def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=
         Maximum length
 
     """
-    warnings.simplefilter('ignore')
+    warnings.simplefilter("ignore")
 
-    with open(inputs, 'r') as param_handler:
+    with open(inputs, "r") as param_handler:
         params = json.load(param_handler)
 
-    input_header = params['header0']
-    header = 'infer' if input_header else None
+    input_header = params["header0"]
+    header = "infer" if input_header else None
 
     with open(infile) as fp:
         lines = fp.read().splitlines()
@@ -65,41 +74,45 @@ def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=
 
     # Extract frequent itemsets for association rule mining
     # use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices
-    frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length)
+    frequent_itemsets = fpgrowth(
+        df, min_support=min_support, use_colnames=True, max_len=max_length
+    )
 
     # Get association rules, with confidence larger than min_confidence
-    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
+    rules = association_rules(
+        frequent_itemsets, metric="confidence", min_threshold=min_confidence
+    )
 
     # Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction
-    rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)]
+    rules = rules[(rules["lift"] >= min_lift) & (rules["conviction"] >= min_conviction)]
 
     # Convert columns from frozenset to list (more readable)
-    rules['antecedents'] = rules['antecedents'].apply(list)
-    rules['consequents'] = rules['consequents'].apply(list)
+    rules["antecedents"] = rules["antecedents"].apply(list)
+    rules["consequents"] = rules["consequents"].apply(list)
 
     # The next 3 steps are intended to fix the order of the association
     # rules generated, so tests that rely on diff'ing a desired output
     # with an expected output can pass
 
     # 1) Sort entry in every row/column for columns 'antecedents' and 'consequents'
-    rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row))
-    rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row))
+    rules["antecedents"] = rules["antecedents"].apply(lambda row: sorted(row))
+    rules["consequents"] = rules["consequents"].apply(lambda row: sorted(row))
 
     # 2) Create two temporary string columns to sort on
-    rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row))
-    rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row))
+    rules["ant_str"] = rules["antecedents"].apply(lambda row: " ".join(row))
+    rules["con_str"] = rules["consequents"].apply(lambda row: " ".join(row))
 
     # 3) Sort results so they are re-producable
-    rules.sort_values(by=['ant_str', 'con_str'], inplace=True)
-    del rules['ant_str']
-    del rules['con_str']
+    rules.sort_values(by=["ant_str", "con_str"], inplace=True)
+    del rules["ant_str"]
+    del rules["con_str"]
     rules.reset_index(drop=True, inplace=True)
 
     # Write association rules and metrics to file
     rules.to_csv(outfile, sep="\t", index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     aparser = argparse.ArgumentParser()
     aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
     aparser.add_argument("-y", "--infile", dest="infile", required=True)
@@ -111,6 +124,13 @@ def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=
     aparser.add_argument("-t", "--length", dest="length", default=5)
     args = aparser.parse_args()
 
-    main(args.inputs, args.infile, args.outfile,
-         min_support=float(args.support), min_confidence=float(args.confidence),
-         min_lift=float(args.lift), min_conviction=float(args.conviction), max_length=int(args.length))
+    main(
+        args.inputs,
+        args.infile,
+        args.outfile,
+        min_support=float(args.support),
+        min_confidence=float(args.confidence),
+        min_lift=float(args.lift),
+        min_conviction=float(args.conviction),
+        max_length=int(args.length),
+    )
diff --git a/tools/sklearn/clf_metrics.xml b/tools/sklearn/clf_metrics.xml
@@ -77,7 +77,7 @@ with open("$outfile", 'w+') as out_file:
                 <option value="f1_score">F1 score (aka balanced F-score or F-measure)</option>
                 <option value="fbeta_score">F-beta score</option>
                 <option value="hamming_loss">Average Hamming loss</option>
-                <option value="jaccard_similarity_score">Jaccard similarity coefficient score</option>
+                <option value="jaccard_score">Jaccard similarity coefficient score</option>
                 <option value="precision_recall_fscore_support">Compute precision, recall, F-measure and support for each class</option>
                 <option value="precision_score">Precision</option>
                 <option value="recall_score">Recall</option>
@@ -138,10 +138,11 @@ with open("$outfile", 'w+') as out_file:
                     <!- -classes- ->
                 </section-->
             </when>
-            <when value="jaccard_similarity_score">
+            <when value="jaccard_score">
                 <expand macro="clf_inputs" />
                 <section name="options" title="Advanced Options" expanded="False">
-                    <param argument="normalize" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Normalize" help="If false, returns the sum of the Jaccard similarity coefficient over the sample set. Otherwise, returns the average of Jaccard similarity coefficient. " />
+                    <expand macro="average">
+                    </expand>
                 </section>
             </when>
             <when value="precision_recall_fscore_support">
@@ -188,9 +189,6 @@ with open("$outfile", 'w+') as out_file:
             </when>
             <when value="auc">
                 <expand macro="clf_inputs" />
-                <section name="options" title="Advanced Options" expanded="False">
-                    <param argument="reorder" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Assume an ascending curve in the case of ties" help="If the curve is non-ascending, the result will be wrong. " />
-                </section>
             </when>
             <when value="brier_score_loss">
                 <expand macro="clf_inputs" />
@@ -302,11 +300,12 @@ with open("$outfile", 'w+') as out_file:
             <output name="outfile" file="hamming_loss.txt" />
         </test>
         <test>
-            <param name="selected_metric" value="jaccard_similarity_score" />
+            <param name="selected_metric" value="jaccard_score" />
             <param name="infile1" value="y.tabular" ftype="tabular" />
             <param name="col1" value="1" />
             <param name="infile2" value="y.tabular" ftype="tabular" />
             <param name="col2" value="2" />
+            <param name="average" value="weighted" />
             <output name="outfile" file="jaccard_similarity_score.txt" />
         </test>
         <test>
@@ -346,11 +345,10 @@ with open("$outfile", 'w+') as out_file:
         </test>
         <test>
             <param name="selected_metric" value="auc" />
-            <param name="infile1" value="y.tabular" ftype="tabular" />
+            <param name="infile1" value="y_sorted.tabular" ftype="tabular" />
             <param name="col1" value="1" />
-            <param name="infile2" value="y.tabular" ftype="tabular" />
+            <param name="infile2" value="y_sorted.tabular" ftype="tabular" />
             <param name="col2" value="2" />
-            <param name="reorder" value="true" />
             <output name="outfile" file="auc.txt" />
         </test>
         <test>

diff --git a/tools/sklearn/discriminant.xml b/tools/sklearn/discriminant.xml
@@ -1,4 +1,4 @@
-<tool id="sklearn_discriminant_classifier" name="Discriminant Analysis" version="@VERSION@" profile="20.05">
+<tool id="sklearn_discriminant_classifier" name="Discriminant Analysis" version="@VERSION@" profile="@PROFILE@">
     <description></description>
     <macros>
         <import>main_macros.xml</import>
@@ -22,7 +22,8 @@ import pickle
 import sklearn.discriminant_analysis
 import sys
 
-from galaxy_ml.utils import load_model, get_X_y
+from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
+from galaxy_ml.utils import clean_params, get_X_y
 
 
 input_json_path = sys.argv[1]
@@ -31,8 +32,8 @@ with open(input_json_path, "r") as param_handler:
 
 #if $selected_tasks.selected_task == "load":
 
-with open("$infile_model", 'rb') as model_handler:
-    classifier_object = load_model(model_handler)
+classifier_object = load_model_from_h5('$infile_model')
+classifier_object = clean_params(classifier_object)
 
 header = 'infer' if params["selected_tasks"]["header"] else None
 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=header, index_col=None, parse_dates=True, encoding=None)
@@ -51,15 +52,14 @@ selected_algorithm = params["selected_tasks"]["selected_algorithms"]["selected_a
 my_class = getattr(sklearn.discriminant_analysis, selected_algorithm)
 classifier_object = my_class(**options)
 classifier_object.fit(X, y)
-with open("$outfile_fit", 'wb') as out_handler:
-    pickle.dump(classifier_object, out_handler, pickle.HIGHEST_PROTOCOL)
+dump_model_to_h5(classifier_object, '$outfile_fit')
 
 #end if
 ]]>
         </configfile>
     </configfiles>
     <inputs>
-        <expand macro="sl_Conditional" model="zip">
+        <expand macro="sl_Conditional" model="h5mlm">
             <param name="selected_algorithm" type="select" label="Classifier type">
                 <option value="LinearDiscriminantAnalysis" selected="true">Linear Discriminant Classifier</option>
                 <option value="QuadraticDiscriminantAnalysis">Quadratic Discriminant Classifier</option>
@@ -95,8 +95,6 @@ with open("$outfile_fit", 'wb') as out_handler:
         <test>
             <param name="infile1" value="train.tabular" ftype="tabular" />
             <param name="infile2" value="train.tabular" ftype="tabular" />
-            <param name="header1" value="True" />
-            <param name="header2" value="True" />
             <param name="col1" value="1,2,3,4" />
             <param name="col2" value="5" />
             <param name="selected_task" value="train" />
@@ -108,8 +106,6 @@ with open("$outfile_fit", 'wb') as out_handler:
         <test>
             <param name="infile1" value="train.tabular" ftype="tabular" />
             <param name="infile2" value="train.tabular" ftype="tabular" />
-            <param name="header1" value="True" />
-            <param name="header2" value="True" />
             <param name="col1" value="1,2,3,4" />
             <param name="col2" value="5" />
             <param name="selected_task" value="train" />
@@ -120,32 +116,27 @@ with open("$outfile_fit", 'wb') as out_handler:
         <test>
             <param name="infile1" value="train.tabular" ftype="tabular" />
             <param name="infile2" value="train.tabular" ftype="tabular" />
-            <param name="header1" value="True" />
-            <param name="header2" value="True" />
             <param name="col1" value="1,2,3,4" />
             <param name="col2" value="5" />
             <param name="selected_task" value="train" />
             <param name="selected_algorithm" value="QuadraticDiscriminantAnalysis" />
             <output name="outfile_fit" file="qda_model01" compare="sim_size" delta="1" />
         </test>
         <test>
-            <param name="infile_model" value="lda_model01" ftype="zip" />
+            <param name="infile_model" value="lda_model01" ftype="h5mlm" />
             <param name="infile_data" value="test.tabular" ftype="tabular" />
-            <param name="header" value="True" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="lda_prediction_result01.tabular" />
         </test>
         <test>
-            <param name="infile_model" value="lda_model02" ftype="zip" />
+            <param name="infile_model" value="lda_model02" ftype="h5mlm" />
             <param name="infile_data" value="test.tabular" ftype="tabular" />
-            <param name="header" value="True" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="lda_prediction_result02.tabular" />
         </test>
         <test>
-            <param name="infile_model" value="qda_model01" ftype="zip" />
+            <param name="infile_model" value="qda_model01" ftype="h5mlm" />
             <param name="infile_data" value="test.tabular" ftype="tabular" />
-            <param name="header" value="True" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="qda_prediction_result01.tabular" />
         </test>

diff --git a/tools/sklearn/ensemble.xml b/tools/sklearn/ensemble.xml
@@ -1,4 +1,4 @@
-<tool id="sklearn_ensemble" name="Ensemble methods" version="@VERSION@" profile="20.05">
+<tool id="sklearn_ensemble" name="Ensemble methods" version="@VERSION@" profile="@PROFILE@">
     <description>for classification and regression</description>
     <macros>
         <import>main_macros.xml</import>
@@ -17,12 +17,12 @@
 import json
 import numpy as np
 import pandas
-import pickle
 import sys
 
 from scipy.io import mmread
 import sklearn.ensemble
-from galaxy_ml.utils import load_model, get_X_y
+from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
+from galaxy_ml.utils import clean_params, get_X_y
 
 
 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
@@ -57,11 +57,6 @@ if params["selected_tasks"]["selected_task"] == "train":
             options["select_max_features"].pop("num_max_features")
         options["max_features"] = options["select_max_features"]["max_features"]
         options.pop("select_max_features")
-    if "presort" in options:
-        if options["presort"] == "true":
-            options["presort"] = True
-        if options["presort"] == "false":
-            options["presort"] = False
     if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0:
         options["min_samples_leaf"] = 1
     if "min_samples_split" in options and options["min_samples_split"] > 1.0:
@@ -72,12 +67,11 @@ if params["selected_tasks"]["selected_task"] == "train":
     my_class = getattr(sklearn.ensemble, algorithm)
     estimator = my_class(**options)
     estimator.fit(X,y)
-    with open(outfile_fit, 'wb') as out_handler:
-        pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL)
+    dump_model_to_h5(estimator, outfile_fit)
 
 else:
-    with open(infile_model, 'rb') as model_handler:
-        classifier_object = load_model(model_handler)
+    classifier_object = load_model_from_h5(infile_model)
+    classifier_object = clean_params(classifier_object)
     header = 'infer' if params["selected_tasks"]["header"] else None
     data = pandas.read_csv(infile_data, sep='\t', header=header, index_col=None, parse_dates=True, encoding=None)
     prediction = classifier_object.predict(data)
@@ -89,7 +83,7 @@ else:
         </configfile>
     </configfiles>
     <inputs>
-        <expand macro="sl_Conditional" model="zip">
+        <expand macro="sl_Conditional" model="h5mlm">
             <param name="selected_algorithm" type="select" label="Select an ensemble method:">
                 <option value="RandomForestClassifier" selected="true">Random forest classifier</option>
                 <option value="AdaBoostClassifier">Ada boost classifier</option>
@@ -153,7 +147,6 @@ else:
                     <expand macro="verbose" />
                     <expand macro="warm_start" checked="false" />
                     <expand macro="random_state" />
-                    <expand macro="presort" />
                 </section>
             </when>
             <when value="RandomForestRegressor">
@@ -216,7 +209,6 @@ else:
                     <expand macro="verbose" />
                     <expand macro="warm_start" checked="false" />
                     <expand macro="random_state" />
-                    <expand macro="presort" />
                 </section>
             </when>
         </expand>
@@ -236,7 +228,7 @@ else:
             <output name="outfile_fit" file="rfc_model01" compare="sim_size" delta="5" />
         </test>
         <test>
-            <param name="infile_model" value="rfc_model01" ftype="zip" />
+            <param name="infile_model" value="rfc_model01" ftype="h5mlm" />
             <param name="infile_data" value="test.tabular" ftype="tabular" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="rfc_result01" />
@@ -252,7 +244,7 @@ else:
             <output name="outfile_fit" file="rfr_model01" compare="sim_size" delta="5" />
         </test>
         <test>
-            <param name="infile_model" value="rfr_model01" ftype="zip" />
+            <param name="infile_model" value="rfr_model01" ftype="h5mlm" />
             <param name="infile_data" value="regression_test.tabular" ftype="tabular" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="rfr_result01" />
@@ -272,7 +264,7 @@ else:
             <output name="outfile_fit" file="gbr_model01" compare="sim_size" delta="5" />
         </test>
         <test>
-            <param name="infile_model" value="gbr_model01" ftype="zip" />
+            <param name="infile_model" value="gbr_model01" ftype="h5mlm" />
             <param name="infile_data" value="regression_test_X.tabular" ftype="tabular" />
             <param name="selected_task" value="load" />
             <param name="header" value="True" />
@@ -288,7 +280,7 @@ else:
             <output name="outfile_fit" file="gbc_model01" compare="sim_size" delta="5" />
         </test>
         <test>
-            <param name="infile_model" value="gbc_model01" ftype="zip" />
+            <param name="infile_model" value="gbc_model01" ftype="h5mlm" />
             <param name="infile_data" value="test.tabular" ftype="tabular" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="gbc_result01" />
@@ -304,7 +296,7 @@ else:
             <output name="outfile_fit" file="abc_model01" compare="sim_size" delta="5" />
         </test>
         <test>
-            <param name="infile_model" value="abc_model01" ftype="zip" />
+            <param name="infile_model" value="abc_model01" ftype="h5mlm" />
             <param name="infile_data" value="test.tabular" ftype="tabular" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="abc_result01" />
@@ -320,7 +312,7 @@ else:
             <output name="outfile_fit" file="abr_model01" compare="sim_size" delta="5" />
         </test>
         <test>
-            <param name="infile_model" value="abr_model01" ftype="zip" />
+            <param name="infile_model" value="abr_model01" ftype="h5mlm" />
             <param name="infile_data" value="regression_test.tabular" ftype="tabular" />
             <param name="selected_task" value="load" />
             <output name="outfile_predict" file="abr_result01" />