Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Label encoder persistent #1323

Closed
Closed
Show file tree
Hide file tree
Changes from 40 commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
6243a0f
update
anuprulez May 16, 2023
e9451bd
fix model batch and config tools
anuprulez May 16, 2023
8f7587f
fix keras train and eval
anuprulez May 16, 2023
482dc7a
fix ml viz
anuprulez May 16, 2023
4ccc75b
fix feature selection
anuprulez May 16, 2023
4b330ff
fix tools
anuprulez May 16, 2023
833e574
fix stacking and searchcv
anuprulez May 16, 2023
38658bd
fix clf metrics
anuprulez May 17, 2023
435d79e
fix discriminant
anuprulez May 17, 2023
f1c24e0
fix lightGBM
anuprulez May 22, 2023
2933f9a
fix model fit
anuprulez May 22, 2023
7b23811
fix clustering and pw
anuprulez May 22, 2023
25f4b49
add tests
anuprulez May 22, 2023
da499ad
update
anuprulez May 22, 2023
07b6bbe
fix preprocess tool
anuprulez May 22, 2023
632f16d
fix
anuprulez May 22, 2023
2791fe2
update
anuprulez May 22, 2023
dbf5a21
fix eval
anuprulez May 22, 2023
7b14acb
fix train test eval
anuprulez May 22, 2023
337e9ca
fix svm
anuprulez May 25, 2023
7c80cc8
fix NN
anuprulez May 25, 2023
6b895f9
Merge branch 'bgruening:master' into fix_galaxy_ml_1.0
anuprulez May 25, 2023
9ae0a6d
fix ensemble
anuprulez May 25, 2023
b431b72
update
anuprulez May 25, 2023
491adb9
update
anuprulez May 25, 2023
500e226
fix test
anuprulez May 25, 2023
6a6bbcc
fix test
anuprulez May 25, 2023
fed2c5f
fix linting errors
anuprulez May 25, 2023
55e01c3
update
anuprulez May 26, 2023
fca81c4
Merge branch 'master' of https://github.com/anuprulez/galaxytools int…
anuprulez Aug 7, 2023
f9317be
fix sorting
anuprulez Aug 7, 2023
b8137be
fix sorting
anuprulez Aug 7, 2023
8e6887e
fix sorting
anuprulez Aug 7, 2023
839244a
fix sorting
anuprulez Aug 7, 2023
922f62a
update
anuprulez Aug 7, 2023
c0ba729
fix sorting
anuprulez Aug 7, 2023
7f4f8e4
fix large test file
anuprulez Aug 8, 2023
05f1b92
fix linting for max_value param
anuprulez Aug 8, 2023
27b7dee
pca
anuprulez Aug 8, 2023
bed3ddb
allow to export and load encoder using h5 model
paulzierep Aug 9, 2023
497c927
Merge branch 'master' of github.com:bgruening/galaxytools into label_…
paulzierep Aug 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 41 additions & 21 deletions tools/sklearn/association_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,16 @@
from mlxtend.preprocessing import TransactionEncoder


def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=1.0, min_conviction=1.0, max_length=None):
def main(
inputs,
infile,
outfile,
min_support=0.5,
min_confidence=0.5,
min_lift=1.0,
min_conviction=1.0,
max_length=None,
):
"""
Parameter
---------
Expand Down Expand Up @@ -36,13 +45,13 @@ def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=
Maximum length

"""
warnings.simplefilter('ignore')
warnings.simplefilter("ignore")

with open(inputs, 'r') as param_handler:
with open(inputs, "r") as param_handler:
params = json.load(param_handler)

input_header = params['header0']
header = 'infer' if input_header else None
input_header = params["header0"]
header = "infer" if input_header else None

with open(infile) as fp:
lines = fp.read().splitlines()
Expand All @@ -65,41 +74,45 @@ def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=

# Extract frequent itemsets for association rule mining
# use_colnames: Use DataFrames' column names in the returned DataFrame instead of column indices
frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True, max_len=max_length)
frequent_itemsets = fpgrowth(
df, min_support=min_support, use_colnames=True, max_len=max_length
)

# Get association rules, with confidence larger than min_confidence
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence)
rules = association_rules(
frequent_itemsets, metric="confidence", min_threshold=min_confidence
)

# Filter association rules, keeping rules with lift and conviction larger than min_liftand and min_conviction
rules = rules[(rules['lift'] >= min_lift) & (rules['conviction'] >= min_conviction)]
rules = rules[(rules["lift"] >= min_lift) & (rules["conviction"] >= min_conviction)]

# Convert columns from frozenset to list (more readable)
rules['antecedents'] = rules['antecedents'].apply(list)
rules['consequents'] = rules['consequents'].apply(list)
rules["antecedents"] = rules["antecedents"].apply(list)
rules["consequents"] = rules["consequents"].apply(list)

# The next 3 steps are intended to fix the order of the association
# rules generated, so tests that rely on diff'ing a desired output
# with an expected output can pass

# 1) Sort entry in every row/column for columns 'antecedents' and 'consequents'
rules['antecedents'] = rules['antecedents'].apply(lambda row: sorted(row))
rules['consequents'] = rules['consequents'].apply(lambda row: sorted(row))
rules["antecedents"] = rules["antecedents"].apply(lambda row: sorted(row))
rules["consequents"] = rules["consequents"].apply(lambda row: sorted(row))

# 2) Create two temporary string columns to sort on
rules['ant_str'] = rules['antecedents'].apply(lambda row: " ".join(row))
rules['con_str'] = rules['consequents'].apply(lambda row: " ".join(row))
rules["ant_str"] = rules["antecedents"].apply(lambda row: " ".join(row))
rules["con_str"] = rules["consequents"].apply(lambda row: " ".join(row))

# 3) Sort results so they are re-producable
rules.sort_values(by=['ant_str', 'con_str'], inplace=True)
del rules['ant_str']
del rules['con_str']
rules.sort_values(by=["ant_str", "con_str"], inplace=True)
del rules["ant_str"]
del rules["con_str"]
rules.reset_index(drop=True, inplace=True)

# Write association rules and metrics to file
rules.to_csv(outfile, sep="\t", index=False)


if __name__ == '__main__':
if __name__ == "__main__":
aparser = argparse.ArgumentParser()
aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
aparser.add_argument("-y", "--infile", dest="infile", required=True)
Expand All @@ -111,6 +124,13 @@ def main(inputs, infile, outfile, min_support=0.5, min_confidence=0.5, min_lift=
aparser.add_argument("-t", "--length", dest="length", default=5)
args = aparser.parse_args()

main(args.inputs, args.infile, args.outfile,
min_support=float(args.support), min_confidence=float(args.confidence),
min_lift=float(args.lift), min_conviction=float(args.conviction), max_length=int(args.length))
main(
args.inputs,
args.infile,
args.outfile,
min_support=float(args.support),
min_confidence=float(args.confidence),
min_lift=float(args.lift),
min_conviction=float(args.conviction),
max_length=int(args.length),
)
18 changes: 8 additions & 10 deletions tools/sklearn/clf_metrics.xml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ with open("$outfile", 'w+') as out_file:
<option value="f1_score">F1 score (aka balanced F-score or F-measure)</option>
<option value="fbeta_score">F-beta score</option>
<option value="hamming_loss">Average Hamming loss</option>
<option value="jaccard_similarity_score">Jaccard similarity coefficient score</option>
<option value="jaccard_score">Jaccard similarity coefficient score</option>
<option value="precision_recall_fscore_support">Compute precision, recall, F-measure and support for each class</option>
<option value="precision_score">Precision</option>
<option value="recall_score">Recall</option>
Expand Down Expand Up @@ -138,10 +138,11 @@ with open("$outfile", 'w+') as out_file:
<!- -classes- ->
</section-->
</when>
<when value="jaccard_similarity_score">
<when value="jaccard_score">
<expand macro="clf_inputs" />
<section name="options" title="Advanced Options" expanded="False">
<param argument="normalize" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Normalize" help="If false, returns the sum of the Jaccard similarity coefficient over the sample set. Otherwise, returns the average of Jaccard similarity coefficient. " />
<expand macro="average">
</expand>
</section>
</when>
<when value="precision_recall_fscore_support">
Expand Down Expand Up @@ -188,9 +189,6 @@ with open("$outfile", 'w+') as out_file:
</when>
<when value="auc">
<expand macro="clf_inputs" />
<section name="options" title="Advanced Options" expanded="False">
<param argument="reorder" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Assume an ascending curve in the case of ties" help="If the curve is non-ascending, the result will be wrong. " />
</section>
</when>
<when value="brier_score_loss">
<expand macro="clf_inputs" />
Expand Down Expand Up @@ -302,11 +300,12 @@ with open("$outfile", 'w+') as out_file:
<output name="outfile" file="hamming_loss.txt" />
</test>
<test>
<param name="selected_metric" value="jaccard_similarity_score" />
<param name="selected_metric" value="jaccard_score" />
<param name="infile1" value="y.tabular" ftype="tabular" />
<param name="col1" value="1" />
<param name="infile2" value="y.tabular" ftype="tabular" />
<param name="col2" value="2" />
<param name="average" value="weighted" />
<output name="outfile" file="jaccard_similarity_score.txt" />
</test>
<test>
Expand Down Expand Up @@ -346,11 +345,10 @@ with open("$outfile", 'w+') as out_file:
</test>
<test>
<param name="selected_metric" value="auc" />
<param name="infile1" value="y.tabular" ftype="tabular" />
<param name="infile1" value="y_sorted.tabular" ftype="tabular" />
<param name="col1" value="1" />
<param name="infile2" value="y.tabular" ftype="tabular" />
<param name="infile2" value="y_sorted.tabular" ftype="tabular" />
<param name="col2" value="2" />
<param name="reorder" value="true" />
<output name="outfile" file="auc.txt" />
</test>
<test>
Expand Down
29 changes: 10 additions & 19 deletions tools/sklearn/discriminant.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="sklearn_discriminant_classifier" name="Discriminant Analysis" version="@VERSION@" profile="20.05">
<tool id="sklearn_discriminant_classifier" name="Discriminant Analysis" version="@VERSION@" profile="@PROFILE@">
<description></description>
<macros>
<import>main_macros.xml</import>
Expand All @@ -22,7 +22,8 @@ import pickle
import sklearn.discriminant_analysis
import sys

from galaxy_ml.utils import load_model, get_X_y
from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
from galaxy_ml.utils import clean_params, get_X_y


input_json_path = sys.argv[1]
Expand All @@ -31,8 +32,8 @@ with open(input_json_path, "r") as param_handler:

#if $selected_tasks.selected_task == "load":

with open("$infile_model", 'rb') as model_handler:
classifier_object = load_model(model_handler)
classifier_object = load_model_from_h5('$infile_model')
classifier_object = clean_params(classifier_object)

header = 'infer' if params["selected_tasks"]["header"] else None
data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=header, index_col=None, parse_dates=True, encoding=None)
Expand All @@ -51,15 +52,14 @@ selected_algorithm = params["selected_tasks"]["selected_algorithms"]["selected_a
my_class = getattr(sklearn.discriminant_analysis, selected_algorithm)
classifier_object = my_class(**options)
classifier_object.fit(X, y)
with open("$outfile_fit", 'wb') as out_handler:
pickle.dump(classifier_object, out_handler, pickle.HIGHEST_PROTOCOL)
dump_model_to_h5(classifier_object, '$outfile_fit')

#end if
]]>
</configfile>
</configfiles>
<inputs>
<expand macro="sl_Conditional" model="zip">
<expand macro="sl_Conditional" model="h5mlm">
<param name="selected_algorithm" type="select" label="Classifier type">
<option value="LinearDiscriminantAnalysis" selected="true">Linear Discriminant Classifier</option>
<option value="QuadraticDiscriminantAnalysis">Quadratic Discriminant Classifier</option>
Expand Down Expand Up @@ -95,8 +95,6 @@ with open("$outfile_fit", 'wb') as out_handler:
<test>
<param name="infile1" value="train.tabular" ftype="tabular" />
<param name="infile2" value="train.tabular" ftype="tabular" />
<param name="header1" value="True" />
<param name="header2" value="True" />
<param name="col1" value="1,2,3,4" />
<param name="col2" value="5" />
<param name="selected_task" value="train" />
Expand All @@ -108,8 +106,6 @@ with open("$outfile_fit", 'wb') as out_handler:
<test>
<param name="infile1" value="train.tabular" ftype="tabular" />
<param name="infile2" value="train.tabular" ftype="tabular" />
<param name="header1" value="True" />
<param name="header2" value="True" />
<param name="col1" value="1,2,3,4" />
<param name="col2" value="5" />
<param name="selected_task" value="train" />
Expand All @@ -120,32 +116,27 @@ with open("$outfile_fit", 'wb') as out_handler:
<test>
<param name="infile1" value="train.tabular" ftype="tabular" />
<param name="infile2" value="train.tabular" ftype="tabular" />
<param name="header1" value="True" />
<param name="header2" value="True" />
<param name="col1" value="1,2,3,4" />
<param name="col2" value="5" />
<param name="selected_task" value="train" />
<param name="selected_algorithm" value="QuadraticDiscriminantAnalysis" />
<output name="outfile_fit" file="qda_model01" compare="sim_size" delta="1" />
</test>
<test>
<param name="infile_model" value="lda_model01" ftype="zip" />
<param name="infile_model" value="lda_model01" ftype="h5mlm" />
<param name="infile_data" value="test.tabular" ftype="tabular" />
<param name="header" value="True" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="lda_prediction_result01.tabular" />
</test>
<test>
<param name="infile_model" value="lda_model02" ftype="zip" />
<param name="infile_model" value="lda_model02" ftype="h5mlm" />
<param name="infile_data" value="test.tabular" ftype="tabular" />
<param name="header" value="True" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="lda_prediction_result02.tabular" />
</test>
<test>
<param name="infile_model" value="qda_model01" ftype="zip" />
<param name="infile_model" value="qda_model01" ftype="h5mlm" />
<param name="infile_data" value="test.tabular" ftype="tabular" />
<param name="header" value="True" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="qda_prediction_result01.tabular" />
</test>
Expand Down
34 changes: 13 additions & 21 deletions tools/sklearn/ensemble.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
<tool id="sklearn_ensemble" name="Ensemble methods" version="@VERSION@" profile="20.05">
<tool id="sklearn_ensemble" name="Ensemble methods" version="@VERSION@" profile="@PROFILE@">
<description>for classification and regression</description>
<macros>
<import>main_macros.xml</import>
Expand All @@ -17,12 +17,12 @@
import json
import numpy as np
import pandas
import pickle
import sys

from scipy.io import mmread
import sklearn.ensemble
from galaxy_ml.utils import load_model, get_X_y
from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
from galaxy_ml.utils import clean_params, get_X_y


N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
Expand Down Expand Up @@ -57,11 +57,6 @@ if params["selected_tasks"]["selected_task"] == "train":
options["select_max_features"].pop("num_max_features")
options["max_features"] = options["select_max_features"]["max_features"]
options.pop("select_max_features")
if "presort" in options:
if options["presort"] == "true":
options["presort"] = True
if options["presort"] == "false":
options["presort"] = False
if "min_samples_leaf" in options and options["min_samples_leaf"] == 1.0:
options["min_samples_leaf"] = 1
if "min_samples_split" in options and options["min_samples_split"] > 1.0:
Expand All @@ -72,12 +67,11 @@ if params["selected_tasks"]["selected_task"] == "train":
my_class = getattr(sklearn.ensemble, algorithm)
estimator = my_class(**options)
estimator.fit(X,y)
with open(outfile_fit, 'wb') as out_handler:
pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL)
dump_model_to_h5(estimator, outfile_fit)

else:
with open(infile_model, 'rb') as model_handler:
classifier_object = load_model(model_handler)
classifier_object = load_model_from_h5(infile_model)
classifier_object = clean_params(classifier_object)
header = 'infer' if params["selected_tasks"]["header"] else None
data = pandas.read_csv(infile_data, sep='\t', header=header, index_col=None, parse_dates=True, encoding=None)
prediction = classifier_object.predict(data)
Expand All @@ -89,7 +83,7 @@ else:
</configfile>
</configfiles>
<inputs>
<expand macro="sl_Conditional" model="zip">
<expand macro="sl_Conditional" model="h5mlm">
<param name="selected_algorithm" type="select" label="Select an ensemble method:">
<option value="RandomForestClassifier" selected="true">Random forest classifier</option>
<option value="AdaBoostClassifier">Ada boost classifier</option>
Expand Down Expand Up @@ -153,7 +147,6 @@ else:
<expand macro="verbose" />
<expand macro="warm_start" checked="false" />
<expand macro="random_state" />
<expand macro="presort" />
</section>
</when>
<when value="RandomForestRegressor">
Expand Down Expand Up @@ -216,7 +209,6 @@ else:
<expand macro="verbose" />
<expand macro="warm_start" checked="false" />
<expand macro="random_state" />
<expand macro="presort" />
</section>
</when>
</expand>
Expand All @@ -236,7 +228,7 @@ else:
<output name="outfile_fit" file="rfc_model01" compare="sim_size" delta="5" />
</test>
<test>
<param name="infile_model" value="rfc_model01" ftype="zip" />
<param name="infile_model" value="rfc_model01" ftype="h5mlm" />
<param name="infile_data" value="test.tabular" ftype="tabular" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="rfc_result01" />
Expand All @@ -252,7 +244,7 @@ else:
<output name="outfile_fit" file="rfr_model01" compare="sim_size" delta="5" />
</test>
<test>
<param name="infile_model" value="rfr_model01" ftype="zip" />
<param name="infile_model" value="rfr_model01" ftype="h5mlm" />
<param name="infile_data" value="regression_test.tabular" ftype="tabular" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="rfr_result01" />
Expand All @@ -272,7 +264,7 @@ else:
<output name="outfile_fit" file="gbr_model01" compare="sim_size" delta="5" />
</test>
<test>
<param name="infile_model" value="gbr_model01" ftype="zip" />
<param name="infile_model" value="gbr_model01" ftype="h5mlm" />
<param name="infile_data" value="regression_test_X.tabular" ftype="tabular" />
<param name="selected_task" value="load" />
<param name="header" value="True" />
Expand All @@ -288,7 +280,7 @@ else:
<output name="outfile_fit" file="gbc_model01" compare="sim_size" delta="5" />
</test>
<test>
<param name="infile_model" value="gbc_model01" ftype="zip" />
<param name="infile_model" value="gbc_model01" ftype="h5mlm" />
<param name="infile_data" value="test.tabular" ftype="tabular" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="gbc_result01" />
Expand All @@ -304,7 +296,7 @@ else:
<output name="outfile_fit" file="abc_model01" compare="sim_size" delta="5" />
</test>
<test>
<param name="infile_model" value="abc_model01" ftype="zip" />
<param name="infile_model" value="abc_model01" ftype="h5mlm" />
<param name="infile_data" value="test.tabular" ftype="tabular" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="abc_result01" />
Expand All @@ -320,7 +312,7 @@ else:
<output name="outfile_fit" file="abr_model01" compare="sim_size" delta="5" />
</test>
<test>
<param name="infile_model" value="abr_model01" ftype="zip" />
<param name="infile_model" value="abr_model01" ftype="h5mlm" />
<param name="infile_data" value="regression_test.tabular" ftype="tabular" />
<param name="selected_task" value="load" />
<output name="outfile_predict" file="abr_result01" />
Expand Down
Loading