From 89348202f4ff424e77e3da941c16183ae2ac3e29 Mon Sep 17 00:00:00 2001 From: Shafaq Siddiqi Date: Mon, 9 May 2022 16:29:22 +0200 Subject: [PATCH 1/3] [MINOR] Adding apply_pipeline() builtin for cleaning pipelines API --- scripts/builtin/bandit.dml | 5 +- scripts/builtin/executePipeline.dml | 16 +++-- scripts/builtin/fit_pipeline.dml | 26 ++++---- scripts/builtin/frameSort.dml | 3 +- scripts/builtin/scale.dml | 8 +-- scripts/builtin/topk_cleaning.dml | 63 +++++++++---------- .../pipelines/scripts/enumerateLogical.dml | 50 ++++++++++++--- scripts/pipelines/scripts/utils.dml | 3 +- .../org/apache/sysds/common/Builtins.java | 1 + .../cp/VariableCPInstruction.java | 7 ++- ...BuiltinTopkCleaningClassificationTest.java | 4 +- .../functions/pipelines/fit_pipelineTest.dml | 17 +++-- .../classification/applyFunc.csv | 6 +- .../intermediates/classification/bestAcc.csv | 6 +- .../classification/dirtyScore.csv | 2 +- .../intermediates/classification/hp.csv | 6 +- .../intermediates/classification/pip.csv | 6 +- .../intermediates/regression/applyFunc.csv | 10 +-- .../topkcleaningClassificationTest.dml | 5 +- .../pipelines/topkcleaningRegressionTest.dml | 2 +- 20 files changed, 147 insertions(+), 99 deletions(-) diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml index 504b5bb53f4..fa1ff1137d5 100644 --- a/scripts/builtin/bandit.dml +++ b/scripts/builtin/bandit.dml @@ -53,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, Matrix[Double] lpHp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3, - Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="") + Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE) # return(Boolean perf) return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy, Frame[String] applyFunc) { @@ -290,7 +290,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do hp = hp[, 2:totalVals] applyFunctions = allApplyFunctions[i] no_of_res = nrow(hp) - # print("PIPELINE EXECUTION START ... "+toString(op)) + print("PIPELINE EXECUTION START ... "+toString(op)) hpForPruning = matrix(0, rows=1, cols=ncol(op)) changesByOp = matrix(0, rows=1, cols=ncol(op)) metaList2 = metaList; #ensure metaList is no result var @@ -564,6 +564,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning, allChanges = min(allChanges) changesByOp = colMaxs(cvChanges) accuracy = mean(accuracyMatrix) + print("mean: \n"+toString(accuracyMatrix)) print("cv accuracy: "+toString(accuracy)) } diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml index a606df9a465..9eae1a8a74d 100644 --- a/scripts/builtin/executePipeline.dml +++ b/scripts/builtin/executePipeline.dml @@ -57,9 +57,9 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0), Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose) return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest, - Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll) + Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll, List[Unknown] internalStates) { - + internalStates = list() mask=as.matrix(metaList['mask']) FD = as.matrix(metaList['fd']) applyFunc = as.frame(metaList['applyFunc']) @@ -76,7 +76,7 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat for(i in 1:ncol(pipeline)) { op = as.scalar(pipeline[1,i]) applyOp = toString(as.scalar(applyFunc[1,i])) - + # print("op: "+op) Xclone = Xtrain XtestClone = Xtest [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, FD, hyperParameters[i], flagsCount, op) @@ -86,11 +86,14 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat Xtrain = as.matrix(O) if(applyOp != "NA") { [Xtest, executeFlag] = applyDataFlag(Xtest, mask, dataFlag) + internalStates = append(internalStates, L) L = append(L, list(X=Xtest)); Xtest = eval(applyOp, L); - Xtest = confirmData(Xtest, XtestClone, mask, dataFlag, yFlag) + # print("L \n"+toString(L, rows=3)) + Xtest = confirmData(Xtest, XtestClone, mask, dataFlag) } - Xtrain = confirmData(Xtrain, Xclone, mask, dataFlag, yFlag) + else internalStates = append(internalStates, as.frame("NA")) + Xtrain = confirmData(Xtrain, Xclone, mask, dataFlag) # dataFlag 0 = only on numeric, 1 = on whole data if(yFlag) { @@ -98,6 +101,7 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain, Mat Ytrain = as.matrix(Y) } Xtrain = confirmMeta(Xtrain, mask) + Xtest = confirmMeta(Xtest, mask) } else { print("not applying "+op+" executeFlag = 0") @@ -225,7 +229,7 @@ return (Matrix[Double] X) } -confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag, Integer yFlag) +confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag) return (Matrix[Double] X) { diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml index dae96a2a058..d67af627392 100644 --- a/scripts/builtin/fit_pipeline.dml +++ b/scripts/builtin/fit_pipeline.dml @@ -44,7 +44,7 @@ # ---------------------------------------------------------------------------------------------------------------------- # NAME TYPE MEANING # ---------------------------------------------------------------------------------------------------------------------- -# result Matrix[Double] --- +# scores Matrix[Double] --- # ---------------------------------------------------------------------------------------------------------------------- source("scripts/pipelines/scripts/utils.dml") as utils; @@ -54,10 +54,12 @@ source("scripts/builtin/bandit.dml") as bandit; s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, String evaluationFunc, Matrix[Double] evalFunHp, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE) -return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTest) +return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState) { + externalState = list() no_of_flag_vars = 5 [schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData) + pip = removeEmpty(target=pip, margin="cols") applyFunc = removeEmpty(target=applyFunc, margin="cols") metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL")) @@ -70,6 +72,7 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe if(maskY == 1) { [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}"); eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M); + externalState = append(externalState, M) } else { @@ -83,14 +86,11 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe [Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx) # # # if mask has 1s then there are categorical features - [eXtrain, eXtest] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode") + [eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode") + externalState = append(externalState, M1) # # # do the early dropping - [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE) + # [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE) metaList["applyFunc"] = applyFunc - # construct the parameter list for best hyper-parameters if the oversampling technique is part of - # pipeline then take it out because oversampling is not applied on test dataset - # this condition is unnecessary here in this case because the input dataset is balanced and - # instead of diving the dataset into train/test I am doing cross validations no_of_param = as.scalar(hp[1, 1]) + 1 hp_width= hp[1, 2:no_of_param] @@ -98,7 +98,7 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars) # # # now test accuracy - [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain, + [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain, Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE) if(max(eYtrain) == min(eYtrain)) @@ -110,10 +110,10 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp)) testAccuracy = as.scalar(score[1, 1]) - result = matrix(0, rows=1, cols=3) - result[1, 1] = dirtyScore - result[1, 2] = trainAccuracy - result[1, 3] = testAccuracy + scores = matrix(0, rows=1, cols=3) + scores[1, 1] = dirtyScore + scores[1, 2] = trainAccuracy + scores[1, 3] = testAccuracy cleanTrain = cbind(eXtrain, eYtrain) cleanTest = cbind(eXtest, eYtest) } diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml index 3cfeec7bcab..cf447b42820 100644 --- a/scripts/builtin/frameSort.dml +++ b/scripts/builtin/frameSort.dml @@ -37,10 +37,9 @@ # f_odered Frame[String] sorted dataset by column 1 in decreasing order # ---------------------------------------------------------------------------------------------------------------------- -s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE ) +s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE) return (Frame[String] f_odered) { - # idx[1,1] = 0 # to save accuracy column from encoding index = vectorToCsv(mask) # recode logical pipelines for easy handling jspecR = "{ids:true, recode:["+index+"]}"; diff --git a/scripts/builtin/scale.dml b/scripts/builtin/scale.dml index 63a5f7fd876..161b596846f 100644 --- a/scripts/builtin/scale.dml +++ b/scripts/builtin/scale.dml @@ -43,8 +43,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE) return (Matrix[Double] out, Matrix[Double] Centering, Matrix[Double] ScaleFactor) { if(center){ - # ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0)) - ColMean = colMeans(X) + ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0)) + # ColMean = colMeans(X) X = X - ColMean } else { @@ -55,8 +55,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE) if (scale) { N = nrow(X) - # ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1)) - ScaleFactor = sqrt(colSums(X^2)/(N-1)) + ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1)) + # ScaleFactor = sqrt(colSums(X^2)/(N-1)) # Replace entries in the scale factor that are 0 and NaN with 1. # To avoid division by 0 or NaN, introducing NaN to the ouput. diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index 33ed3ddf370..b322eaf6b12 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -58,8 +58,7 @@ source("scripts/builtin/bandit.dml") as bandit; s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives, Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, - Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, - String output) + Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE) return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc) { @@ -104,7 +103,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a print("---- feature transformations to numeric matrix"); [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode") # # # do the early dropping - [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv) + # [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv) # apply sampling on training data for pipeline enumeration # TODO why recoding/sampling twice (within getDirtyScore) print("---- class-stratified sampling of feature matrix w/ f="+sample); @@ -148,7 +147,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a # stop("end of enumlp") [topKPipelines, topKHyperParams, topKScores, applyFunc] = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore, - k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, output=output, verbose=TRUE); + k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, verbose=TRUE); t7 = time(); print("-- Cleaning - Enum Physical Pipelines: "+(t7-t6)/1e9+"s"); } @@ -239,7 +238,7 @@ return(Double dirtyScore, Matrix[Double] evalFunHp) } recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Matrix[Double] mask, Boolean cv, String code) -return(Matrix[Double] eXtrain, Matrix[Double] eXtest) +return(Matrix[Double] eXtrain, Matrix[Double] eXtest, Frame[Unknown] X_meta) { if(sum(mask) > 0) { @@ -257,31 +256,31 @@ return(Matrix[Double] eXtrain, Matrix[Double] eXtest) } } -featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList, Boolean cv) -return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList) -{ - mask = as.matrix(metaList['mask']) - fdMask = as.matrix(metaList['fd']) - schema = as.frame(metaList['schema']) - # # 1. if 90% of the column is empty - # # # 2. if the column has only single value - # # # have all unique values - Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0) - nullMask = is.na(eXtrain) - singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | (colMaxs(Xtmp) == colMins(Xtmp)) - allmostEmpty = colSums(nullMask) - allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9) - allSum = singleValuesCol | allmostEmptyRatio - if(sum(allSum) > 0) { - eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 0)) - if(!cv) - eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 0)) - mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0)) - fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0)) - schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0)) - metaList['mask'] = mask - metaList['schema'] = schema - metaList['fd'] = fdMask - } -} +# featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList, Boolean cv) +# return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList) +# { + # mask = as.matrix(metaList['mask']) + # fdMask = as.matrix(metaList['fd']) + # schema = as.frame(metaList['schema']) + # # # 1. if 90% of the column is empty + # # # # 2. if the column has only single value + # # # # have all unique values + # Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0) + # nullMask = is.na(eXtrain) + # singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | (colMaxs(Xtmp) == colMins(Xtmp)) + # allmostEmpty = colSums(nullMask) + # allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9) + # allSum = singleValuesCol | allmostEmptyRatio + # if(sum(allSum) > 0) { + # eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 0)) + # if(!cv) + # eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 0)) + # mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0)) + # fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0)) + # schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0)) + # metaList['mask'] = mask + # metaList['schema'] = schema + # metaList['fd'] = fdMask + # } +# } diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml index d4f9891b27e..a6d8459e063 100644 --- a/scripts/pipelines/scripts/enumerateLogical.dml +++ b/scripts/pipelines/scripts/enumerateLogical.dml @@ -87,7 +87,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do pipelines = rbind(ref, pipelines) population = pipelines populationSize = nrow(pipelines) - transitions = sample(3, (populationSize * max_iter), TRUE, seed) + transitions = sample(4, (populationSize * max_iter), TRUE, seed) opToAdd = sample(nrow(allOps), (populationSize * max_iter), TRUE, seed) # opToRemove = sample(max_iter, (populationSize * max_iter), TRUE, seed) refChangesInternal = 0 @@ -117,18 +117,39 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do finalOutput = append(finalOutput, sortedPipelines) finalOutputHp = append(finalOutputHp, sortedHp) # # # if converged then stop otherwise generate new population - children = frame(0, rows=populationSize, cols=ncol(sortedPipelines)) + children = frame(0, rows=populationSize, cols=ncol(sortedPipelines)+(ncol(sortedPipelines)/2)) sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)] + start = 1; + end = 0; + topk = frame(0, rows=round((populationSize/2)) * length(finalOutput) , cols=populationLength + 2) + for(i in 1:length(finalOutput)) + { + pipFrame = as.frame(finalOutput[i]) + end = end + nrow(pipFrame) + topk[start:end, 1:ncol(pipFrame)] = pipFrame + start = end + 1 + } + sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1, cols=ncol(topk) - 2)) + topk = removeEmpty(target=topk, margin="rows") + topk = frameSort(topk, sort_mask, TRUE) + topk = topk[, 3:ncol(topk)] # # randomly pick the pipelines for transitions pipRand = sample(nrow(sortedPipelines), populationSize, TRUE, seed) if(!converged) { parfor(i in 1:nrow(children), check=0) { idxR = (nrow(children) * (iter - 1)) + i idx = as.scalar(pipRand[i]) - top = removeEmpty(target=sortedPipelines[idx], margin="cols") - tail = top[, ncol(top)] - if(sum(mask) > 0) + top = removeEmpty(target=topk[idx], margin="cols") + # top = removeEmpty(target=sortedPipelines[idx], margin="cols") + idx2 = min(max(pipRand), idx + 1) + top2 = removeEmpty(target=topk[idx2], margin="cols") + # top2 = removeEmpty(target=sortedPipelines[idx2], margin="cols") + if(sum(mask) > 0) { + tail = top[, ncol(top)] + tail2 = top2[, ncol(top2)] top = top[, 1:ncol(top) - 1] + top2 = top2[, 1:ncol(top2) - 1] + } random = ifelse(ncol(top) <=2, 1, as.scalar(transitions[idxR])) if(random == 1) @@ -137,6 +158,8 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do c1 = mutation(top, seed) else if(random == 3) c1 = removal(top, seed) + else if(random == 4) + c1 = crossover(top, top2, seed) if(sum(mask) > 0) c1 = cbind(c1, tail) @@ -171,11 +194,9 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do refChanges = as.double(as.scalar(outputPip[nrow(outputPip), 2])) acc = outputPip[, 1] + print(toString(outputPip)) outputPip = outputPip[,3:ncol(outputPip)] - print(toString(outputHp)) - - } addition = function(Frame[Unknown] top, Frame[Unknown] opToAdd) @@ -214,6 +235,19 @@ return (Frame[Unknown] child) } } +crossover = function(Frame[Unknown] p1, Frame[Unknown] p2, Integer seed) +return(Frame[Unknown] child) +{ + # # randomly select the lengths to be append + lp1 = as.scalar(sample(ncol(p1), 1, FALSE, seed)) + lp2 = as.scalar(sample(ncol(p2), 1, FALSE, seed)) + child = cbind(p1[, 1:lp1], p2[, lp2:ncol(p2)]) + print("p1 "+toString(p1)) + print("p2 "+toString(p2)) + print("child "+toString(child)) +} + + getOps = function( Frame[string] allOps, Frame[String] refSol, Integer dist, Integer n, Integer minValue) return (Frame[String] allOps, Frame[String] refSol) { diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml index 8f0a60dc5a1..7fb95297dfb 100644 --- a/scripts/pipelines/scripts/utils.dml +++ b/scripts/pipelines/scripts/utils.dml @@ -59,9 +59,10 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Boolean return (Matrix[Double] sampledX, Matrix[Double] sampledY) { MIN_SAMPLE = 1000 - sampled = floor(nrow(eX) * ratio) sampledX = eX sampledY = eY + ratio = ifelse(nrow(eY) > 200000, 0.6, ratio) + sampled = floor(nrow(eX) * ratio) if(sampled > MIN_SAMPLE & ratio != 1.0) { diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index e42d7f8bad7..5e9509696b3 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -46,6 +46,7 @@ public enum Builtins { ALS_DS("alsDS", true), ALS_PREDICT("alsPredict", true), ALS_TOPK_PREDICT("alsTopkPredict", true), + APPLY_PIPELINE("apply_pipeline", true), ARIMA("arima", true), ASIN("asin", false), ATAN("atan", false), diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java index 08e8e3f7417..eb4065ccfc4 100644 --- a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java +++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java @@ -610,9 +610,10 @@ public void processInstruction(ExecutionContext ec) { case CastAsListVariable: ListObject lobj = ec.getListObject(getInput1()); if( lobj.getLength() != 1 || !(lobj.getData(0) instanceof ListObject) ) - throw new RuntimeException("as.list() expects a list input with one nested list: " - + "length(list)="+lobj.getLength()+", dt(list[0])="+lobj.getData(0).getDataType() ); - ec.setVariable(output.getName(), lobj.getData(0)); + ec.setVariable(output.getName(), lobj); +// throw new RuntimeException("as.list() expects a list input with one nested list: " +// + "length(list)="+lobj.getLength()+", dt(list[0])="+lobj.getData(0).getDataType() ); + else ec.setVariable(output.getName(), lobj.getData(0)); break; case CastAsDoubleVariable: diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java index de880724a2d..5ef02bef1f1 100644 --- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java +++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java @@ -48,13 +48,13 @@ public void setUp() { @Ignore public void testFindBestPipelineCompany() { runtopkCleaning(DATA_DIR+ "company.csv", RESOURCE+ "meta/meta_company.csv", 1.0, 3,5, - .0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE); + 5.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE); } @Test public void testFindBestPipelineCensus() { runtopkCleaning(DATA_DIR+ "dirty.csv", RESOURCE+ "meta/meta_census.csv", 1.0, 3,5, - 27.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE); + 2.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE); } // this test is ignored due to it long running time in Git actions diff --git a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml index 889b82c6e1f..f0cb72656a9 100644 --- a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml +++ b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml @@ -59,18 +59,25 @@ trainData = F[1:split,] testData = F[split+1:nrow(F),] -result = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE, FALSE) +print("pipeline: "+toString(pip[1])) +[result, trX, tsX, exState, iState] = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE, FALSE) +eXtest = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE) + -header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3) result = as.frame(result) +resultBool = as.scalar(result[1, 3] > result[1, 1]) +eXtest = replace(target=eXtest, pattern=NaN, replacement=0) +tsX = replace(target=tsX, pattern=NaN, replacement=0) + +resApply = sum(eXtest - tsX[, 1:ncol(eXtest)]) == 0 +resultBool = resultBool & resApply +write(resultBool, $6) +header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3) writeRes = rbind(header, result) print(toString(writeRes)) -result = as.scalar(result[1, 3] > result[1, 1]) -write(result, $6) - # UDF for evaluation # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally ) evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv index b11da3e9ee4..457f9728715 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv @@ -1,3 +1,3 @@ -scaleApply,dummycodingApply,0,0,0 -NA,scaleApply,NA,dummycodingApply,0 -winsorizeApply,NA,scaleApply,dummycodingApply,0 +NA,dummycodingApply,0 +NA,dummycodingApply,0 +NA,dummycodingApply,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv index 746303da873..50c70d11528 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv @@ -1,3 +1,3 @@ -93.69369369369369 -93.69369369369369 -93.69369369369369 +73.73188405797102 +70.1086956521739 +68.29710144927536 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv index d70d1d19535..4e5b1a5042c 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv @@ -1 +1 @@ -71.17117117117117 \ No newline at end of file +61.050724637681164 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index 0f59fbc7a58..643e5d3472c 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -16.0,2.0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -32.0,1.0,0.2,0,0,0,1.0,0,2.0,2.0,1.0,0,0,0,0,0,0,1.0,0.2,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -32.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,0.2,0,0,0,1.0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +14.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +14.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +14.0,1.0,0.10554249238742949,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index 0080afe1c1f..86a68a13a41 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -scale,dummycoding,0,0,0 -underSampling,scale,underSampling,dummycoding,0 -winsorize,underSampling,scale,dummycoding,0 +underSampling,dummycoding,0 +underSampling,dummycoding,0 +underSampling,dummycoding,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv index a0ecc4ac210..3ce56a2c040 100644 --- a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv @@ -1,5 +1,5 @@ -miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0,0 -miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0,0 -miceApply,forward_fill,winsorizeApply,normalizeApply,scaleApply,0,0,0 -winsorizeApply,forward_fill,miceApply,normalizeApply,scaleApply,0,0,0 -miceApply,forward_fill,normalizeApply,winsorizeApply,scaleApply,0,0,0 +winsorizeApply,imputeByMeanApply,normalizeApply,scaleApply,0,0,0 +miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0 +miceApply,imputeByMeanApply,forward_fill,normalizeApply,scaleApply,0,0 +normalizeApply,miceApply,forward_fill,scaleApply,0,0,0 +normalizeApply,miceApply,forward_fill,scaleApply,0,0,0 diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml index 3f4b7a0dc5e..296165c0292 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml @@ -56,9 +56,10 @@ if(nrow(metaInfo) < 2) metaInfo = metaInfo[, 2:ncol(metaInfo)] # # # split in train/test 70/30 -[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3), +[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, + refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE, - expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) + expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) write(topKPipelines, output+"/pip.csv", format="csv") write(topKHyperParams, output+"/hp.csv", format="csv") diff --git a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml index cdb4a155fa8..4f2dbf31064 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml @@ -50,7 +50,7 @@ else { #matrix("1 1e-6 1e-9 1000", rows=1, cols=4) [topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, primitives=primitives, parameters=param, evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN), - topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) + topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) write(topKPipelines, output+"/pip.csv", format="csv") write(topKHyperParams, output+"/hp.csv", format="csv") From 8bdb966f25f64499d9053c790cd1221c3a970647 Mon Sep 17 00:00:00 2001 From: Shafaq Siddiqi Date: Thu, 12 May 2022 12:06:35 +0200 Subject: [PATCH 2/3] fix workloadAnalysis test --- scripts/builtin/scale.dml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/builtin/scale.dml b/scripts/builtin/scale.dml index 161b596846f..63a5f7fd876 100644 --- a/scripts/builtin/scale.dml +++ b/scripts/builtin/scale.dml @@ -43,8 +43,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE) return (Matrix[Double] out, Matrix[Double] Centering, Matrix[Double] ScaleFactor) { if(center){ - ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0)) - # ColMean = colMeans(X) + # ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0)) + ColMean = colMeans(X) X = X - ColMean } else { @@ -55,8 +55,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE) if (scale) { N = nrow(X) - ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1)) - # ScaleFactor = sqrt(colSums(X^2)/(N-1)) + # ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1)) + ScaleFactor = sqrt(colSums(X^2)/(N-1)) # Replace entries in the scale factor that are 0 and NaN with 1. # To avoid division by 0 or NaN, introducing NaN to the ouput. From 253346194c6d456d2fb24616de028103a7f6eeea Mon Sep 17 00:00:00 2001 From: Shafaq Siddiqi Date: Thu, 12 May 2022 13:07:15 +0200 Subject: [PATCH 3/3] fix imports apply_pipeline.dml() --- scripts/builtin/apply_pipeline.dml | 213 +++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 scripts/builtin/apply_pipeline.dml diff --git a/scripts/builtin/apply_pipeline.dml b/scripts/builtin/apply_pipeline.dml new file mode 100644 index 00000000000..d7cee4da3d4 --- /dev/null +++ b/scripts/builtin/apply_pipeline.dml @@ -0,0 +1,213 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data +# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset +# in terms of classification accuracy + +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------------------------------------------------- +# trainData Frame[Unknown] --- +# testData Frame[Unknown] --- +# metaData Frame[Unknown] as.frame("NULL") +# lp Frame[Unknown] --- +# pip Frame[Unknown] --- +# hp Frame[Unknown] --- +# evaluationFunc String --- +# evalFunHp Matrix[Double] --- +# isLastLabel Boolean TRUE +# correctTypos Boolean FALSE +# +# ---------------------------------------------------------------------------------------------------------------------- +# +# OUTPUT: +# ---------------------------------------------------------------------------------------------------------------------- +# NAME TYPE MEANING +# ---------------------------------------------------------------------------------------------------------------------- +# scores Matrix[Double] --- +# ---------------------------------------------------------------------------------------------------------------------- + + +source("scripts/builtin/topk_cleaning.dml") as topk; + +s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] pip, + Frame[Unknown] applyFunc, Matrix[Double] hp, Boolean isLastLabel = TRUE,List[Unknown] exState, List[Unknown] iState, Boolean correctTypos=FALSE) + return (Matrix[Double] eXtest) +{ + no_of_flag_vars = 5 + [schema, mask, fdMask, maskY] = topk::prepareMeta(testData, metaData) + pip = removeEmpty(target=pip, margin="cols") + applyFunc = removeEmpty(target=applyFunc, margin="cols") + metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL")) + ctx = list(prefix="----"); #TODO include seed + # separate the label + [Xtest, Ytest] = topk::getLabel(testData, isLastLabel) + + # always recode the label + if(maskY == 1) { + M = as.frame(exState[1]) + eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M); + } + else + { + eYtest = as.matrix(Ytest) + } + # # # when the evaluation function is called first we also compute and keep hyperparams of target application + ctx = list(prefix="apply Pipeline") + + [Xtest, Xt] = topk::runStringPipeline(Xtest, Xtest, schema, mask, FALSE, correctTypos, ctx) + + # # # if mask has 1s then there are categorical features + M = as.frame(exState[2]) + index = vectorToCsv(mask) + jspecR = "{ids:true, recode:["+index+"]}" + eXtest = transformapply(target=Xtest, spec=jspecR, meta=M); + metaList["applyFunc"] = applyFunc + + no_of_param = as.scalar(hp[1, 1]) + 1 + hp_width= hp[1, 2:no_of_param] + hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip)) + pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars) + for(i in 1:length(iState)) { + op = as.scalar(pip[1,i]) + XtestClone = eXtest + applyOp = toString(as.scalar(applyFunc[1,i])) + dataFlag = as.scalar(hp_matrix[i, ncol(hp_matrix)]) + [iState, L] = remove(iState, 1) + [eXtest, executeFlag] = getDataFromFlag(eXtest, mask, dataFlag) + L2 = list(eXtest) + L = as.list(L) + for(k in 1:length(L)) { + L2 = append(L2, L[k]) + } + if(executeFlag == 1 & applyOp != "NA") { + eXtest = eval(applyOp, L2); + eXtest = confirmDataFromMask (eXtest, XtestClone, mask, dataFlag) + eXtest = confirmMetaFromMask (eXtest, mask) + } + else { + print("not applying "+op+" executeFlag = 0") + } + } + +} + + +getDataFromFlag = function(Matrix[Double] X, Matrix[Double] mask, Integer dataFlag) +return(Matrix[Double] X,Integer executeFlag) +{ + executeFlag = 1 + if(dataFlag == 0) + { + if(sum(mask) == ncol(mask)) + executeFlag = 0 + else { + # take numerics out and remove categorical + X = removeEmpty(target=X, margin = "cols", select = (mask == 0)) + } + } + else if(dataFlag == 1) + { + if(sum(mask) == 0) + executeFlag = 0 + else { + # take categorical out and remove numerics + X = removeEmpty(target=X, margin = "cols", select = mask) + } + } + else X = X +} + +confirmMetaFromMask = function(Matrix[Double] X, Matrix[Double] mask) +return (Matrix[Double] X) +{ + if((sum(mask) > 0) & (ncol(X) == ncol(mask))) + { + # get the max + 1 for nan replacement + nanMask = is.na(X) + # replace nan + X = replace(target = X, pattern = NaN, replacement = 9999) + # take categorical out + cat = removeEmpty(target=X, margin="cols", select = mask) + # round categorical (if there is any floating point) + cat = round(cat) + less_than_1_mask = cat < 1 + less_than_1 = less_than_1_mask * 9999 + cat = (cat * (less_than_1_mask == 0)) + less_than_1 + # reconstruct original X + X = X * (mask == 0) + q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", + select=t(mask)), ncol(cat), ncol(X)) + X = (cat %*% q) + X + + # put nan back + nanMask = replace(target = nanMask, pattern = 1, replacement = NaN) + X = X + nanMask + } +} + + +confirmDataFromMask = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag) +return (Matrix[Double] X) +{ + + if(dataFlag == 0 & (sum(mask) > 0) & (sum(mask) != ncol(originalX))) + { + maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1 + nX = replace(target = nX, pattern = NaN, replacement = maxDummy) + # X without numerics + Xcat = removeEmpty(target=originalX, margin="cols", select=mask) + nanMask = is.na(Xcat) + Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111) + + # reconstruct the original matrix + p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", + select=t(mask==0)), ncol(nX), ncol(originalX)) + q = table(seq(1, ncol(Xcat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", + select=t(mask)), ncol(Xcat), ncol(originalX)) + X = (nX %*% p) + (Xcat %*% q) + + X = replace(target = X, pattern = maxDummy, replacement = NaN) + X = replace(target = X, pattern = -1111, replacement = NaN) + } + else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(originalX))) + { + maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1 + nX = replace(target = nX, pattern = NaN, replacement = maxDummy) + # X without categorical + Xnum = removeEmpty(target=originalX, margin="cols", select=(mask==0)) + nanMask = is.na(Xnum) + Xnum = replace(target = Xnum, pattern = NaN, replacement = -1111) + # reconstruct the original matrix + p = table(seq(1, ncol(Xnum)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", + select=t(mask==0)), ncol(Xnum), ncol(originalX)) + q = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", + select=t(mask)), ncol(nX), ncol(originalX)) + X = (nX %*% q) + (Xnum %*% p) + X = replace(target = X, pattern = maxDummy, replacement = NaN) + X = replace(target = X, pattern = -1111, replacement = NaN) + + } + else X = nX + +}