From 89348202f4ff424e77e3da941c16183ae2ac3e29 Mon Sep 17 00:00:00 2001
From: Shafaq Siddiqi <shafaq.siddiqi@tugraz.at>
Date: Mon, 9 May 2022 16:29:22 +0200
Subject: [PATCH 1/3] [MINOR] Adding apply_pipeline() builtin for cleaning
 pipelines API

---
 scripts/builtin/bandit.dml                    |  5 +-
 scripts/builtin/executePipeline.dml           | 16 +++--
 scripts/builtin/fit_pipeline.dml              | 26 ++++----
 scripts/builtin/frameSort.dml                 |  3 +-
 scripts/builtin/scale.dml                     |  8 +--
 scripts/builtin/topk_cleaning.dml             | 63 +++++++++----------
 .../pipelines/scripts/enumerateLogical.dml    | 50 ++++++++++++---
 scripts/pipelines/scripts/utils.dml           |  3 +-
 .../org/apache/sysds/common/Builtins.java     |  1 +
 .../cp/VariableCPInstruction.java             |  7 ++-
 ...BuiltinTopkCleaningClassificationTest.java |  4 +-
 .../functions/pipelines/fit_pipelineTest.dml  | 17 +++--
 .../classification/applyFunc.csv              |  6 +-
 .../intermediates/classification/bestAcc.csv  |  6 +-
 .../classification/dirtyScore.csv             |  2 +-
 .../intermediates/classification/hp.csv       |  6 +-
 .../intermediates/classification/pip.csv      |  6 +-
 .../intermediates/regression/applyFunc.csv    | 10 +--
 .../topkcleaningClassificationTest.dml        |  5 +-
 .../pipelines/topkcleaningRegressionTest.dml  |  2 +-
 20 files changed, 147 insertions(+), 99 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index 504b5bb53f4..fa1ff1137d5 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,7 +53,7 @@
 
 m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList,
   String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, Matrix[Double] lpHp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3,
-  Integer R=50, Double baseLineScore, Boolean cv,  Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="")
+  Integer R=50, Double baseLineScore, Boolean cv,  Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE)
   # return(Boolean perf)
   return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy, Frame[String] applyFunc) 
 {
@@ -290,7 +290,7 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Do
     hp = hp[, 2:totalVals]
     applyFunctions = allApplyFunctions[i]
     no_of_res = nrow(hp)
-    # print("PIPELINE EXECUTION START ... "+toString(op))
+    print("PIPELINE EXECUTION START ... "+toString(op))
     hpForPruning = matrix(0, rows=1, cols=ncol(op))
     changesByOp = matrix(0, rows=1, cols=ncol(op))
     metaList2 = metaList; #ensure metaList is no result var
@@ -564,6 +564,7 @@ return (Double accuracy, Matrix[Double] evalFunHp, Matrix[Double] hpForPruning,
   allChanges = min(allChanges)
   changesByOp = colMaxs(cvChanges)
   accuracy =  mean(accuracyMatrix)
+  print("mean: \n"+toString(accuracyMatrix))
   print("cv accuracy: "+toString(accuracy))
 }
 
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index a606df9a465..9eae1a8a74d 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -57,9 +57,9 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
   Matrix[Double] Xtest,  Matrix[Double] Ytest, List[Unknown] metaList, Matrix[Double] hyperParameters, Matrix[Double] hpForPruning = as.matrix(0),
   Matrix[Double] changesByOp = as.matrix(0), Integer flagsCount, Boolean test = FALSE, Boolean verbose)
   return (Matrix[Double] Xtrain, Matrix[Double] Ytrain, Matrix[Double] Xtest, Matrix[Double] Ytest,
-    Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll)
+    Double t2, Matrix[Double] hpForPruning, Matrix[Double] changesByOp, Double changesAll, List[Unknown] internalStates)
 {
-
+  internalStates = list()
   mask=as.matrix(metaList['mask'])
   FD = as.matrix(metaList['fd'])
   applyFunc = as.frame(metaList['applyFunc'])
@@ -76,7 +76,7 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
   for(i in 1:ncol(pipeline)) {
     op = as.scalar(pipeline[1,i])
     applyOp = toString(as.scalar(applyFunc[1,i]))
-
+    # print("op: "+op)
     Xclone = Xtrain
     XtestClone = Xtest
     [hp, dataFlag, yFlag, executeFlag] = matrixToList(Xtrain, Ytrain, mask, FD, hyperParameters[i], flagsCount, op)
@@ -86,11 +86,14 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
       Xtrain = as.matrix(O)
       if(applyOp != "NA") {
         [Xtest, executeFlag] = applyDataFlag(Xtest, mask, dataFlag)
+        internalStates = append(internalStates, L)
         L = append(L, list(X=Xtest));
         Xtest = eval(applyOp, L);
-        Xtest = confirmData(Xtest, XtestClone, mask, dataFlag, yFlag)
+        # print("L \n"+toString(L, rows=3))
+        Xtest = confirmData(Xtest, XtestClone, mask, dataFlag)
       }
-      Xtrain = confirmData(Xtrain, Xclone, mask, dataFlag, yFlag)
+      else internalStates = append(internalStates, as.frame("NA"))
+      Xtrain = confirmData(Xtrain, Xclone, mask, dataFlag)
 
       # dataFlag 0 = only on numeric, 1 = on whole data
       if(yFlag) {
@@ -98,6 +101,7 @@ s_executePipeline = function(Frame[String] pipeline, Matrix[Double] Xtrain,  Mat
         Ytrain = as.matrix(Y)
       }
       Xtrain = confirmMeta(Xtrain, mask)
+      Xtest = confirmMeta(Xtest, mask)
     }
     else {
       print("not applying "+op+" executeFlag = 0")
@@ -225,7 +229,7 @@ return (Matrix[Double] X)
 }
 
 
-confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag, Integer yFlag)
+confirmData = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag)
 return (Matrix[Double] X)
 {
 
diff --git a/scripts/builtin/fit_pipeline.dml b/scripts/builtin/fit_pipeline.dml
index dae96a2a058..d67af627392 100644
--- a/scripts/builtin/fit_pipeline.dml
+++ b/scripts/builtin/fit_pipeline.dml
@@ -44,7 +44,7 @@
 # ----------------------------------------------------------------------------------------------------------------------
 # NAME           TYPE             MEANING
 # ----------------------------------------------------------------------------------------------------------------------
-# result         Matrix[Double]   ---
+# scores         Matrix[Double]   ---
 # ----------------------------------------------------------------------------------------------------------------------
 
 source("scripts/pipelines/scripts/utils.dml") as utils;
@@ -54,10 +54,12 @@ source("scripts/builtin/bandit.dml") as bandit;
 s_fit_pipeline = function(Frame[Unknown] trainData, Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"),
   Frame[Unknown] pip, Frame[Unknown] applyFunc, Matrix[Double] hp, String evaluationFunc, Matrix[Double] evalFunHp,
   Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE)
-return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTest)
+return (Matrix[Double] scores, Matrix[Double] cleanTrain, Matrix[Double] cleanTest, List[Unknown] externalState, List[Unknown] iState)
 {
+  externalState = list()
   no_of_flag_vars = 5
   [schema, mask, fdMask, maskY] = topk::prepareMeta(trainData, metaData)
+
   pip = removeEmpty(target=pip, margin="cols")
   applyFunc = removeEmpty(target=applyFunc, margin="cols")
   metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
@@ -70,6 +72,7 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
   if(maskY == 1) {
     [eYtrain, M] = transformencode(target=Ytrain, spec= "{ids:true, recode:[1]}");
     eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
+    externalState = append(externalState, M)
   }
   else
   {
@@ -83,14 +86,11 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
   [Xtrain, Xtest] = topk::runStringPipeline(Xtrain, Xtest, schema, mask, FALSE, correctTypos, ctx)
   
   # # # if mask has 1s then there are categorical features
-  [eXtrain, eXtest] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
+  [eXtrain, eXtest, M1] = topk::recodeData(Xtrain, Xtest, mask, FALSE, "recode")
+  externalState = append(externalState, M1)
   # # # do the early dropping
-  [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
+  # [eXtrain, eXtest, metaList] = topk::featureDrop(eXtrain, eXtest, metaList, FALSE)
   metaList["applyFunc"] = applyFunc
-  # construct the parameter list for best hyper-parameters if the oversampling technique is part of 
-  # pipeline then take it out because oversampling is not applied on test dataset
-  # this condition is unnecessary here in this case because the input dataset is balanced and 
-  # instead of diving the dataset into train/test I am doing cross validations
 
   no_of_param = as.scalar(hp[1, 1]) + 1
   hp_width= hp[1, 2:no_of_param]
@@ -98,7 +98,7 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
   pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
 
   # # # now test accuracy
-  [eXtrain, eYtrain, eXtest, eYtest, a, b,Tr] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
+  [eXtrain, eYtrain, eXtest, eYtest, a, b, c, d, iState] = executePipeline(pipeline=pip, Xtrain=eXtrain, Ytrain=eYtrain,
     Xtest=eXtest, Ytest=eYtest, metaList=metaList, hyperParameters=hp_matrix, flagsCount=no_of_flag_vars, test=TRUE, verbose=FALSE)
   
   if(max(eYtrain) == min(eYtrain)) 
@@ -110,10 +110,10 @@ return (Matrix[Double] result, Matrix[Double] cleanTrain, Matrix[Double] cleanTe
   score = eval(evaluationFunc, list(X=eXtrain, Y=eYtrain, Xtest=eXtest, Ytest=eYtest, Xorig=as.matrix(0), evalFunHp=evalFunHp))
   testAccuracy = as.scalar(score[1, 1])
   
-  result = matrix(0, rows=1, cols=3)
-  result[1, 1] = dirtyScore
-  result[1, 2] = trainAccuracy
-  result[1, 3] = testAccuracy  
+  scores = matrix(0, rows=1, cols=3)
+  scores[1, 1] = dirtyScore
+  scores[1, 2] = trainAccuracy
+  scores[1, 3] = testAccuracy  
   cleanTrain = cbind(eXtrain, eYtrain)
   cleanTest = cbind(eXtest, eYtest)
 }
diff --git a/scripts/builtin/frameSort.dml b/scripts/builtin/frameSort.dml
index 3cfeec7bcab..cf447b42820 100644
--- a/scripts/builtin/frameSort.dml
+++ b/scripts/builtin/frameSort.dml
@@ -37,10 +37,9 @@
 # f_odered  Frame[String]                sorted dataset by column 1 in decreasing order
 # ----------------------------------------------------------------------------------------------------------------------
 
-s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE )
+s_frameSort = function(Frame[String] F, Matrix[Double] mask, Boolean orderDesc = TRUE)
 return (Frame[String] f_odered)
 {
-  # idx[1,1] = 0 # to save accuracy column from encoding 
   index = vectorToCsv(mask)
   # recode logical pipelines for easy handling
   jspecR = "{ids:true, recode:["+index+"]}";
diff --git a/scripts/builtin/scale.dml b/scripts/builtin/scale.dml
index 63a5f7fd876..161b596846f 100644
--- a/scripts/builtin/scale.dml
+++ b/scripts/builtin/scale.dml
@@ -43,8 +43,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE)
   return (Matrix[Double] out, Matrix[Double] Centering, Matrix[Double] ScaleFactor) 
 {
   if(center){
-    # ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0))
-    ColMean = colMeans(X)
+    ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0))
+    # ColMean = colMeans(X)
     X =  X - ColMean
   }
   else {
@@ -55,8 +55,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE)
 
   if (scale) {
     N = nrow(X)
-    # ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1))
-    ScaleFactor = sqrt(colSums(X^2)/(N-1))
+    ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1))
+    # ScaleFactor = sqrt(colSums(X^2)/(N-1))
 
     # Replace entries in the scale factor that are 0 and NaN with 1.
     # To avoid division by 0 or NaN, introducing NaN to the ouput.
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index 33ed3ddf370..b322eaf6b12 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -58,8 +58,7 @@ source("scripts/builtin/bandit.dml") as bandit;
 
 s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives,
   Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10,
-  Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE,
-  String output)
+  Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE)
   return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores,
     Double dirtyScore, Matrix[Double] evalFunHp, Frame[Unknown] applyFunc)
 {
@@ -104,7 +103,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   print("---- feature transformations to numeric matrix");
   [eXtrain, eXtest] = recodeData(Xtrain, Xtest, mask, cv, "recode")
   # # # do the early dropping
-  [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv)
+  # [eXtrain, eXtest, metaList] = featureDrop(eXtrain, eXtest, metaList, cv)
   # apply sampling on training data for pipeline enumeration
   # TODO why recoding/sampling twice (within getDirtyScore)
   print("---- class-stratified sampling of feature matrix w/ f="+sample);
@@ -148,7 +147,7 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   # stop("end of enumlp")
   [topKPipelines, topKHyperParams, topKScores, applyFunc] = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,  metaList=metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, lpHp=bestHp, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
-    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, output=output, verbose=TRUE);  
+    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, verbose=TRUE);  
   t7 = time(); print("-- Cleaning - Enum Physical Pipelines: "+(t7-t6)/1e9+"s");
 }
 
@@ -239,7 +238,7 @@ return(Double dirtyScore, Matrix[Double] evalFunHp)
 }
 
 recodeData = function(Frame[Unknown] Xtrain, Frame[Unknown] Xtest, Matrix[Double] mask, Boolean cv, String code)
-return(Matrix[Double] eXtrain, Matrix[Double] eXtest)
+return(Matrix[Double] eXtrain, Matrix[Double] eXtest, Frame[Unknown] X_meta)
 {
   if(sum(mask) > 0)
   {
@@ -257,31 +256,31 @@ return(Matrix[Double] eXtrain, Matrix[Double] eXtest)
   }
 }
 
-featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList, Boolean cv)
-return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList)
-{
-  mask = as.matrix(metaList['mask'])
-  fdMask = as.matrix(metaList['fd'])
-  schema = as.frame(metaList['schema'])
-  # # 1. if 90% of the column is empty
-  # # # 2. if the column has only single value
-  # # # have all unique values
-  Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0)
-  nullMask = is.na(eXtrain)
-  singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | (colMaxs(Xtmp) == colMins(Xtmp))
-  allmostEmpty = colSums(nullMask) 
-  allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9)
-  allSum = singleValuesCol | allmostEmptyRatio
-  if(sum(allSum) > 0) {
-    eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 0))
-    if(!cv)
-      eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 0))
-    mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0))
-    fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0))
-    schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0))
-    metaList['mask'] = mask
-    metaList['schema'] = schema
-    metaList['fd'] = fdMask
-  }
-}
+# featureDrop = function(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList, Boolean cv)
+# return(Matrix[Double] eXtrain, Matrix[Double] eXtest, List[Unknown] metaList)
+# {
+  # mask = as.matrix(metaList['mask'])
+  # fdMask = as.matrix(metaList['fd'])
+  # schema = as.frame(metaList['schema'])
+  # # # 1. if 90% of the column is empty
+  # # # # 2. if the column has only single value
+  # # # # have all unique values
+  # Xtmp = replace(target = eXtrain, pattern = NaN, replacement = 0)
+  # nullMask = is.na(eXtrain)
+  # singleValuesCol = ((colMins(Xtmp) == 0) & (colMaxs(Xtmp) == 1)) | (colMaxs(Xtmp) == colMins(Xtmp))
+  # allmostEmpty = colSums(nullMask) 
+  # allmostEmptyRatio = allmostEmpty >= (nrow(Xtmp) * 0.9)
+  # allSum = singleValuesCol | allmostEmptyRatio
+  # if(sum(allSum) > 0) {
+    # eXtrain = removeEmpty(target=eXtrain, margin="cols", select = (allSum == 0))
+    # if(!cv)
+      # eXtest = removeEmpty(target=eXtest, margin="cols", select = (allSum == 0))
+    # mask = removeEmpty(target=mask, margin="cols", select = (allSum == 0))
+    # fdMask = removeEmpty(target=fdMask, margin="cols", select = (allSum == 0))
+    # schema = removeEmpty(target=schema, margin="cols", select = (allSum == 0))
+    # metaList['mask'] = mask
+    # metaList['schema'] = schema
+    # metaList['fd'] = fdMask
+  # }
+# }
 
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index d4f9891b27e..a6d8459e063 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -87,7 +87,7 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do
   pipelines = rbind(ref, pipelines)
   population = pipelines
   populationSize = nrow(pipelines)
-  transitions = sample(3, (populationSize * max_iter), TRUE, seed)
+  transitions = sample(4, (populationSize * max_iter), TRUE, seed)
   opToAdd = sample(nrow(allOps), (populationSize * max_iter), TRUE, seed)
   # opToRemove = sample(max_iter, (populationSize * max_iter), TRUE, seed)
   refChangesInternal = 0
@@ -117,18 +117,39 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do
     finalOutput = append(finalOutput, sortedPipelines)
     finalOutputHp = append(finalOutputHp, sortedHp)
     # # # if converged then stop otherwise generate new population
-    children = frame(0, rows=populationSize, cols=ncol(sortedPipelines))
+    children = frame(0, rows=populationSize, cols=ncol(sortedPipelines)+(ncol(sortedPipelines)/2))
     sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)]
+    start = 1; 
+    end = 0;
+    topk = frame(0, rows=round((populationSize/2)) * length(finalOutput) , cols=populationLength + 2) 
+    for(i in 1:length(finalOutput))
+    {
+      pipFrame = as.frame(finalOutput[i])
+      end = end + nrow(pipFrame)
+      topk[start:end, 1:ncol(pipFrame)] = pipFrame
+      start = end + 1
+    }
+    sort_mask = cbind(matrix(0, rows=1, cols=2), matrix(1, rows=1, cols=ncol(topk) - 2))
+    topk = removeEmpty(target=topk, margin="rows")
+    topk = frameSort(topk, sort_mask, TRUE)
+    topk = topk[, 3:ncol(topk)]
     # # randomly pick the pipelines for transitions
     pipRand = sample(nrow(sortedPipelines), populationSize, TRUE, seed)
     if(!converged) {
       parfor(i in 1:nrow(children), check=0) {
         idxR = (nrow(children) * (iter - 1)) + i
         idx = as.scalar(pipRand[i])
-        top = removeEmpty(target=sortedPipelines[idx], margin="cols")
-        tail = top[, ncol(top)]
-        if(sum(mask) > 0)
+        top = removeEmpty(target=topk[idx], margin="cols")
+        # top = removeEmpty(target=sortedPipelines[idx], margin="cols")
+        idx2 = min(max(pipRand), idx + 1)
+        top2 = removeEmpty(target=topk[idx2], margin="cols")
+        # top2 = removeEmpty(target=sortedPipelines[idx2], margin="cols")
+        if(sum(mask) > 0) {
+          tail = top[, ncol(top)]
+          tail2 = top2[, ncol(top2)]
           top = top[, 1:ncol(top) - 1]
+          top2 = top2[, 1:ncol(top2) - 1]
+        }
           
         random = ifelse(ncol(top) <=2, 1, as.scalar(transitions[idxR]))
         if(random == 1)
@@ -137,6 +158,8 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do
           c1 = mutation(top, seed) 
         else if(random == 3)
           c1 = removal(top, seed) 
+        else if(random == 4)
+          c1 = crossover(top, top2, seed)
         
         if(sum(mask) > 0)
           c1 = cbind(c1, tail)
@@ -171,11 +194,9 @@ return (Frame[Unknown] outputPip, Matrix[Double] outputHp, boolean converged, Do
   
   refChanges = as.double(as.scalar(outputPip[nrow(outputPip), 2]))
   acc = outputPip[, 1]
+  print(toString(outputPip))
   outputPip = outputPip[,3:ncol(outputPip)]
 
-  print(toString(outputHp))
-
-
 }
 
 addition = function(Frame[Unknown] top, Frame[Unknown] opToAdd)
@@ -214,6 +235,19 @@ return (Frame[Unknown] child)
   }
 }
 
+crossover = function(Frame[Unknown] p1, Frame[Unknown] p2, Integer seed)
+return(Frame[Unknown] child)
+{
+    # # randomly select the lengths to be append
+    lp1 = as.scalar(sample(ncol(p1), 1, FALSE, seed))
+    lp2 = as.scalar(sample(ncol(p2), 1, FALSE, seed))
+    child = cbind(p1[, 1:lp1], p2[, lp2:ncol(p2)])
+    print("p1 "+toString(p1))
+    print("p2 "+toString(p2))
+    print("child "+toString(child))
+}
+
+
 getOps = function( Frame[string] allOps, Frame[String] refSol, Integer dist, Integer n, Integer minValue)
  return (Frame[String] allOps, Frame[String] refSol) {
  
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 8f0a60dc5a1..7fb95297dfb 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -59,9 +59,10 @@ doSample = function(Matrix[Double] eX, Matrix[Double] eY, Double ratio, Boolean
   return (Matrix[Double] sampledX, Matrix[Double] sampledY)
 {
   MIN_SAMPLE = 1000
-  sampled = floor(nrow(eX) * ratio)
   sampledX = eX
   sampledY = eY
+  ratio = ifelse(nrow(eY) > 200000, 0.6, ratio)
+  sampled = floor(nrow(eX) * ratio)
   
   if(sampled > MIN_SAMPLE & ratio != 1.0)
   {
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index e42d7f8bad7..5e9509696b3 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -46,6 +46,7 @@ public enum Builtins {
 	ALS_DS("alsDS", true),
 	ALS_PREDICT("alsPredict", true),
 	ALS_TOPK_PREDICT("alsTopkPredict", true),
+	APPLY_PIPELINE("apply_pipeline", true),
 	ARIMA("arima", true),
 	ASIN("asin", false),
 	ATAN("atan", false),
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
index 08e8e3f7417..eb4065ccfc4 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/VariableCPInstruction.java
@@ -610,9 +610,10 @@ public void processInstruction(ExecutionContext ec) {
 		case CastAsListVariable:
 			ListObject lobj = ec.getListObject(getInput1());
 			if( lobj.getLength() != 1 || !(lobj.getData(0) instanceof ListObject) )
-				throw new RuntimeException("as.list() expects a list input with one nested list: "
-					+ "length(list)="+lobj.getLength()+", dt(list[0])="+lobj.getData(0).getDataType() );
-			ec.setVariable(output.getName(), lobj.getData(0));
+				ec.setVariable(output.getName(), lobj);
+//				throw new RuntimeException("as.list() expects a list input with one nested list: "
+//					+ "length(list)="+lobj.getLength()+", dt(list[0])="+lobj.getData(0).getDataType() );
+			else ec.setVariable(output.getName(), lobj.getData(0));
 			break;
 
 		case CastAsDoubleVariable:
diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
index de880724a2d..5ef02bef1f1 100644
--- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
+++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java
@@ -48,13 +48,13 @@ public void setUp() {
 	@Ignore
 	public void testFindBestPipelineCompany() {
 		runtopkCleaning(DATA_DIR+ "company.csv", RESOURCE+ "meta/meta_company.csv", 1.0, 3,5,
-			.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
+			5.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
 	}
 
 	@Test
 	public void testFindBestPipelineCensus() {
 		runtopkCleaning(DATA_DIR+ "dirty.csv", RESOURCE+ "meta/meta_census.csv", 1.0, 3,5,
-			27.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
+			2.0,"FALSE", 0,0.8, Types.ExecMode.SINGLE_NODE);
 	}
 
 	// this test is ignored due to it long running time in Git actions
diff --git a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
index 889b82c6e1f..f0cb72656a9 100644
--- a/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
+++ b/src/test/scripts/functions/pipelines/fit_pipelineTest.dml
@@ -59,18 +59,25 @@ trainData = F[1:split,]
 testData = F[split+1:nrow(F),]
 
 
-result = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE, FALSE)
+print("pipeline: "+toString(pip[1]))
+[result, trX, tsX, exState, iState]  = fit_pipeline(trainData, testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], "evalClassification", evalHp, TRUE, FALSE)
+eXtest  = apply_pipeline(testData, metaInfo, pip[1,], applyFunc[1,], hp[1,], TRUE, exState, iState, FALSE)
+
 
-header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
 result = as.frame(result)
+resultBool = as.scalar(result[1, 3] > result[1, 1])
+eXtest = replace(target=eXtest, pattern=NaN, replacement=0)
+tsX = replace(target=tsX, pattern=NaN, replacement=0)
+
 
+resApply = sum(eXtest - tsX[, 1:ncol(eXtest)]) == 0
+resultBool = resultBool & resApply
+write(resultBool, $6)
 
+header = frame(["dirty acc", "train acc", "test acc"], rows=1, cols=3)
 writeRes = rbind(header, result)
 print(toString(writeRes))
 
-result = as.scalar(result[1, 3] > result[1, 1])
-write(result, $6)
-
 # UDF for evaluation  
 # choice of parameters provided by API, X, Y, clone_X, evalFunHp (hyper-param), trainML (boolean for optimizing hp internally or passed by externally )
 evalClassification = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index b11da3e9ee4..457f9728715 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-scaleApply,dummycodingApply,0,0,0
-NA,scaleApply,NA,dummycodingApply,0
-winsorizeApply,NA,scaleApply,dummycodingApply,0
+NA,dummycodingApply,0
+NA,dummycodingApply,0
+NA,dummycodingApply,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index 746303da873..50c70d11528 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-93.69369369369369
-93.69369369369369
-93.69369369369369
+73.73188405797102
+70.1086956521739
+68.29710144927536
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
index d70d1d19535..4e5b1a5042c 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
@@ -1 +1 @@
-71.17117117117117
\ No newline at end of file
+61.050724637681164
\ No newline at end of file
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 0f59fbc7a58..643e5d3472c 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-16.0,2.0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,1.0,0.2,0,0,0,1.0,0,2.0,2.0,1.0,0,0,0,0,0,0,1.0,0.2,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,2.0,0.05,0.95,0,0,0,1.0,0,1.0,0.2,0,0,0,1.0,0,2.0,2.0,1.0,0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.3140125178611014,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+14.0,1.0,0.10554249238742949,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 0080afe1c1f..86a68a13a41 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-scale,dummycoding,0,0,0
-underSampling,scale,underSampling,dummycoding,0
-winsorize,underSampling,scale,dummycoding,0
+underSampling,dummycoding,0
+underSampling,dummycoding,0
+underSampling,dummycoding,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
index a0ecc4ac210..3ce56a2c040 100644
--- a/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/regression/applyFunc.csv
@@ -1,5 +1,5 @@
-miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0,0
-miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0,0
-miceApply,forward_fill,winsorizeApply,normalizeApply,scaleApply,0,0,0
-winsorizeApply,forward_fill,miceApply,normalizeApply,scaleApply,0,0,0
-miceApply,forward_fill,normalizeApply,winsorizeApply,scaleApply,0,0,0
+winsorizeApply,imputeByMeanApply,normalizeApply,scaleApply,0,0,0
+miceApply,forward_fill,imputeByMeanApply,normalizeApply,scaleApply,0,0
+miceApply,imputeByMeanApply,forward_fill,normalizeApply,scaleApply,0,0
+normalizeApply,miceApply,forward_fill,scaleApply,0,0,0
+normalizeApply,miceApply,forward_fill,scaleApply,0,0,0
diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 3f4b7a0dc5e..296165c0292 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -56,9 +56,10 @@ if(nrow(metaInfo) < 2)
 metaInfo = metaInfo[, 2:ncol(metaInfo)]
 # # # split in train/test 70/30
 
-[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
+[topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, 
+  refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
   evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, enablePruning=TRUE,
-  expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
+  expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE) 
 
 write(topKPipelines, output+"/pip.csv", format="csv")
 write(topKHyperParams, output+"/hp.csv", format="csv")
diff --git a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
index cdb4a155fa8..4f2dbf31064 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningRegressionTest.dml
@@ -50,7 +50,7 @@ else {
 #matrix("1 1e-6 1e-9 1000", rows=1, cols=4)
 [topKPipelines, topKHyperParams, topKScores, baseLineScore, evalFunHp, applyFunc] = topk_cleaning(dataTrain=trainData, dataTest=testData, 
   primitives=primitives, parameters=param, evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),
-  topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output)
+  topK=topK, resource_val=resources, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE)
 
 write(topKPipelines, output+"/pip.csv", format="csv")
 write(topKHyperParams, output+"/hp.csv", format="csv")

From 8bdb966f25f64499d9053c790cd1221c3a970647 Mon Sep 17 00:00:00 2001
From: Shafaq Siddiqi <shafaq.siddiqi@tugraz.at>
Date: Thu, 12 May 2022 12:06:35 +0200
Subject: [PATCH 2/3] fix workloadAnalysis test

---
 scripts/builtin/scale.dml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/builtin/scale.dml b/scripts/builtin/scale.dml
index 161b596846f..63a5f7fd876 100644
--- a/scripts/builtin/scale.dml
+++ b/scripts/builtin/scale.dml
@@ -43,8 +43,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE)
   return (Matrix[Double] out, Matrix[Double] Centering, Matrix[Double] ScaleFactor) 
 {
   if(center){
-    ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0))
-    # ColMean = colMeans(X)
+    # ColMean = colMeans(replace(target=X, pattern=NaN, replacement=0))
+    ColMean = colMeans(X)
     X =  X - ColMean
   }
   else {
@@ -55,8 +55,8 @@ m_scale = function(Matrix[Double] X, Boolean center=TRUE, Boolean scale=TRUE)
 
   if (scale) {
     N = nrow(X)
-    ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1))
-    # ScaleFactor = sqrt(colSums(X^2)/(N-1))
+    # ScaleFactor = sqrt(colSums(replace(target=X, pattern=NaN, replacement=0)^2)/(N-1))
+    ScaleFactor = sqrt(colSums(X^2)/(N-1))
 
     # Replace entries in the scale factor that are 0 and NaN with 1.
     # To avoid division by 0 or NaN, introducing NaN to the ouput.

From 253346194c6d456d2fb24616de028103a7f6eeea Mon Sep 17 00:00:00 2001
From: Shafaq Siddiqi <shafaq.siddiqi@tugraz.at>
Date: Thu, 12 May 2022 13:07:15 +0200
Subject: [PATCH 3/3] fix imports apply_pipeline.dml()

---
 scripts/builtin/apply_pipeline.dml | 213 +++++++++++++++++++++++++++++
 1 file changed, 213 insertions(+)
 create mode 100644 scripts/builtin/apply_pipeline.dml

diff --git a/scripts/builtin/apply_pipeline.dml b/scripts/builtin/apply_pipeline.dml
new file mode 100644
index 00000000000..d7cee4da3d4
--- /dev/null
+++ b/scripts/builtin/apply_pipeline.dml
@@ -0,0 +1,213 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script will read the dirty and clean data, then it will apply the best pipeline on dirty data
+# and then will classify both cleaned dataset and check if the cleaned dataset is performing same as original dataset
+# in terms of classification accuracy
+
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME              TYPE               DEFAULT            MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# trainData         Frame[Unknown]      ---
+# testData          Frame[Unknown]      ---
+# metaData          Frame[Unknown]      as.frame("NULL")
+# lp                Frame[Unknown]      ---
+# pip               Frame[Unknown]      ---
+# hp                Frame[Unknown]      ---
+# evaluationFunc    String              ---
+# evalFunHp         Matrix[Double]      ---
+# isLastLabel       Boolean             TRUE
+# correctTypos      Boolean             FALSE
+#
+# ----------------------------------------------------------------------------------------------------------------------
+#
+# OUTPUT:
+# ----------------------------------------------------------------------------------------------------------------------
+# NAME           TYPE             MEANING
+# ----------------------------------------------------------------------------------------------------------------------
+# scores         Matrix[Double]   ---
+# ----------------------------------------------------------------------------------------------------------------------
+
+
+source("scripts/builtin/topk_cleaning.dml") as topk;
+
+s_apply_pipeline = function(Frame[Unknown] testData, Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] pip,
+ Frame[Unknown] applyFunc, Matrix[Double] hp, Boolean isLastLabel = TRUE,List[Unknown] exState, List[Unknown] iState, Boolean correctTypos=FALSE)
+  return (Matrix[Double] eXtest)
+{
+  no_of_flag_vars = 5
+  [schema, mask, fdMask, maskY] = topk::prepareMeta(testData, metaData)
+  pip = removeEmpty(target=pip, margin="cols")
+  applyFunc = removeEmpty(target=applyFunc, margin="cols")
+  metaList = list(mask=mask, schema=schema, fd=fdMask, applyFunc=as.frame("NULL"))
+  ctx = list(prefix="----"); #TODO include seed
+  # separate the label
+  [Xtest, Ytest] = topk::getLabel(testData, isLastLabel)
+    
+  # always recode the label 
+  if(maskY == 1) {
+    M = as.frame(exState[1])
+    eYtest = transformapply(target=Ytest, spec= "{ids:true, recode:[1]}", meta=M);
+  }
+  else
+  {
+    eYtest = as.matrix(Ytest)
+  }
+    # # # when the evaluation function is called first we also compute and keep hyperparams of target application
+  ctx = list(prefix="apply Pipeline")
+
+  [Xtest, Xt] = topk::runStringPipeline(Xtest, Xtest, schema, mask, FALSE, correctTypos, ctx)
+  
+  # # # if mask has 1s then there are categorical features
+  M = as.frame(exState[2])
+  index = vectorToCsv(mask)
+  jspecR = "{ids:true, recode:["+index+"]}"
+  eXtest = transformapply(target=Xtest, spec=jspecR, meta=M);
+  metaList["applyFunc"] = applyFunc
+
+  no_of_param = as.scalar(hp[1, 1]) + 1
+  hp_width= hp[1, 2:no_of_param]
+  hp_matrix = matrix(hp_width, rows=ncol(pip), cols=ncol(hp_width)/ncol(pip))
+  pipList = list(ph = pip, hp = hp_matrix, flags = no_of_flag_vars)
+  for(i in 1:length(iState)) {
+    op = as.scalar(pip[1,i])
+    XtestClone = eXtest
+    applyOp = toString(as.scalar(applyFunc[1,i]))
+    dataFlag = as.scalar(hp_matrix[i, ncol(hp_matrix)])
+    [iState, L] = remove(iState, 1)
+    [eXtest, executeFlag] = getDataFromFlag(eXtest, mask, dataFlag)
+    L2 = list(eXtest)
+    L = as.list(L)
+    for(k in 1:length(L)) {
+      L2 = append(L2, L[k])
+    }
+    if(executeFlag == 1 & applyOp != "NA") {
+      eXtest = eval(applyOp, L2);
+      eXtest = confirmDataFromMask (eXtest, XtestClone, mask, dataFlag)
+      eXtest = confirmMetaFromMask (eXtest, mask)
+    }
+    else {
+      print("not applying "+op+" executeFlag = 0")
+    }
+  }
+ 
+}
+
+
+getDataFromFlag = function(Matrix[Double] X, Matrix[Double] mask, Integer dataFlag)
+return(Matrix[Double] X,Integer executeFlag)
+{
+  executeFlag = 1
+  if(dataFlag == 0)
+  { 
+    if(sum(mask) == ncol(mask))
+      executeFlag = 0
+    else {
+      # take numerics out and remove categorical
+      X = removeEmpty(target=X, margin = "cols", select = (mask == 0))
+    }
+  }
+  else if(dataFlag == 1)
+  { 
+    if(sum(mask) == 0)
+      executeFlag = 0
+    else {
+      # take categorical out and remove numerics
+      X = removeEmpty(target=X, margin = "cols", select = mask)
+    }
+  }
+  else X = X
+}
+
+confirmMetaFromMask = function(Matrix[Double] X, Matrix[Double] mask)
+return (Matrix[Double] X)
+{
+  if((sum(mask) > 0) & (ncol(X) == ncol(mask)))
+  {
+    # get  the max + 1 for nan replacement
+    nanMask = is.na(X)
+    # replace nan
+    X = replace(target = X, pattern = NaN, replacement = 9999)
+    # take categorical out
+    cat = removeEmpty(target=X, margin="cols", select = mask)
+    # round categorical (if there is any floating  point)
+    cat = round(cat)
+    less_than_1_mask = cat < 1
+    less_than_1 = less_than_1_mask * 9999
+    cat = (cat * (less_than_1_mask == 0)) +  less_than_1
+    # reconstruct original X
+    X = X * (mask == 0)
+    q = table(seq(1, ncol(cat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
+      select=t(mask)), ncol(cat), ncol(X))
+    X = (cat %*% q) + X 
+
+    # put nan back
+    nanMask = replace(target = nanMask, pattern = 1, replacement = NaN)
+    X = X + nanMask
+  }
+}
+
+
+confirmDataFromMask  = function(Matrix[Double] nX, Matrix[Double] originalX, Matrix[Double] mask, Integer dataFlag)
+return (Matrix[Double] X)
+{
+
+  if(dataFlag == 0 & (sum(mask) > 0) & (sum(mask) != ncol(originalX)))
+  {
+    maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1
+    nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
+    # X without numerics
+    Xcat = removeEmpty(target=originalX, margin="cols", select=mask)
+    nanMask = is.na(Xcat)
+    Xcat = replace(target = Xcat, pattern = NaN, replacement = -1111)
+    
+    # reconstruct the original matrix
+    p = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
+    select=t(mask==0)), ncol(nX), ncol(originalX))
+    q = table(seq(1, ncol(Xcat)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
+    select=t(mask)), ncol(Xcat), ncol(originalX))
+    X = (nX %*% p) + (Xcat %*% q) 
+
+    X = replace(target = X, pattern = maxDummy, replacement = NaN)
+    X = replace(target = X, pattern = -1111, replacement = NaN)
+  }
+  else if(dataFlag == 1 & (sum(mask) > 0) & (sum(mask) != ncol(originalX)))
+  {
+    maxDummy = max(replace(target=nX, pattern=NaN, replacement=0)) + 1
+    nX = replace(target = nX, pattern = NaN, replacement = maxDummy)
+    # X without categorical
+    Xnum = removeEmpty(target=originalX, margin="cols", select=(mask==0))
+    nanMask = is.na(Xnum)
+    Xnum = replace(target = Xnum, pattern = NaN, replacement = -1111)
+    # reconstruct the original matrix
+    p = table(seq(1, ncol(Xnum)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
+    select=t(mask==0)), ncol(Xnum), ncol(originalX))
+    q = table(seq(1, ncol(nX)), removeEmpty(target=seq(1, ncol(mask)), margin="rows", 
+    select=t(mask)), ncol(nX), ncol(originalX))
+    X = (nX %*% q) + (Xnum %*% p) 
+    X = replace(target = X, pattern = maxDummy, replacement = NaN)
+    X = replace(target = X, pattern = -1111, replacement = NaN)
+  
+  }
+  else X = nX
+
+}