From 3a9baf48427c8ad6f51a233feeff03a407175f64 Mon Sep 17 00:00:00 2001 From: David Kerschbaumer Date: Sat, 9 Jan 2021 00:08:15 +0100 Subject: [PATCH] [SYSTEMDS-2789] Disguised Missing Values Detection Co-authored-by: Patrick Lovric Co-authored-by: Valentin Edelsbrunner DIA project WS2020/21. Closes #1144. Date: Sat Jan 9 00:05:47 2021 +0100 --- docs/site/builtins-reference.md | 38 ++ scripts/builtin/dmv.dml | 29 ++ .../org/apache/sysds/common/Builtins.java | 1 + .../sysds/runtime/matrix/data/FrameBlock.java | 69 +++- .../apache/sysds/runtime/util/DMVUtils.java | 341 ++++++++++++++++++ .../functions/builtin/BuiltinDMVTest.java | 200 ++++++++++ .../builtin/disguisedMissingValue.dml | 24 ++ 7 files changed, 692 insertions(+), 10 deletions(-) create mode 100644 scripts/builtin/dmv.dml create mode 100644 src/main/java/org/apache/sysds/runtime/util/DMVUtils.java create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java create mode 100644 src/test/scripts/functions/builtin/disguisedMissingValue.dml diff --git a/docs/site/builtins-reference.md b/docs/site/builtins-reference.md index 22c14917f2f..d960b25eb37 100644 --- a/docs/site/builtins-reference.md +++ b/docs/site/builtins-reference.md @@ -32,6 +32,7 @@ limitations under the License. * [`DBSCAN`-Function](#DBSCAN-function) * [`discoverFD`-Function](#discoverFD-function) * [`dist`-Function](#dist-function) + * [`dmv`-Function](#dmv-function) * [`glm`-Function](#glm-function) * [`gridSearch`-Function](#gridSearch-function) * [`hyperband`-Function](#hyperband-function) @@ -299,6 +300,43 @@ X = rand (rows = 5, cols = 5) Y = dist(X) ``` + + +## `dmv`-Function + +The `dmv`-function is used to find disguised missing values utilising syntactical pattern recognition. + +### Usage + +```r +dmv(X, threshold, replace) +``` + +### Arguments + +| Name | Type | Default | Description | +| :-------- | :------------ | :------- | :----------------------------------------------------------- | +| X | Frame[String] | required | Input Frame | +| threshold | Double | 0.8 | threshold value in interval [0, 1] for dominant pattern per column (e.g., 0.8 means that 80% of the entries per column must adhere this pattern to be dominant) | +| replace | String | "NA" | The string disguised missing values are replaced with | + +### Returns + +| Type | Description | +| :------------ | :----------------------------------------------------- | +| Frame[String] | Frame `X` including detected disguised missing values | + +### Example + +```r +A = read("fileA", data_type="frame", rows=10, cols=8); +Z = dmv(X=A) +Z = dmv(X=A, threshold=0.9) +Z = dmv(X=A, threshold=0.9, replace="NaN") +``` + + + ## `glm`-Function The `glm`-function is a flexible generalization of ordinary linear regression that allows for response variables that have diff --git a/scripts/builtin/dmv.dml b/scripts/builtin/dmv.dml new file mode 100644 index 00000000000..af68f1f7403 --- /dev/null +++ b/scripts/builtin/dmv.dml @@ -0,0 +1,29 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------ + +s_dmv = function(Frame[String] X, Double threshold=0.8, String replace="NA") return (Frame[String] Y) { + + if( threshold < 0 | threshold > 1 ) + stop("Stopping due to invalid input, threshold required in interval [0, 1] found " + threshold) + + Y = map(X, "UtilFunctions.syntacticalPatternDiscovery(" + threshold + "," + replace + ")") +} + diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java index bb9610096e0..5bcc85e257e 100644 --- a/src/main/java/org/apache/sysds/common/Builtins.java +++ b/src/main/java/org/apache/sysds/common/Builtins.java @@ -98,6 +98,7 @@ public enum Builtins { DIAG("diag", false), DISCOVER_FD("discoverFD", true), DIST("dist", true), + DMV("dmv", true), DROP_INVALID_TYPE("dropInvalidType", false), DROP_INVALID_LENGTH("dropInvalidLength", false), EIGEN("eigen", false, ReturnType.MULTI_RETURN), diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java index 0605a86fdc2..d157e37a778 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java @@ -56,6 +56,7 @@ import org.apache.sysds.runtime.matrix.operators.BinaryOperator; import org.apache.sysds.runtime.transform.encode.EncoderRecode; import org.apache.sysds.runtime.util.CommonThreadPool; +import org.apache.sysds.runtime.util.DMVUtils; import org.apache.sysds.runtime.util.IndexRange; import org.apache.sysds.runtime.util.UtilFunctions; @@ -64,8 +65,8 @@ public class FrameBlock implements CacheBlock, Externalizable { private static final long serialVersionUID = -3993450030207130665L; private static final Log LOG = LogFactory.getLog(FrameBlock.class.getName()); private static final IDSequence CLASS_ID = new IDSequence(); - - public static final int BUFFER_SIZE = 1 * 1000 * 1000; //1M elements, size of default matrix block + + public static final int BUFFER_SIZE = 1 * 1000 * 1000; //1M elements, size of default matrix block //internal configuration private static final boolean REUSE_RECODE_MAPS = true; @@ -2101,13 +2102,26 @@ else if (rowTemp1[i].equals("INT32") || rowTemp2[i].equals("CHARACTER")) } public FrameBlock map(String lambdaExpr) { + if(!lambdaExpr.contains("->")) + { + //return map(getCompiledFunctionBlock(lambdaExpr)); + String args = lambdaExpr.substring(lambdaExpr.indexOf('(') + 1, lambdaExpr.indexOf(')')); + if(args.contains(",")) { + String[] arguments = args.split(","); + return DMVUtils.syntacticalPatternDiscovery(this, Double.parseDouble(arguments[0]), arguments[1]); + } + } return map(getCompiledFunction(lambdaExpr)); } + + public FrameBlock map(FrameBlockMapFunction lambdaExpression) { + return lambdaExpression.apply(); + } public FrameBlock map(FrameMapFunction lambdaExpr) { // Prepare temporary output array String[][] output = new String[getNumRows()][getNumColumns()]; - + // Execute map function on all cells for(int j=0; j"); if( parts.length != 2 ) throw new DMLRuntimeException("Unsupported lambda expression: "+lambdaExpr); - String varname = parts[0].trim(); - String expr = parts[1].trim(); - + varname = parts[0].trim(); + expr = parts[1].trim(); + // construct class code - String cname = "StringProcessing"+CLASS_ID.getNextID(); - StringBuilder sb = new StringBuilder(); sb.append("import org.apache.sysds.runtime.util.UtilFunctions;\n"); sb.append("import org.apache.sysds.runtime.matrix.data.FrameBlock.FrameMapFunction;\n"); sb.append("public class "+cname+" extends FrameMapFunction {\n"); @@ -2140,15 +2158,46 @@ public static FrameMapFunction getCompiledFunction(String lambdaExpr) { // compile class, and create FrameMapFunction object try { return (FrameMapFunction) CodegenUtils - .compileClass(cname, sb.toString()).newInstance(); + .compileClass(cname, sb.toString()).newInstance(); } catch(InstantiationException | IllegalAccessException e) { throw new DMLRuntimeException("Failed to compile FrameMapFunction.", e); } } + + public FrameBlockMapFunction getCompiledFunctionBlock(String lambdaExpression) { + // split lambda expression + String expr; + + String cname = "StringProcessing"+CLASS_ID.getNextID(); + StringBuilder sb = new StringBuilder(); + + expr = lambdaExpression; + + sb.append("import org.apache.sysds.runtime.util.UtilFunctions;\n"); + sb.append("import org.apache.sysds.runtime.matrix.data.FrameBlock.FrameBlockMapFunction;\n"); + sb.append("public class "+cname+" extends FrameBlockMapFunction {\n"); + sb.append("@Override\n"); + sb.append("public FrameBlock apply() {\n"); + sb.append(" return "+expr+"; }}\n"); + + try { + return (FrameBlockMapFunction) CodegenUtils + .compileClass(cname, sb.toString()).newInstance(); + } + catch(InstantiationException | IllegalAccessException e) { + throw new DMLRuntimeException("Failed to compile FrameBlockMapFunction.", e); + } + } + public static abstract class FrameMapFunction implements Serializable { private static final long serialVersionUID = -8398572153616520873L; public abstract String apply(String input); } + + public static abstract class FrameBlockMapFunction implements Serializable { + private static final long serialVersionUID = -8398573333616520876L; + public abstract FrameBlock apply(); + } } diff --git a/src/main/java/org/apache/sysds/runtime/util/DMVUtils.java b/src/main/java/org/apache/sysds/runtime/util/DMVUtils.java new file mode 100644 index 00000000000..e850d015d30 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/util/DMVUtils.java @@ -0,0 +1,341 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.util; + +import org.apache.commons.collections.map.HashedMap; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; + +public class DMVUtils { + public static final char DIGIT = 'd'; + public static final char LOWER = 'l'; + public static final char UPPER = 'u'; + public static final char ALPHA = 'a'; + public static final char SPACE = 's'; + public static final char DOT = 't'; + public static final char OTHER = 'y'; + public static final char ARBITRARY_LEN = '+'; + public static final char MINUS = '-'; + public static String DISGUISED_VAL = ""; + + public enum LEVEL_ENUM { LEVEL1, LEVEL2, LEVEL3, LEVEL4, LEVEL5, LEVEL6} + + public static FrameBlock syntacticalPatternDiscovery(FrameBlock frame, double threshold, String disguised_value) { + + // Preparation + DISGUISED_VAL = disguised_value; + int numCols = frame.getNumColumns(); + int numRows = frame.getNumRows(); + ArrayList> table_Hist = new ArrayList(numCols); // list of every column with values and their frequency + + int idx; + for (idx = 0; idx < numCols; idx++) { + Object c = frame.getColumnData(idx); + String[] column = (String[]) c; + String key = ""; + for (String attr : column) { + key = (attr.isEmpty()) ? "NULL": attr; + addDistinctValueOrIncrementCounter(table_Hist, key, idx); + } + } + + // Syntactic Pattern Discovery + idx = -1; + for (Map col_Hist : table_Hist) { + idx++; + Map dominant_patterns_ratio = new HashedMap(); + Map prev_pattern_hist = col_Hist; + for(LEVEL_ENUM level : LEVEL_ENUM.values()) { + dominant_patterns_ratio.clear(); + Map current_pattern_hist = LevelsExecutor(prev_pattern_hist, level); + dominant_patterns_ratio = calculatePatternsRatio(current_pattern_hist, numRows); + String dominant_pattern = findDominantPattern(dominant_patterns_ratio, threshold); + if(dominant_pattern != null) { //found pattern + detectDisguisedValues(dominant_pattern, frame.getColumnData(idx), idx, frame, level); + break; + } + prev_pattern_hist = current_pattern_hist; + } + } + return frame; + } + + + public static Map calculatePatternsRatio(Map patterns_hist, int nr_entries) { + Map patterns_ratio_map = new HashedMap(); + Iterator it = patterns_hist.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry pair = (Map.Entry) it.next(); + String pattern = (String) pair.getKey(); + Double nr_occurences = new Double((Integer)pair.getValue()); + + double current_ratio = nr_occurences / nr_entries; // percentage of current pattern in column + patterns_ratio_map.put(pattern, current_ratio); + } + return patterns_ratio_map; + } + + public static String findDominantPattern(Map dominant_patterns, double threshold) { + + Iterator it = dominant_patterns.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry pair = (Map.Entry) it.next(); + String pattern = (String) pair.getKey(); + Double pattern_ratio = (Double)pair.getValue(); + + if(pattern_ratio > threshold) + return pattern; + + } + return null; + } + + private static void addDistinctValueOrIncrementCounter(ArrayList> maps, String key, Integer idx) { + if (maps.size() == idx) { + HashMap m = new HashMap<>(); + m.put(key, 1); + maps.add(m); + return; + } + + if (!(maps.get(idx).containsKey(key))) { + maps.get(idx).put(key, 1); + } else { + maps.get(idx).compute(key, (k, v) -> v + 1); + } + } + + private static void addDistinctValueOrIncrementCounter(Map map, String encoded_value, Integer nr_occurrences) { + if (!(map.containsKey(encoded_value))) { + map.put(encoded_value, nr_occurrences); + } else { + map.compute(encoded_value, (k, v) -> v + nr_occurrences); + } + } + + public static Map LevelsExecutor(Map old_pattern_hist, LEVEL_ENUM level) { + Map new_pattern_hist = new HashedMap(); + Iterator it = old_pattern_hist.entrySet().iterator(); + while (it.hasNext()) { + Map.Entry pair = (Map.Entry) it.next(); + String pattern = (String) pair.getKey(); + Integer nr_of_occurrences = (Integer)pair.getValue(); + + String new_pattern; + switch(level) { + case LEVEL1: // default encoding + new_pattern = encodeRawString(pattern); + break; + case LEVEL2: // ignores the number of occurrences. It replaces all numbers with '+' + new_pattern = removeNumbers(pattern); + break; + case LEVEL3: // ignores upper and lowercase characters. It replaces all 'u' and 'l' with 'a' = Alphabet + new_pattern = removeUpperLowerCase(pattern); + break; + case LEVEL4: // changes floats to digits + new_pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT); + break; + case LEVEL5: // removes spaces between strings + new_pattern = removeInnerCharacterInPattern(pattern, ALPHA, SPACE); + break; + case LEVEL6: // changes negative numbers to digits + new_pattern = acceptNegativeNumbersAsDigits(pattern); + break; + default: + new_pattern = ""; + break; + } + addDistinctValueOrIncrementCounter(new_pattern_hist, new_pattern, nr_of_occurrences); + } + + return new_pattern_hist; + } + + public static String acceptNegativeNumbersAsDigits(String pattern) { + char[] chars = pattern.toCharArray(); + StringBuilder tmp = new StringBuilder(); + boolean currently_minus_digit = false; + for (char ch : chars) { + if(ch == MINUS && !currently_minus_digit) { + currently_minus_digit = true; + } + else if(ch == DIGIT && currently_minus_digit) { + tmp.append(ch); + currently_minus_digit = false; + } + else if(currently_minus_digit) { + tmp.append(MINUS); + tmp.append(ch); + currently_minus_digit = false; + } + else { + tmp.append(ch); + } + } + return tmp.toString(); + } + + public static String removeInnerCharacterInPattern(String pattern, char outter_char, char inner_char) { + char[] chars = pattern.toCharArray(); + StringBuilder tmp = new StringBuilder(); + boolean currently_digit = false; + for (char ch : chars) { + if(ch == outter_char && !currently_digit) { + currently_digit = true; + tmp.append(ch); + } + else if(currently_digit && (ch == outter_char || ch == inner_char)) + continue; + else if(ch != inner_char && ch != ARBITRARY_LEN) { + currently_digit = false; + tmp.append(ch); + } + else { + if(tmp.length() > 0 && tmp.charAt(tmp.length() - 1) != ARBITRARY_LEN) + tmp.append(ch); + } + } + return tmp.toString(); + } + + + public static String removeUpperLowerCase(String pattern) { + char[] chars = pattern.toCharArray(); + StringBuilder tmp = new StringBuilder(); + boolean currently_alphabetic = false; + for (char ch : chars) { + if(ch == UPPER || ch == LOWER) { + if(!currently_alphabetic) { + currently_alphabetic = true; + tmp.append(ALPHA); + } + } + else if(ch == ARBITRARY_LEN) { + if(tmp.charAt(tmp.length() - 1) != ARBITRARY_LEN) + tmp.append(ch); + } + else { + tmp.append(ch); + currently_alphabetic = false; + } + } + return tmp.toString(); + } + + private static String removeNumbers(String pattern) { + char[] chars = pattern.toCharArray(); + StringBuilder tmp = new StringBuilder(); + for (char ch : chars) { + if(Character.isDigit(ch)) + tmp.append(ARBITRARY_LEN); + else + tmp.append(ch); + } + return tmp.toString(); + } + + public static String encodeRawString(String input) { + char[] chars = input.toCharArray(); + + StringBuilder tmp = new StringBuilder(); + for (char ch : chars) { + tmp.append(getCharClass(ch)); + } + return getFrequencyOfEachConsecutiveChar(tmp.toString()); + } + + private static char getCharClass(char c) { + if (Character.isDigit(c)) return DIGIT; + if (Character.isLowerCase(c)) return LOWER; + if (Character.isUpperCase(c)) return UPPER; + if (Character.isSpaceChar(c)) return SPACE; + if (c == '.') return DOT; + if(c == '-') return MINUS; + return OTHER; + } + + public static String getFrequencyOfEachConsecutiveChar(String s) { + StringBuilder retval = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + int count = 1; + while (i + 1 < s.length() && s.charAt(i) == s.charAt(i + 1)) { + i++; + count++; + } + retval.append(s.charAt(i)); + retval.append(count); + } + return retval.toString(); + } + + private static void detectDisguisedValues(String dom_pattern, Object col, int col_idx, + FrameBlock frameBlock, LEVEL_ENUM level) + { + int row_idx = -1; + String pattern = ""; + String[] column = (String[]) col; + for (String attr : column) { + switch (level){ + case LEVEL1: + pattern = encodeRawString(attr); + break; + case LEVEL2: + pattern = encodeRawString(attr); + pattern = removeNumbers(pattern); + break; + case LEVEL3: + pattern = encodeRawString(attr); + pattern = removeNumbers(pattern); + pattern = removeUpperLowerCase(pattern); + break; + case LEVEL4: + pattern = encodeRawString(attr); + pattern = removeNumbers(pattern); + pattern = removeUpperLowerCase(pattern); + pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT); + break; + case LEVEL5: + pattern = encodeRawString(attr); + pattern = removeNumbers(pattern); + pattern = removeUpperLowerCase(pattern); + pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT); + pattern = removeInnerCharacterInPattern(pattern, ALPHA, SPACE); + break; + case LEVEL6: + pattern = encodeRawString(attr); + pattern = removeNumbers(pattern); + pattern = removeUpperLowerCase(pattern); + pattern = removeInnerCharacterInPattern(pattern, DIGIT, DOT); + pattern = removeInnerCharacterInPattern(pattern, ALPHA, SPACE); + pattern = acceptNegativeNumbersAsDigits(pattern); + default: + //System.out.println("Could not find suitable level"); + } + row_idx++; + if(pattern.equals(dom_pattern)) continue; +// System.out.println("[" + level +"] Disguised value: " + frameBlock.get(row_idx, col_idx) + " (c=" + col_idx + ",r=" + row_idx + ")"); + frameBlock.set(row_idx, col_idx, DISGUISED_VAL); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java new file mode 100644 index 00000000000..10f414380ef --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinDMVTest.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.builtin; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameWriterFactory; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.apache.sysds.lops.LopProperties.ExecType; +import org.apache.sysds.test.AutomatedTestBase; +import org.apache.sysds.test.TestConfiguration; +import org.apache.sysds.test.TestUtils; + +public class BuiltinDMVTest extends AutomatedTestBase { + + private final static String TEST_NAME = "disguisedMissingValue"; + private final static String TEST_DIR = "functions/builtin/"; + private static final String TEST_CLASS_DIR = TEST_DIR + BuiltinOutlierTest.class.getSimpleName() + "/"; + + @BeforeClass + public static void init() { + TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR); + } + + @AfterClass + public static void cleanUp() { + if (TEST_CACHE_ENABLED) { + TestUtils.clearDirectory(TEST_DATA_DIR + TEST_CLASS_DIR); + } + } + + @Override + public void setUp() { + TestUtils.clearAssertionInformation(); + addTestConfiguration(TEST_NAME,new TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"B"})); + if (TEST_CACHE_ENABLED) { + setOutAndExpectedDeletionDisabled(true); + } + } + + @Test + public void NormalStringFrameTest() { + FrameBlock f = generateRandomFrameBlock(1000, 4,null); + String[] disguised_values = new String[]{"?", "9999", "?", "9999"}; + ArrayList> positions = getDisguisedPositions(f, 4, disguised_values); + runMissingValueTest(f, ExecType.CP, 0.8, "DMV", positions); + } + + @Test + public void PreDefinedStringsFrameTest() { + String[] testarray0 = new String[]{"77","77","55","89","43", "99", "46"}; // detect Weg + String[] testarray1 = new String[]{"8010","9999","8456","4565","89655", "86542", "45624"}; // detect ? + String[] testarray2 = new String[]{"David K","Valentin E","Patrick L","VEVE","DK", "VE", "PL"}; // detect 45 + String[] testarray3 = new String[]{"3.42","45","0.456",".45","4589.245", "97", "33"}; // detect ka + String[] testarray4 = new String[]{"99","123","158","146","158", "174", "201"}; // detect 9999 + + String[][] teststrings = new String[][]{testarray0, testarray1, testarray2, testarray3, testarray4}; + FrameBlock f = generateRandomFrameBlock(7, 5, teststrings); + String[] disguised_values = new String[]{"Patrick-Lovric-Weg-666", "?", "45", "ka", "9999"}; + ArrayList> positions = getDisguisedPositions(f, 1, disguised_values); + runMissingValueTest(f, ExecType.CP, 0.7,"NA", positions); + } + + @Test + public void PreDefinedDoubleFrame() { + Double[] test_val = new Double[10000]; + for(int i = 0; i < test_val.length; i++) { + test_val[i] = TestUtils.getPositiveRandomDouble(); + } + String[] test_string = new String[test_val.length]; + for(int j = 0; j < test_val.length; j++) { + test_string[j] = test_val[j].toString(); + } + + String[][] teststrings = new String[][]{test_string}; + FrameBlock f = generateRandomFrameBlock(test_string.length, 1, teststrings); + String[] disguised_values = new String[]{"9999999999"}; + ArrayList> positions = getDisguisedPositions(f, 10, disguised_values); + runMissingValueTest(f, ExecType.CP, 0.6, "-1", positions); + } + + private void runMissingValueTest(FrameBlock test_frame, ExecType et, Double threshold, String replacement, + ArrayList> positions) + { + Types.ExecMode platformOld = setExecMode(et); + + try { + getAndLoadTestConfiguration(TEST_NAME); + + String HOME = SCRIPT_DIR + TEST_DIR; + fullDMLScriptName = HOME + TEST_NAME + ".dml"; + programArgs = new String[] {"-nvargs", "F=" + input("F"), "O=" + output("O"), + "threshold=" + threshold, "replacement=" + replacement + }; + + FrameWriterFactory.createFrameWriter(Types.FileFormat.CSV). + writeFrameToHDFS(test_frame, input("F"), test_frame.getNumRows(), test_frame.getNumColumns()); + + runTest(true, false, null, -1); + + FrameBlock outputFrame = readDMLFrameFromHDFS("O", Types.FileFormat.CSV); + + for(int i = 0; i < positions.size(); i++) { + String[] output = (String[]) outputFrame.getColumnData(i); + for(int j = 0; j < positions.get(i).size(); j++) { + if(replacement.equals("NA")) { + TestUtils.compareScalars(null, output[positions.get(i).get(j)]); + } + else { + TestUtils.compareScalars(replacement, output[positions.get(i).get(j)]); + } + } + } + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + finally { + resetExecMode(platformOld); + } + } + + private FrameBlock generateRandomFrameBlock(int rows, int cols, String[][] defined_strings) + { + Types.ValueType[] schema = new Types.ValueType[cols]; + for(int i = 0; i < cols; i++) { + schema[i] = Types.ValueType.STRING; + } + + if(defined_strings != null) + { + String[] names = new String[cols]; + for(int i = 0; i < cols; i++) + names[i] = schema[i].toString(); + FrameBlock frameBlock = new FrameBlock(schema, names); + frameBlock.ensureAllocatedColumns(rows); + for(int row = 0; row < rows; row++) + for(int col = 0; col < cols; col++) + frameBlock.set(row, col, defined_strings[col][row]); + return frameBlock; + } + return TestUtils.generateRandomFrameBlock(rows, cols, schema ,TestUtils.getPositiveRandomInt()); + } + + private ArrayList> getDisguisedPositions(FrameBlock frame, int amountValues, String[] disguisedValue) + { + ArrayList> positions = new ArrayList<>(); + int counter; + for(int i = 0; i < frame.getNumColumns(); i++) + { + counter = 0; + List arrayToFill = new ArrayList<>(); + while(counter < frame.getNumRows() && counter < amountValues) + { + int position = TestUtils.getPositiveRandomInt() % frame.getNumRows(); + while(counter != 0 && arrayToFill.contains(position)) + { + position = (position + TestUtils.getPositiveRandomInt() + 5) % frame.getNumRows(); + } + arrayToFill.add(position); + if(disguisedValue.length > 1) + { + frame.set(position, i, disguisedValue[i]); + } + else if (disguisedValue.length == 1) + { + frame.set(position, i, disguisedValue[0]); + } + + counter++; + } + positions.add(i, arrayToFill); + } + + return positions; + } + +} diff --git a/src/test/scripts/functions/builtin/disguisedMissingValue.dml b/src/test/scripts/functions/builtin/disguisedMissingValue.dml new file mode 100644 index 00000000000..6d45fb76626 --- /dev/null +++ b/src/test/scripts/functions/builtin/disguisedMissingValue.dml @@ -0,0 +1,24 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +X = read($F, data_type="frame", format="csv", header=FALSE) +Z = dmv(X=X, threshold=$threshold, replace=$replacement) + +write(Z, $O, format = "csv")