Skip to content

Commit

Permalink
[SYSTEMDS-2789] Disguised Missing Values Detection
Browse files Browse the repository at this point in the history
Co-authored-by: Patrick Lovric <patrick.lovric@student.tugraz.at>
Co-authored-by: Valentin Edelsbrunner <v.edelsbrunner@student.tugraz.at>

DIA project WS2020/21.
Closes #1144.

Date:      Sat Jan 9 00:05:47 2021 +0100
  • Loading branch information
dkerschbaumer authored and Shafaq-Siddiqi committed Jan 9, 2021
1 parent 12cdc89 commit 3a9baf4
Show file tree
Hide file tree
Showing 7 changed files with 692 additions and 10 deletions.
38 changes: 38 additions & 0 deletions docs/site/builtins-reference.md
Expand Up @@ -32,6 +32,7 @@ limitations under the License.
* [`DBSCAN`-Function](#DBSCAN-function)
* [`discoverFD`-Function](#discoverFD-function)
* [`dist`-Function](#dist-function)
* [`dmv`-Function](#dmv-function)
* [`glm`-Function](#glm-function)
* [`gridSearch`-Function](#gridSearch-function)
* [`hyperband`-Function](#hyperband-function)
Expand Down Expand Up @@ -299,6 +300,43 @@ X = rand (rows = 5, cols = 5)
Y = dist(X)
```



## `dmv`-Function

The `dmv`-function is used to find disguised missing values utilising syntactical pattern recognition.

### Usage

```r
dmv(X, threshold, replace)
```

### Arguments

| Name | Type | Default | Description |
| :-------- | :------------ | :------- | :----------------------------------------------------------- |
| X | Frame[String] | required | Input Frame |
| threshold | Double | 0.8 | threshold value in interval [0, 1] for dominant pattern per column (e.g., 0.8 means that 80% of the entries per column must adhere this pattern to be dominant) |
| replace | String | "NA" | The string disguised missing values are replaced with |

### Returns

| Type | Description |
| :------------ | :----------------------------------------------------- |
| Frame[String] | Frame `X` including detected disguised missing values |

### Example

```r
A = read("fileA", data_type="frame", rows=10, cols=8);
Z = dmv(X=A)
Z = dmv(X=A, threshold=0.9)
Z = dmv(X=A, threshold=0.9, replace="NaN")
```



## `glm`-Function

The `glm`-function is a flexible generalization of ordinary linear regression that allows for response variables that have
Expand Down
29 changes: 29 additions & 0 deletions scripts/builtin/dmv.dml
@@ -0,0 +1,29 @@
#-------------------------------------------------------------
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
#------------------------------------------------------------

s_dmv = function(Frame[String] X, Double threshold=0.8, String replace="NA") return (Frame[String] Y) {

if( threshold < 0 | threshold > 1 )
stop("Stopping due to invalid input, threshold required in interval [0, 1] found " + threshold)

Y = map(X, "UtilFunctions.syntacticalPatternDiscovery(" + threshold + "," + replace + ")")
}

1 change: 1 addition & 0 deletions src/main/java/org/apache/sysds/common/Builtins.java
Expand Up @@ -98,6 +98,7 @@ public enum Builtins {
DIAG("diag", false),
DISCOVER_FD("discoverFD", true),
DIST("dist", true),
DMV("dmv", true),
DROP_INVALID_TYPE("dropInvalidType", false),
DROP_INVALID_LENGTH("dropInvalidLength", false),
EIGEN("eigen", false, ReturnType.MULTI_RETURN),
Expand Down
69 changes: 59 additions & 10 deletions src/main/java/org/apache/sysds/runtime/matrix/data/FrameBlock.java
Expand Up @@ -56,6 +56,7 @@
import org.apache.sysds.runtime.matrix.operators.BinaryOperator;
import org.apache.sysds.runtime.transform.encode.EncoderRecode;
import org.apache.sysds.runtime.util.CommonThreadPool;
import org.apache.sysds.runtime.util.DMVUtils;
import org.apache.sysds.runtime.util.IndexRange;
import org.apache.sysds.runtime.util.UtilFunctions;

Expand All @@ -64,8 +65,8 @@ public class FrameBlock implements CacheBlock, Externalizable {
private static final long serialVersionUID = -3993450030207130665L;
private static final Log LOG = LogFactory.getLog(FrameBlock.class.getName());
private static final IDSequence CLASS_ID = new IDSequence();
public static final int BUFFER_SIZE = 1 * 1000 * 1000; //1M elements, size of default matrix block

public static final int BUFFER_SIZE = 1 * 1000 * 1000; //1M elements, size of default matrix block

//internal configuration
private static final boolean REUSE_RECODE_MAPS = true;
Expand Down Expand Up @@ -2101,13 +2102,26 @@ else if (rowTemp1[i].equals("INT32") || rowTemp2[i].equals("CHARACTER"))
}

public FrameBlock map(String lambdaExpr) {
if(!lambdaExpr.contains("->"))
{
//return map(getCompiledFunctionBlock(lambdaExpr));
String args = lambdaExpr.substring(lambdaExpr.indexOf('(') + 1, lambdaExpr.indexOf(')'));
if(args.contains(",")) {
String[] arguments = args.split(",");
return DMVUtils.syntacticalPatternDiscovery(this, Double.parseDouble(arguments[0]), arguments[1]);
}
}
return map(getCompiledFunction(lambdaExpr));
}

public FrameBlock map(FrameBlockMapFunction lambdaExpression) {
return lambdaExpression.apply();
}

public FrameBlock map(FrameMapFunction lambdaExpr) {
// Prepare temporary output array
String[][] output = new String[getNumRows()][getNumColumns()];

// Execute map function on all cells
for(int j=0; j<getNumColumns(); j++) {
Array input = getColumn(j);
Expand All @@ -2120,16 +2134,20 @@ public FrameBlock map(FrameMapFunction lambdaExpr) {
}

public static FrameMapFunction getCompiledFunction(String lambdaExpr) {
// split lambda expression
String varname;
String expr;

String cname = "StringProcessing"+CLASS_ID.getNextID();
StringBuilder sb = new StringBuilder();


String[] parts = lambdaExpr.split("->");
if( parts.length != 2 )
throw new DMLRuntimeException("Unsupported lambda expression: "+lambdaExpr);
String varname = parts[0].trim();
String expr = parts[1].trim();
varname = parts[0].trim();
expr = parts[1].trim();

// construct class code
String cname = "StringProcessing"+CLASS_ID.getNextID();
StringBuilder sb = new StringBuilder();
sb.append("import org.apache.sysds.runtime.util.UtilFunctions;\n");
sb.append("import org.apache.sysds.runtime.matrix.data.FrameBlock.FrameMapFunction;\n");
sb.append("public class "+cname+" extends FrameMapFunction {\n");
Expand All @@ -2140,15 +2158,46 @@ public static FrameMapFunction getCompiledFunction(String lambdaExpr) {
// compile class, and create FrameMapFunction object
try {
return (FrameMapFunction) CodegenUtils
.compileClass(cname, sb.toString()).newInstance();
.compileClass(cname, sb.toString()).newInstance();
}
catch(InstantiationException | IllegalAccessException e) {
throw new DMLRuntimeException("Failed to compile FrameMapFunction.", e);
}
}


public FrameBlockMapFunction getCompiledFunctionBlock(String lambdaExpression) {
// split lambda expression
String expr;

String cname = "StringProcessing"+CLASS_ID.getNextID();
StringBuilder sb = new StringBuilder();

expr = lambdaExpression;

sb.append("import org.apache.sysds.runtime.util.UtilFunctions;\n");
sb.append("import org.apache.sysds.runtime.matrix.data.FrameBlock.FrameBlockMapFunction;\n");
sb.append("public class "+cname+" extends FrameBlockMapFunction {\n");
sb.append("@Override\n");
sb.append("public FrameBlock apply() {\n");
sb.append(" return "+expr+"; }}\n");

try {
return (FrameBlockMapFunction) CodegenUtils
.compileClass(cname, sb.toString()).newInstance();
}
catch(InstantiationException | IllegalAccessException e) {
throw new DMLRuntimeException("Failed to compile FrameBlockMapFunction.", e);
}
}

public static abstract class FrameMapFunction implements Serializable {
private static final long serialVersionUID = -8398572153616520873L;
public abstract String apply(String input);
}

public static abstract class FrameBlockMapFunction implements Serializable {
private static final long serialVersionUID = -8398573333616520876L;
public abstract FrameBlock apply();
}
}

0 comments on commit 3a9baf4

Please sign in to comment.