From cf6ab0a95761deae03a1cbfcb9d0f47108ca0a74 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 6 Jan 2022 23:56:15 +0100 Subject: [PATCH 01/84] Start reimplement IOGEN --- .../runtime/iogen/CustomProperties2.java | 88 +++++ .../sysds/runtime/iogen/DelimiterTrie.java | 177 +++++++++ .../runtime/iogen/DelimiterTrieNode.java | 35 ++ .../apache/sysds/runtime/iogen/RawIndex.java | 353 ++++++++++++++++++ .../sysds/runtime/iogen/ReaderMapping2.java | 293 +++++++++++++++ 5 files changed, 946 insertions(+) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrieNode.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java new file mode 100644 index 00000000000..f2d9835a107 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.runtime.io.FileFormatProperties; + +import java.io.Serializable; +import java.util.HashSet; + +public class CustomProperties2 extends FileFormatProperties implements Serializable { + private static final Log LOG = LogFactory.getLog(CustomProperties2.class.getName()); + private static final long serialVersionUID = -4447926749068752721L; + + public enum IndexProperties { + IDENTIFY, PREFIX, KEY; + @Override + public String toString() { + return this.name().toLowerCase().replaceAll("_", "-"); + } + } + + private IndexProperties rowIndex; + private IndexProperties colIndex; + + // When the index is prefixes + private Integer rowIndexPrefixPosition; + private String rowIndexPrefixDelim; + private Boolean rowIndexPrefixDelimFixLength; + + public IndexProperties getRowIndex() { + return rowIndex; + } + + public void setRowIndex(IndexProperties rowIndex) { + this.rowIndex = rowIndex; + } + + public IndexProperties getColIndex() { + return colIndex; + } + + public void setColIndex(IndexProperties colIndex) { + this.colIndex = colIndex; + } + + public Integer getRowIndexPrefixPosition() { + return rowIndexPrefixPosition; + } + + public void setRowIndexPrefixPosition(Integer rowIndexPrefixPosition) { + this.rowIndexPrefixPosition = rowIndexPrefixPosition; + } + + public String getRowIndexPrefixDelim() { + return rowIndexPrefixDelim; + } + + public void setRowIndexPrefixDelim(String rowIndexPrefixDelim) { + this.rowIndexPrefixDelim = rowIndexPrefixDelim; + } + + public Boolean getRowIndexPrefixDelimFixLength() { + return rowIndexPrefixDelimFixLength; + } + + public void setRowIndexPrefixDelimFixLength(Boolean rowIndexPrefixDelimFixLength) { + this.rowIndexPrefixDelimFixLength = rowIndexPrefixDelimFixLength; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java new file mode 100644 index 00000000000..8b4bb303acd --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import java.util.HashSet; + +public class DelimiterTrie { + private DelimiterTrieNode root; + + public DelimiterTrie() { + root = new DelimiterTrieNode(); + } + + private String intersect(String str1, String str2) { + StringBuilder sb = new StringBuilder(); + for(int i = 0; i < Math.min(str1.length(), str2.length()); i++) { + if(str1.charAt(i) == str2.charAt(i)) + sb.append(str1.charAt(i)); + else + break; + } + if(sb.length() == 0) + return null; + else + return sb.toString(); + } + + private TrieNodeResult getSubNode(DelimiterTrieNode current, String delim) { + for(String key : current.getChildren().keySet()) { + String insec = intersect(key, delim); + if(insec != null) + return new TrieNodeResult(current.getChildren().get(key), insec, key); + } + return null; + } + + public void insert(String delim) { + DelimiterTrieNode current = root; + String remaindKeyDelim; + String currentDelim = delim; + TrieNodeResult trieNodeResult; + do { + trieNodeResult = getSubNode(current, currentDelim); + if(trieNodeResult == null) { + DelimiterTrieNode newNode = new DelimiterTrieNode(); + current.getChildren().put(currentDelim, newNode); + } + else { + currentDelim = currentDelim.substring(trieNodeResult.intersect.length()); + remaindKeyDelim = trieNodeResult.nodeKey.substring(trieNodeResult.intersect.length()); + int cwl = currentDelim.length(); + int rkwl = remaindKeyDelim.length(); + + if(cwl == 0 && rkwl > 0) { + DelimiterTrieNode newNode = new DelimiterTrieNode(); + + DelimiterTrieNode updateNode = new DelimiterTrieNode(); + updateNode.setChildren(trieNodeResult.trieNode.getChildren()); + + // Add Update Node + newNode.getChildren().put(remaindKeyDelim, updateNode); + + // Add New Node + current.getChildren().put(trieNodeResult.intersect, newNode); + + // Remove old node + current.getChildren().remove(trieNodeResult.nodeKey); + + } + else if(rkwl == 0) { + current = trieNodeResult.trieNode; + } + else { + DelimiterTrieNode newNode = new DelimiterTrieNode(); + + DelimiterTrieNode updateNode = new DelimiterTrieNode(); + updateNode.setChildren(trieNodeResult.trieNode.getChildren()); + + // Add Update Node + newNode.getChildren().put(remaindKeyDelim, updateNode); + + // Add New Node + current.getChildren().put(trieNodeResult.intersect, newNode); + + // Remove old node + current.getChildren().remove(trieNodeResult.nodeKey); + + // Add New Delim remaind + DelimiterTrieNode newDelimNode = new DelimiterTrieNode(); + newNode.getChildren().put(currentDelim, newDelimNode); + break; + } + } + + } + while(trieNodeResult != null && currentDelim.length() > 0); + } + + public String getShortestDelim(int minsize) { + // Check the possibility of the shortest delim + boolean flag = true; + DelimiterTrieNode current = root; + StringBuilder sb = new StringBuilder(); + do { + int currentChildCount = current.getChildren().size(); + if(currentChildCount == 0) + break; + else if(currentChildCount != 1) + flag = false; + else { + String key = current.getChildren().keySet().iterator().next(); + sb.append(key); + current = current.getChildren().get(key); + } + } + while(flag); + if(flag) { + String allDelim = sb.toString(); + int allDelimLength = allDelim.length(); + HashSet delimSet = new HashSet<>(); + for(int i=1; i<=minsize; i++){ + delimSet.clear(); + for(int j=0; j children = new HashMap<>(); + + public Map getChildren() { + return children; + } + + public void setChildren(Map children) { + this.children = children; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java new file mode 100644 index 00000000000..bec10a34120 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -0,0 +1,353 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import com.google.gson.Gson; +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.matrix.data.Pair; +import org.apache.sysds.runtime.util.UtilFunctions; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; + +public class RawIndex { + private final String raw; + private final int rawLength; + private final BitSet numberBitSet; + private final BitSet dotBitSet; + private final BitSet eBitSet; + private final BitSet plusMinusBitSet; + private BitSet reservedPositions; + private BitSet backupReservedPositions; + private HashMap>> actualNumericValues; + private HashMap>> dotActualNumericValues; + private HashMap>> dotEActualNumericValues; + + + public RawIndex(String raw) { + this.raw = raw; + this.rawLength = raw.length(); + this.numberBitSet = new BitSet(rawLength); + this.dotBitSet = new BitSet(rawLength); + this.eBitSet = new BitSet(rawLength); + this.plusMinusBitSet = new BitSet(rawLength); + this.reservedPositions = new BitSet(rawLength); + this.backupReservedPositions = new BitSet(rawLength); + this.actualNumericValues = null; + this.dotActualNumericValues = null; + this.dotEActualNumericValues = new HashMap<>(); + + for(int i = 0; i < this.rawLength; i++) { + switch(raw.charAt(i)) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + this.numberBitSet.set(i); + break; + case '+': + case '-': + this.plusMinusBitSet.set(i); + break; + case '.': + this.dotBitSet.set(i); + break; + case 'e': + case 'E': + this.eBitSet.set(i); + break; + } + } + // Clean unnecessary sets + // Clean for "." + for(int i = dotBitSet.nextSetBit(0); i != -1; i = dotBitSet.nextSetBit(i + 1)) { + boolean flag = false; + if(i > 0) { + if(i< rawLength -2) { + flag = !numberBitSet.get(i - 1) && + !numberBitSet.get(i + 1) && + !plusMinusBitSet.get(i + 1) && + !eBitSet.get(i + 1); + } + } + else if( i== rawLength-1){ + flag = !numberBitSet.get(i - 1); + } + else if(i==0){ + if(i < rawLength-2){ + flag = !numberBitSet.get(i + 1) && + !plusMinusBitSet.get(i + 1) && + !eBitSet.get(i + 1); + } + else if( i== rawLength-1){ + flag = true; + } + } + + if(flag) + dotBitSet.set(i, false); + } + + // Clean for "+/-" + for(int i = plusMinusBitSet.nextSetBit(0); i != -1; i = plusMinusBitSet.nextSetBit(i + 1)) { + boolean flag; + if(i1 && i findValue(Object value, Types.ValueType valueType){ + if(valueType.isNumeric()) + return findValue(UtilFunctions.getDouble(value)); + else if(valueType == Types.ValueType.STRING) + return findValue(UtilFunctions.objectToString(value)); +// else if(valueType == Types.ValueType.BOOLEAN) +// return findValue(UtilFunctions.objectToString()) + else + return null; + } + + public Pair findValue(double value){ +// extractNumericActualValues(); +// if(actualNumericValues.containsKey(value)){ +// return getValuePositionAndLength(actualNumericValues.get(value)); +// } +// +// extractNumericDotActualValues(); +// if(dotActualNumericValues.containsKey(value)){ +// return getValuePositionAndLength(dotActualNumericValues.get(value)); +// } +// +// extractNumericDotEActualValues(); + if(dotEActualNumericValues.containsKey(value)){ + return getValuePositionAndLength(dotEActualNumericValues.get(value)); + } + return null; + } + + private Pair findValue(String value){ + int index = this.raw.indexOf(value); + if(index == -1) + return null; + else { + for(int i= index; i(index, value.length()); + } + } + + private Pair getValuePositionAndLength(ArrayList> list){ + for(Pair p: list){ + if(!reservedPositions.get(p.getKey())) { + reservedPositions.set(p.getKey(), p.getKey()+p.getValue()); + return p; + } + } + return null; + } + + private void extractNumericActualValues(){ + if(this.actualNumericValues == null) + this.actualNumericValues = new HashMap<>(); + else + return; + StringBuilder sb = new StringBuilder(); + BitSet nBitSet = (BitSet) numberBitSet.clone(); + nBitSet.or(plusMinusBitSet); + int pi = nBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for(int i = nBitSet.nextSetBit(pi+1); i != -1; i = nBitSet.nextSetBit(i + 1)) { + if(pi+sb.length() != i) { + addActualValueToList(sb.toString(), pi, actualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } + else + sb.append(raw.charAt(i)); + } + if(sb.length()>0) + addActualValueToList(sb.toString(), pi, actualNumericValues); + } + + private void extractNumericDotActualValues(){ + if(this.dotActualNumericValues == null) + this.dotActualNumericValues = new HashMap<>(); + else + return; + + BitSet numericDotBitSet = (BitSet) numberBitSet.clone(); + numericDotBitSet.or(dotBitSet); + numericDotBitSet.or(plusMinusBitSet); + StringBuilder sb = new StringBuilder(); + int pi = numericDotBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for(int i = numericDotBitSet.nextSetBit(pi+1); i != -1; i = numericDotBitSet.nextSetBit(i + 1)) { + if(pi+sb.length() != i) { + addActualValueToList(sb.toString(), pi, dotActualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } + else + sb.append(raw.charAt(i)); + } + if(sb.length()>0) + addActualValueToList(sb.toString(), pi, dotActualNumericValues); + } + + private void extractNumericDotEActualValues(){ +// if(this.dotEActualNumericValues == null) +// this.dotEActualNumericValues = new HashMap<>(); +// else +// return; + + BitSet numericDotEBitSet = (BitSet) numberBitSet.clone(); + numericDotEBitSet.or(dotBitSet); + numericDotEBitSet.or(eBitSet); + numericDotEBitSet.or(plusMinusBitSet); + + StringBuilder sb = new StringBuilder(); + int pi = numericDotEBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for(int i = numericDotEBitSet.nextSetBit(pi+1); i != -1; i = numericDotEBitSet.nextSetBit(i + 1)) { + if(pi+sb.length() != i) { + addActualValueToList(sb.toString(), pi, dotEActualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } + else + sb.append(raw.charAt(i)); + } + if(sb.length()>0) + addActualValueToList(sb.toString(), pi, dotEActualNumericValues); + } + + private void addActualValueToList(String stringValue, Integer position, HashMap>> list){ + try { + double d = UtilFunctions.getDouble(stringValue); + Pair pair = new Pair(position, stringValue.length()); + if(!list.containsKey(d)) { + ArrayList> tmpList = new ArrayList<>(); + tmpList.add(pair); + list.put(d, tmpList); + } + else + list.get(d).add(pair); + } + catch(Exception e){ + + } + } + + public void printBitSets() { + // String numberBitSetStrng; + String dotBitSetString=""; + String eBitSetString=""; + String plusMinusBitSetString=""; + // String minusBitSetStrng; + // for(int i=0; i p1= ni.findValue(123); + Pair p= ni.findValue(123); + Gson gson=new Gson(); + System.out.println(gson.toJson(p)); + } + +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java new file mode 100644 index 00000000000..dfa6c71d5be --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java @@ -0,0 +1,293 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.data.Pair; + +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashSet; + +public class ReaderMapping2 { + + private int[][] mapRow; + private int[][] mapCol; + private boolean symmetric; + private boolean skewSymmetric; + private boolean isUpperTriangular; + private int skewCoefficient; + private ArrayList sampleRawIndexes; + + private boolean mapped; + private static int nrows; + private static int ncols; + private int nlines; + private int firstRowIndex; + private int firstColIndex; + + private MatrixBlock sampleMatrix; + private FrameBlock sampleFrame; + private Types.ValueType[] schema; + private final boolean isMatrix; + + public ReaderMapping2(String raw, MatrixBlock matrix) throws Exception { + this.ReadRaw(raw); + this.isMatrix = true; + this.sampleMatrix = matrix; + this.nrows = this.sampleMatrix.getNumRows(); + this.ncols = this.sampleMatrix.getNumColumns(); + this.runMapping(); + } + + public ReaderMapping2(String raw, FrameBlock frame) throws Exception { + this.ReadRaw(raw); + this.isMatrix = false; + this.sampleFrame = frame; + this.nrows = this.sampleFrame.getNumRows(); + this.ncols = this.sampleFrame.getNumColumns(); + this.schema = this.sampleFrame.getSchema(); + this.runMapping(); + } + + private void ReadRaw(String raw) throws Exception { + this.sampleRawIndexes = new ArrayList<>(); + InputStream is = IOUtilFunctions.toInputStream(raw); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String value; + int nlines = 0; + + while((value = br.readLine()) != null) { + this.sampleRawIndexes.add(new RawIndex(value)); + nlines++; + } + this.nlines = nlines; + this.firstColIndex = 0; + this.firstRowIndex = 0; + } + + private boolean isSchemaNumeric() { + if(isMatrix) + return true; + + boolean result = true; + for(Types.ValueType vt : schema) + result &= vt.isNumeric(); + return result; + } + + private void runMapping() throws Exception { + mapped = findMapping(); + } + + protected boolean findMapping() { + mapRow = new int[nrows][ncols]; + mapCol = new int[nrows][ncols]; + + // Set "-1" as default value for all defined matrix + for(int r = 0; r < nrows; r++) + for(int c = 0; c < ncols; c++) + mapRow[r][c] = mapCol[r][c] = -1; + + int itRow = 0; + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + if((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && this.sampleFrame.get( + r, c) != null)) { + HashSet checkedLines = new HashSet<>(); + while(checkedLines.size() < nlines) { + RawIndex ri = sampleRawIndexes.get(itRow); + Pair pair = this.isMatrix ? ri.findValue( + sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); + if(pair != null) { + mapRow[r][c] = itRow; + mapCol[r][c] = pair.getKey(); + break; + } + else { + checkedLines.add(itRow); + itRow++; + if(itRow == nlines) + itRow = 0; + } + } + } + } + } + boolean flagMap = true; + for(int r = 0; r < nrows && flagMap; r++) + for(int c = 0; c < ncols && flagMap; c++) + if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, + c) != null) || (this.isMatrix && this.sampleMatrix.getValue(r, c) != 0))) { + flagMap = false; + } + return flagMap; + } + + public CustomProperties2 getFormatProperties() { + CustomProperties2 properties = new CustomProperties2(); + + // Find Row Index Properties + // 1. is row index identified? + boolean rowIndexIdentify = isRowIndexIdentify(); + if(!rowIndexIdentify) { + Pair pair = isRowIndexPrefix(); + if(pair==null){ + + } + else { + properties.setRowIndex(CustomProperties2.IndexProperties.PREFIX); + properties.setRowIndexPrefixDelim(pair.getKey()); + properties.setRowIndexPrefixDelimFixLength(pair.getValue()); + } + } + else + properties.setRowIndex(CustomProperties2.IndexProperties.IDENTIFY); + + return properties; + } + + private boolean isRowIndexIdentify() { + int l = 0; + ArrayList> mismatched = new ArrayList<>(); + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + if(mapRow[r][c] != -1 && l != mapRow[r][c]) { + mismatched.add(new Pair<>(r, c)); + } + } + l++; + } + // All rows of sample raw not used + if(l != nlines) { + return false; + } + else if(mismatched.size() > 0) { + return false; + } + return true; + } + + private Pair isRowIndexPrefix() { + + ArrayList> mismatched = new ArrayList<>(); + ArrayList> prefixes = new ArrayList<>(); + ArrayList> nonePrefix = new ArrayList<>(); + DelimiterTrie delimiterTrie = new DelimiterTrie(); + + int delimiterMinSize = 0; + + for(int r = 0; r < nrows; r++) { + RawIndex ri = sampleRawIndexes.get(r); + ri.cloneReservedPositions(); + for(int c = 0; c < ncols; c++) { + if(mapRow[r][c] != -1) { + Pair pair = ri.findValue(r); + if(pair == null) + mismatched.add(new Pair<>(r, c)); + else { + if(pair.getKey() < mapCol[r][c]) { + String delim = ri.getSubString(pair.getKey() + pair.getValue(), mapCol[r][c]); + int delimLength = delim.length(); + if(delimiterMinSize != 0 && delimLength < delimiterMinSize) + delimiterMinSize = delimLength; + else + delimiterMinSize = delimLength; + + delimiterTrie.insert(delim); + prefixes.add(pair); + } + else + nonePrefix.add(pair); + } + } + } + //ri.restoreReservedPositions(); + } + // TODO: attend to mistakes and none-prefix row index maps + + return delimiterTrie.getShortestDelim(delimiterMinSize); + } + + class DelimiterTrie { + private final StringBuilder totalDelim; + private int totalDelimLength; + private boolean valid; + + public DelimiterTrie() { + totalDelim = new StringBuilder(); + totalDelimLength = 0; + valid = true; + } + + public boolean insert(String delim) { + if(delim.length() > totalDelimLength) { + if(delim.startsWith(totalDelim.toString())) { + totalDelim.append(delim.substring(totalDelimLength)); + totalDelimLength += delim.length() - totalDelimLength; + } + else + valid = false; + } + else if(!totalDelim.toString().startsWith(delim)) + valid = false; + return valid; + } + + public Pair getShortestDelim(int minsize) { + if(!valid) + return null; + + if(minsize == totalDelimLength) + return new Pair(totalDelim.toString(), true); + else { + HashSet delimSet = new HashSet<>(); + for(int i = 1; i <= minsize; i++) { + delimSet.clear(); + for(int j = 0; j < totalDelimLength; j += i) { + delimSet.add(totalDelim.substring(j, Math.min(j + i, totalDelimLength))); + } + if(delimSet.size() == 1) + break; + } + if(delimSet.size() == 1) { + String delim = delimSet.iterator().next(); + return new Pair(delim, delim.length() == totalDelimLength); + } + else + return null; + } + } + + public void print() { + System.out.println(totalDelim); + } + } + + + public boolean isMapped() { + return mapped; + } +} From 6e8b6498b90f70a7163befde1f8349aefb05c3c9 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 7 Jan 2022 14:32:30 +0100 Subject: [PATCH 02/84] Start change to new CodeGen --- pom.xml | 2 +- .../iogen/ColumnIdentifyProperties.java | 60 ++ .../sysds/runtime/iogen/CustomProperties.java | 111 +- .../runtime/iogen/CustomProperties2.java | 88 -- .../sysds/runtime/iogen/GenerateReader.java | 6 +- .../apache/sysds/runtime/iogen/RawIndex.java | 4 + .../sysds/runtime/iogen/ReaderMapping.java | 966 +++++------------- .../sysds/runtime/iogen/ReaderMapping2.java | 293 ------ .../iogen/GenerateReaderMatrixTest.java | 17 +- .../Identify/MatrixGRRowColIdentifyTest.java | 83 ++ .../iogen/MatrixGenerateReaderCSVTest.java | 4 +- .../MatrixGenerateReaderMatrixMarketTest.java | 2 +- 12 files changed, 445 insertions(+), 1191 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java diff --git a/pom.xml b/pom.xml index 51f38c2b8fd..97541ce5169 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ true ** false - -Xms3000m -Xmx3000m -Xmn300m + -Xms3000m -Xmx9000m -Xmn300m false diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java new file mode 100644 index 00000000000..396c0d9fec6 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +public class ColumnIdentifyProperties { + + private String indexPositionDelimiter; + private Integer indexPosition; + private String valueEndWithString; + + public ColumnIdentifyProperties() { + } + + public ColumnIdentifyProperties(String indexPositionDelimiter, Integer indexPosition, String valueEndWithString) { + this.indexPositionDelimiter = indexPositionDelimiter; + this.indexPosition = indexPosition; + this.valueEndWithString = valueEndWithString; + } + + public String getIndexPositionDelimiter() { + return indexPositionDelimiter; + } + + public void setIndexPositionDelimiter(String indexPositionDelimiter) { + this.indexPositionDelimiter = indexPositionDelimiter; + } + + public Integer getIndexPosition() { + return indexPosition; + } + + public void setIndexPosition(Integer indexPosition) { + this.indexPosition = indexPosition; + } + + public String getValueEndWithString() { + return valueEndWithString; + } + + public void setValueEndWithString(String valueEndWithString) { + this.valueEndWithString = valueEndWithString; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 05c92226cd6..751e7e1e1fa 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -24,112 +24,73 @@ import org.apache.sysds.runtime.io.FileFormatProperties; import java.io.Serializable; -import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { - protected static final Log LOG = LogFactory.getLog(CustomProperties.class.getName()); + private static final Log LOG = LogFactory.getLog(CustomProperties.class.getName()); private static final long serialVersionUID = -4447926749068752721L; - private String delim; - private String indexDelim; - private HashSet naStrings; - private int firstColIndex; - private int firstRowIndex; + public enum IndexProperties { + IDENTIFY, PREFIX, KEY; - protected enum GRPattern { - Regular, Irregular; - - @Override - public String toString() { - return this.name().toLowerCase(); - } - } - - protected enum GRSymmetry { - GENERAL, SYMMETRIC, SKEW_SYMMETRIC; - - @Override - public String toString() { + @Override public String toString() { return this.name().toLowerCase().replaceAll("_", "-"); } } - private GRPattern rowPattern; - private GRPattern colPattern; - private GRSymmetry grSymmetry; + private IndexProperties rowIndex; + private IndexProperties colIndex; - public CustomProperties() { - } - - // Row & Col Regular Format - public CustomProperties(GRPattern rowPattern, String delim, HashSet naStrings) { - this.delim = delim; - this.naStrings = naStrings; - this.rowPattern = rowPattern; - this.colPattern = GRPattern.Regular; - this.grSymmetry = GRSymmetry.GENERAL; - this.firstRowIndex = 0; - this.firstColIndex = 0; - } + // When the Row and Column Index are identify + private ColumnIdentifyProperties[] columnIdentifyProperties; - // Row Regular & Col Irregular Format - public CustomProperties(GRPattern rowPattern, String delim, String indexDelim, int firstColIndex) { - this.delim = delim; - this.indexDelim = indexDelim; - this.rowPattern = rowPattern; - this.colPattern = GRPattern.Irregular; - this.grSymmetry = GRSymmetry.GENERAL; - this.firstColIndex = firstColIndex; - this.firstRowIndex = 0; - } + // When the index is prefixes + private Integer rowIndexPrefixPosition; + private String rowIndexPrefixDelim; + private Boolean rowIndexPrefixDelimFixLength; - // Row Irregular format - public CustomProperties(GRSymmetry grSymmetry, String delim, int firstRowIndex, int firstColIndex) { - this.delim = delim; - this.grSymmetry = grSymmetry; - this.colPattern = GRPattern.Regular; - this.rowPattern = GRPattern.Irregular; - this.firstColIndex = firstColIndex; - this.firstRowIndex = firstRowIndex; + public void setRowColIdentifyProperties(ColumnIdentifyProperties[] columnIdentifyProperties) { + this.columnIdentifyProperties = columnIdentifyProperties; + this.rowIndex = IndexProperties.IDENTIFY; + this.colIndex = IndexProperties.IDENTIFY; } - public String getDelim() { - return delim; + public IndexProperties getRowIndex() { + return rowIndex; } - public String getIndexDelim() { - return indexDelim; + public void setRowIndex(IndexProperties rowIndex) { + this.rowIndex = rowIndex; } - public HashSet getNaStrings() { - return naStrings; + public IndexProperties getColIndex() { + return colIndex; } - public GRPattern getRowPattern() { - return rowPattern; + public void setColIndex(IndexProperties colIndex) { + this.colIndex = colIndex; } - public GRPattern getColPattern() { - return colPattern; + public Integer getRowIndexPrefixPosition() { + return rowIndexPrefixPosition; } - public GRSymmetry getGrSymmetry() { - return grSymmetry; + public void setRowIndexPrefixPosition(Integer rowIndexPrefixPosition) { + this.rowIndexPrefixPosition = rowIndexPrefixPosition; } - public int getFirstColIndex() { - return firstColIndex; + public String getRowIndexPrefixDelim() { + return rowIndexPrefixDelim; } - public void setFirstColIndex(int firstColIndex) { - this.firstColIndex = firstColIndex; + public void setRowIndexPrefixDelim(String rowIndexPrefixDelim) { + this.rowIndexPrefixDelim = rowIndexPrefixDelim; } - public int getFirstRowIndex() { - return firstRowIndex; + public Boolean getRowIndexPrefixDelimFixLength() { + return rowIndexPrefixDelimFixLength; } - public void setFirstRowIndex(int firstRowIndex) { - this.firstRowIndex = firstRowIndex; + public void setRowIndexPrefixDelimFixLength(Boolean rowIndexPrefixDelimFixLength) { + this.rowIndexPrefixDelimFixLength = rowIndexPrefixDelimFixLength; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java deleted file mode 100644 index f2d9835a107..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties2.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.sysds.runtime.io.FileFormatProperties; - -import java.io.Serializable; -import java.util.HashSet; - -public class CustomProperties2 extends FileFormatProperties implements Serializable { - private static final Log LOG = LogFactory.getLog(CustomProperties2.class.getName()); - private static final long serialVersionUID = -4447926749068752721L; - - public enum IndexProperties { - IDENTIFY, PREFIX, KEY; - @Override - public String toString() { - return this.name().toLowerCase().replaceAll("_", "-"); - } - } - - private IndexProperties rowIndex; - private IndexProperties colIndex; - - // When the index is prefixes - private Integer rowIndexPrefixPosition; - private String rowIndexPrefixDelim; - private Boolean rowIndexPrefixDelimFixLength; - - public IndexProperties getRowIndex() { - return rowIndex; - } - - public void setRowIndex(IndexProperties rowIndex) { - this.rowIndex = rowIndex; - } - - public IndexProperties getColIndex() { - return colIndex; - } - - public void setColIndex(IndexProperties colIndex) { - this.colIndex = colIndex; - } - - public Integer getRowIndexPrefixPosition() { - return rowIndexPrefixPosition; - } - - public void setRowIndexPrefixPosition(Integer rowIndexPrefixPosition) { - this.rowIndexPrefixPosition = rowIndexPrefixPosition; - } - - public String getRowIndexPrefixDelim() { - return rowIndexPrefixDelim; - } - - public void setRowIndexPrefixDelim(String rowIndexPrefixDelim) { - this.rowIndexPrefixDelim = rowIndexPrefixDelim; - } - - public Boolean getRowIndexPrefixDelimFixLength() { - return rowIndexPrefixDelimFixLength; - } - - public void setRowIndexPrefixDelimFixLength(Boolean rowIndexPrefixDelimFixLength) { - this.rowIndexPrefixDelimFixLength = rowIndexPrefixDelimFixLength; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 0e9b1cd7b21..f97a0816f34 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -45,9 +45,9 @@ public abstract class GenerateReader { public GenerateReader(SampleProperties sampleProperties) throws Exception { - readerMapping = sampleProperties.getDataType().isMatrix() ? new ReaderMapping.MatrixReaderMapping( - sampleProperties.getSampleRaw(), sampleProperties.getSampleMatrix()) : new ReaderMapping.FrameReaderMapping( - sampleProperties.getSampleRaw(), sampleProperties.getSampleFrame()); + readerMapping = sampleProperties.getDataType().isMatrix() ? new ReaderMapping(sampleProperties.getSampleRaw(), + sampleProperties.getSampleMatrix()) : new ReaderMapping(sampleProperties.getSampleRaw(), + sampleProperties.getSampleFrame()); } // Generate Reader for Matrix diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index bec10a34120..73eba1f3cb4 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -340,6 +340,10 @@ public String getSubString(int start, int end){ return raw.substring(start, end); } + public int getRawLength() { + return rawLength; + } + public static void main(String[] args) { String s = "123dddd56"; RawIndex ni = new RawIndex(s); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index c227c522ed3..b25d165f69f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -28,275 +28,113 @@ import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.Serializable; import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; import java.util.HashSet; -public abstract class ReaderMapping { - - protected int[][] mapRow; - protected int[][] mapCol; - protected boolean symmetric; - protected boolean skewSymmetric; - protected boolean isUpperTriangular; - protected int skewCoefficient; - protected final ArrayList sampleRawRows; - - protected boolean mapped; - protected static int nrows; - protected static int ncols; - protected final int nlines; - protected int firstRowIndex; - protected int firstColIndex; +public class ReaderMapping { + + private int[][] mapRow; + private int[][] mapCol; + private int[][] mapLen; + private boolean symmetric; + private boolean skewSymmetric; + private boolean isUpperTriangular; + private int skewCoefficient; + private ArrayList sampleRawIndexes; + + private boolean mapped; + private static int nrows; + private static int ncols; + private int nlines; + private int firstRowIndex; + private int firstColIndex; + + private MatrixBlock sampleMatrix; + private FrameBlock sampleFrame; + private Types.ValueType[] schema; + private final boolean isMatrix; + + public ReaderMapping(String raw, MatrixBlock matrix) throws Exception { + this.ReadRaw(raw); + this.isMatrix = true; + this.sampleMatrix = matrix; + this.nrows = this.sampleMatrix.getNumRows(); + this.ncols = this.sampleMatrix.getNumColumns(); + this.runMapping(); + } - protected ValueTrimFormat[][] VTF; - protected ValueTrimFormat[][] VTFClone = null; + public ReaderMapping(String raw, FrameBlock frame) throws Exception { + this.ReadRaw(raw); + this.isMatrix = false; + this.sampleFrame = frame; + this.nrows = this.sampleFrame.getNumRows(); + this.ncols = this.sampleFrame.getNumColumns(); + this.schema = this.sampleFrame.getSchema(); + this.runMapping(); + } - public ReaderMapping(String raw) throws Exception { + private void ReadRaw(String raw) throws Exception { + this.sampleRawIndexes = new ArrayList<>(); InputStream is = IOUtilFunctions.toInputStream(raw); BufferedReader br = new BufferedReader(new InputStreamReader(is)); String value; int nlines = 0; - sampleRawRows = new ArrayList<>(); + while((value = br.readLine()) != null) { - sampleRawRows.add(new RawRow(value)); + this.sampleRawIndexes.add(new RawIndex(value)); nlines++; } this.nlines = nlines; - firstColIndex = 0; - firstRowIndex = 0; - } - - protected abstract boolean isSchemaNumeric(); - - protected void cloneSample() { - if(VTFClone == null) { - VTFClone = new ValueTrimFormat[nrows][ncols]; - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) - VTFClone[r][c] = VTF[r][c].getACopy(); - } - } - - protected void retrieveSample() { - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) - VTF[r][c] = VTFClone[r][c].getACopy(); + this.firstColIndex = 0; + this.firstRowIndex = 0; } - protected void transferSampleTriangular(boolean isUpper) throws Exception { - if(nrows != ncols) - throw new Exception("For upper triangular both Row and Col should be same!"); - - for(int r = 0; r < nrows; r++) { - if(isUpper) { - for(int c = 0; c < r; c++) { - VTF[r][c].setNoSet(); - } - } - else { - for(int c = r + 1; c < ncols; c++) { - VTF[r][c].setNoSet(); - } - } - } - } - - protected void transferSampleSkew(int coefficient) throws Exception { - if(coefficient != 1 && coefficient != -1) - throw new Exception("The value of Coefficient have to be 1 or -1!"); - - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) { - if(!VTF[r][c].isNotSet() && VTF[r][c].getValueType().isNumeric()) - VTF[r][c] = new ValueTrimFormat(VTF[r][c].getColIndex(), VTF[r][c].getValueType(), - VTF[r][c].getDoubleActualValue() * coefficient); - } - } - - protected abstract ValueTrimFormat[][] convertSampleTOValueTrimFormat(); - - // Matrix Reader Mapping - public static class MatrixReaderMapping extends ReaderMapping { - - private MatrixBlock sampleMatrix; - - public MatrixReaderMapping(String raw, MatrixBlock matrix) throws Exception { - super(raw); - this.sampleMatrix = matrix; - nrows = sampleMatrix.getNumRows(); - ncols = sampleMatrix.getNumColumns(); - VTF = convertSampleTOValueTrimFormat(); - runMapping(); - } - - // Convert: convert each value of a sample matrix to NumberTrimFormat - @Override - protected ValueTrimFormat[][] convertSampleTOValueTrimFormat() { - ValueTrimFormat[][] result = new ValueTrimFormat[nrows][ncols]; - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) { - result[r][c] = new ValueTrimFormat(c, Types.ValueType.FP64, sampleMatrix.getValue(r, c)); - } - return result; - } - - @Override - protected boolean isSchemaNumeric() { + private boolean isSchemaNumeric() { + if(isMatrix) return true; - } + boolean result = true; + for(Types.ValueType vt : schema) + result &= vt.isNumeric(); + return result; } - // Frame Reader Mapping - public static class FrameReaderMapping extends ReaderMapping { - - private FrameBlock sampleFrame; - private Types.ValueType[] schema; - - public FrameReaderMapping(String raw, FrameBlock frame) throws Exception { - super(raw); - this.sampleFrame = frame; - nrows = sampleFrame.getNumRows(); - ncols = sampleFrame.getNumColumns(); - schema = sampleFrame.getSchema(); - VTF = convertSampleTOValueTrimFormat(); - //TODO: set NNZ for Frame !!?? - runMapping(); - } - - // Convert: convert each value of a sample Frame to ValueTrimFormat(Number, String, and Boolean) - @Override - protected ValueTrimFormat[][] convertSampleTOValueTrimFormat() { - ValueTrimFormat[][] result = new ValueTrimFormat[nrows][ncols]; - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) { - result[r][c] = new ValueTrimFormat(c, schema[c], sampleFrame.get(r, c)); - } - return result; - } - - @Override - protected boolean isSchemaNumeric() { - boolean result = true; - for(Types.ValueType vt : schema) - result &= vt.isNumeric(); - return result; - } - } - - public void runMapping() throws Exception { - + private void runMapping() throws Exception { mapped = findMapping(); - boolean schemaNumeric = isSchemaNumeric(); - if(!mapped) { - // Clone Sample Matrix/Frame - cloneSample(); - - // Symmetric and Skew-Symmetric check: - symmetric = nrows == ncols; - skewSymmetric = nrows == ncols && schemaNumeric; - - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(symmetric) - symmetric = VTF[r][c].isEqual(VTF[c][r]); - - if(skewSymmetric) { - if(r != c) - skewSymmetric = VTF[r][c].getDoubleActualValue() == VTF[c][r].getDoubleActualValue() * -1; - else - skewSymmetric = VTF[r][c].isNotSet(); - } - } - } - - boolean isRR = isRowRegular(); - if(symmetric) { - // Lower Triangular - isUpperTriangular = false; - transferSampleTriangular(isUpperTriangular); - mapped = isRR ? findMapping() : findMapping() && verifyRISymmetricMapping(isUpperTriangular); - - // Upper Triangular - if(!mapped) { - isUpperTriangular = true; - retrieveSample(); - transferSampleTriangular(isUpperTriangular); - mapped = isRR ? findMapping() : findMapping() && verifyRISymmetricMapping(isUpperTriangular); - } - } - // Skew-Symmetric check: - else if(skewSymmetric) { - // Lower Triangular - isUpperTriangular = false; - transferSampleTriangular(isUpperTriangular); - mapped = isRR ? findMapping() : findMapping() && verifyRISymmetricMapping(isUpperTriangular); - - // Lower Triangular Skew - if(!mapped) { - skewCoefficient = -1; - transferSampleSkew(skewCoefficient); - mapped = isRR ? findMapping() : findMapping() && verifyRISymmetricMapping(isUpperTriangular); - } - - // Upper Triangular - if(!mapped) { - isUpperTriangular = true; - skewCoefficient = 1; - retrieveSample(); - transferSampleTriangular(isUpperTriangular); - mapped = isRR ? findMapping() : findMapping() && verifyRISymmetricMapping(isUpperTriangular); - } - // Upper Triangular Skew - if(!mapped) { - skewCoefficient = -1; - transferSampleSkew(skewCoefficient); - mapped = isRR ? findMapping() : findMapping() && verifyRISymmetricMapping(isUpperTriangular); - } - } - } } protected boolean findMapping() { mapRow = new int[nrows][ncols]; mapCol = new int[nrows][ncols]; + mapLen = new int[nrows][ncols]; // Set "-1" as default value for all defined matrix for(int r = 0; r < nrows; r++) for(int c = 0; c < ncols; c++) - mapRow[r][c] = mapCol[r][c] = -1; + mapRow[r][c] = mapCol[r][c] = mapLen[r][c] = -1; - for(int i = 0; i < nlines; i++) { - sampleRawRows.get(i).resetReserved(); - } int itRow = 0; for(int r = 0; r < nrows; r++) { - ArrayList vtfRow = new ArrayList<>(); - for(int i = 0; i < ncols; i++) { - if(!VTF[r][i].isNotSet()) - vtfRow.add(VTF[r][i]); - } - Collections.sort(vtfRow); - - for(ValueTrimFormat vtf : vtfRow) { - int c = vtf.getColIndex(); - HashSet checkedLines = new HashSet<>(); - while(checkedLines.size() < nlines) { - RawRow row = sampleRawRows.get(itRow); - Pair mi = row.findValue(vtf, false); - if(mi.getKey() != -1) { - mapRow[r][c] = itRow; - mapCol[r][c] = mi.getKey(); - break; - } - else { - checkedLines.add(itRow); - itRow++; - if(itRow == nlines) - itRow = 0; + for(int c = 0; c < ncols; c++) { + if((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && this.sampleFrame.get( + r, c) != null)) { + HashSet checkedLines = new HashSet<>(); + while(checkedLines.size() < nlines) { + RawIndex ri = sampleRawIndexes.get(itRow); + Pair pair = this.isMatrix ? ri.findValue( + sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); + if(pair != null) { + mapRow[r][c] = itRow; + mapCol[r][c] = pair.getKey(); + mapLen[r][c] = pair.getValue(); + break; + } + else { + checkedLines.add(itRow); + itRow++; + if(itRow == nlines) + itRow = 0; + } } } } @@ -304,554 +142,236 @@ protected boolean findMapping() { boolean flagMap = true; for(int r = 0; r < nrows && flagMap; r++) for(int c = 0; c < ncols && flagMap; c++) - if(mapRow[r][c] == -1 && !VTF[r][c].isNotSet()) { + if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, + c) != null) || (this.isMatrix && this.sampleMatrix.getValue(r, c) != 0))) { flagMap = false; } return flagMap; } - private boolean verifyRISymmetricMapping(boolean upperTriangular) { - - boolean result = false; - int[] rowIndex = {0, 1, 0, 1}; - int[] colIndex = {0, 1, 1, 0}; - for(int i = 0; i < rowIndex.length && !result; i++) { - result = verifyRISymmetricMapping(upperTriangular, rowIndex[i], colIndex[i]); - if(result) { - firstRowIndex = rowIndex[i]; - firstColIndex = colIndex[i]; - } - } - return result; - } + public CustomProperties getFormatProperties() { + CustomProperties properties = new CustomProperties(); - private boolean verifyRISymmetricMapping(boolean upperTriangular, int firstRowIndex, int firstColIndex) { + boolean rowIndexIdentify = isRowIndexIdentify(); + ColumnIdentifyProperties[] colIndexIdentify = isColumnIndexIdentify(); + if(rowIndexIdentify && colIndexIdentify!=null) + properties.setRowColIdentifyProperties(colIndexIdentify); - HashSet checkedRow = new HashSet<>(); - boolean rcvMapped = true; - int selectedIndex; + return properties; + } - for(int r = nrows - 2; r >= 0 && rcvMapped; r--) { - selectedIndex = upperTriangular ? Math.min(r + 1, nrows - 1) : Math.max(r - 1, 0); - if(r == selectedIndex) - break; - int lindeIndex = 0; - rcvMapped = false; - do { - if(checkedRow.contains(lindeIndex) || VTF[r][selectedIndex].isNotSet()) - continue; - RawRow row = sampleRawRows.get(lindeIndex).getResetClone(); - if(isMapRowColValue(row, r + firstRowIndex, selectedIndex + firstColIndex, VTF[r][selectedIndex])) { - checkedRow.add(lindeIndex); - rcvMapped = true; + // Row Index is Identifies, when the sample row index equal to sample Matrix/Frame row index + private boolean isRowIndexIdentify() { + int l = 0; + ArrayList> mismatched = new ArrayList<>(); + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + if(mapRow[r][c] != -1 && l != mapRow[r][c]) { + mismatched.add(new Pair<>(r, c)); } } - while(++lindeIndex < nlines && !rcvMapped); + l++; } - return rcvMapped; - } - - public final CustomProperties getFormatProperties() throws Exception { - CustomProperties ffp; - if(isRowRegular()) { - ffp = getFileFormatPropertiesOfRRCRMapping(); - if(ffp == null) { - ffp = getFileFormatPropertiesOfRRCIMapping(); - } + // All rows of sample raw not used + if(l != nlines) { + return false; } - else { - ffp = getFileFormatPropertiesOfRIMapping(); + else if(mismatched.size() > 0) { + return false; } - return ffp; + return true; } - public final boolean isRowRegular() { - int nrows = mapRow.length; - int ncols = mapRow[0].length; - boolean result = true; - int rValue = -1; + // Column Index is identifies, when the logical char position of the sample raw on a line + // equal to a column index in sample Matrix/Frame + private ColumnIdentifyProperties[] isColumnIndexIdentify() { + ColumnIdentifyProperties[] result = new ColumnIdentifyProperties[ncols]; for(int c = 0; c < ncols; c++) { - if(mapRow[0][c] != -1) { - rValue = mapRow[0][c]; - break; - } - } - - for(int r = 0; r < nrows && result; r++) { - for(int c = 0; c < ncols && result; c++) { - if(mapRow[r][c] != -1 && mapRow[r][c] != rValue + r) { - result = false; - } + Pair pair = getLogicalPositionOfAColumn(c); + if(pair == null) + return null; + else { + String endDelimiterOfAColumn = getEndDelimiterOfAColumn(c); + if(endDelimiterOfAColumn != null) + result[c] = new ColumnIdentifyProperties(pair.getKey(), pair.getValue(), endDelimiterOfAColumn); + else + return null; } } return result; } - /* Get delimiters between two indexes. - Row String: 1,2,3,4,5 - Sample Matrix: [1 2 3 4 5 ] - Map Col: [0 2 4 6 8 ] - result: ["," "," "," "," ","] - */ - public final CustomProperties getFileFormatPropertiesOfRRCRMapping() { - - ArrayList rowDelims = new ArrayList<>(); - HashSet naString = new HashSet<>(); - String stringToken = null; - - // append all delimiters as a string and then tokenize it + private Pair getLogicalPositionOfAColumn(int colIndex) { + ArrayList tokens = new ArrayList<>(); + int minPos = mapCol[0][colIndex]; + int maxPos = minPos; + int colPos; + int rowIndex; for(int r = 0; r < nrows; r++) { - RawRow rr = sampleRawRows.get(r); - Pair pair = rr.getDelims(); - rowDelims.add(pair.getValue()); - if(stringToken == null || (pair.getKey().length() > 0 && stringToken.length() > pair.getKey().length())) - stringToken = pair.getKey(); - } - if(stringToken.length() == 0) - stringToken = rowDelims.get(0); - String uniqueDelimiter = null; - StringBuilder token = new StringBuilder(); - - FastStringTokenizer fastStringTokenizer; - - for(Character ch : stringToken.toCharArray()) { - token.append(ch); - boolean flagCurrToken = true; - HashSet ns = new HashSet<>(); - fastStringTokenizer = new FastStringTokenizer(token.toString()); - for(int r = 0; r < nrows; r++) { - String row = rowDelims.get(r); - fastStringTokenizer.reset(row); - ArrayList delimsOfToken = fastStringTokenizer.getTokens(); - - // remove numeric NA Strings - // This case can appear in Frame DataType - for(String s : delimsOfToken) { - try { - Double.parseDouble(s); - } - catch(Exception ex) { - ns.add(s); - } - } - if(fastStringTokenizer._count != ncols - 1) { - flagCurrToken = false; - break; - } - } - if(flagCurrToken) { - uniqueDelimiter = token.toString(); - naString = ns; + colPos = mapCol[r][colIndex]; + rowIndex = mapRow[r][colIndex]; + if(colPos != -1) { + tokens.add(sampleRawIndexes.get(rowIndex).getSubString(0, colPos)); + minPos = Math.min(minPos, colPos); + maxPos = Math.max(maxPos, colPos); } } - if(uniqueDelimiter != null) { - CustomProperties ffpgr = new CustomProperties(CustomProperties.GRPattern.Regular, uniqueDelimiter, - naString); - ffpgr.setDescription("CSV Format Recognized"); - return ffpgr; + if(maxPos == 0 && minPos == 0) { + return new Pair("", 0); } - else - return null; - } - - private static class FastStringTokenizer implements Serializable { - private static final long serialVersionUID = -4698672725609750097L; - private String _string = null; - private String _del = ""; - private int _pos = -1; - private int _count = 0; - public FastStringTokenizer(String delimiter) { - _del = delimiter; - reset(null); - } - - public void reset(String string) { - _string = string; - _pos = 0; - _count = 0; - } - - private String nextToken() { - int len = _string.length(); - int start = _pos; - - //find start (skip over leading delimiters) - while(start != -1 && start < len && _del - .equals(_string.substring(start, Math.min(start + _del.length(), _string.length())))) { - start += _del.length(); - _count++; + String delimCandidate = null; + int delimCandidateCont = 0; + for(int tl = 1; tl < minPos; tl++) { + String token = tokens.get(0); + String delim = token.substring(token.length() - tl); + int xCount = getDuplicateSubstringCountString(tokens.get(0), delim); + int yCount = xCount; + for(int i = 1; i < tokens.size() && xCount == yCount; i++) { + yCount = getDuplicateSubstringCountString(tokens.get(i), delim); } - - //find end (next delimiter) and return - if(start < len && start != -1) { - _pos = _string.indexOf(_del, start); - if(start < _pos && _pos < len) { - return _string.substring(start, _pos); - } - else - return _string.substring(start); + if(xCount == yCount) { + delimCandidate = delim; + delimCandidateCont = xCount; } - //no next token - return null; + else + break; } + if(delimCandidate != null) + return new Pair<>(delimCandidate, delimCandidateCont); + return null; - public ArrayList getTokens() { - ArrayList tokens = new ArrayList<>(); - tokens.add(""); - String token; - do { - token = nextToken(); - if(token != null) { - tokens.add(token); - } - } - while(token != null); - return tokens; - } } - private CustomProperties getFileFormatPropertiesOfRIMapping() { - - int[] rowIndex = {0, 1, 0, 1}; - int[] colIndex = {0, 1, 1, 0}; - CustomProperties ffp = null; - for(int i = 0; i < rowIndex.length && ffp == null; i++) { - ffp = getDelimsOfMapping(rowIndex[i], colIndex[i]); - if(ffp != null) { - firstRowIndex = rowIndex[i]; - firstColIndex = colIndex[i]; + private String getEndDelimiterOfAColumn(int colIndex) { + HashSet tokens = new HashSet<>(); + int colEnd; + int colPos; + int rowIndex; + for(int r = 0; r < nrows; r++) { + rowIndex = mapRow[r][colIndex]; + colPos = mapCol[rowIndex][colIndex]; + RawIndex ri = sampleRawIndexes.get(r); + if(colPos != -1) { + colEnd = colPos + mapLen[r][colIndex]; + String endStr = ri.getSubString(colEnd, Math.min(ri.getRawLength(), colEnd + 1)); + tokens.add(endStr); } } - - if(ffp != null) { - ffp.setFirstColIndex(firstColIndex); - ffp.setFirstRowIndex(firstRowIndex); - ffp.setDescription( - "Market Matrix Format Recognized: FirstRowIndex: " + firstRowIndex + " and FirstColIndex: " + firstColIndex); - } - return ffp; + if(tokens.size() == 1) + return tokens.iterator().next(); + else + return null; } - private CustomProperties getDelimsOfMapping(int firstRowIndex, int firstColIndex) { - - //HashSet checkedRow = new HashSet<>(); - HashSet delims = new HashSet<>(); - int minDelimLength = -1; - boolean rcvMapped = false; - int selectedRowIndex = nrows - 2; - int selectedColIndex = ncols - 1; - // select maximum none zero col index - for(int c = ncols - 1; c >= 0; c--) { - if(!VTF[selectedRowIndex][c].isNotSet()) { - selectedColIndex = c; - break; - } - } - int lindeIndex = 0; + private int getDuplicateSubstringCountString(String source, String str) { + int count = 0; + int index = 0; do { - RawRow row = sampleRawRows.get(lindeIndex).getResetClone(); - if(isMapRowColValue(row, selectedRowIndex + firstRowIndex, selectedColIndex + firstColIndex, - VTF[selectedRowIndex][selectedColIndex])) { - rcvMapped = true; - - Pair, Integer> pair = row.getDelimsSet(); - delims.addAll(pair.getKey()); - minDelimLength = minDelimLength == -1 ? pair.getValue() : Math.min(minDelimLength, pair.getValue()); - } - } - while(++lindeIndex < nlines && !rcvMapped); - - if(!rcvMapped) { - return null; - } - else { - - String uniqueDelim = null; - for(int l = 1; l < minDelimLength + 1; l++) { - boolean flagToken = true; - HashSet token = new HashSet<>(); - for(String delim : delims) { - if(delim.length() % l != 0) { - flagToken = false; - break; - } - for(int i = 0; i <= delim.length() - l; i += l) - token.add(delim.substring(i, i + l)); - if(token.size() > 1) { - flagToken = false; - break; - } - } - if(flagToken) { - if(token.size() > 0) - uniqueDelim = token.iterator().next(); - break; - } + index = source.indexOf(str, index); + if(index != -1) { + count++; + index += str.length(); } - - if(uniqueDelim != null) { - CustomProperties.GRSymmetry symmetry; - if(symmetric) - symmetry = CustomProperties.GRSymmetry.SYMMETRIC; - else if(skewSymmetric) - symmetry = CustomProperties.GRSymmetry.SKEW_SYMMETRIC; - else - symmetry = CustomProperties.GRSymmetry.GENERAL; - - return new CustomProperties(symmetry, uniqueDelim, firstRowIndex, firstColIndex); - } - else - return null; } + while(index != -1); + return count; } - public CustomProperties getFileFormatPropertiesOfRRCIMapping() { + private Pair isRowIndexPrefix() { - CustomProperties ffplibsvm; - int firstColIndex = 0; - - // FirstColIndex = 0 - ffplibsvm = getDelimsOfRRCIMapping(firstColIndex); - - // FirstColIndex = 1 - if(ffplibsvm == null) { - firstColIndex = 1; - ffplibsvm = getDelimsOfRRCIMapping(firstColIndex); - } + ArrayList> mismatched = new ArrayList<>(); + ArrayList> prefixes = new ArrayList<>(); + ArrayList> nonePrefix = new ArrayList<>(); + DelimiterTrie delimiterTrie = new DelimiterTrie(); - if(ffplibsvm != null) { - ffplibsvm.setDescription("LibSVM Format Recognized: First Index Started From " + firstColIndex); - ffplibsvm.setFirstColIndex(firstColIndex); - } - return ffplibsvm; - } + int delimiterMinSize = 0; - private CustomProperties getDelimsOfRRCIMapping(int firstColIndex) { - HashMap> tokens = new HashMap<>(); - HashSet allTokens = new HashSet<>(); - int maxNNZCount = 0; - int selectedRowIndex = 0; for(int r = 0; r < nrows; r++) { - int rnnz = 0; - for(int c = 0; c < ncols; c++) - if(!VTF[r][c].isNotSet()) - rnnz++; - if(maxNNZCount < rnnz) { - maxNNZCount = rnnz; - selectedRowIndex = r; - } - } - - RawRow row = sampleRawRows.get(selectedRowIndex); - // For find index delimiter, we need to find all possible "Index Delim Value" tokens - for(int c = ncols - 1; c >= 0; c--) { - ValueTrimFormat v = VTF[selectedRowIndex][c]; - if(v.isNotSet()) - continue; - - String key = (c + firstColIndex) + "," + v.getStringOfActualValue(); - HashSet token = tokens.computeIfAbsent(key, k -> new HashSet<>()); - token.addAll(getColIndexValueMappedTokens(row, c + firstColIndex, v)); - allTokens.addAll(token); - } - - //After find all tokens the intersection of tokens is a good candidate for "Index delimiter" - // This part of code try to find the intersection of tokens - // In some cases like LobSVM label value don't have Index Delim token, - // So, we ignored this condition for some values - ArrayList missedKeys = new ArrayList<>(); - HashSet labelIndex = new HashSet<>(); - ArrayList selectedTokens = new ArrayList<>(); - - for(String key : tokens.keySet()) { - if(tokens.get(key).size() == 0) - missedKeys.add(key); - } - if(missedKeys.size() > 1) - return null; - else { - for(String t : allTokens) { - missedKeys.clear(); - for(String key : tokens.keySet()) { - if(!tokens.get(key).contains(t)) { - missedKeys.add(key); + RawIndex ri = sampleRawIndexes.get(r); + ri.cloneReservedPositions(); + for(int c = 0; c < ncols; c++) { + if(mapRow[r][c] != -1) { + Pair pair = ri.findValue(r); + if(pair == null) + mismatched.add(new Pair<>(r, c)); + else { + if(pair.getKey() < mapCol[r][c]) { + String delim = ri.getSubString(pair.getKey() + pair.getValue(), mapCol[r][c]); + int delimLength = delim.length(); + if(delimiterMinSize != 0 && delimLength < delimiterMinSize) + delimiterMinSize = delimLength; + else + delimiterMinSize = delimLength; + + delimiterTrie.insert(delim); + prefixes.add(pair); + } + else + nonePrefix.add(pair); } } - if(missedKeys.size() == 1) { - int li = Integer.parseInt(missedKeys.iterator().next().split(",")[0]); - labelIndex.add(li); - selectedTokens.add(t); - } } + //ri.restoreReservedPositions(); } + // TODO: attend to mistakes and none-prefix row index maps - /* After find index delim token, the next step is find Item Separator - The algorithm for find separator, mark all Indexes, Values and Index Delim on the raw string - Finally the reminder of the text is separator. In some cases(i.e., duplicated values) - there are more than on position for value and this cause wrong matching and finally wrong value - for separator. To avoid this type of problems, first looked for biggest char base size values - (for example a= 123.45 b= 1000000 a will match first because based on VariableTrimFormat algorithm - "a" have 5 char ad the length is 5, but b have 1 char and the length is one). - */ - String separator = null; - String indexSeparator = null; - boolean isVerify = false; - - // Just one row of the sample raw is enough for finding item separator. "selectedRowIndex" mentioned - // first row of sample raw data - - for(int i = 0; i < selectedTokens.size() && !isVerify; i++) { - isVerify = true; - indexSeparator = selectedTokens.get(i); - - row = sampleRawRows.get(selectedRowIndex).getResetClone(); - // find all values - ArrayList vtfValueList = new ArrayList<>(); - ValueTrimFormat vtfIndexDelim = new ValueTrimFormat(indexSeparator); - for(int c = 0; c < ncols; c++) { - if(!VTF[selectedRowIndex][c].isNotSet() && !labelIndex.contains(c + firstColIndex)) { - vtfValueList.add(VTF[selectedRowIndex][c].getACopy()); - } - } - Collections.sort(vtfValueList); - - for(ValueTrimFormat vtf : vtfValueList) { - ArrayList indexDelimValue = new ArrayList<>(); - ValueTrimFormat vtfColIndex = new ValueTrimFormat(vtf.getColIndex() + firstColIndex); - indexDelimValue.add(vtfColIndex); - indexDelimValue.add(vtfIndexDelim); - indexDelimValue.add(vtf); - row.findSequenceValues(indexDelimValue, 0, true); - } - for(Integer li : labelIndex) { - row.findValue(VTF[selectedRowIndex][li - firstColIndex], false); - } - //row.print(); - separator = row.getDelims().getKey(); - if(separator == null) { - isVerify = false; - break; - } - } - if(isVerify) { - return new CustomProperties(CustomProperties.GRPattern.Regular, separator, indexSeparator, firstColIndex); - } - else - return null; + return delimiterTrie.getShortestDelim(delimiterMinSize); } - private static boolean isMapRowColValue(RawRow rawrow, int row, int col, ValueTrimFormat value) { - ValueTrimFormat vtfRow = new ValueTrimFormat(row); - ValueTrimFormat vtfCol = new ValueTrimFormat(col); - ValueTrimFormat vtfValue = value.getACopy(); - boolean mapped = true; - - byte hasZero = 0b000; - if(vtfRow.isNotSet()) - hasZero |= 0b100; - - if(vtfCol.isNotSet()) - hasZero |= 0b010; - - if(vtfValue.isNotSet()) - hasZero |= 0b001; - - ValueTrimFormat[] order = new ValueTrimFormat[3]; - /* valid formats: - Row, Col, Value - 1. 0 , 0 , Value >> 110 -> 6 - 2. 0 , col, Value >> 100 -> 4 - 3. row, 0 , value >> 010 -> 2 - 4. row, col, value >> 000 -> 0 - ----------------- >> otherwise the value is not set. - */ - switch(hasZero) { - case 0: - order[0] = vtfRow; - order[1] = vtfCol; - order[2] = vtfValue; - break; - - case 2: - order[0] = vtfRow; - order[1] = vtfValue; - order[2] = vtfCol; - break; + class DelimiterTrie { + private final StringBuilder totalDelim; + private int totalDelimLength; + private boolean valid; - case 4: - order[0] = vtfCol; - order[1] = vtfValue; - order[2] = vtfRow; - break; - - case 6: - order[0] = vtfValue; - order[1] = vtfRow; - order[2] = vtfCol; - break; - default: - throw new RuntimeException("Not set values can't be find on a string"); + public DelimiterTrie() { + totalDelim = new StringBuilder(); + totalDelimLength = 0; + valid = true; } - for(ValueTrimFormat vtf : order) { - if(rawrow.findValue(vtf, false).getKey() == -1) { - mapped = false; - break; + + public boolean insert(String delim) { + if(delim.length() > totalDelimLength) { + if(delim.startsWith(totalDelim.toString())) { + totalDelim.append(delim.substring(totalDelimLength)); + totalDelimLength += delim.length() - totalDelimLength; + } + else + valid = false; } + else if(!totalDelim.toString().startsWith(delim)) + valid = false; + return valid; } - return mapped; - - } - private static HashSet getColIndexValueMappedTokens(RawRow rawrow, int col, ValueTrimFormat value) { - ValueTrimFormat vtfColIndex = new ValueTrimFormat(col); - ValueTrimFormat vtfColValue = value.getACopy(); - Pair pairCol; - Pair pairValue; - HashSet tokens = new HashSet<>(); - RawRow row = rawrow.getResetClone(); - int lastIndex = 0; - int lastTokenStart = -1; - int lastTokenEnd = -1; - int lastTokenID = -1; - do { - row.resetReserved(); - row.setLastIndex(lastIndex); - pairCol = row.findValue(vtfColIndex, true); - if(pairCol.getKey() == -1) - break; - - lastIndex = row.getNumericLastIndex(); - - pairValue = row.findValue(vtfColValue, true); - if(pairValue.getKey() == -1) - break; - - int tl = pairValue.getKey() - pairCol.getKey() + pairCol.getValue(); - if(tl > 0) { - - if(lastTokenID == -1) - lastTokenID = pairValue.getKey(); + public Pair getShortestDelim(int minsize) { + if(!valid) + return null; - if(lastTokenID != pairValue.getKey()) { - String token = row.getRaw().substring(lastTokenStart, lastTokenEnd); - tokens.add(token); + if(minsize == totalDelimLength) + return new Pair(totalDelim.toString(), true); + else { + HashSet delimSet = new HashSet<>(); + for(int i = 1; i <= minsize; i++) { + delimSet.clear(); + for(int j = 0; j < totalDelimLength; j += i) { + delimSet.add(totalDelim.substring(j, Math.min(j + i, totalDelimLength))); + } + if(delimSet.size() == 1) + break; } - - lastTokenStart = pairCol.getKey() + pairCol.getValue(); - lastTokenEnd = pairValue.getKey(); + if(delimSet.size() == 1) { + String delim = delimSet.iterator().next(); + return new Pair(delim, delim.length() == totalDelimLength); + } + else + return null; } } - while(true); - if(lastTokenEnd - lastTokenStart > 0) { - String token = row.getRaw().substring(lastTokenStart, lastTokenEnd); - tokens.add(token); - } - return tokens; - } - public boolean isSymmetric() { - return symmetric; + public void print() { + System.out.println(totalDelim); + } } public boolean isMapped() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java deleted file mode 100644 index dfa6c71d5be..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping2.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.IOUtilFunctions; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.sysds.runtime.matrix.data.Pair; - -import java.io.BufferedReader; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashSet; - -public class ReaderMapping2 { - - private int[][] mapRow; - private int[][] mapCol; - private boolean symmetric; - private boolean skewSymmetric; - private boolean isUpperTriangular; - private int skewCoefficient; - private ArrayList sampleRawIndexes; - - private boolean mapped; - private static int nrows; - private static int ncols; - private int nlines; - private int firstRowIndex; - private int firstColIndex; - - private MatrixBlock sampleMatrix; - private FrameBlock sampleFrame; - private Types.ValueType[] schema; - private final boolean isMatrix; - - public ReaderMapping2(String raw, MatrixBlock matrix) throws Exception { - this.ReadRaw(raw); - this.isMatrix = true; - this.sampleMatrix = matrix; - this.nrows = this.sampleMatrix.getNumRows(); - this.ncols = this.sampleMatrix.getNumColumns(); - this.runMapping(); - } - - public ReaderMapping2(String raw, FrameBlock frame) throws Exception { - this.ReadRaw(raw); - this.isMatrix = false; - this.sampleFrame = frame; - this.nrows = this.sampleFrame.getNumRows(); - this.ncols = this.sampleFrame.getNumColumns(); - this.schema = this.sampleFrame.getSchema(); - this.runMapping(); - } - - private void ReadRaw(String raw) throws Exception { - this.sampleRawIndexes = new ArrayList<>(); - InputStream is = IOUtilFunctions.toInputStream(raw); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String value; - int nlines = 0; - - while((value = br.readLine()) != null) { - this.sampleRawIndexes.add(new RawIndex(value)); - nlines++; - } - this.nlines = nlines; - this.firstColIndex = 0; - this.firstRowIndex = 0; - } - - private boolean isSchemaNumeric() { - if(isMatrix) - return true; - - boolean result = true; - for(Types.ValueType vt : schema) - result &= vt.isNumeric(); - return result; - } - - private void runMapping() throws Exception { - mapped = findMapping(); - } - - protected boolean findMapping() { - mapRow = new int[nrows][ncols]; - mapCol = new int[nrows][ncols]; - - // Set "-1" as default value for all defined matrix - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) - mapRow[r][c] = mapCol[r][c] = -1; - - int itRow = 0; - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && this.sampleFrame.get( - r, c) != null)) { - HashSet checkedLines = new HashSet<>(); - while(checkedLines.size() < nlines) { - RawIndex ri = sampleRawIndexes.get(itRow); - Pair pair = this.isMatrix ? ri.findValue( - sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); - if(pair != null) { - mapRow[r][c] = itRow; - mapCol[r][c] = pair.getKey(); - break; - } - else { - checkedLines.add(itRow); - itRow++; - if(itRow == nlines) - itRow = 0; - } - } - } - } - } - boolean flagMap = true; - for(int r = 0; r < nrows && flagMap; r++) - for(int c = 0; c < ncols && flagMap; c++) - if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, - c) != null) || (this.isMatrix && this.sampleMatrix.getValue(r, c) != 0))) { - flagMap = false; - } - return flagMap; - } - - public CustomProperties2 getFormatProperties() { - CustomProperties2 properties = new CustomProperties2(); - - // Find Row Index Properties - // 1. is row index identified? - boolean rowIndexIdentify = isRowIndexIdentify(); - if(!rowIndexIdentify) { - Pair pair = isRowIndexPrefix(); - if(pair==null){ - - } - else { - properties.setRowIndex(CustomProperties2.IndexProperties.PREFIX); - properties.setRowIndexPrefixDelim(pair.getKey()); - properties.setRowIndexPrefixDelimFixLength(pair.getValue()); - } - } - else - properties.setRowIndex(CustomProperties2.IndexProperties.IDENTIFY); - - return properties; - } - - private boolean isRowIndexIdentify() { - int l = 0; - ArrayList> mismatched = new ArrayList<>(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(mapRow[r][c] != -1 && l != mapRow[r][c]) { - mismatched.add(new Pair<>(r, c)); - } - } - l++; - } - // All rows of sample raw not used - if(l != nlines) { - return false; - } - else if(mismatched.size() > 0) { - return false; - } - return true; - } - - private Pair isRowIndexPrefix() { - - ArrayList> mismatched = new ArrayList<>(); - ArrayList> prefixes = new ArrayList<>(); - ArrayList> nonePrefix = new ArrayList<>(); - DelimiterTrie delimiterTrie = new DelimiterTrie(); - - int delimiterMinSize = 0; - - for(int r = 0; r < nrows; r++) { - RawIndex ri = sampleRawIndexes.get(r); - ri.cloneReservedPositions(); - for(int c = 0; c < ncols; c++) { - if(mapRow[r][c] != -1) { - Pair pair = ri.findValue(r); - if(pair == null) - mismatched.add(new Pair<>(r, c)); - else { - if(pair.getKey() < mapCol[r][c]) { - String delim = ri.getSubString(pair.getKey() + pair.getValue(), mapCol[r][c]); - int delimLength = delim.length(); - if(delimiterMinSize != 0 && delimLength < delimiterMinSize) - delimiterMinSize = delimLength; - else - delimiterMinSize = delimLength; - - delimiterTrie.insert(delim); - prefixes.add(pair); - } - else - nonePrefix.add(pair); - } - } - } - //ri.restoreReservedPositions(); - } - // TODO: attend to mistakes and none-prefix row index maps - - return delimiterTrie.getShortestDelim(delimiterMinSize); - } - - class DelimiterTrie { - private final StringBuilder totalDelim; - private int totalDelimLength; - private boolean valid; - - public DelimiterTrie() { - totalDelim = new StringBuilder(); - totalDelimLength = 0; - valid = true; - } - - public boolean insert(String delim) { - if(delim.length() > totalDelimLength) { - if(delim.startsWith(totalDelim.toString())) { - totalDelim.append(delim.substring(totalDelimLength)); - totalDelimLength += delim.length() - totalDelimLength; - } - else - valid = false; - } - else if(!totalDelim.toString().startsWith(delim)) - valid = false; - return valid; - } - - public Pair getShortestDelim(int minsize) { - if(!valid) - return null; - - if(minsize == totalDelimLength) - return new Pair(totalDelim.toString(), true); - else { - HashSet delimSet = new HashSet<>(); - for(int i = 1; i <= minsize; i++) { - delimSet.clear(); - for(int j = 0; j < totalDelimLength; j += i) { - delimSet.add(totalDelim.substring(j, Math.min(j + i, totalDelimLength))); - } - if(delimSet.size() == 1) - break; - } - if(delimSet.size() == 1) { - String delim = delimSet.iterator().next(); - return new Pair(delim, delim.length() == totalDelimLength); - } - else - return null; - } - } - - public void print() { - System.out.println(totalDelim); - } - } - - - public boolean isMapped() { - return mapped; - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index 7541b7182e2..c1e24e950cf 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -19,11 +19,11 @@ package org.apache.sysds.test.functions.iogen; +import com.google.gson.Gson; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types; import org.apache.sysds.conf.CompilerConfig; -import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.iogen.ReaderMapping; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; @@ -89,10 +89,17 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + ReaderMapping r2 = new ReaderMapping(sampleRaw, sampleMB); + //System.out.println(r2.isMapped()); - MatrixReader mr= gr.getReader(); - MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, -1, clen, -1, -1); + Gson gson=new Gson(); + System.out.println(gson.toJson(r2.getFormatProperties())); + + +// GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); +// +// MatrixReader mr= gr.getReader(); +// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, -1, clen, -1, -1); } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java new file mode 100644 index 00000000000..6ae3f3157ed --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen.Identify; + +import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; +import org.junit.Test; + +import java.util.Random; + +public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { + + private final static String TEST_NAME = "MatrixGenerateReaderCSVTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + private void generateRandomCSV(int nrows, int ncols, double min, double max, double sparsity, String separator, + String[] naString) { + + sampleMatrix = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); + StringBuilder sb = new StringBuilder(); + + for(int r = 0; r < nrows; r++) { + StringBuilder row = new StringBuilder(); + for(int c = 0; c < ncols; c++) { + if(sampleMatrix[r][c] != 0) { + row.append(sampleMatrix[r][c]).append(separator); + } + else { + Random rn = new Random(); + int rni = rn.nextInt(naString.length); + row.append(naString[rni]).append(separator); + } + } + + sb.append(row.substring(0, row.length() - separator.length())); + if(r != nrows - 1) + sb.append("\n"); + } + sampleRaw = sb.toString(); + } + + @Test + public void test1() { + sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; + sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; + runGenerateReaderTest(); + } + + @Test + public void test2() { + sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; + sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; + runGenerateReaderTest(); + } + @Test + public void test3() { + sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; + sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; + runGenerateReaderTest(); + } + + +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java index ff32ada6b14..7492467f5f6 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java @@ -68,7 +68,7 @@ public void test1() { @Test public void test2() { String[] naString = {"NaN"}; - generateRandomCSV(5, 5, -10, 10, 1, ",", naString); + generateRandomCSV(1000, 10000, -10, 10, 1, ",", naString); runGenerateReaderTest(); } @@ -82,7 +82,7 @@ public void test3() { @Test public void test4() { String[] naString = {"Nan", "NAN", "", "inf", "null", "NULL"}; - generateRandomCSV(50, 50, -10, 10, 0.5, ",,", naString); + generateRandomCSV(5, 5, -10, 10, 0.5, ",,", naString); runGenerateReaderTest(); } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java index b25b2d15d7a..074555c3590 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java @@ -139,7 +139,7 @@ public void test1() { @Test public void test1_2() { - generateRandomMM(1, 5, 100, -100, 100, 1, ",,,,,"); + generateRandomMM(1, 500, 1000, -100, 100, 1, ",,,,,"); runGenerateReaderTest(); } From bdc9ce8141feab9fc8730a12ce47ba7e43a3f268 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 16 Jan 2022 17:55:09 +0100 Subject: [PATCH 03/84] Update CodeGen --- .../iogen/ColumnIdentifyProperties.java | 60 -- .../sysds/runtime/iogen/CustomProperties.java | 57 +- .../sysds/runtime/iogen/DelimiterTrie.java | 177 ------ .../runtime/iogen/DelimiterTrieNode.java | 35 -- .../runtime/iogen/FastStringTokenizer.java | 95 --- .../runtime/iogen/FormatIdentifying.java | 181 ++++++ .../runtime/iogen/FrameGenerateReader.java | 292 --------- .../sysds/runtime/iogen/GenerateReader.java | 57 +- .../sysds/runtime/iogen/MappingTrie.java | 316 ++++++++++ .../sysds/runtime/iogen/MappingTrieNode.java | 63 ++ .../runtime/iogen/MatrixGenerateReader.java | 1 + .../apache/sysds/runtime/iogen/RawIndex.java | 12 +- .../apache/sysds/runtime/iogen/RawRow.java | 558 ------------------ .../sysds/runtime/iogen/ReaderMapping.java | 264 ++------- .../sysds/runtime/iogen/ValueTrimFormat.java | 228 ------- .../runtime/iogen/codegen/CodeGenBase.java | 18 + .../runtime/iogen/codegen/CodeGenTrie.java | 104 ++++ .../iogen/codegen/CodeGenTrieNode.java | 167 ++++++ .../runtime/iogen/codegen/RowColIdentify.java | 65 ++ .../iogen/template/GIOMatrixReader.java | 94 +++ .../iogen/template/MatrixGenerateReader.java | 171 ++++++ .../iogen/template/TemplateCodeGenBase.java | 26 + .../iogen/template/TemplateCodeGenFrame.java | 18 + .../iogen/template/TemplateCodeGenMatrix.java | 60 ++ 24 files changed, 1362 insertions(+), 1757 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrieNode.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/FastStringTokenizer.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/FrameGenerateReader.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/RawRow.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ValueTrimFormat.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java deleted file mode 100644 index 396c0d9fec6..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/ColumnIdentifyProperties.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -public class ColumnIdentifyProperties { - - private String indexPositionDelimiter; - private Integer indexPosition; - private String valueEndWithString; - - public ColumnIdentifyProperties() { - } - - public ColumnIdentifyProperties(String indexPositionDelimiter, Integer indexPosition, String valueEndWithString) { - this.indexPositionDelimiter = indexPositionDelimiter; - this.indexPosition = indexPosition; - this.valueEndWithString = valueEndWithString; - } - - public String getIndexPositionDelimiter() { - return indexPositionDelimiter; - } - - public void setIndexPositionDelimiter(String indexPositionDelimiter) { - this.indexPositionDelimiter = indexPositionDelimiter; - } - - public Integer getIndexPosition() { - return indexPosition; - } - - public void setIndexPosition(Integer indexPosition) { - this.indexPosition = indexPosition; - } - - public String getValueEndWithString() { - return valueEndWithString; - } - - public void setValueEndWithString(String valueEndWithString) { - this.valueEndWithString = valueEndWithString; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 751e7e1e1fa..1c076796dbf 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -24,6 +24,8 @@ import org.apache.sysds.runtime.io.FileFormatProperties; import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { private static final Log LOG = LogFactory.getLog(CustomProperties.class.getName()); @@ -33,25 +35,34 @@ public enum IndexProperties { IDENTIFY, PREFIX, KEY; @Override public String toString() { - return this.name().toLowerCase().replaceAll("_", "-"); + return this.name().toUpperCase(); } } + private ArrayList[] colKeyPattern; + private HashSet[] endWithValueString; private IndexProperties rowIndex; private IndexProperties colIndex; - // When the Row and Column Index are identify - private ColumnIdentifyProperties[] columnIdentifyProperties; + public CustomProperties(ArrayList[] colKeyPattern, HashSet[] endWithValueString) { + this.colKeyPattern = colKeyPattern; + this.endWithValueString = endWithValueString; + } - // When the index is prefixes - private Integer rowIndexPrefixPosition; - private String rowIndexPrefixDelim; - private Boolean rowIndexPrefixDelimFixLength; + public ArrayList[] getColKeyPattern() { + return colKeyPattern; + } - public void setRowColIdentifyProperties(ColumnIdentifyProperties[] columnIdentifyProperties) { - this.columnIdentifyProperties = columnIdentifyProperties; - this.rowIndex = IndexProperties.IDENTIFY; - this.colIndex = IndexProperties.IDENTIFY; + public void setColKeyPattern(ArrayList[] colKeyPattern) { + this.colKeyPattern = colKeyPattern; + } + + public HashSet[] getEndWithValueString() { + return endWithValueString; + } + + public void setEndWithValueString(HashSet[] endWithValueString) { + this.endWithValueString = endWithValueString; } public IndexProperties getRowIndex() { @@ -69,28 +80,4 @@ public IndexProperties getColIndex() { public void setColIndex(IndexProperties colIndex) { this.colIndex = colIndex; } - - public Integer getRowIndexPrefixPosition() { - return rowIndexPrefixPosition; - } - - public void setRowIndexPrefixPosition(Integer rowIndexPrefixPosition) { - this.rowIndexPrefixPosition = rowIndexPrefixPosition; - } - - public String getRowIndexPrefixDelim() { - return rowIndexPrefixDelim; - } - - public void setRowIndexPrefixDelim(String rowIndexPrefixDelim) { - this.rowIndexPrefixDelim = rowIndexPrefixDelim; - } - - public Boolean getRowIndexPrefixDelimFixLength() { - return rowIndexPrefixDelimFixLength; - } - - public void setRowIndexPrefixDelimFixLength(Boolean rowIndexPrefixDelimFixLength) { - this.rowIndexPrefixDelimFixLength = rowIndexPrefixDelimFixLength; - } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java deleted file mode 100644 index 8b4bb303acd..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/DelimiterTrie.java +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import java.util.HashSet; - -public class DelimiterTrie { - private DelimiterTrieNode root; - - public DelimiterTrie() { - root = new DelimiterTrieNode(); - } - - private String intersect(String str1, String str2) { - StringBuilder sb = new StringBuilder(); - for(int i = 0; i < Math.min(str1.length(), str2.length()); i++) { - if(str1.charAt(i) == str2.charAt(i)) - sb.append(str1.charAt(i)); - else - break; - } - if(sb.length() == 0) - return null; - else - return sb.toString(); - } - - private TrieNodeResult getSubNode(DelimiterTrieNode current, String delim) { - for(String key : current.getChildren().keySet()) { - String insec = intersect(key, delim); - if(insec != null) - return new TrieNodeResult(current.getChildren().get(key), insec, key); - } - return null; - } - - public void insert(String delim) { - DelimiterTrieNode current = root; - String remaindKeyDelim; - String currentDelim = delim; - TrieNodeResult trieNodeResult; - do { - trieNodeResult = getSubNode(current, currentDelim); - if(trieNodeResult == null) { - DelimiterTrieNode newNode = new DelimiterTrieNode(); - current.getChildren().put(currentDelim, newNode); - } - else { - currentDelim = currentDelim.substring(trieNodeResult.intersect.length()); - remaindKeyDelim = trieNodeResult.nodeKey.substring(trieNodeResult.intersect.length()); - int cwl = currentDelim.length(); - int rkwl = remaindKeyDelim.length(); - - if(cwl == 0 && rkwl > 0) { - DelimiterTrieNode newNode = new DelimiterTrieNode(); - - DelimiterTrieNode updateNode = new DelimiterTrieNode(); - updateNode.setChildren(trieNodeResult.trieNode.getChildren()); - - // Add Update Node - newNode.getChildren().put(remaindKeyDelim, updateNode); - - // Add New Node - current.getChildren().put(trieNodeResult.intersect, newNode); - - // Remove old node - current.getChildren().remove(trieNodeResult.nodeKey); - - } - else if(rkwl == 0) { - current = trieNodeResult.trieNode; - } - else { - DelimiterTrieNode newNode = new DelimiterTrieNode(); - - DelimiterTrieNode updateNode = new DelimiterTrieNode(); - updateNode.setChildren(trieNodeResult.trieNode.getChildren()); - - // Add Update Node - newNode.getChildren().put(remaindKeyDelim, updateNode); - - // Add New Node - current.getChildren().put(trieNodeResult.intersect, newNode); - - // Remove old node - current.getChildren().remove(trieNodeResult.nodeKey); - - // Add New Delim remaind - DelimiterTrieNode newDelimNode = new DelimiterTrieNode(); - newNode.getChildren().put(currentDelim, newDelimNode); - break; - } - } - - } - while(trieNodeResult != null && currentDelim.length() > 0); - } - - public String getShortestDelim(int minsize) { - // Check the possibility of the shortest delim - boolean flag = true; - DelimiterTrieNode current = root; - StringBuilder sb = new StringBuilder(); - do { - int currentChildCount = current.getChildren().size(); - if(currentChildCount == 0) - break; - else if(currentChildCount != 1) - flag = false; - else { - String key = current.getChildren().keySet().iterator().next(); - sb.append(key); - current = current.getChildren().get(key); - } - } - while(flag); - if(flag) { - String allDelim = sb.toString(); - int allDelimLength = allDelim.length(); - HashSet delimSet = new HashSet<>(); - for(int i=1; i<=minsize; i++){ - delimSet.clear(); - for(int j=0; j children = new HashMap<>(); - - public Map getChildren() { - return children; - } - - public void setChildren(Map children) { - this.children = children; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FastStringTokenizer.java b/src/main/java/org/apache/sysds/runtime/iogen/FastStringTokenizer.java deleted file mode 100644 index 42bfbe4dcf6..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/FastStringTokenizer.java +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import java.io.Serializable; -import java.util.HashSet; - -public class FastStringTokenizer implements Serializable { - - private static final long serialVersionUID = -4698672725609750097L; - private String _string = null; - private String _del = ""; - private int _pos = -1; - private int _index = 0; - private HashSet naStrings = null; - - public FastStringTokenizer(String delimiter) { - _del = delimiter; - reset(null); - } - - public void reset(String string) { - _string = string; - _pos = 0; - _index = 0; - } - - public String nextToken() { - int len = _string.length(); - int start = _pos; - - if(_pos == -1) { - _index = -1; - return "0"; - } - //find start (skip over leading delimiters) - while(start < len && _del.equals(_string.substring(start, Math.min(start + _del.length(), _string.length())))) { - start += _del.length(); - _index++; - } - - //find end (next delimiter) and return - if(start < len) { - _pos = _string.indexOf(_del, start); - if(start < _pos && _pos < len) - return _string.substring(start, _pos); - else - return _string.substring(start); - } - //no next token - _index = -1; - return null; - } - - public int nextInt() { - return Integer.parseInt(nextToken()); - } - - public long nextLong() { - return Long.parseLong(nextToken()); - } - - public double nextDouble() { - String nt = nextToken(); - if((naStrings != null && naStrings.contains(nt)) || nt == null) - return 0; - else - return Double.parseDouble(nt); - } - - public int getIndex() { - return _index; - } - - public void setNaStrings(HashSet naStrings) { - this.naStrings = naStrings; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java new file mode 100644 index 00000000000..5da4f2ef8f2 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.data.Pair; + +import java.util.ArrayList; +import java.util.HashSet; + +public class FormatIdentifying { + + private int[][] mapRow; + private int[][] mapCol; + private int[][] mapLen; + private ArrayList sampleRawIndexes; + + private static int nrows; + private static int ncols; + private int nlines; + private final boolean isMatrix; + private int colIndexBeginFrom; + private int rowIndexBeginFrom; + + private ReaderMapping mappingValues; + private CustomProperties properties; + + public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { + this.mappingValues = new ReaderMapping(raw, matrix); + this.isMatrix = true; + this.runIdentification(); + } + + public FormatIdentifying(String raw, FrameBlock frame) throws Exception { + this.mappingValues = new ReaderMapping(raw, frame); + this.isMatrix = false; + this.runIdentification(); + } + + private void runIdentification() { + + mapRow = mappingValues.getMapRow(); + mapCol = mappingValues.getMapCol(); + mapLen = mappingValues.getMapLen(); + sampleRawIndexes = mappingValues.getSampleRawIndexes(); + + nrows = mappingValues.getNrows(); + ncols = mappingValues.getNcols(); + nlines = mappingValues.getNlines(); + + Pair[], HashSet[]> patternPair = buildKeyPattern(); + properties = new CustomProperties(patternPair.getKey(), patternPair.getValue()); + properties.setRowIndex(CustomProperties.IndexProperties.IDENTIFY); + } + + + public CustomProperties getFormatProperties() { + return properties; + } + + private Pair[], HashSet[]> buildKeyPattern() { + + ArrayList[] colKeys = new ArrayList[ncols]; + HashSet[] colKeyEndWithValueStrings = new HashSet[ncols]; + for(int c = 0; c < ncols; c++) { + Pair, HashSet> pair = buildKeyPatternForAColumn(c); + if(pair != null) { + colKeys[c] = pair.getKey(); + colKeyEndWithValueStrings[c] = pair.getValue(); + } + else { + return null; + } + } + return new Pair<>(colKeys, colKeyEndWithValueStrings); + } + + private Pair, HashSet> buildKeyPatternForAColumn(int colIndex) { + ArrayList> prefixStringAndLineNumber = getAllPrefixStringsOfAColumn(colIndex); + MappingTrie trie = new MappingTrie(); + for(Pair p : prefixStringAndLineNumber) { + trie.reverseInsert(p.getKey(), p.getValue()); + } + ArrayList> keys = trie.getAllSequentialKeys(); + HashSet endWithValueString = null; + boolean flagReconstruct; + int selectedIndex = -1; + + do { + int index = 0; + for(ArrayList key : keys) { + endWithValueString = verifyColKeyInALine(colIndex, key); + if(endWithValueString != null) { + selectedIndex = index; + break; + } + index++; + } + if(endWithValueString == null) { + flagReconstruct = trie.reConstruct(); + + if(flagReconstruct) + keys = trie.getAllSequentialKeys(); + else + break; + } + else + break; + + } + while(true); + + if(selectedIndex != -1) + return new Pair<>(keys.get(selectedIndex), endWithValueString); + else + return null; + } + + // Get all prefix strings of a column + private ArrayList> getAllPrefixStringsOfAColumn(int colIndex) { + ArrayList> prefixStringAndLineNumber = new ArrayList<>(); + int rowIndex; + for(int r = 0; r < nrows; r++) { + rowIndex = mapRow[r][colIndex]; + if(rowIndex != -1) { + prefixStringAndLineNumber.add(new Pair<>( + sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]), rowIndex)); + } + } + return prefixStringAndLineNumber; + } + + // Validate a key in a row of sample raw data + private HashSet verifyColKeyInALine(int colIndex, ArrayList key) { + + boolean flag = true; + HashSet endWithValueString = new HashSet<>(); + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][colIndex]; + if(rowIndex != -1) { + RawIndex ri = sampleRawIndexes.get(rowIndex); + int currPos = 0; + for(String k : key) { + int index = ri.getRaw().indexOf(k, currPos); + if(index != -1) + currPos = index + k.length(); + else { + flag = false; + break; + } + } + int endDelimPos = mapCol[r][colIndex] + mapLen[r][colIndex]; + endWithValueString.add(ri.getSubString(endDelimPos, Math.min(endDelimPos + 1, ri.getRawLength()))); + if(!flag || currPos != mapCol[r][colIndex]) { + return null; + } + } + } + if(endWithValueString.size() == 0) + return null; + return endWithValueString; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FrameGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/FrameGenerateReader.java deleted file mode 100644 index 5820e08972c..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/FrameGenerateReader.java +++ /dev/null @@ -1,292 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.*; -import org.apache.sysds.common.Types; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.io.IOUtilFunctions; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.InputStreamInputFormat; -import org.apache.sysds.runtime.util.UtilFunctions; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -public abstract class FrameGenerateReader extends FrameReader { - - protected CustomProperties _props; - protected final FastStringTokenizer fastStringTokenizerDelim; - - public FrameGenerateReader(CustomProperties _props) { - this._props = _props; - fastStringTokenizerDelim = new FastStringTokenizer(_props.getDelim()); - } - - private int getNumRows(List files, FileSystem fs) throws IOException, DMLRuntimeException { - int rows = 0; - String value; - for(int fileNo = 0; fileNo < files.size(); fileNo++) { - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); - try { - // Row Regular - if(_props.getRowPattern().equals(CustomProperties.GRPattern.Regular)) { - // TODO: check the file has header? - while(br.readLine() != null) - rows++; - } - // Row Irregular - else { - FastStringTokenizer st = new FastStringTokenizer(_props.getDelim()); - while((value = br.readLine()) != null) { - st.reset(value); - int row = st.nextInt(); - rows = Math.max(rows, row); - } - rows++; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - } - return rows; - } - - @Override - public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, - long clen) throws IOException, DMLRuntimeException { - - // prepare file access - JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path(fname); - FileSystem fs = IOUtilFunctions.getFileSystem(path, job); - FileInputFormat.addInputPath(job, path); - - // check existence and non-empty file - checkValidInputFile(fs, path); - - // compute size if necessary - if(rlen <= 0) { - ArrayList paths = new ArrayList<>(); - paths.add(path); - rlen = getNumRows(paths, fs); - } - - // allocate output frame block - Types.ValueType[] lschema = createOutputSchema(schema, clen); - String[] lnames = createOutputNames(names, clen); - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - - // core read (sequential/parallel) - readFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen); - - return ret; - - } - - @Override - public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] schema, String[] names, - long rlen, long clen) throws IOException, DMLRuntimeException { - - // allocate output frame block - Types.ValueType[] lschema = createOutputSchema(schema, clen); - String[] lnames = createOutputNames(names, clen); - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - - // core read (sequential/parallel) - InputStreamInputFormat informat = new InputStreamInputFormat(is); - InputSplit split = informat.getSplits(null, 1)[0]; - readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); - - return ret; - } - - protected void readFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, Types.ValueType[] schema, - String[] names, long rlen, long clen) throws IOException { - - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - InputSplit[] splits = informat.getSplits(job, 1); - splits = IOUtilFunctions.sortInputSplits(splits); - for(int i = 0, rpos = 0; i < splits.length; i++) - rpos = readFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0); - } - - protected abstract int readFrameFromInputSplit(InputSplit split, InputFormat informat, - JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, - boolean first) throws IOException; - - public static class FrameReaderRowRegularColRegular extends FrameGenerateReader { - - public FrameReaderRowRegularColRegular(CustomProperties _props) { - super(_props); - } - - @Override - protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, - JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, - boolean first) throws IOException { - - String cellValue; - fastStringTokenizerDelim.setNaStrings(_props.getNaStrings()); - - // create record reader - RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int row = rl; - int col = 0; - Set naValues = _props.getNaStrings(); - - // Read the data - try { - while(reader.next(key, value)) // foreach line - { - String cellStr = value.toString(); - fastStringTokenizerDelim.reset(cellStr); - while(col != -1) { - cellValue = fastStringTokenizerDelim.nextToken(); - col = fastStringTokenizerDelim.getIndex(); - if(col != -1 && cellValue != null && (naValues == null || !naValues.contains(cellValue))) { - dest.set(row, col, UtilFunctions.stringToObject(schema[col], cellValue)); - } - } - row++; - col = 0; - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - return row; - } - } - - public static class FrameReaderRowRegularColIrregular extends FrameGenerateReader { - - public FrameReaderRowRegularColIrregular(CustomProperties _props) { - super(_props); - } - - @Override - protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, - JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, - boolean first) throws IOException { - - String cellValue; - FastStringTokenizer fastStringTokenizerIndexDelim = new FastStringTokenizer(_props.getIndexDelim()); - - // create record reader - RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int row = rl; - int col = 0; - - // Read the data - try { - while(reader.next(key, value)) // foreach line - { - String cellStr = value.toString(); - fastStringTokenizerDelim.reset(cellStr); - String cellValueString = fastStringTokenizerDelim.nextToken(); - dest.set(row, (int) clen - 1 - _props.getFirstColIndex(), - UtilFunctions.stringToObject(schema[(int) clen - 1 - _props.getFirstColIndex()], cellValueString)); - - while(col != -1) { - String nt = fastStringTokenizerDelim.nextToken(); - if(fastStringTokenizerDelim.getIndex() == -1) - break; - fastStringTokenizerIndexDelim.reset(nt); - col = fastStringTokenizerIndexDelim.nextInt(); - cellValue = fastStringTokenizerIndexDelim.nextToken(); - if(col != -1 && cellValue != null) { - dest.set(row, col - _props.getFirstColIndex(), - UtilFunctions.stringToObject(schema[col - _props.getFirstColIndex()], cellValue)); - } - } - row++; - col = 0; - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - return row; - } - } - - public static class FrameReaderRowIrregular extends FrameGenerateReader { - - public FrameReaderRowIrregular(CustomProperties _props) { - super(_props); - } - - @Override - protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, - JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, - boolean first) throws IOException { - - String cellValue; - fastStringTokenizerDelim.setNaStrings(_props.getNaStrings()); - - // create record reader - RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int row = rl; - int col = 0; - - // Read the data - try { - while(reader.next(key, value)) // foreach line - { - String cellStr = value.toString(); - fastStringTokenizerDelim.reset(cellStr); - int ri = fastStringTokenizerDelim.nextInt(); - col = fastStringTokenizerDelim.nextInt(); - cellValue = fastStringTokenizerDelim.nextToken(); - - if(col != -1 && cellValue != null) { - dest.set(ri-_props.getFirstRowIndex(), col - _props.getFirstColIndex(), - UtilFunctions.stringToObject(schema[col - _props.getFirstColIndex()], cellValue)); - } - row = Math.max(row, ri); - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - return row; - } - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index f97a0816f34..b687598606e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -23,6 +23,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.io.MatrixReader; import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.iogen.template.GIOMatrixReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; @@ -41,12 +42,12 @@ public abstract class GenerateReader { protected static final Log LOG = LogFactory.getLog(GenerateReader.class.getName()); - protected static ReaderMapping readerMapping; + protected static FormatIdentifying formatIdentifying; public GenerateReader(SampleProperties sampleProperties) throws Exception { - readerMapping = sampleProperties.getDataType().isMatrix() ? new ReaderMapping(sampleProperties.getSampleRaw(), - sampleProperties.getSampleMatrix()) : new ReaderMapping(sampleProperties.getSampleRaw(), + formatIdentifying = sampleProperties.getDataType().isMatrix() ? new FormatIdentifying(sampleProperties.getSampleRaw(), + sampleProperties.getSampleMatrix()) : new FormatIdentifying(sampleProperties.getSampleRaw(), sampleProperties.getSampleFrame()); } @@ -65,29 +66,25 @@ public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix) throws E public MatrixReader getReader() throws Exception { - boolean isMapped = readerMapping != null && readerMapping.isMapped(); - if(!isMapped) { - throw new Exception("Sample raw data and sample matrix don't match !!"); - } - CustomProperties ffp = readerMapping.getFormatProperties(); + CustomProperties ffp = formatIdentifying.getFormatProperties(); if(ffp == null) { throw new Exception("The file format couldn't recognize!!"); } - // 2. Generate a Matrix Reader: - if(ffp.getRowPattern().equals(CustomProperties.GRPattern.Regular)) { - if(ffp.getColPattern().equals(CustomProperties.GRPattern.Regular)) { - matrixReader = new MatrixGenerateReader.MatrixReaderRowRegularColRegular(ffp); - } - else { - matrixReader = new MatrixGenerateReader.MatrixReaderRowRegularColIrregular(ffp); - } - } - else { - matrixReader = new MatrixGenerateReader.MatrixReaderRowIrregular(ffp); - } + + //String className = "GIOMatrixReader2"; + //TemplateCodeGenMatrix src = new TemplateCodeGenMatrix(ffp, className); + + // constructor with arguments as CustomProperties + //Class[] cArg = new Class[1]; + //cArg[0] = CustomProperties.class; + + //String co = src.generateCodeJava(); + + //System.out.println(src.generateCodeJava()); + //matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor().newInstance(); + matrixReader = new GIOMatrixReader(ffp); return matrixReader; } - } // Generate Reader for Frame @@ -105,26 +102,10 @@ public GenerateReaderFrame(String sampleRaw, FrameBlock sampleFrame) throws Exce public FrameReader getReader() throws Exception { - boolean isMapped = readerMapping != null && readerMapping.isMapped(); - if(!isMapped) { - throw new Exception("Sample raw data and sample frame don't match !!"); - } - CustomProperties ffp = readerMapping.getFormatProperties(); + CustomProperties ffp = formatIdentifying.getFormatProperties(); if(ffp == null) { throw new Exception("The file format couldn't recognize!!"); } - // 2. Generate a Frame Reader: - if(ffp.getRowPattern().equals(CustomProperties.GRPattern.Regular)) { - if(ffp.getColPattern().equals(CustomProperties.GRPattern.Regular)) { - frameReader = new FrameGenerateReader.FrameReaderRowRegularColRegular(ffp); - } - else { - frameReader = new FrameGenerateReader.FrameReaderRowRegularColIrregular(ffp); - } - } - else { - frameReader = new FrameGenerateReader.FrameReaderRowIrregular(ffp); - } return frameReader; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java new file mode 100644 index 00000000000..308fd2e4234 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -0,0 +1,316 @@ +package org.apache.sysds.runtime.iogen; + +import org.apache.sysds.runtime.matrix.data.Pair; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class MappingTrie { + + private MappingTrieNode root; + + private int keyLevel; + + public MappingTrie() { + root = new MappingTrieNode(MappingTrieNode.Type.INNER); + keyLevel = 0; + } + + public void insert(String word, int rowIndex) { + ArrayList tmpList = new ArrayList<>(); + tmpList.add(rowIndex); + this.insert(word, tmpList); + } + + public void reverseInsert(String word, int rowIndex) { + ArrayList tmpList = new ArrayList<>(); + tmpList.add(rowIndex); + this.insert(new StringBuilder(word).reverse().toString(), tmpList); + } + public void insert(String word, ArrayList rowIndexes) { + MappingTrieNode newNode; + if(root.getChildren().containsKey(word)) + newNode = root.getChildren().get(word); + else + newNode = new MappingTrieNode(); + newNode.addRowIndex(rowIndexes); + root.getChildren().put(word, newNode); + } + + public MappingTrieNode getFistMultiChildNode(MappingTrieNode node) { + if(node.getChildren().size() == 1 && node.getNodeType() != MappingTrieNode.Type.END) + return getFistMultiChildNode(node.getChildren().get(node.getChildren().keySet().iterator().next())); + else + return node; + } + + public Set getAllSubStringsOfStringContainIntersect(String str, BitSet bitSet) { + HashSet result = new HashSet<>(); + StringBuilder sb = new StringBuilder(); + for(int i = 0; i < bitSet.size(); i++) { + if(bitSet.get(i)) + sb.append(str.charAt(i)); + else if(sb.length() > 0) { + if(sb.length() == 1) + result.add(sb.toString()); + else { + for(int j = 1; j <= sb.length(); j++) { + for(int k = 0; k <= sb.length() - j; k++) { + result.add(sb.substring(k, k + j)); + } + } + } + sb = new StringBuilder(); + } + } + + return result; + } + + public String getIntersectOfChildren(MappingTrieNode node) { + if(node.getNodeType() == MappingTrieNode.Type.END || node.getChildren().size() == 0) + return null; + else { + Set keys = node.getChildren().keySet(); + if(keys.size() == 1) + return String.valueOf(keys.iterator().next().charAt(0)); + + boolean flag = false; + int maxKeyLength = 0; + Set intersections = null; + for(String k : keys) { + if(flag) { + intersections.retainAll(k.chars().mapToObj(c -> (char) c).collect(Collectors.toSet())); + } + else { + intersections = k.chars().mapToObj(c -> (char) c).collect(Collectors.toSet()); + flag = true; + } + + // set max length of key + maxKeyLength = Math.max(maxKeyLength, k.length()); + } + if(intersections == null || intersections.size() == 0) + return null; + else { + Set subStringIntersection = new HashSet<>(); + boolean subStringIntersectionFlag = false; + for(String k : keys) { + BitSet bitSets = new BitSet(maxKeyLength); + int i = 0; + for(Character character : k.toCharArray()) { + if(intersections.contains(character)) + bitSets.set(i); + i++; + } + if(subStringIntersectionFlag) { + subStringIntersection.retainAll(getAllSubStringsOfStringContainIntersect(k, bitSets)); + } + else { + subStringIntersection = getAllSubStringsOfStringContainIntersect(k, bitSets); + subStringIntersectionFlag = true; + } + } + if(subStringIntersection.size() == 1) { + return subStringIntersection.iterator().next(); + } + else { + ArrayList sortedList = (ArrayList) subStringIntersection.stream() + .sorted((o1, o2) -> o2.length() - o1.length()).collect(Collectors.toList()); + + for(String ssi : sortedList) { + if(keyLevel == 0) { + boolean flagBest = true; + for(String k : keys) { + if(!k.startsWith(ssi)) { + flagBest = false; + break; + } + } + if(flagBest) + return ssi; + } + else { + int lastCount = 0; + for(String k : keys) { + int beginPos = 0; + int count = 0; + do { + int index = k.indexOf(ssi, beginPos); + if(index != -1) { + count++; + beginPos = index + ssi.length(); + } + else + break; + } + while(true); + if(lastCount != 0 && lastCount != count) { + lastCount = 0; + break; + } + else if(lastCount == 0) + lastCount = count; + } + if(lastCount != 0) + return ssi; + } + } + return null; + } + } + } + } + + public MappingTrieNode getRoot() { + return root; + } + + public boolean reConstruct() { + MappingTrieNode node = getFistMultiChildNode(root); + String intersect = getIntersectOfChildren(node); + + // prune nodes if they don't have any intersect char + if(intersect == null) { + node.getChildren().clear(); + node.setNodeType(MappingTrieNode.Type.END); + return false; + } + else { + MappingTrieNode.Type intersectNodeType = MappingTrieNode.Type.INNER; + MappingTrie intersectTrie = new MappingTrie(); + ArrayList intersectRowIndexes = new ArrayList<>(); + + for(String k : node.getChildren().keySet()) { + String key = k.substring(k.indexOf(intersect) + intersect.length()); + if(key.length() > 0) { + intersectTrie.insert(key, node.getChildren().get(k).getRowIndexes()); + intersectRowIndexes.addAll(node.getChildren().get(k).getRowIndexes()); + } + else + intersectNodeType = MappingTrieNode.Type.END; + } + + // clear the node children + node.getChildren().clear(); + + // create an IGNORE node type and add it to the tree + MappingTrieNode ignoreNode = new MappingTrieNode(MappingTrieNode.Type.IGNORE); + + node.getChildren().put(null, ignoreNode); + + // create and add intersection node + MappingTrieNode intersectionNode = new MappingTrieNode(intersectNodeType); + intersectionNode.setChildren(intersectTrie.root.getChildren()); + intersectionNode.setRowIndexes(intersectRowIndexes); + ignoreNode.getChildren().put(intersect, intersectionNode); + + keyLevel++; + + return true; + } + } + + public ArrayList> getAllSequentialKeys() { + ArrayList>>> result = new ArrayList<>(); + getAllSequentialKeys(root, result, new ArrayList<>()); + + // orders + ArrayList> indexOrder = new ArrayList<>(); + int index = 0; + for(ArrayList>> k : result) { + int level = 0; + for(Pair> n : k) { + if(n.getKey() != null) { + + if(level == keyLevel - 1 || keyLevel == 0) { + indexOrder.add(new Pair<>(index, n.getValue().size())); + break; + } + level++; + } + } + index++; + } + + List> sortedList = indexOrder.stream() + .sorted((o1, o2) -> o2.getValue().compareTo(o1.getValue())).collect(Collectors.toList()); + + ArrayList> keys = new ArrayList<>(); + for(Pair p : sortedList) { + ArrayList>> k = result.get(p.getKey()); + ArrayList kl = new ArrayList<>(); + int level = 0; + for(Pair> n : k) + if(n.getKey() != null) { + if(level < keyLevel || keyLevel == 0) { + kl.add(n.getKey()); + level++; + } + else + break; + } + + keys.add(kl); + } + ArrayList> distinctKeys = new ArrayList<>(); + HashSet markedIndexes = new HashSet<>(); + ArrayList selected; + for(int i = 0; i < keys.size(); i++) { + if(markedIndexes.contains(i)) + continue; + else { + selected = keys.get(i); + markedIndexes.add(i); + distinctKeys.add(selected); + } + for(int j = i + 1; j < keys.size(); j++) { + if(!markedIndexes.contains(j)) { + boolean flag = true; + for(int k = 0; k < selected.size(); k++) { + if(!selected.get(k).equals(keys.get(j).get(k))) { + flag = false; + break; + } + } + if(flag) { + markedIndexes.add(j); + } + } + } + } + + // revert list avd values of list + for(ArrayList l: distinctKeys){ + Collections.reverse(l); + for(int i=0; i>>> result, + ArrayList>> nodeKeys) { + + if(node.getNodeType() == MappingTrieNode.Type.END) { + result.add(nodeKeys); + nodeKeys = new ArrayList<>(); + } + else { + for(String k : node.getChildren().keySet()) { + MappingTrieNode child = node.getChildren().get(k); + ArrayList>> tmpKeys = new ArrayList<>(); + tmpKeys.addAll(nodeKeys); + tmpKeys.add(new Pair<>(k, child.getRowIndexes())); + getAllSequentialKeys(child, result, tmpKeys); + } + } + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java new file mode 100644 index 00000000000..5c3325d2603 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java @@ -0,0 +1,63 @@ +package org.apache.sysds.runtime.iogen; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +public class MappingTrieNode { + + public enum Type { + INNER, END, IGNORE; + @Override public String toString() { + return this.name().toUpperCase(); + } + } + + private Map children; + private Type nodeType; + private ArrayList rowIndexes; + + public MappingTrieNode(Type nodeType) { + this.nodeType = nodeType; + children = new HashMap<>(); + rowIndexes = new ArrayList<>(); + } + + public MappingTrieNode() { + this.nodeType = Type.END; + children = new HashMap<>(); + rowIndexes = new ArrayList<>(); + } + + public Map getChildren() { + return children; + } + + public void setChildren(Map children) { + this.children = children; + } + + public Type getNodeType() { + return nodeType; + } + + public void setNodeType(Type nodeType) { + this.nodeType = nodeType; + } + + public void addRowIndex(int rowIndex) { + rowIndexes.add(rowIndex); + } + + public void addRowIndex(ArrayList rowIndexes) { + this.rowIndexes.addAll(rowIndexes); + } + + public void setRowIndexes(ArrayList rowIndexes) { + this.rowIndexes = rowIndexes; + } + + public ArrayList getRowIndexes() { + return rowIndexes; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java index c377dbc294c..e628ca1cedb 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java @@ -115,6 +115,7 @@ public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long cle return ret; } + @SuppressWarnings("unchecked") private MatrixBlock readMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int blen) throws IOException, DMLRuntimeException { //prepare file paths in alphanumeric order diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index 73eba1f3cb4..5df7b72b17e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -19,7 +19,6 @@ package org.apache.sysds.runtime.iogen; -import com.google.gson.Gson; import org.apache.sysds.common.Types; import org.apache.sysds.runtime.matrix.data.Pair; import org.apache.sysds.runtime.util.UtilFunctions; @@ -344,14 +343,7 @@ public int getRawLength() { return rawLength; } - public static void main(String[] args) { - String s = "123dddd56"; - RawIndex ni = new RawIndex(s); - //ni.printBitSets(); - Pair p1= ni.findValue(123); - Pair p= ni.findValue(123); - Gson gson=new Gson(); - System.out.println(gson.toJson(p)); + public String getRaw() { + return raw; } - } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawRow.java b/src/main/java/org/apache/sysds/runtime/iogen/RawRow.java deleted file mode 100644 index 6ef8222e3f0..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawRow.java +++ /dev/null @@ -1,558 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.matrix.data.Pair; - -import java.util.ArrayList; -import java.util.BitSet; -import java.util.HashSet; - -public class RawRow { - private final String raw; - private ArrayList numericPositions = new ArrayList<>(); - private final BitSet numericReserved; - private final String numericRaw; - private final BitSet reserved; - private int numericLastIndex; - private int rawLastIndex; - - private Pair resultNumeric; - - public RawRow(String raw, ArrayList numericPositions, String numericRaw) { - this.raw = raw; - this.numericReserved = new BitSet(numericRaw.length()); - this.numericRaw = numericRaw; - this.reserved = new BitSet(numericRaw.length()); - this.numericPositions = numericPositions; - - } - - public RawRow(String raw) { - this.raw = raw; - char[] rawChars = raw.toCharArray(); - StringBuilder sbNumericRaw = new StringBuilder(); - for(int i = 0; i < rawChars.length; i++) { - char ch = rawChars[i]; - if(Character.isDigit(ch)) { - sbNumericRaw.append(ch); - numericPositions.add(i); - } - } - numericReserved = new BitSet(numericPositions.size()); - numericRaw = sbNumericRaw.toString(); - reserved = new BitSet(raw.length()); - numericLastIndex = 0; - rawLastIndex = 0; - } - - public Pair findValue(ValueTrimFormat vtf, boolean forward, boolean update) { - Types.ValueType vt = vtf.getValueType(); - if(vt.isNumeric()) - return findNumericValue(vtf, forward, update); - - else if(vt == Types.ValueType.STRING) - return findStringValue(vtf, forward, update); - else if(vt == Types.ValueType.BOOLEAN) { - ValueTrimFormat vtfb = new ValueTrimFormat(vtf.getStringOfActualValue()); - return findStringValue(vtfb, forward, update); - } - return null; - } - - public Pair findValue(ValueTrimFormat vtf, boolean forward) { - return findValue(vtf, forward, true); - } - - public Pair findSequenceValues(ArrayList vtfs, int startIndex, boolean update) { - int currentNumericLastIndex = numericLastIndex; - int currentRawLastIndex = rawLastIndex; - Pair spair = null; - Pair epair = null; - ValueTrimFormat snode = vtfs.get(0); - rawLastIndex = 0; - numericLastIndex = 0; - - do { - spair = findValue(snode, true, false); - if(spair.getKey() != -1) { - for(int i = 1; i < vtfs.size(); i++) { - epair = findAtValue(vtfs.get(i), rawLastIndex, numericLastIndex, false); - if(epair.getKey() == -1) - break; - } - if(epair != null && epair.getKey() != -1) - break; - } - else - break; - } - while(true); - if(update && epair != null && epair.getKey() != -1) { - reserved.set(spair.getKey(), epair.getKey() + epair.getValue(), true); - } - else { - numericLastIndex = currentNumericLastIndex; - rawLastIndex = currentRawLastIndex; - } - - if(epair != null && epair.getKey() != -1) { - spair.set(spair.getKey(), epair.getKey() + epair.getValue()); - - } - else - spair.set(-1, 0); - - return spair; - } - - public Pair findAtValue(ValueTrimFormat vtf, int rawIndex, int numericIndex, boolean update) { - if(vtf.getValueType() == Types.ValueType.STRING) - return findAtStringValue(vtf, rawIndex, update); - else if(vtf.getValueType().isNumeric()) - return findAtNumericValue(vtf, rawIndex, numericIndex, update); - else if(vtf.getValueType() == Types.ValueType.BOOLEAN) { - ValueTrimFormat vtfb = new ValueTrimFormat(vtf.getStringOfActualValue()); - return findAtStringValue(vtfb, rawIndex, update); - } - else - throw new RuntimeException("FindAt just work for fixed length of values!"); - } - - public Pair findAtValue(ValueTrimFormat vtf, int rawIndex, int numericIndex) { - return findAtValue(vtf, rawIndex, numericIndex, true); - } - - private Pair findAtStringValue(ValueTrimFormat stf, int index, boolean update) { - Pair result = new Pair<>(-1, 0); - int length = stf.getStringOfActualValue().length(); - if(index + length > raw.length() || index <= 0) - return result; - - if(reserved.get(index, index + length).isEmpty()) { - if(raw.substring(index, index + length).equalsIgnoreCase(stf.getStringOfActualValue())) { - result.set(index, length); - rawLastIndex = result.getKey() + result.getValue(); - } - } - if(result.getKey() != -1 && update) { - reserved.set(result.getKey(), result.getKey() + result.getValue(), true); - } - return result; - } - - private Pair findAtNumericValue(ValueTrimFormat ntf, int rawStart, int numericStart, - boolean update) { - Pair result = new Pair<>(-1, 0); - int end = rawStart; - - for(int i = rawStart; i < raw.length(); i++) { - if(!reserved.get(i)) - end++; - else - break; - } - boolean flagD = false; - StringBuilder sb = new StringBuilder(); - for(int i = rawStart; i < end; i++) { - char ch = raw.charAt(i); - if(ch == 'E' || ch == 'e' || ch == '+' || ch == '-') { - sb.append(ch); - } - else if(!flagD && ch == '.') { - sb.append(ch); - flagD = true; - } - else if(Character.isDigit(ch)) - sb.append(ch); - else - break; - } - Double value = tryParse(sb.toString()); - if(value != null) { - if(value == ntf.getDoubleActualValue()) { - result.setKey(rawStart); - result.setValue(sb.length()); - } - } - - if(result.getKey() != -1) { - if(update) { - for(int i = resultNumeric.getKey() - 1; i >= 0; i--) { - if(numericPositions.get(i) >= result.getKey()) - numericReserved.set(i); - else - break; - } - - for(int i = resultNumeric.getKey() + 1; i < numericPositions.size(); i++) { - if(numericPositions.get(i) <= result.getKey() + result.getValue()) { - numericReserved.set(i); - numericLastIndex = i; - } - else - break; - } - numericReserved.set(resultNumeric.getKey(), resultNumeric.getKey() + resultNumeric.getValue(), true); - reserved.set(result.getKey(), result.getKey() + result.getValue(), true); - } - else { - for(int i = resultNumeric.getKey() + 1; i < numericPositions.size(); i++) { - if(numericPositions.get(i) <= result.getKey() + result.getValue()) { - numericLastIndex = i; - } - else - break; - } - } - numericLastIndex = Math.max(numericLastIndex, resultNumeric.getKey() + resultNumeric.getValue()); - rawLastIndex = result.getKey() + result.getValue(); - } - return result; - } - - private Pair findStringValue(ValueTrimFormat stf, boolean forward, boolean update) { - ArrayList> unreserved = getRawUnreservedPositions(forward); - Pair result = new Pair<>(-1, 0); - for(Pair p : unreserved) { - int start = p.getKey(); - int end = p.getValue(); - String ntfString = stf.getStringOfActualValue(); - int length = ntfString.length(); - int index = raw.indexOf(ntfString, start); - if(index != -1 && (index <= end - length + 1)) { - result.setKey(index); - result.setValue(length); - rawLastIndex = index + length; - if(update) - reserved.set(result.getKey(), result.getKey() + result.getValue(), true); - break; - } - } - return result; - } - - private Pair findNumericValue(ValueTrimFormat ntf, boolean forward, boolean update) { - ArrayList> unreserved = getUnreservedPositions(forward); - Pair result = new Pair<>(-1, 0); - resultNumeric = new Pair<>(-1, 0); - for(Pair p : unreserved) { - int start = p.getKey(); - int end = p.getValue(); - for(int s = start; s <= end && result.getKey() == -1; ) { - String ntfString = ntf.getNString(); - int length = ntfString.length(); - int index = numericRaw.indexOf(ntfString, s); - if(index == -1 || index > end - length + 1) - break; - s = index + 1; - - resultNumeric.setValue(length); - resultNumeric.setKey(index); - int startPos = numericPositions.get(index); - int endPos = numericPositions.get(index + length - 1); - ntfString = raw.substring(startPos, endPos + 1); - Double value = tryParse(ntfString); - if(value == null) - continue; - - // Choose range of string - boolean flagD = false; - - // 1. the range contain '.' - // 2. the range contain none numeric chars. In this condition we should terminate checking - int d = endPos - startPos - length + 1; - if(d == 1) { - for(int i = startPos; i <= endPos; i++) { - if(raw.charAt(i) == '.') { - flagD = true; - break; - } - } - if(!flagD) - continue; - // Check mapping - ntfString = raw.substring(startPos, endPos + 1); - } - else if(d > 1) - continue; - - StringBuilder sb = new StringBuilder(); - - /* 3. add extra chars if the value at the middle of another value - Example: target value= 123 - source text: 1.123E12,123 - second "123" value should report - */ - - boolean flagPrefix = true; - for(int i = startPos - 1; i >= 0 && flagPrefix; i--) { - char ch = raw.charAt(i); - if(Character.isDigit(ch) && ch != '0') - flagPrefix = false; - else if(ch == '0') - sb.append('0'); - else if(!flagD && ch == '.') { - sb.append(ch); - flagD = true; - } - else if(ch == '+' || ch == '-') { - sb.append(ch); - break; - } - else { - break; - } - } - if(!flagPrefix) - continue; - - sb = sb.reverse(); - startPos -= sb.length(); - sb.append(ntfString); - - for(int i = endPos + 1; i < raw.length(); i++) { - char ch = raw.charAt(i); - if(ch == 'E' || ch == 'e' || ch == '+' || ch == '-') { - sb.append(ch); - } - else if(!flagD && ch == '.') { - sb.append(ch); - flagD = true; - } - else if(Character.isDigit(ch)) - sb.append(ch); - else - break; - } - value = tryParse(sb.toString()); - if(value != null) { - if(value == ntf.getDoubleActualValue()) { - result.setKey(startPos); - result.setValue(sb.length()); - } - } - } - if(result.getKey() != -1) { - break; - } - } - if(result.getKey() != -1) { - if(update) { - for(int i = resultNumeric.getKey() - 1; i >= 0; i--) { - if(numericPositions.get(i) >= result.getKey()) - numericReserved.set(i); - else - break; - } - - for(int i = resultNumeric.getKey() + 1; i < numericPositions.size(); i++) { - if(numericPositions.get(i) <= result.getKey() + result.getValue()) { - numericReserved.set(i); - numericLastIndex = i; - } - else - break; - } - numericReserved.set(resultNumeric.getKey(), resultNumeric.getKey() + resultNumeric.getValue(), true); - reserved.set(result.getKey(), result.getKey() + result.getValue(), true); - } - else { - for(int i = resultNumeric.getKey() + 1; i < numericPositions.size(); i++) { - if(numericPositions.get(i) <= result.getKey() + result.getValue()) { - numericLastIndex = i; - } - else - break; - } - } - numericLastIndex = Math.max(numericLastIndex, resultNumeric.getKey() + resultNumeric.getValue()); - rawLastIndex = result.getKey() + result.getValue(); - } - return result; - } - - private ArrayList> getUnreservedPositions(boolean forward) { - ArrayList> result = new ArrayList<>(); - int sIndex, eIndex; - int size = numericPositions.size(); - int[] start = {numericLastIndex, 0}; - int[] end = {size, numericLastIndex}; - int psize = (forward || rawLastIndex == 0) ? 1 : 2; - - for(int p = 0; p < psize; p++) { - for(int i = start[p]; i < end[p]; ) { - // skip all reserved indexes - for(int j = i; j < end[p]; j++) { - if(numericReserved.get(j)) - i++; - else - break; - } - sIndex = i; - // Extract unreserved position - for(int j = i; j < end[p]; j++) { - if(!numericReserved.get(j)) - i++; - else - break; - } - eIndex = i; - if(sIndex < eIndex) - result.add(new Pair<>(sIndex, eIndex - 1)); - } - } - return result; - } - - private ArrayList> getRawUnreservedPositions(boolean forward) { - ArrayList> result = new ArrayList<>(); - int sIndex, eIndex; - int size = raw.length(); - int[] start = {rawLastIndex, 0}; - int[] end = {size, rawLastIndex}; - - int psize = (forward || rawLastIndex == 0) ? 1 : 2; - for(int p = 0; p < psize; p++) { - - for(int i = start[p]; i < end[p]; ) { - // skip all reserved indexes - for(int j = i; j < end[p]; j++) { - if(reserved.get(j)) - i++; - else - break; - } - sIndex = i; - // Extract unreserved position - for(int j = i; j < end[p]; j++) { - if(!reserved.get(j)) - i++; - else - break; - } - eIndex = i; - if(sIndex < eIndex) - result.add(new Pair<>(sIndex, eIndex - 1)); - } - } - return result; - } - - private static Double tryParse(String input) { - try { - return Double.parseDouble(input); - } - catch(Exception ex) { - return null; - } - } - - public Pair getDelims() { - Pair result = new Pair<>("", ""); - - StringBuilder sbAll = new StringBuilder(); - StringBuilder sbPart = new StringBuilder(); - String minToken = ""; - for(int i = 0; i < raw.length(); i++) { - if(!reserved.get(i)) { - char ch = raw.charAt(i); - sbAll.append(ch); - sbPart.append(ch); - } - else { - if(sbPart.length() == 0) - continue; - - if(minToken.length() == 0 || minToken.length() > sbPart.length()) - minToken = sbPart.toString(); - - sbPart = new StringBuilder(); - } - } - result.set(minToken, sbAll.toString()); - return result; - } - - public void resetReserved() { - reserved.set(0, raw.length(), false); - numericReserved.set(0, numericPositions.size(), false); - numericLastIndex = 0; - rawLastIndex = 0; - } - - public Pair, Integer> getDelimsSet() { - Pair, Integer> result = new Pair<>(); - StringBuilder sb = new StringBuilder(); - int minSize = -1; - HashSet set = new HashSet<>(); - for(int i = 0; i < raw.length(); i++) { - if(!reserved.get(i)) { - char ch = raw.charAt(i); - sb.append(ch); - } - else { - if(sb.length() > 0) { - set.add(sb.toString()); - minSize = minSize == -1 ? sb.length() : Math.min(minSize, sb.length()); - } - sb = new StringBuilder(); - } - } - result.set(set, minSize); - return result; - } - - public String getRaw() { - return raw; - } - - public void setNumericLastIndex(int numericLastIndex) { - this.numericLastIndex = numericLastIndex; - } - - public void setRawLastIndex(int rawLastIndex) { - this.rawLastIndex = rawLastIndex; - } - - public RawRow getResetClone() { - RawRow clone = new RawRow(raw, numericPositions, numericRaw); - clone.setRawLastIndex(0); - clone.setNumericLastIndex(0); - return clone; - } - - public void setLastIndex(int lastIndex) { - this.numericLastIndex = lastIndex; - } - - public int getNumericLastIndex() { - return numericLastIndex; - } - - public int getRawLastIndex() { - return rawLastIndex; - } - - public boolean isMarked() { - return !reserved.isEmpty(); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index b25d165f69f..736cbdd4f66 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -36,31 +36,34 @@ public class ReaderMapping { private int[][] mapRow; private int[][] mapCol; private int[][] mapLen; - private boolean symmetric; - private boolean skewSymmetric; - private boolean isUpperTriangular; - private int skewCoefficient; - private ArrayList sampleRawIndexes; - private boolean mapped; - private static int nrows; - private static int ncols; + private int nrows; + private int ncols; private int nlines; - private int firstRowIndex; - private int firstColIndex; - + private ArrayList sampleRawIndexes; private MatrixBlock sampleMatrix; private FrameBlock sampleFrame; private Types.ValueType[] schema; private final boolean isMatrix; + public ReaderMapping(int nlines, int nrows, int ncols, ArrayList sampleRawIndexes, MatrixBlock matrix) + throws Exception { + this.nlines = nlines; + this.nrows = nrows; + this.ncols = ncols; + this.sampleRawIndexes = sampleRawIndexes; + this.sampleMatrix = matrix; + this.isMatrix = true; + this.runMapping(true); + } + public ReaderMapping(String raw, MatrixBlock matrix) throws Exception { this.ReadRaw(raw); this.isMatrix = true; this.sampleMatrix = matrix; this.nrows = this.sampleMatrix.getNumRows(); this.ncols = this.sampleMatrix.getNumColumns(); - this.runMapping(); + this.runMapping(false); } public ReaderMapping(String raw, FrameBlock frame) throws Exception { @@ -70,7 +73,7 @@ public ReaderMapping(String raw, FrameBlock frame) throws Exception { this.nrows = this.sampleFrame.getNumRows(); this.ncols = this.sampleFrame.getNumColumns(); this.schema = this.sampleFrame.getSchema(); - this.runMapping(); + this.runMapping(false); } private void ReadRaw(String raw) throws Exception { @@ -85,8 +88,6 @@ private void ReadRaw(String raw) throws Exception { nlines++; } this.nlines = nlines; - this.firstColIndex = 0; - this.firstRowIndex = 0; } private boolean isSchemaNumeric() { @@ -99,11 +100,11 @@ private boolean isSchemaNumeric() { return result; } - private void runMapping() throws Exception { - mapped = findMapping(); + private void runMapping(boolean isIndexMapping) throws Exception { + mapped = findMapping(isIndexMapping); } - protected boolean findMapping() { + protected boolean findMapping(boolean isIndexMapping) { mapRow = new int[nrows][ncols]; mapCol = new int[nrows][ncols]; mapLen = new int[nrows][ncols]; @@ -116,8 +117,8 @@ protected boolean findMapping() { int itRow = 0; for(int r = 0; r < nrows; r++) { for(int c = 0; c < ncols; c++) { - if((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && this.sampleFrame.get( - r, c) != null)) { + if(isIndexMapping || ((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && this.sampleFrame.get( + r, c) != null))) { HashSet checkedLines = new HashSet<>(); while(checkedLines.size() < nlines) { RawIndex ri = sampleRawIndexes.get(itRow); @@ -149,229 +150,34 @@ protected boolean findMapping() { return flagMap; } - public CustomProperties getFormatProperties() { - CustomProperties properties = new CustomProperties(); - boolean rowIndexIdentify = isRowIndexIdentify(); - ColumnIdentifyProperties[] colIndexIdentify = isColumnIndexIdentify(); - if(rowIndexIdentify && colIndexIdentify!=null) - properties.setRowColIdentifyProperties(colIndexIdentify); - return properties; + public int[][] getMapRow() { + return mapRow; } - // Row Index is Identifies, when the sample row index equal to sample Matrix/Frame row index - private boolean isRowIndexIdentify() { - int l = 0; - ArrayList> mismatched = new ArrayList<>(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(mapRow[r][c] != -1 && l != mapRow[r][c]) { - mismatched.add(new Pair<>(r, c)); - } - } - l++; - } - // All rows of sample raw not used - if(l != nlines) { - return false; - } - else if(mismatched.size() > 0) { - return false; - } - return true; + public int[][] getMapCol() { + return mapCol; } - // Column Index is identifies, when the logical char position of the sample raw on a line - // equal to a column index in sample Matrix/Frame - private ColumnIdentifyProperties[] isColumnIndexIdentify() { - ColumnIdentifyProperties[] result = new ColumnIdentifyProperties[ncols]; - for(int c = 0; c < ncols; c++) { - Pair pair = getLogicalPositionOfAColumn(c); - if(pair == null) - return null; - else { - String endDelimiterOfAColumn = getEndDelimiterOfAColumn(c); - if(endDelimiterOfAColumn != null) - result[c] = new ColumnIdentifyProperties(pair.getKey(), pair.getValue(), endDelimiterOfAColumn); - else - return null; - } - } - return result; + public int[][] getMapLen() { + return mapLen; } - private Pair getLogicalPositionOfAColumn(int colIndex) { - ArrayList tokens = new ArrayList<>(); - int minPos = mapCol[0][colIndex]; - int maxPos = minPos; - int colPos; - int rowIndex; - for(int r = 0; r < nrows; r++) { - colPos = mapCol[r][colIndex]; - rowIndex = mapRow[r][colIndex]; - if(colPos != -1) { - tokens.add(sampleRawIndexes.get(rowIndex).getSubString(0, colPos)); - minPos = Math.min(minPos, colPos); - maxPos = Math.max(maxPos, colPos); - } - } - if(maxPos == 0 && minPos == 0) { - return new Pair("", 0); - } - - String delimCandidate = null; - int delimCandidateCont = 0; - for(int tl = 1; tl < minPos; tl++) { - String token = tokens.get(0); - String delim = token.substring(token.length() - tl); - int xCount = getDuplicateSubstringCountString(tokens.get(0), delim); - int yCount = xCount; - for(int i = 1; i < tokens.size() && xCount == yCount; i++) { - yCount = getDuplicateSubstringCountString(tokens.get(i), delim); - } - if(xCount == yCount) { - delimCandidate = delim; - delimCandidateCont = xCount; - } - else - break; - } - if(delimCandidate != null) - return new Pair<>(delimCandidate, delimCandidateCont); - return null; - - } - - private String getEndDelimiterOfAColumn(int colIndex) { - HashSet tokens = new HashSet<>(); - int colEnd; - int colPos; - int rowIndex; - for(int r = 0; r < nrows; r++) { - rowIndex = mapRow[r][colIndex]; - colPos = mapCol[rowIndex][colIndex]; - RawIndex ri = sampleRawIndexes.get(r); - if(colPos != -1) { - colEnd = colPos + mapLen[r][colIndex]; - String endStr = ri.getSubString(colEnd, Math.min(ri.getRawLength(), colEnd + 1)); - tokens.add(endStr); - } - } - if(tokens.size() == 1) - return tokens.iterator().next(); - else - return null; + public ArrayList getSampleRawIndexes() { + return sampleRawIndexes; } - private int getDuplicateSubstringCountString(String source, String str) { - int count = 0; - int index = 0; - do { - index = source.indexOf(str, index); - if(index != -1) { - count++; - index += str.length(); - } - } - while(index != -1); - return count; + public int getNrows() { + return nrows; } - private Pair isRowIndexPrefix() { - - ArrayList> mismatched = new ArrayList<>(); - ArrayList> prefixes = new ArrayList<>(); - ArrayList> nonePrefix = new ArrayList<>(); - DelimiterTrie delimiterTrie = new DelimiterTrie(); - - int delimiterMinSize = 0; - - for(int r = 0; r < nrows; r++) { - RawIndex ri = sampleRawIndexes.get(r); - ri.cloneReservedPositions(); - for(int c = 0; c < ncols; c++) { - if(mapRow[r][c] != -1) { - Pair pair = ri.findValue(r); - if(pair == null) - mismatched.add(new Pair<>(r, c)); - else { - if(pair.getKey() < mapCol[r][c]) { - String delim = ri.getSubString(pair.getKey() + pair.getValue(), mapCol[r][c]); - int delimLength = delim.length(); - if(delimiterMinSize != 0 && delimLength < delimiterMinSize) - delimiterMinSize = delimLength; - else - delimiterMinSize = delimLength; - - delimiterTrie.insert(delim); - prefixes.add(pair); - } - else - nonePrefix.add(pair); - } - } - } - //ri.restoreReservedPositions(); - } - // TODO: attend to mistakes and none-prefix row index maps - - return delimiterTrie.getShortestDelim(delimiterMinSize); + public int getNcols() { + return ncols; } - class DelimiterTrie { - private final StringBuilder totalDelim; - private int totalDelimLength; - private boolean valid; - - public DelimiterTrie() { - totalDelim = new StringBuilder(); - totalDelimLength = 0; - valid = true; - } - - public boolean insert(String delim) { - if(delim.length() > totalDelimLength) { - if(delim.startsWith(totalDelim.toString())) { - totalDelim.append(delim.substring(totalDelimLength)); - totalDelimLength += delim.length() - totalDelimLength; - } - else - valid = false; - } - else if(!totalDelim.toString().startsWith(delim)) - valid = false; - return valid; - } - - public Pair getShortestDelim(int minsize) { - if(!valid) - return null; - - if(minsize == totalDelimLength) - return new Pair(totalDelim.toString(), true); - else { - HashSet delimSet = new HashSet<>(); - for(int i = 1; i <= minsize; i++) { - delimSet.clear(); - for(int j = 0; j < totalDelimLength; j += i) { - delimSet.add(totalDelim.substring(j, Math.min(j + i, totalDelimLength))); - } - if(delimSet.size() == 1) - break; - } - if(delimSet.size() == 1) { - String delim = delimSet.iterator().next(); - return new Pair(delim, delim.length() == totalDelimLength); - } - else - return null; - } - } - - public void print() { - System.out.println(totalDelim); - } + public int getNlines() { + return nlines; } public boolean isMapped() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ValueTrimFormat.java b/src/main/java/org/apache/sysds/runtime/iogen/ValueTrimFormat.java deleted file mode 100644 index 5f5f20a6823..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/ValueTrimFormat.java +++ /dev/null @@ -1,228 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.util.UtilFunctions; - -import java.text.DecimalFormat; - -public class ValueTrimFormat implements Comparable { - - // save the col index of the value on the Matrix. - // We need this value when we want to reorder matrix cols - private final int colIndex; - private Object actualValue; - - // Convert all numeric values(i.e., double, float, int, long, ...) to number trim format - public char S; // signe of value "+" or "-" - private char[] N; // array of none zero chars. Example: value = 0.00012345, N = [1,2,3,4,5] - private String NString; - private Types.ValueType valueType; - - public ValueTrimFormat(int actualValue) { - this(-1, Types.ValueType.INT32, actualValue); - } - - public ValueTrimFormat(String actualValue) { - this.valueType = Types.ValueType.STRING; - this.actualValue = actualValue; - this.colIndex = -1; - } - - public ValueTrimFormat(int colIndex, Types.ValueType vt, Object o) { - this.valueType = vt; - this.colIndex = colIndex; - this.actualValue = o; - if(vt.isNumeric()) { - double value = UtilFunctions.getDouble(o); - - // remove scientific format - DecimalFormat decimalFormat = new DecimalFormat("0.000000000000000000000000000000"); - String stringValue = decimalFormat.format(value); - if(value == 0) { - S = '+'; - N = new char[] {'0'}; - } - else { - S = (value < 0) ? '-' : '+'; - if((o instanceof Long || o instanceof Integer) && stringValue.contains(".")) { - stringValue = stringValue.substring(0, stringValue.indexOf(".")); - } - numberTrimFormat(stringValue); - } - StringBuilder s = new StringBuilder(); - for(Character c : N) - s.append(c); - NString = s.toString(); - } - else if(vt != Types.ValueType.STRING && vt != Types.ValueType.BOOLEAN) { - throw new RuntimeException("Don't support value type format!"); - } - } - - private void numberTrimFormat(String stringValue) { - if(stringValue.charAt(0) == '+' || stringValue.charAt(0) == '-') - stringValue = stringValue.substring(1); - - int length = stringValue.length(); - int firstNZ = -1; - int lastNZ = -1; - for(int i = 0; i < length; i++) { - char fChar = stringValue.charAt(i); - char lChar = stringValue.charAt(length - i - 1); - if(Character.isDigit(fChar) && fChar != '0' && firstNZ == -1) - firstNZ = i; - - if(Character.isDigit(lChar) && lChar != '0' && lastNZ == -1) - lastNZ = length - i; - - if(firstNZ > 0 && lastNZ > 0) - break; - } - String subValue = stringValue.substring(firstNZ, lastNZ); - int dotLength = subValue.contains(".") ? 1 : 0; - N = new char[lastNZ - firstNZ - dotLength]; - int index = 0; - for(Character c : subValue.toCharArray()) { - if(c != '.') - N[index++] = c; - } - } - - public double getDoubleActualValue() { - return UtilFunctions.getDouble(actualValue); - } - - // Get a copy of value - public ValueTrimFormat getACopy() { - ValueTrimFormat copy = null; - if(valueType.isNumeric()) { - copy = new ValueTrimFormat(colIndex, valueType, getDoubleActualValue()); - copy.S = S; - copy.N = N; - } - else { - copy = new ValueTrimFormat(colIndex, valueType, actualValue); - } - return copy; - } - - // Check the value is a not set value - public boolean isNotSet() { - - if(this.valueType == Types.ValueType.STRING) - return actualValue == null || ((String) actualValue).length() == 0; - else if(this.valueType.isNumeric()) - return getDoubleActualValue() == 0; - else if(this.valueType == Types.ValueType.BOOLEAN) - return actualValue == null || !((Boolean) actualValue); - return true; - } - - // Set as NoSet - public void setNoSet() { - if(this.valueType == Types.ValueType.STRING) - actualValue = ""; - else if(this.valueType.isNumeric()) { - actualValue = (double) 0; - S = '+'; - N = new char[] {'0'}; - NString = null; - } - else if(this.valueType == Types.ValueType.BOOLEAN) - actualValue = null; - } - - // Get String of actual value - public String getStringOfActualValue() { - return UtilFunctions.objectToString(actualValue); - } - - public boolean isEqual(ValueTrimFormat vtf) { - if(vtf.getValueType() != this.getValueType()) - return false; - else if(vtf.getValueType() == Types.ValueType.FP32) - return ((Float) this.actualValue).compareTo((Float) vtf.actualValue) == 0; - return UtilFunctions.compareTo(valueType, this.actualValue, vtf.actualValue) == 0; - } - - public int getColIndex() { - return colIndex; - } - - private static int getLength(ValueTrimFormat vtf) { - Types.ValueType vt = vtf.valueType; - int len = -1; - if(vt == Types.ValueType.STRING ) - len = vtf.getStringOfActualValue().length(); - else if(vt == Types.ValueType.BOOLEAN) - len = 1; - return len; - } - - @Override - public int compareTo(ValueTrimFormat vtf) { - Types.ValueType vt = vtf.valueType; - if(vt.isNumeric() && this.valueType.isNumeric()) { - return compareNumericVTF(vtf, this); - } - else if(vt.isNumeric() && this.valueType == Types.ValueType.STRING) { - return -1; - } - else if(vt == Types.ValueType.STRING && this.valueType.isNumeric()) { - try { - Double d = Double.parseDouble(vtf.getStringOfActualValue()); - ValueTrimFormat vtfs = new ValueTrimFormat(-1,Types.ValueType.FP64,d); - return compareNumericVTF(vtfs, this); - } - catch(Exception exception){ - return 1; - } - } - else - return Integer.compare(getLength(vtf), getLength(this)); - } - - private static int compareNumericVTF(ValueTrimFormat vtf1, ValueTrimFormat vtf2){ - double dv1 = vtf1.getDoubleActualValue(); - double dv2 = vtf2.getDoubleActualValue(); - int vc = Double.compare(dv1, dv2); - - if(vc == 0) - return 0; - - int s1 = dv1 >= 0 ? 0 : 1; - int s2 = dv2 >= 0 ? 0 : 1; - int nc = Integer.compare(vtf1.N.length + s1, vtf2.N.length + s2); - if(nc == 0) - return Double.compare(Math.abs(dv1), Math.abs(dv2)); - else - return nc; - } - - public String getNString() { - return NString; - } - - public Types.ValueType getValueType() { - return valueType; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java new file mode 100644 index 00000000000..2d8d1d1ade8 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java @@ -0,0 +1,18 @@ +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.runtime.iogen.CustomProperties; + +public abstract class CodeGenBase { + + protected CustomProperties properties; + protected String className; + + public CodeGenBase(CustomProperties properties, String className) { + this.properties = properties; + this.className = className; + } + + public abstract String generateCodeJava(); + + public abstract String generateCodeCPP(); +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java new file mode 100644 index 00000000000..2bb3a7caea3 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.iogen.MappingTrieNode; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Random; + +public class CodeGenTrie { + private CodeGenTrieNode root; + + public CodeGenTrie() { + root = new CodeGenTrieNode(); + } + + public void insert(int colIndex, Types.ValueType valueType, ArrayList keys, HashSet endWithValueString) { + + CodeGenTrieNode currentNode = root; + int index = 0; + for(String key : keys) { + if(currentNode.getChildren().containsKey(key)) { + currentNode = currentNode.getChildren().get(key); + index++; + } + else + break; + } + + CodeGenTrieNode newNode; + for(int i = index; i < keys.size(); i++) { + newNode = new CodeGenTrieNode(i == keys.size() - 1, colIndex, valueType, keys.get(i), endWithValueString, new HashSet<>()); + currentNode.getChildren().put(keys.get(i), newNode); + currentNode = newNode; + } + } + public String getJavaCode(){ + StringBuilder src = new StringBuilder(); + getJavaCode(root, src, "dest.appendValue", "0"); + return src.toString(); + } + + private String getRandomName(String base) { + Random r = new Random(); + int low = 0; + int high = 100000000; + int result = r.nextInt(high - low) + low; + + return base + "_" + result; + } + private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destination, String currPos){ + String currPosVariable = getRandomName("curPos"); + if(node.getChildren().size() ==0){ + String key = node.getKey(); + if(key.length() > 0){ + src.append("index = str.indexOf(\""+node.getKey()+"\", "+currPos+"); \n"); + src.append("if(index != -1) { \n"); + src.append("int "+currPosVariable + " = index + "+ key.length()+"; \n"); + src.append(node.geValueCode(destination, currPosVariable)); + src.append("}\n"); + } + else { + src.append(node.geValueCode(destination, "0")); + } + } + else { + if(node.getKey()!=null) { + src.append("index = str.indexOf(\"" + node.getKey() + "\", "+currPos+"); \n"); + src.append("if(index != -1) { \n"); + src.append("int "+currPosVariable + " = index + "+ node.getKey().length()+"; \n"); + currPos = currPosVariable; + } + + for(String key: node.getChildren().keySet()){ + CodeGenTrieNode child = node.getChildren().get(key); + getJavaCode(child, src, destination, currPos); + } + if(node.getKey()!=null){ + src.append("}\n"); + } + } + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java new file mode 100644 index 00000000000..35de47a8b02 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.iogen.MappingTrieNode; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Random; +import java.util.Set; + +public class CodeGenTrieNode { + + private final Map children = new HashMap<>(); + private boolean endOfCondition; + private int colIndex; + private Types.ValueType valueType; + private HashSet endWithValueString; + private String key; + private HashSet naStrings; + + public CodeGenTrieNode() { + this.endOfCondition = false; + } + + public CodeGenTrieNode(int colIndex, String key) { + this.colIndex = colIndex; + this.key = key; + } + + public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType valueType, String key, HashSet endWithValueString, HashSet naStrings) { + this.endOfCondition = endOfCondition; + this.colIndex = colIndex; + this.valueType = valueType; + this.key = key; + if(endOfCondition){ + this.endWithValueString = endWithValueString; + this.naStrings = naStrings; + } + + } + + public String geValueCode(String destination, String currPos){ + + StringBuilder src = new StringBuilder(); + String subStr; + + if(this.endWithValueString.size() == 1) { + String delim = this.endWithValueString.iterator().next(); + if(delim.length() > 0) + subStr = "str.substring("+currPos+", str.indexOf(\""+delim+"\", "+currPos+"))"; + else + subStr = "str.substring("+currPos+")"; + } + else { + int i = 0; + for(String d: this.endWithValueString){ + if(i == 0) { + if(d.length() == 0) + src.append("endPos = strLen; \n"); + else + src.append("endPos = str.indexOf(\"" + d + "\", "+currPos+"); \n"); + } + else { + if(d.length() == 0) + src.append("endPos = Math.min(strLen, endPos); \n"); + else + src.append("endPos = Math.min(endPos, str.indexOf(\"" + d + "\", "+currPos+")); \n"); + } + i++; + } + subStr = "str.substring(currPos, endPos)"; + } + if(valueType.isNumeric()) { + src.append(getParsCode(subStr)); + src.append("if(cellValue"+colIndex+" != 0) { \n"); + src.append(destination).append("(row, " + colIndex + ", cellValue"+colIndex+"); \n"); + src.append("lnnz++;\n"); + src.append("}\n"); + } + else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOLEAN){ + if(naStrings.size() > 0) { + StringBuilder sb = new StringBuilder(); + sb.append("if("); + for(String na : naStrings) { + src.append("naStrings.contains(\"" + na + "\")").append("|| \n"); + } + sb.delete(sb.length()-2, sb.length()); + sb.append("){ \n"); + sb.append("cellValue+"+colIndex+" = null;"); + sb.append("}\n"); + } + else + src.append(getParsCode(subStr)); + src.append(destination).append("(row, " + colIndex + ", cellValue+"+colIndex+"); \n"); + } + return src.toString(); + } + + private String getParsCode(String subStr) { + switch(valueType ) { + case STRING: return "String cellValue"+colIndex+" = "+subStr+"; \n"; + case BOOLEAN: return "Boolean cellValue"+colIndex+" = Boolean.parseBoolean("+subStr+"); \n"; + case INT32: return "Integer cellValue"+colIndex+" = Integer.parseInt("+subStr+"); \n"; + case INT64: return "Long cellValue"+colIndex+" = Long.parseLong("+subStr+"); \n"; + case FP64: return "Float cellValue"+colIndex+" = Double.parseDouble("+subStr+"); \n"; + case FP32: return "Double cellValue"+colIndex+" = Float.parseFloat("+subStr+"); \n"; + default: throw new RuntimeException("Unsupported value type: "+valueType); + } + } + + + public Map getChildren() { + return children; + } + + public boolean isEndOfCondition() { + return endOfCondition; + } + + public void setEndOfCondition(boolean endOfCondition) { + this.endOfCondition = endOfCondition; + } + + public int getColIndex() { + return colIndex; + } + + public void setColIndex(int colIndex) { + this.colIndex = colIndex; + } + + public Types.ValueType getValueType() { + return valueType; + } + + public void setValueType(Types.ValueType valueType) { + this.valueType = valueType; + } + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java new file mode 100644 index 00000000000..a2004d86cf8 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java @@ -0,0 +1,65 @@ +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.runtime.iogen.CustomProperties; + +public class RowColIdentify extends CodeGenBase { + + public RowColIdentify(CustomProperties properties, String className) { + super(properties, className); + } + + @Override public String generateCodeJava() { + String code = "String str; \n"+ + "int row = rowPos.intValue(); \n"+ + "double cellValue; \n"+ + "ColumnIdentifyProperties[] colsPro = _props.getColumnIdentifyProperties(); \n"+ + "int col = colsPro.length; \n"+ + "int start; \n"+ + "long lnnz = 0; \n"+ + + "BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"+ + "Arrays.sort(colsPro, Comparator.comparing(ColumnIdentifyProperties::getIndexPosition)); \n"+ + + "int lastIndex = 0; \n"+ + "for(ColumnIdentifyProperties cip : _props.getColumnIdentifyProperties()) { \n"+ + " cip.setIndexPosition(cip.getIndexPosition() - lastIndex); \n"+ + " lastIndex += cip.getIndexPosition(); \n"+ + "}\n"+ +// +// "// Read the data\n"+ +// "try {\n"+ +// " while((str = br.readLine()) != null) //foreach line\n"+ +// " {\n"+ +// "start = 0; \n"+ +// "for(int c = 0; + c < col; c++) {\n"+ +// " Pair pair = _props.getValue(str, start, colsPro[c].getIndexPositionDelimiter(),\n"+ +// "colsPro[c].getIndexPosition(), colsPro[c].getValueEndWithString()); \n"+ +// +// " if(pair!=null) {\n"+ +// "cellValue = UtilFunctions.getDouble(pair.getKey()); \n"+ +// "if(cellValue != 0) {\n"+ +// " dest.appendValue(row, col, cellValue); \n"+ +// " lnnz++; \n"+ +// " start += pair.getValue(); \n"+ +// "}\n"+ +// " }\n"+ +// " else\n"+ +// "break; \n"+ +// "}\n"+ +// "row++; \n"+ +// " }\n"+ +// "}\n"+ +// "finally {\n"+ +// " IOUtilFunctions.closeSilently(br); \n"+ +// "}\n"+ +// "rowPos.setValue(row); \n"+ +// "return lnnz; "; + "return 0; "; + + return code; + } + + @Override public String generateCodeCPP() { + return null; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java new file mode 100644 index 00000000000..a2219a058c7 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java @@ -0,0 +1,94 @@ +package org.apache.sysds.runtime.iogen.template; + +import org.apache.commons.lang.mutable.MutableInt; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashSet; + +public class GIOMatrixReader extends MatrixGenerateReader { + + public GIOMatrixReader(CustomProperties _props) { + super(_props); + } + + @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, + MutableInt rowPos, long rlen, long clen, int blen) throws IOException { + + String str; + int row = rowPos.intValue(); + double cellValue; + long lnnz = 0; + + ArrayList[] colKeyPattern = _props.getColKeyPattern(); + HashSet[] endWithValueString = _props.getEndWithValueString(); + int col = endWithValueString.length; + int index; + + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + try { + while((str = br.readLine()) != null) //foreach line + { +// for(int c = 0; c < col; c++) { +// cellValue = getCellValue(str, colKeyPattern[c], endWithValueString[c]); +// if(cellValue != 0) { +// dest.appendValue(row, col, cellValue); +// lnnz++; +// } +// } + + + row++; + } + } + finally { + IOUtilFunctions.closeSilently(br); + } + // + // + // //------------------------------------------------------- + // Arrays.sort(colsPro, Comparator.comparing(ColumnIdentifyProperties::getIndexPosition)); + // + // int lastIndex = 0; + // for(ColumnIdentifyProperties cip : _props.getColumnIdentifyProperties()) { + // cip.setIndexPosition(cip.getIndexPosition() - lastIndex); + // lastIndex += cip.getIndexPosition(); + // } + // + // // Read the data + // try { + // while((str = br.readLine()) != null) //foreach line + // { + // start = 0; + // for(int c = 0; c < col; c++) { + // Pair pair = _props.getValue(str, start, colsPro[c].getIndexPositionDelimiter(), + // colsPro[c].getIndexPosition(), colsPro[c].getValueEndWithString()); + // + // if(pair!=null) { + // cellValue = UtilFunctions.getDouble(pair.getKey()); + // if(cellValue != 0) { + // dest.appendValue(row, col, cellValue); + // lnnz++; + // start += pair.getValue(); + // } + // } + // else + // break; + // } + // row++; + // } + // } + // finally { + // IOUtilFunctions.closeSilently(br); + // } + + rowPos.setValue(row); + return lnnz; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java new file mode 100644 index 00000000000..080d80066ed --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.template; + +import org.apache.commons.lang.mutable.MutableInt; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.mapred.JobConf; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.util.UtilFunctions; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public abstract class MatrixGenerateReader extends MatrixReader { + + protected static CustomProperties _props; + + public MatrixGenerateReader(CustomProperties _props) { + MatrixGenerateReader._props = _props; + } + + protected MatrixBlock computeSize(List files, FileSystem fs, long rlen, long clen) + throws IOException, DMLRuntimeException { + // allocate target matrix block based on given size; + return new MatrixBlock(getNumRows(files, fs), (int) clen, rlen * clen); + } + + private static int getNumRows(List files, FileSystem fs) throws IOException, DMLRuntimeException { + int rows = 0; + for(int fileNo = 0; fileNo < files.size(); fileNo++) { + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); + try { + // Row Identify + if(_props.getRowIndex().equals(CustomProperties.IndexProperties.IDENTIFY)) { + while(br.readLine() != null) + rows++; + } + } + finally { + IOUtilFunctions.closeSilently(br); + } + } + return rows; + } + + @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) + throws IOException, DMLRuntimeException { + + MatrixBlock ret = null; + if(rlen >= 0 && clen >= 0) //otherwise allocated on read + ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false); + + //prepare file access + JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + + //core read + ret = readMatrixFromHDFS(path, job, fs, ret, rlen, clen, blen); + + return ret; + } + + @Override public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz) + throws IOException, DMLRuntimeException { + + MatrixBlock ret = null; + if(rlen >= 0 && clen >= 0) //otherwise allocated on read + ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false); + + return ret; + } + + @SuppressWarnings("unchecked") private MatrixBlock readMatrixFromHDFS(Path path, JobConf job, FileSystem fs, + MatrixBlock dest, long rlen, long clen, int blen) throws IOException, DMLRuntimeException { + + //prepare file paths in alphanumeric order + ArrayList files = new ArrayList<>(); + if(fs.getFileStatus(path).isDirectory()) { + for(FileStatus stat : fs.listStatus(path, IOUtilFunctions.hiddenFileFilter)) + files.add(stat.getPath()); + Collections.sort(files); + } + else + files.add(path); + + //determine matrix size via additional pass if required + if(dest == null) { + dest = computeSize(files, fs, rlen, clen); + rlen = dest.getNumRows(); + //clen = _props.getColumnIdentifyProperties().length; + } + + //actual read of individual files + long lnnz = 0; + MutableInt row = new MutableInt(0); + for(int fileNo = 0; fileNo < files.size(); fileNo++) { + lnnz += readMatrixFromInputStream(fs.open(files.get(fileNo)), path.toString(), dest, row, rlen, clen, blen); + } + + //post processing + dest.setNonZeros(lnnz); + + return dest; + } + + protected abstract long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, + MutableInt rowPos, long rlen, long clen, int blen) throws IOException; + + protected void saveCode(String fileName, String code) { + try(Writer writer = new BufferedWriter( + new OutputStreamWriter(new FileOutputStream(fileName, false), "utf-8"))) { + writer.write(code); + } + catch(Exception ex) { + } + } + +// protected double getCellValue(String str, ArrayList keys, HashSet endDelim) { +// int currPos = 0; +// for(String k : keys) { +// int index = str.indexOf(k, currPos); +// if(index != -1) +// currPos = index + k.length(); +// else +// return 0; +// } +// int endPos = -1; +// for(String d : endDelim) { +// endPos = d.length()> 0 ? str.indexOf(d, currPos): str.length(); +// if(endPos != -1) +// break; +// } +// //------------------------ +// if(endDelim.contains("")){} +// return UtilFunctions.getDouble(str.substring(currPos, endPos)); +// } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java new file mode 100644 index 00000000000..12b71f50408 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java @@ -0,0 +1,26 @@ +package org.apache.sysds.runtime.iogen.template; + +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.codegen.CodeGenBase; + +public abstract class TemplateCodeGenBase { + + protected String code = "%code%"; + protected String prop = "%prop%"; + protected CustomProperties properties; + protected String className; + protected String javaTemplate; + protected String cppSourceTemplate; + protected String cppHeaderTemplate; + + protected CodeGenBase codeGenClass; + + public TemplateCodeGenBase(CustomProperties properties, String className) { + this.properties = properties; + this.className = className; + } + + public abstract String generateCodeJava(); + + public abstract String generateCodeCPP(); +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java new file mode 100644 index 00000000000..0eb4ffa25d9 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java @@ -0,0 +1,18 @@ +package org.apache.sysds.runtime.iogen.template; + +import org.apache.sysds.runtime.iogen.CustomProperties; + +public class TemplateCodeGenFrame extends TemplateCodeGenBase { + + public TemplateCodeGenFrame(CustomProperties properties, String className) { + super(properties, className); + } + + @Override public String generateCodeJava() { + return null; + } + + @Override public String generateCodeCPP() { + return null; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java new file mode 100644 index 00000000000..424e98371e2 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java @@ -0,0 +1,60 @@ +package org.apache.sysds.runtime.iogen.template; + +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.codegen.RowColIdentify; + +public class TemplateCodeGenMatrix extends TemplateCodeGenBase { + + private String type; + + public TemplateCodeGenMatrix(CustomProperties properties, String className) { + super(properties, className); + + // 1. set java code template + // 2. set cpp code template + javaTemplate =//"package org.apache.sysds.runtime.iogen; \n"+ + "import org.apache.commons.lang.mutable.MutableInt;\n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + + "import org.apache.sysds.runtime.iogen.ColumnIdentifyProperties;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import org.apache.sysds.runtime.matrix.data.Pair;\n" + + "import org.apache.sysds.runtime.util.UtilFunctions;\n" + + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n"+ + "import java.io.BufferedReader;\n" + + "import java.io.IOException;\n" + + "import java.io.InputStream;\n" + + "import java.io.InputStreamReader;\n" + + "import java.util.Arrays;\n" + + "import java.util.Comparator;\n" + + + "public class "+className+" extends MatrixGenerateReader {\n"+ + + " public "+className+"(CustomProperties _props) {\n"+ + " super(_props);\n"+ + " }\n"+ + + " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n"+ + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n"+ + code+ + "}}\n"; + + type = properties.getRowIndex().toString() + properties.getColIndex().toString(); + switch(type){ + case "IDENTIFYIDENTIFY": + codeGenClass = new RowColIdentify(properties, className); + break; + default: + throw new RuntimeException("The properties of row and column index are not defined!!"); + } + } + + @Override + public String generateCodeJava() { + return javaTemplate.replace(code, codeGenClass.generateCodeJava()); + } + + @Override public String generateCodeCPP() { + return codeGenClass.generateCodeCPP(); + } +} From 7a1cf7230f1438c02e32383ec1ba4e9754264539 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 16 Jan 2022 22:47:44 +0100 Subject: [PATCH 04/84] Update CodeGen --- .../sysds/runtime/iogen/GenerateReader.java | 24 +++-- .../sysds/runtime/iogen/codegen/CodeGen.java | 52 ++++++++++ .../runtime/iogen/codegen/CodeGenBase.java | 18 ---- .../runtime/iogen/codegen/CodeGenTrie.java | 14 ++- .../iogen/codegen/CodeGenTrieNode.java | 44 ++------- .../runtime/iogen/codegen/RowColIdentify.java | 65 ------------- .../iogen/template/GIOMatrixReader.java | 94 ------------------- .../iogen/template/MatrixGenerateReader.java | 30 ++---- .../iogen/template/TemplateCodeGenBase.java | 4 +- .../iogen/template/TemplateCodeGenMatrix.java | 65 +++++-------- 10 files changed, 119 insertions(+), 291 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index b687598606e..63c4f130856 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -21,12 +21,14 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysds.runtime.codegen.CodegenUtils; import org.apache.sysds.runtime.io.MatrixReader; import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.template.GIOMatrixReader; +import org.apache.sysds.runtime.iogen.template.TemplateCodeGenMatrix; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; + /* Generate Reader has two steps: 1. Identify file format and extract the properties of it based on the Sample Matrix. @@ -42,7 +44,7 @@ public abstract class GenerateReader { protected static final Log LOG = LogFactory.getLog(GenerateReader.class.getName()); - protected static FormatIdentifying formatIdentifying; + public FormatIdentifying formatIdentifying; public GenerateReader(SampleProperties sampleProperties) throws Exception { @@ -51,6 +53,10 @@ public GenerateReader(SampleProperties sampleProperties) throws Exception { sampleProperties.getSampleFrame()); } + public FormatIdentifying getFormatIdentifying() { + return formatIdentifying; + } + // Generate Reader for Matrix public static class GenerateReaderMatrix extends GenerateReader { @@ -71,18 +77,16 @@ public MatrixReader getReader() throws Exception { throw new Exception("The file format couldn't recognize!!"); } - //String className = "GIOMatrixReader2"; - //TemplateCodeGenMatrix src = new TemplateCodeGenMatrix(ffp, className); + String className = "GIOMatrixReader"; + TemplateCodeGenMatrix src = new TemplateCodeGenMatrix(ffp, className); // constructor with arguments as CustomProperties - //Class[] cArg = new Class[1]; - //cArg[0] = CustomProperties.class; + Class[] cArg = new Class[1]; + cArg[0] = CustomProperties.class; - //String co = src.generateCodeJava(); + String jc = src.generateCodeJava(); - //System.out.println(src.generateCodeJava()); - //matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor().newInstance(); - matrixReader = new GIOMatrixReader(ffp); + matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(ffp); return matrixReader; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java new file mode 100644 index 00000000000..7fa4d6eb5d8 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java @@ -0,0 +1,52 @@ +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.iogen.CustomProperties; + +import java.util.ArrayList; + +public class CodeGen { + + protected CustomProperties properties; + protected String className; + + public CodeGen(CustomProperties properties, String className) { + this.properties = properties; + this.className = className; + } + + public String generateCodeJava(){ + StringBuilder src = new StringBuilder(); + src.append("String str; \n"); + src.append("int row = rowPos.intValue(); \n"); + src.append("long lnnz = 0; \n"); + src.append("int index, endPos, strLen; \n"); + src.append("HashSet[] endWithValueString = _props.getEndWithValueString(); \n"); + src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); + src.append("try { \n"); + src.append("while((str = br.readLine()) != null){ \n"); + src.append("strLen = str.length(); \n"); + + ArrayList[] colKeyPattern = properties.getColKeyPattern(); + CodeGenTrie trie= new CodeGenTrie(); + for(int c=0; c< colKeyPattern.length; c++){ + trie.insert(c, Types.ValueType.FP64, colKeyPattern[c]); + } + src.append(trie.getJavaCode()); + + src.append("row++; \n"); + src.append("} \n"); + src.append("} \n"); + src.append("finally { \n"); + src.append("IOUtilFunctions.closeSilently(br); \n"); + src.append("}"); + src.append("rowPos.setValue(row); \n"); + src.append("return lnnz; \n"); + + return src.toString(); + } + + public String generateCodeCPP(){ + return null; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java deleted file mode 100644 index 2d8d1d1ade8..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenBase.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.apache.sysds.runtime.iogen.codegen; - -import org.apache.sysds.runtime.iogen.CustomProperties; - -public abstract class CodeGenBase { - - protected CustomProperties properties; - protected String className; - - public CodeGenBase(CustomProperties properties, String className) { - this.properties = properties; - this.className = className; - } - - public abstract String generateCodeJava(); - - public abstract String generateCodeCPP(); -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 2bb3a7caea3..742fcb4d35b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -20,12 +20,9 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.iogen.MappingTrieNode; import java.util.ArrayList; -import java.util.HashMap; import java.util.HashSet; -import java.util.Map; import java.util.Random; public class CodeGenTrie { @@ -35,7 +32,7 @@ public CodeGenTrie() { root = new CodeGenTrieNode(); } - public void insert(int colIndex, Types.ValueType valueType, ArrayList keys, HashSet endWithValueString) { + public void insert(int colIndex, Types.ValueType valueType, ArrayList keys) { CodeGenTrieNode currentNode = root; int index = 0; @@ -50,7 +47,7 @@ public void insert(int colIndex, Types.ValueType valueType, ArrayList ke CodeGenTrieNode newNode; for(int i = index; i < keys.size(); i++) { - newNode = new CodeGenTrieNode(i == keys.size() - 1, colIndex, valueType, keys.get(i), endWithValueString, new HashSet<>()); + newNode = new CodeGenTrieNode(i == keys.size() - 1, colIndex, valueType, keys.get(i), new HashSet<>()); currentNode.getChildren().put(keys.get(i), newNode); currentNode = newNode; } @@ -61,7 +58,7 @@ public String getJavaCode(){ return src.toString(); } - private String getRandomName(String base) { + public String getRandomName(String base) { Random r = new Random(); int low = 0; int high = 100000000; @@ -69,12 +66,13 @@ private String getRandomName(String base) { return base + "_" + result; } + private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destination, String currPos){ String currPosVariable = getRandomName("curPos"); if(node.getChildren().size() ==0){ String key = node.getKey(); if(key.length() > 0){ - src.append("index = str.indexOf(\""+node.getKey()+"\", "+currPos+"); \n"); + src.append("index = str.indexOf(\""+node.getKey().replace("\"", "\\\"")+"\", "+currPos+"); \n"); src.append("if(index != -1) { \n"); src.append("int "+currPosVariable + " = index + "+ key.length()+"; \n"); src.append(node.geValueCode(destination, currPosVariable)); @@ -86,7 +84,7 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destina } else { if(node.getKey()!=null) { - src.append("index = str.indexOf(\"" + node.getKey() + "\", "+currPos+"); \n"); + src.append("index = str.indexOf(\"" + node.getKey().replace("\"", "\\\"") + "\", "+currPos+"); \n"); src.append("if(index != -1) { \n"); src.append("int "+currPosVariable + " = index + "+ node.getKey().length()+"; \n"); currPos = currPosVariable; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 35de47a8b02..0d391247277 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -20,13 +20,10 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.iogen.MappingTrieNode; import java.util.HashMap; import java.util.HashSet; import java.util.Map; -import java.util.Random; -import java.util.Set; public class CodeGenTrieNode { @@ -34,7 +31,7 @@ public class CodeGenTrieNode { private boolean endOfCondition; private int colIndex; private Types.ValueType valueType; - private HashSet endWithValueString; + //private HashSet endWithValueString; private String key; private HashSet naStrings; @@ -47,13 +44,13 @@ public CodeGenTrieNode(int colIndex, String key) { this.key = key; } - public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType valueType, String key, HashSet endWithValueString, HashSet naStrings) { + public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType valueType, String key, HashSet naStrings) { this.endOfCondition = endOfCondition; this.colIndex = colIndex; this.valueType = valueType; this.key = key; if(endOfCondition){ - this.endWithValueString = endWithValueString; + //this.endWithValueString = endWithValueString; this.naStrings = naStrings; } @@ -64,32 +61,9 @@ public String geValueCode(String destination, String currPos){ StringBuilder src = new StringBuilder(); String subStr; - if(this.endWithValueString.size() == 1) { - String delim = this.endWithValueString.iterator().next(); - if(delim.length() > 0) - subStr = "str.substring("+currPos+", str.indexOf(\""+delim+"\", "+currPos+"))"; - else - subStr = "str.substring("+currPos+")"; - } - else { - int i = 0; - for(String d: this.endWithValueString){ - if(i == 0) { - if(d.length() == 0) - src.append("endPos = strLen; \n"); - else - src.append("endPos = str.indexOf(\"" + d + "\", "+currPos+"); \n"); - } - else { - if(d.length() == 0) - src.append("endPos = Math.min(strLen, endPos); \n"); - else - src.append("endPos = Math.min(endPos, str.indexOf(\"" + d + "\", "+currPos+")); \n"); - } - i++; - } - subStr = "str.substring(currPos, endPos)"; - } + src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueString["+colIndex+"]); \n"); + subStr = "str.substring("+currPos+",endPos)"; + if(valueType.isNumeric()) { src.append(getParsCode(subStr)); src.append("if(cellValue"+colIndex+" != 0) { \n"); @@ -98,7 +72,7 @@ public String geValueCode(String destination, String currPos){ src.append("}\n"); } else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOLEAN){ - if(naStrings.size() > 0) { + if(naStrings !=null && naStrings.size() > 0) { StringBuilder sb = new StringBuilder(); sb.append("if("); for(String na : naStrings) { @@ -122,8 +96,8 @@ private String getParsCode(String subStr) { case BOOLEAN: return "Boolean cellValue"+colIndex+" = Boolean.parseBoolean("+subStr+"); \n"; case INT32: return "Integer cellValue"+colIndex+" = Integer.parseInt("+subStr+"); \n"; case INT64: return "Long cellValue"+colIndex+" = Long.parseLong("+subStr+"); \n"; - case FP64: return "Float cellValue"+colIndex+" = Double.parseDouble("+subStr+"); \n"; - case FP32: return "Double cellValue"+colIndex+" = Float.parseFloat("+subStr+"); \n"; + case FP64: return "Double cellValue"+colIndex+" = Double.parseDouble("+subStr+"); \n"; + case FP32: return "Float cellValue"+colIndex+" = Float.parseFloat("+subStr+"); \n"; default: throw new RuntimeException("Unsupported value type: "+valueType); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java deleted file mode 100644 index a2004d86cf8..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/RowColIdentify.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.apache.sysds.runtime.iogen.codegen; - -import org.apache.sysds.runtime.iogen.CustomProperties; - -public class RowColIdentify extends CodeGenBase { - - public RowColIdentify(CustomProperties properties, String className) { - super(properties, className); - } - - @Override public String generateCodeJava() { - String code = "String str; \n"+ - "int row = rowPos.intValue(); \n"+ - "double cellValue; \n"+ - "ColumnIdentifyProperties[] colsPro = _props.getColumnIdentifyProperties(); \n"+ - "int col = colsPro.length; \n"+ - "int start; \n"+ - "long lnnz = 0; \n"+ - - "BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"+ - "Arrays.sort(colsPro, Comparator.comparing(ColumnIdentifyProperties::getIndexPosition)); \n"+ - - "int lastIndex = 0; \n"+ - "for(ColumnIdentifyProperties cip : _props.getColumnIdentifyProperties()) { \n"+ - " cip.setIndexPosition(cip.getIndexPosition() - lastIndex); \n"+ - " lastIndex += cip.getIndexPosition(); \n"+ - "}\n"+ -// -// "// Read the data\n"+ -// "try {\n"+ -// " while((str = br.readLine()) != null) //foreach line\n"+ -// " {\n"+ -// "start = 0; \n"+ -// "for(int c = 0; + c < col; c++) {\n"+ -// " Pair pair = _props.getValue(str, start, colsPro[c].getIndexPositionDelimiter(),\n"+ -// "colsPro[c].getIndexPosition(), colsPro[c].getValueEndWithString()); \n"+ -// -// " if(pair!=null) {\n"+ -// "cellValue = UtilFunctions.getDouble(pair.getKey()); \n"+ -// "if(cellValue != 0) {\n"+ -// " dest.appendValue(row, col, cellValue); \n"+ -// " lnnz++; \n"+ -// " start += pair.getValue(); \n"+ -// "}\n"+ -// " }\n"+ -// " else\n"+ -// "break; \n"+ -// "}\n"+ -// "row++; \n"+ -// " }\n"+ -// "}\n"+ -// "finally {\n"+ -// " IOUtilFunctions.closeSilently(br); \n"+ -// "}\n"+ -// "rowPos.setValue(row); \n"+ -// "return lnnz; "; - "return 0; "; - - return code; - } - - @Override public String generateCodeCPP() { - return null; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java deleted file mode 100644 index a2219a058c7..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/GIOMatrixReader.java +++ /dev/null @@ -1,94 +0,0 @@ -package org.apache.sysds.runtime.iogen.template; - -import org.apache.commons.lang.mutable.MutableInt; -import org.apache.sysds.runtime.io.IOUtilFunctions; -import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashSet; - -public class GIOMatrixReader extends MatrixGenerateReader { - - public GIOMatrixReader(CustomProperties _props) { - super(_props); - } - - @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException { - - String str; - int row = rowPos.intValue(); - double cellValue; - long lnnz = 0; - - ArrayList[] colKeyPattern = _props.getColKeyPattern(); - HashSet[] endWithValueString = _props.getEndWithValueString(); - int col = endWithValueString.length; - int index; - - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - try { - while((str = br.readLine()) != null) //foreach line - { -// for(int c = 0; c < col; c++) { -// cellValue = getCellValue(str, colKeyPattern[c], endWithValueString[c]); -// if(cellValue != 0) { -// dest.appendValue(row, col, cellValue); -// lnnz++; -// } -// } - - - row++; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - // - // - // //------------------------------------------------------- - // Arrays.sort(colsPro, Comparator.comparing(ColumnIdentifyProperties::getIndexPosition)); - // - // int lastIndex = 0; - // for(ColumnIdentifyProperties cip : _props.getColumnIdentifyProperties()) { - // cip.setIndexPosition(cip.getIndexPosition() - lastIndex); - // lastIndex += cip.getIndexPosition(); - // } - // - // // Read the data - // try { - // while((str = br.readLine()) != null) //foreach line - // { - // start = 0; - // for(int c = 0; c < col; c++) { - // Pair pair = _props.getValue(str, start, colsPro[c].getIndexPositionDelimiter(), - // colsPro[c].getIndexPosition(), colsPro[c].getValueEndWithString()); - // - // if(pair!=null) { - // cellValue = UtilFunctions.getDouble(pair.getKey()); - // if(cellValue != 0) { - // dest.appendValue(row, col, cellValue); - // lnnz++; - // start += pair.getValue(); - // } - // } - // else - // break; - // } - // row++; - // } - // } - // finally { - // IOUtilFunctions.closeSilently(br); - // } - - rowPos.setValue(row); - return lnnz; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index 080d80066ed..7cbba447452 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -30,7 +30,6 @@ import org.apache.hadoop.fs.Path; import org.apache.sysds.runtime.io.IOUtilFunctions; import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.util.UtilFunctions; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -42,6 +41,7 @@ import java.io.Writer; import java.util.ArrayList; import java.util.Collections; +import java.util.HashSet; import java.util.List; public abstract class MatrixGenerateReader extends MatrixReader { @@ -149,23 +149,13 @@ protected void saveCode(String fileName, String code) { } } -// protected double getCellValue(String str, ArrayList keys, HashSet endDelim) { -// int currPos = 0; -// for(String k : keys) { -// int index = str.indexOf(k, currPos); -// if(index != -1) -// currPos = index + k.length(); -// else -// return 0; -// } -// int endPos = -1; -// for(String d : endDelim) { -// endPos = d.length()> 0 ? str.indexOf(d, currPos): str.length(); -// if(endPos != -1) -// break; -// } -// //------------------------ -// if(endDelim.contains("")){} -// return UtilFunctions.getDouble(str.substring(currPos, endPos)); -// } + protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { + int endPos = strLen; + for(String d : endWithValueString) { + int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; + if(pos != -1) + endPos = Math.min(endPos, pos); + } + return endPos; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java index 12b71f50408..d7a3894694d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java @@ -1,7 +1,7 @@ package org.apache.sysds.runtime.iogen.template; import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.codegen.CodeGenBase; +import org.apache.sysds.runtime.iogen.codegen.CodeGen; public abstract class TemplateCodeGenBase { @@ -13,7 +13,7 @@ public abstract class TemplateCodeGenBase { protected String cppSourceTemplate; protected String cppHeaderTemplate; - protected CodeGenBase codeGenClass; + protected CodeGen codeGen; public TemplateCodeGenBase(CustomProperties properties, String className) { this.properties = properties; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java index 424e98371e2..044bef22a86 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java @@ -1,60 +1,47 @@ package org.apache.sysds.runtime.iogen.template; import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.codegen.RowColIdentify; +import org.apache.sysds.runtime.iogen.codegen.CodeGen; public class TemplateCodeGenMatrix extends TemplateCodeGenBase { - private String type; - public TemplateCodeGenMatrix(CustomProperties properties, String className) { super(properties, className); // 1. set java code template // 2. set cpp code template javaTemplate =//"package org.apache.sysds.runtime.iogen; \n"+ - "import org.apache.commons.lang.mutable.MutableInt;\n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + - "import org.apache.sysds.runtime.iogen.ColumnIdentifyProperties;\n" + - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + - "import org.apache.sysds.runtime.matrix.data.Pair;\n" + - "import org.apache.sysds.runtime.util.UtilFunctions;\n" + - "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n"+ - "import java.io.BufferedReader;\n" + - "import java.io.IOException;\n" + - "import java.io.InputStream;\n" + - "import java.io.InputStreamReader;\n" + - "import java.util.Arrays;\n" + - "import java.util.Comparator;\n" + - - "public class "+className+" extends MatrixGenerateReader {\n"+ - - " public "+className+"(CustomProperties _props) {\n"+ - " super(_props);\n"+ - " }\n"+ - - " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n"+ - " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n"+ - code+ - "}}\n"; - - type = properties.getRowIndex().toString() + properties.getColIndex().toString(); - switch(type){ - case "IDENTIFYIDENTIFY": - codeGenClass = new RowColIdentify(properties, className); - break; - default: - throw new RuntimeException("The properties of row and column index are not defined!!"); - } + "import org.apache.commons.lang.mutable.MutableInt;\n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + + "import java.io.BufferedReader;\n" + + "import java.io.IOException;\n" + + "import java.io.InputStream;\n" + + "import java.io.InputStreamReader;\n" + + "import java.util.HashSet; \n" + + + "public class "+className+" extends MatrixGenerateReader {\n"+ + + " public "+className+"(CustomProperties _props) {\n"+ + " super(_props);\n"+ + " }\n"+ + + " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n"+ + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n"+ + code+ + "}}\n"; + + codeGen = new CodeGen(properties, className); } @Override public String generateCodeJava() { - return javaTemplate.replace(code, codeGenClass.generateCodeJava()); + return javaTemplate.replace(code, codeGen.generateCodeJava()); } @Override public String generateCodeCPP() { - return codeGenClass.generateCodeCPP(); + return codeGen.generateCodeCPP(); } } From 360f570831d3a2d33fb1e26bef13226895152be9 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 16 Jan 2022 23:50:56 +0100 Subject: [PATCH 05/84] Fix MappingTrie Bug --- .../runtime/iogen/FormatIdentifying.java | 1 - .../sysds/runtime/iogen/MappingTrie.java | 25 ++++++++++++------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 5da4f2ef8f2..162646da6eb 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -103,7 +103,6 @@ private Pair, HashSet> buildKeyPatternForAColumn(int c HashSet endWithValueString = null; boolean flagReconstruct; int selectedIndex = -1; - do { int index = 0; for(ArrayList key : keys) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 308fd2e4234..5e51d5000ef 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -56,22 +56,29 @@ public Set getAllSubStringsOfStringContainIntersect(String str, BitSet b if(bitSet.get(i)) sb.append(str.charAt(i)); else if(sb.length() > 0) { - if(sb.length() == 1) - result.add(sb.toString()); - else { - for(int j = 1; j <= sb.length(); j++) { - for(int k = 0; k <= sb.length() - j; k++) { - result.add(sb.substring(k, k + j)); - } - } - } + getAllSubStrings(result, sb); sb = new StringBuilder(); } } + if(sb.length() > 0){ + getAllSubStrings(result, sb); + } return result; } + private void getAllSubStrings(HashSet result, StringBuilder sb) { + if(sb.length() == 1) + result.add(sb.toString()); + else { + for(int j = 1; j <= sb.length(); j++) { + for(int k = 0; k <= sb.length() - j; k++) { + result.add(sb.substring(k, k + j)); + } + } + } + } + public String getIntersectOfChildren(MappingTrieNode node) { if(node.getNodeType() == MappingTrieNode.Type.END || node.getChildren().size() == 0) return null; From 8dedf850ad03370ecdf94dcc1437a61c39455670 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Mon, 17 Jan 2022 02:17:21 +0100 Subject: [PATCH 06/84] Add CodeGen for Frame --- .../sysds/runtime/iogen/CustomProperties.java | 15 +- .../sysds/runtime/iogen/GenerateReader.java | 42 ++--- .../sysds/runtime/iogen/MappingTrie.java | 19 +++ .../sysds/runtime/iogen/MappingTrieNode.java | 19 +++ .../sysds/runtime/iogen/codegen/CodeGen.java | 52 ------- .../runtime/iogen/codegen/CodeGenTrie.java | 35 +++-- .../iogen/codegen/CodeGenTrieNode.java | 3 +- .../runtime/iogen/codegen/FrameCodeGen.java | 94 ++++++++++++ .../runtime/iogen/codegen/MatrixCodeGen.java | 93 +++++++++++ .../iogen/template/FrameGenerateReader.java | 144 ++++++++++++++++++ .../iogen/template/TemplateCodeGenBase.java | 22 ++- .../iogen/template/TemplateCodeGenFrame.java | 18 --- .../iogen/template/TemplateCodeGenMatrix.java | 47 ------ 13 files changed, 444 insertions(+), 159 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 1c076796dbf..2d834fda565 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -19,17 +19,13 @@ package org.apache.sysds.runtime.iogen; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FileFormatProperties; - import java.io.Serializable; import java.util.ArrayList; import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { - private static final Log LOG = LogFactory.getLog(CustomProperties.class.getName()); - private static final long serialVersionUID = -4447926749068752721L; public enum IndexProperties { IDENTIFY, PREFIX, KEY; @@ -41,6 +37,7 @@ public enum IndexProperties { private ArrayList[] colKeyPattern; private HashSet[] endWithValueString; + private Types.ValueType[] schema; private IndexProperties rowIndex; private IndexProperties colIndex; @@ -80,4 +77,12 @@ public IndexProperties getColIndex() { public void setColIndex(IndexProperties colIndex) { this.colIndex = colIndex; } + + public Types.ValueType[] getSchema() { + return schema; + } + + public void setSchema(Types.ValueType[] schema) { + this.schema = schema; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 63c4f130856..41ebf2e75e6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -24,7 +24,8 @@ import org.apache.sysds.runtime.codegen.CodegenUtils; import org.apache.sysds.runtime.io.MatrixReader; import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.template.TemplateCodeGenMatrix; +import org.apache.sysds.runtime.iogen.codegen.FrameCodeGen; +import org.apache.sysds.runtime.iogen.codegen.MatrixCodeGen; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; @@ -44,19 +45,24 @@ public abstract class GenerateReader { protected static final Log LOG = LogFactory.getLog(GenerateReader.class.getName()); - public FormatIdentifying formatIdentifying; + protected CustomProperties properties; public GenerateReader(SampleProperties sampleProperties) throws Exception { - formatIdentifying = sampleProperties.getDataType().isMatrix() ? new FormatIdentifying(sampleProperties.getSampleRaw(), + FormatIdentifying formatIdentifying = sampleProperties.getDataType().isMatrix() ? new FormatIdentifying(sampleProperties.getSampleRaw(), sampleProperties.getSampleMatrix()) : new FormatIdentifying(sampleProperties.getSampleRaw(), sampleProperties.getSampleFrame()); - } - public FormatIdentifying getFormatIdentifying() { - return formatIdentifying; + properties = formatIdentifying.getFormatProperties(); + if(properties == null) { + throw new Exception("The file format couldn't recognize!!"); + } + if(sampleProperties.getDataType().isFrame()){ + properties.setSchema(sampleProperties.getSampleFrame().getSchema()); + } } + // Generate Reader for Matrix public static class GenerateReaderMatrix extends GenerateReader { @@ -71,14 +77,8 @@ public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix) throws E } public MatrixReader getReader() throws Exception { - - CustomProperties ffp = formatIdentifying.getFormatProperties(); - if(ffp == null) { - throw new Exception("The file format couldn't recognize!!"); - } - String className = "GIOMatrixReader"; - TemplateCodeGenMatrix src = new TemplateCodeGenMatrix(ffp, className); + MatrixCodeGen src = new MatrixCodeGen(properties, className); // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; @@ -86,7 +86,7 @@ public MatrixReader getReader() throws Exception { String jc = src.generateCodeJava(); - matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(ffp); + matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } } @@ -105,11 +105,17 @@ public GenerateReaderFrame(String sampleRaw, FrameBlock sampleFrame) throws Exce } public FrameReader getReader() throws Exception { + String className = "GIOFrameReader"; + FrameCodeGen src = new FrameCodeGen(properties, className); + + // constructor with arguments as CustomProperties + Class[] cArg = new Class[1]; + cArg[0] = CustomProperties.class; + + String jc = src.generateCodeJava(); + + frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); - CustomProperties ffp = formatIdentifying.getFormatProperties(); - if(ffp == null) { - throw new Exception("The file format couldn't recognize!!"); - } return frameReader; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 5e51d5000ef..3f8e97c2a71 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.sysds.runtime.iogen; import org.apache.sysds.runtime.matrix.data.Pair; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java index 5c3325d2603..defab3cab37 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.sysds.runtime.iogen; import java.util.ArrayList; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java deleted file mode 100644 index 7fa4d6eb5d8..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGen.java +++ /dev/null @@ -1,52 +0,0 @@ -package org.apache.sysds.runtime.iogen.codegen; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.iogen.CustomProperties; - -import java.util.ArrayList; - -public class CodeGen { - - protected CustomProperties properties; - protected String className; - - public CodeGen(CustomProperties properties, String className) { - this.properties = properties; - this.className = className; - } - - public String generateCodeJava(){ - StringBuilder src = new StringBuilder(); - src.append("String str; \n"); - src.append("int row = rowPos.intValue(); \n"); - src.append("long lnnz = 0; \n"); - src.append("int index, endPos, strLen; \n"); - src.append("HashSet[] endWithValueString = _props.getEndWithValueString(); \n"); - src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); - src.append("try { \n"); - src.append("while((str = br.readLine()) != null){ \n"); - src.append("strLen = str.length(); \n"); - - ArrayList[] colKeyPattern = properties.getColKeyPattern(); - CodeGenTrie trie= new CodeGenTrie(); - for(int c=0; c< colKeyPattern.length; c++){ - trie.insert(c, Types.ValueType.FP64, colKeyPattern[c]); - } - src.append(trie.getJavaCode()); - - src.append("row++; \n"); - src.append("} \n"); - src.append("} \n"); - src.append("finally { \n"); - src.append("IOUtilFunctions.closeSilently(br); \n"); - src.append("}"); - src.append("rowPos.setValue(row); \n"); - src.append("return lnnz; \n"); - - return src.toString(); - } - - public String generateCodeCPP(){ - return null; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 742fcb4d35b..f593dc7325c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -20,7 +20,6 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; - import java.util.ArrayList; import java.util.HashSet; import java.util.Random; @@ -33,7 +32,7 @@ public CodeGenTrie() { } public void insert(int colIndex, Types.ValueType valueType, ArrayList keys) { - + CodeGenTrieNode currentNode = root; int index = 0; for(String key : keys) { @@ -52,9 +51,9 @@ public void insert(int colIndex, Types.ValueType valueType, ArrayList ke currentNode = newNode; } } - public String getJavaCode(){ + public String getJavaCode(String destination){ StringBuilder src = new StringBuilder(); - getJavaCode(root, src, "dest.appendValue", "0"); + getJavaCode(root, src, destination, "0"); return src.toString(); } @@ -69,34 +68,42 @@ public String getRandomName(String base) { private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destination, String currPos){ String currPosVariable = getRandomName("curPos"); - if(node.getChildren().size() ==0){ + if(node.getChildren().size() ==0 || node.isEndOfCondition()){ String key = node.getKey(); if(key.length() > 0){ src.append("index = str.indexOf(\""+node.getKey().replace("\"", "\\\"")+"\", "+currPos+"); \n"); src.append("if(index != -1) { \n"); src.append("int "+currPosVariable + " = index + "+ key.length()+"; \n"); src.append(node.geValueCode(destination, currPosVariable)); - src.append("}\n"); + currPos = currPosVariable; } - else { + else src.append(node.geValueCode(destination, "0")); - } } - else { - if(node.getKey()!=null) { - src.append("index = str.indexOf(\"" + node.getKey().replace("\"", "\\\"") + "\", "+currPos+"); \n"); + + if(node.getChildren().size() > 0) { + if(node.getKey() != null) { + currPosVariable = getRandomName("curPos"); + src.append("index = str.indexOf(\"" + node.getKey().replace("\"", "\\\"") + "\", " + currPos + "); \n"); src.append("if(index != -1) { \n"); - src.append("int "+currPosVariable + " = index + "+ node.getKey().length()+"; \n"); + src.append("int " + currPosVariable + " = index + " + node.getKey().length() + "; \n"); currPos = currPosVariable; } - for(String key: node.getChildren().keySet()){ + for(String key : node.getChildren().keySet()) { CodeGenTrieNode child = node.getChildren().get(key); getJavaCode(child, src, destination, currPos); } - if(node.getKey()!=null){ + if(node.getKey() != null) { src.append("}\n"); } } + + if(node.getChildren().size() ==0 || node.isEndOfCondition()){ + String key = node.getKey(); + if(key.length() > 0) + src.append("} \n"); + } + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 0d391247277..790efea25ed 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -20,7 +20,6 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; - import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -85,7 +84,7 @@ else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOL } else src.append(getParsCode(subStr)); - src.append(destination).append("(row, " + colIndex + ", cellValue+"+colIndex+"); \n"); + src.append(destination).append("(row, " + colIndex + ", cellValue"+colIndex+"); \n"); } return src.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java new file mode 100644 index 00000000000..e9a987d8146 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; +import java.util.ArrayList; + +public class FrameCodeGen extends TemplateCodeGenBase { + + public FrameCodeGen(CustomProperties properties, String className) { + super(properties, className); + + // 1. set java code template + javaTemplate = "import org.apache.hadoop.io.LongWritable; \n" + + "import org.apache.hadoop.io.Text; \n" + + "import org.apache.hadoop.mapred.InputFormat; \n" + + "import org.apache.hadoop.mapred.InputSplit; \n" + + "import org.apache.hadoop.mapred.JobConf; \n" + + "import org.apache.hadoop.mapred.RecordReader; \n" + + "import org.apache.hadoop.mapred.Reporter; \n" + + "import org.apache.sysds.common.Types; \n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions; \n" + + "import org.apache.sysds.runtime.iogen.CustomProperties; \n" + + "import org.apache.sysds.runtime.matrix.data.FrameBlock; \n" + + "import org.apache.sysds.runtime.iogen.template.FrameGenerateReader; \n" + + "import java.io.IOException; \n" + + "import java.util.HashSet; \n" + + "public class "+className+" extends FrameGenerateReader{ \n" + + "public "+className+"(CustomProperties _props) { \n" + + " super(_props); \n" + + " } \n" + + + "@Override protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, \n" + + " JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, \n" + + " boolean first) throws IOException { \n" + + code+ + "}} \n"; + + } + + @Override public String generateCodeJava() { + + StringBuilder src = new StringBuilder(); + src.append("RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); \n"); + src.append("LongWritable key = new LongWritable(); \n"); + src.append("Text value = new Text(); \n"); + src.append("int row = rl; \n"); + src.append("long lnnz = 0; \n"); + src.append("HashSet[] endWithValueString = _props.getEndWithValueString(); \n"); + src.append("int index, endPos, strLen; \n"); + src.append("try { \n"); + src.append("while(reader.next(key, value)){ \n"); + src.append("String str = value.toString(); \n"); + src.append("strLen = str.length(); \n"); + + ArrayList[] colKeyPattern = properties.getColKeyPattern(); + CodeGenTrie trie = new CodeGenTrie(); + for(int c = 0; c < colKeyPattern.length; c++) { + trie.insert(c, properties.getSchema()[c], colKeyPattern[c]); + } + src.append(trie.getJavaCode("dest.set")); + + src.append("row++; \n"); + src.append("}} \n"); + src.append("finally { \n"); + src.append("IOUtilFunctions.closeSilently(reader); \n"); + src.append("} \n"); + src.append("return row; \n"); + + return javaTemplate.replace(code, src.toString()); + } + + @Override public String generateCodeCPP() { + return null; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java new file mode 100644 index 00000000000..7e4beb9333c --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; + +import java.util.ArrayList; + +public class MatrixCodeGen extends TemplateCodeGenBase { + + public MatrixCodeGen(CustomProperties properties, String className) { + super(properties, className); + + // 1. set java code template + javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + + "import java.io.BufferedReader;\n" + + "import java.io.IOException;\n" + + "import java.io.InputStream;\n" + + "import java.io.InputStreamReader;\n" + + "import java.util.HashSet; \n" + + + "public class "+className+" extends MatrixGenerateReader {\n"+ + + " public "+className+"(CustomProperties _props) {\n"+ + " super(_props);\n"+ + " }\n"+ + + " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n"+ + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n"+ + code+ + "}}\n"; + // 2. set cpp code template + } + + @Override + public String generateCodeJava() { + StringBuilder src = new StringBuilder(); + src.append("String str; \n"); + src.append("int row = rowPos.intValue(); \n"); + src.append("long lnnz = 0; \n"); + src.append("int index, endPos, strLen; \n"); + src.append("HashSet[] endWithValueString = _props.getEndWithValueString(); \n"); + src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); + src.append("try { \n"); + src.append("while((str = br.readLine()) != null){ \n"); + src.append("strLen = str.length(); \n"); + + ArrayList[] colKeyPattern = properties.getColKeyPattern(); + CodeGenTrie trie= new CodeGenTrie(); + for(int c=0; c< colKeyPattern.length; c++){ + trie.insert(c, Types.ValueType.FP64, colKeyPattern[c]); + } + src.append(trie.getJavaCode("dest.appendValue")); + + src.append("row++; \n"); + src.append("} \n"); + src.append("} \n"); + src.append("finally { \n"); + src.append("IOUtilFunctions.closeSilently(br); \n"); + src.append("}"); + src.append("rowPos.setValue(row); \n"); + src.append("return lnnz; \n"); + + return javaTemplate.replace(code, src.toString()); + } + + @Override public String generateCodeCPP() { + return null; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java new file mode 100644 index 00000000000..50ab279fbf6 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.template; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.*; +import org.apache.sysds.common.Types; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.InputStreamInputFormat; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; + +public abstract class FrameGenerateReader extends FrameReader { + + protected CustomProperties _props; + + public FrameGenerateReader(CustomProperties _props) { + this._props = _props; + } + + private int getNumRows(List files, FileSystem fs) throws IOException, DMLRuntimeException { + int rows = 0; + for(int fileNo = 0; fileNo < files.size(); fileNo++) { + BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); + try { + // Row Identify + if(_props.getRowIndex().equals(CustomProperties.IndexProperties.IDENTIFY)) { + while(br.readLine() != null) + rows++; + } + } + finally { + IOUtilFunctions.closeSilently(br); + } + } + return rows; + } + + @Override + public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, + long clen) throws IOException, DMLRuntimeException { + + // prepare file access + JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + FileInputFormat.addInputPath(job, path); + + // check existence and non-empty file + checkValidInputFile(fs, path); + + // compute size if necessary + if(rlen <= 0) { + ArrayList paths = new ArrayList<>(); + paths.add(path); + rlen = getNumRows(paths, fs); + } + + // allocate output frame block + Types.ValueType[] lschema = createOutputSchema(schema, clen); + String[] lnames = createOutputNames(names, clen); + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + + // core read (sequential/parallel) + readFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen); + + return ret; + + } + + @Override + public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] schema, String[] names, + long rlen, long clen) throws IOException, DMLRuntimeException { + + // allocate output frame block + Types.ValueType[] lschema = createOutputSchema(schema, clen); + String[] lnames = createOutputNames(names, clen); + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + + // core read (sequential/parallel) + InputStreamInputFormat informat = new InputStreamInputFormat(is); + InputSplit split = informat.getSplits(null, 1)[0]; + readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); + + return ret; + } + + protected void readFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, Types.ValueType[] schema, + String[] names, long rlen, long clen) throws IOException { + + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + for(int i = 0, rpos = 0; i < splits.length; i++) + rpos = readFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0); + } + + protected abstract int readFrameFromInputSplit(InputSplit split, InputFormat informat, + JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, + boolean first) throws IOException; + + protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { + int endPos = strLen; + for(String d : endWithValueString) { + int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; + if(pos != -1) + endPos = Math.min(endPos, pos); + } + return endPos; + } + +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java index d7a3894694d..fd813fe96fe 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java @@ -1,7 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + package org.apache.sysds.runtime.iogen.template; import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.codegen.CodeGen; public abstract class TemplateCodeGenBase { @@ -13,8 +31,6 @@ public abstract class TemplateCodeGenBase { protected String cppSourceTemplate; protected String cppHeaderTemplate; - protected CodeGen codeGen; - public TemplateCodeGenBase(CustomProperties properties, String className) { this.properties = properties; this.className = className; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java deleted file mode 100644 index 0eb4ffa25d9..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenFrame.java +++ /dev/null @@ -1,18 +0,0 @@ -package org.apache.sysds.runtime.iogen.template; - -import org.apache.sysds.runtime.iogen.CustomProperties; - -public class TemplateCodeGenFrame extends TemplateCodeGenBase { - - public TemplateCodeGenFrame(CustomProperties properties, String className) { - super(properties, className); - } - - @Override public String generateCodeJava() { - return null; - } - - @Override public String generateCodeCPP() { - return null; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java deleted file mode 100644 index 044bef22a86..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenMatrix.java +++ /dev/null @@ -1,47 +0,0 @@ -package org.apache.sysds.runtime.iogen.template; - -import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.codegen.CodeGen; - -public class TemplateCodeGenMatrix extends TemplateCodeGenBase { - - public TemplateCodeGenMatrix(CustomProperties properties, String className) { - super(properties, className); - - // 1. set java code template - // 2. set cpp code template - javaTemplate =//"package org.apache.sysds.runtime.iogen; \n"+ - "import org.apache.commons.lang.mutable.MutableInt;\n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + - "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + - "import java.io.BufferedReader;\n" + - "import java.io.IOException;\n" + - "import java.io.InputStream;\n" + - "import java.io.InputStreamReader;\n" + - "import java.util.HashSet; \n" + - - "public class "+className+" extends MatrixGenerateReader {\n"+ - - " public "+className+"(CustomProperties _props) {\n"+ - " super(_props);\n"+ - " }\n"+ - - " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n"+ - " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n"+ - code+ - "}}\n"; - - codeGen = new CodeGen(properties, className); - } - - @Override - public String generateCodeJava() { - return javaTemplate.replace(code, codeGen.generateCodeJava()); - } - - @Override public String generateCodeCPP() { - return codeGen.generateCodeCPP(); - } -} From 6ab292153d7d4b0f3348960667f2f5bb1f161070 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Mon, 17 Jan 2022 15:42:07 +0100 Subject: [PATCH 07/84] Add row scattered column support --- .../runtime/iogen/FormatIdentifying.java | 78 ++++++++++- .../iogen/codegen/CodeGenTrieNode.java | 2 - .../iogen/GenerateReaderFrameTest.java | 5 +- .../iogen/GenerateReaderMatrixTest.java | 24 ++-- .../Identify/FrameGenerateReaderCSVTest.java | 121 ++++++++++++++++++ .../Identify/MatrixGRRowColIdentifyTest.java | 50 ++++++++ 6 files changed, 258 insertions(+), 22 deletions(-) create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 162646da6eb..bfa77734361 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -19,11 +19,13 @@ package org.apache.sysds.runtime.iogen; +import com.google.gson.Gson; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.data.Pair; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; public class FormatIdentifying { @@ -66,12 +68,16 @@ private void runIdentification() { ncols = mappingValues.getNcols(); nlines = mappingValues.getNlines(); + // Build a Key-Pattern foreach column Pair[], HashSet[]> patternPair = buildKeyPattern(); properties = new CustomProperties(patternPair.getKey(), patternPair.getValue()); + + // Check the row index format + //verifyColsAtAllLines(patternPair.getKey()); + properties.setRowIndex(CustomProperties.IndexProperties.IDENTIFY); } - public CustomProperties getFormatProperties() { return properties; } @@ -140,8 +146,8 @@ private ArrayList> getAllPrefixStringsOfAColumn(int colInd for(int r = 0; r < nrows; r++) { rowIndex = mapRow[r][colIndex]; if(rowIndex != -1) { - prefixStringAndLineNumber.add(new Pair<>( - sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]), rowIndex)); + prefixStringAndLineNumber.add( + new Pair<>(sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]), rowIndex)); } } return prefixStringAndLineNumber; @@ -177,4 +183,70 @@ private HashSet verifyColKeyInALine(int colIndex, ArrayList key) return null; return endWithValueString; } + + // Check the row index pattern of columns + private void verifyColsAtAllLines(ArrayList[] colKeyPattern){ + Pair, HashSet>[] colsRowsLineNumbers = new Pair[ncols]; + + boolean isRowIdentified; + + HashSet allCoveredLines = new HashSet<>(); + for(int c = 0; c < ncols; c++){ + Pair, HashSet> colRowLineNumbers = verifyColKeyAtAllLines(c, colKeyPattern[c]); + colsRowsLineNumbers[c] = colRowLineNumbers; + allCoveredLines.addAll(colRowLineNumbers.getKey()); + } + + isRowIdentified = allCoveredLines.size() == nrows; + } + + // Check the sequential list of keys are on a string + private Integer getIndexOfKeysOnString(String str, ArrayList key, int beginPos) { + int currPos = beginPos; + boolean flag = true; + for(String k : key) { + int index = str.indexOf(k, currPos); + if(index != -1) + currPos = index + k.length(); + else { + flag = false; + break; + } + } + if(flag) + return currPos; + else + return -1; + } + + // Check the row index pattern of a column + private Pair, HashSet> verifyColKeyAtAllLines(int colIndex, ArrayList key) { + + HashSet lineNumbers = new HashSet<>(); + HashSet rowNumbers = new HashSet<>(); + HashMap mapColRow = new HashMap<>(); + for(int r = 0; r < nrows; r++) { + int lineNumber = mapRow[r][colIndex]; + if(lineNumber != -1) { + mapColRow.put(lineNumber, mapCol[r][colIndex]); + rowNumbers.add(r); + } + } + + for(int l = 0; l < nlines; l++) { + boolean flag = true; + RawIndex ri = sampleRawIndexes.get(l); + int index = getIndexOfKeysOnString(ri.getRaw(), key, 0); + if(index != -1) { + if(mapColRow.containsKey(l) && mapColRow.get(l) == index) + lineNumbers.add(l); + else + flag = false; + } + if(!flag) + return null; + } + return new Pair<>(lineNumbers, rowNumbers); + } + } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 790efea25ed..4c6e7a1ccfe 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -30,7 +30,6 @@ public class CodeGenTrieNode { private boolean endOfCondition; private int colIndex; private Types.ValueType valueType; - //private HashSet endWithValueString; private String key; private HashSet naStrings; @@ -49,7 +48,6 @@ public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType val this.valueType = valueType; this.key = key; if(endOfCondition){ - //this.endWithValueString = endWithValueString; this.naStrings = naStrings; } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java index 6d8f20628b7..d9a4228c137 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java @@ -49,8 +49,9 @@ public abstract class GenerateReaderFrameTest extends AutomatedTestBase { Types.ValueType.INT32, Types.ValueType.INT64, Types.ValueType.FP32, - Types.ValueType.FP64, - Types.ValueType.BOOLEAN}; + Types.ValueType.FP64//, + // Types.ValueType.BOOLEAN + }; protected Types.ValueType[] types1= { Types.ValueType.BOOLEAN}; diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index c1e24e950cf..063ea29b7d0 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -19,11 +19,11 @@ package org.apache.sysds.test.functions.iogen; -import com.google.gson.Gson; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types; import org.apache.sysds.conf.CompilerConfig; -import org.apache.sysds.runtime.iogen.ReaderMapping; +import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; @@ -44,8 +44,7 @@ public abstract class GenerateReaderMatrixTest extends AutomatedTestBase { protected abstract String getTestName(); - @Override - public void setUp() { + @Override public void setUp() { TestUtils.clearAssertionInformation(); addTestConfiguration(getTestName(), new TestConfiguration(TEST_DIR, getTestName(), new String[] {"Y"})); } @@ -64,8 +63,7 @@ protected void generateRandomSymmetric(int size, double min, double max, double } } - @SuppressWarnings("unused") - protected void runGenerateReaderTest() { + @SuppressWarnings("unused") protected void runGenerateReaderTest() { Types.ExecMode oldPlatform = rtplatform; rtplatform = Types.ExecMode.SINGLE_NODE; @@ -83,23 +81,19 @@ protected void runGenerateReaderTest() { String HOME = SCRIPT_DIR + TEST_DIR; File directory = new File(HOME); - if (! directory.exists()){ + if(!directory.exists()) { directory.mkdir(); } String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - ReaderMapping r2 = new ReaderMapping(sampleRaw, sampleMB); - //System.out.println(r2.isMapped()); - Gson gson=new Gson(); - System.out.println(gson.toJson(r2.getFormatProperties())); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + MatrixReader mr = gr.getReader(); + MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, -1, clen, -1, -1); -// GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); -// -// MatrixReader mr= gr.getReader(); -// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, -1, clen, -1, -1); + int a = 100; } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java new file mode 100644 index 00000000000..3bc13086f66 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen.Identify; + +import org.apache.sysds.test.functions.iogen.GenerateReaderFrameTest; +import org.junit.Test; + +public class FrameGenerateReaderCSVTest extends GenerateReaderFrameTest { + + private final static String TEST_NAME = "FrameGenerateReaderCSVTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + private void extractSampleRawCSV(String separator) { + int nrows = data.length; + int ncols = data[0].length; + StringBuilder sb = new StringBuilder(); + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + sb.append(data[r][c]); + if(c != ncols - 1) + sb.append(separator); + } + if(r != nrows - 1) + sb.append("\n"); + } + sampleRaw = sb.toString(); + } + + @Test + public void test1() { + String[] naStrings = {}; + String separator = ","; + generateRandomData(10, 5, 1, 100, 1, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test2() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = ","; + generateRandomData(10, 10, -10, 10, 1, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test3() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "****"; + generateRandomData(100, 500, -10, 10, 1, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test4() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = ","; + generateRandomData(10, 10, -10, 10, 0.7, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test5() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = ",,,,"; + generateRandomData(10, 10, -10, 10, 0.5, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test6() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "**"; + generateRandomData(1000, 100, -10, 10, 0.4, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test7() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "**"; + generateRandomData(1000, 100, -10, 10, 0.8, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test8() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "**"; + generateRandomData(10000, 100, -10, 10, 0.5, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 6ae3f3157ed..9a7ece6a4fa 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -79,5 +79,55 @@ public void test3() { runGenerateReaderTest(); } + @Test + public void test4() { + String[] naString = {"NaN"}; + generateRandomCSV(20, 20, -10, 10, 1, ",", naString); + runGenerateReaderTest(); + } + + @Test + public void test5() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"name\":3,\"password\":4}}\n" + + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + runGenerateReaderTest(); + } + @Test + public void test6() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + runGenerateReaderTest(); + } + + @Test + public void test7() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + + "{\"name\":18, \"user\":{\"name\":20,\"password\":21}, \"occupation\":19}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + runGenerateReaderTest(); + } + + @Test + public void test8() { + sampleRaw = "1,1,10\n" + + "1,2,20\n" + + "1,3,30\n" + + "2,2,40\n" + + "3,2,50\n"; + + sampleMatrix = new double[][] {{10,20,30}, {0,40,0}, {0,50,0}}; + runGenerateReaderTest(); + } } From c7daac65b0af7d97853d81efced9a01a229690f9 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sat, 22 Jan 2022 02:07:34 +0100 Subject: [PATCH 08/84] Add Multi Line Identification --- .../sysds/runtime/iogen/ColKeyTrie.java | 171 ++++++ .../sysds/runtime/iogen/ColKeyTrieNode.java | 76 +++ .../sysds/runtime/iogen/ColumnProperties.java | 35 ++ .../sysds/runtime/iogen/CustomProperties.java | 37 +- .../runtime/iogen/FormatIdentifying.java | 543 +++++++++++++++--- .../sysds/runtime/iogen/MappingTrie.java | 13 +- .../apache/sysds/runtime/iogen/RawIndex.java | 3 +- .../runtime/iogen/codegen/FrameCodeGen.java | 2 +- .../runtime/iogen/codegen/MatrixCodeGen.java | 2 +- 9 files changed, 765 insertions(+), 117 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java new file mode 100644 index 00000000000..3b13e0cf9c4 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.runtime.iogen; + +import java.util.ArrayList; +import java.util.HashSet; + +public class ColKeyTrie { + + private ColKeyTrieNode rootPrefixKeys; + private ColKeyTrieNode rootSuffixKeys; + private ArrayList> prefixKeyPattern; + + public ColKeyTrie(ArrayList> prefixKeyPattern) { + this.rootPrefixKeys = new ColKeyTrieNode("RootPrefixKeys"); + this.rootSuffixKeys = new ColKeyTrieNode("RootSuffixKeys"); + this.prefixKeyPattern = prefixKeyPattern; + } + + public ColKeyTrie(String colDelim) { + this.rootPrefixKeys = new ColKeyTrieNode("RootPrefixKeys"); + this.rootSuffixKeys = new ColKeyTrieNode("RootSuffixKeys"); + this.prefixKeyPattern = null; + + ColKeyTrieNode newNode; + newNode = new ColKeyTrieNode(colDelim); + this.rootPrefixKeys.getChildren().put(colDelim, newNode); + } + + public void insertPrefixKeys(ArrayList keys) { + this.insertKeys(keys, rootPrefixKeys); + } + + public void insertSuffixKeys(ArrayList keys) { + this.insertKeys(keys, rootSuffixKeys); + } + + public void insertSuffixKeys(char[] keys) { + ArrayList charList = new ArrayList<>(); + for(Character ch : keys) + charList.add(ch.toString()); + this.insertKeys(charList, rootSuffixKeys); + } + + public void setAPrefixPath(ArrayList keys) { + ColKeyTrieNode currentNode = rootPrefixKeys; + for(String key : keys) { + if(currentNode.getChildren().containsKey(key)) { + currentNode = currentNode.getChildren().get(key); + currentNode.setCheck(true); + } + } + } + + private void insertKeys(ArrayList keys, ColKeyTrieNode root) { + ColKeyTrieNode currentNode = root; + int index = 0; + for(String key : keys) { + if(currentNode.getChildren().containsKey(key)) { + currentNode.countPP(); + currentNode = currentNode.getChildren().get(key); + currentNode.countPP(); + index++; + } + else + break; + } + + ColKeyTrieNode newNode; + for(int i = index; i < keys.size(); i++) { + newNode = new ColKeyTrieNode(keys.get(i)); + currentNode.getChildren().put(keys.get(i), newNode); + currentNode = newNode; + } + } + + public ArrayList> getPrefixKeyPatterns() { + if(this.prefixKeyPattern!=null) + return prefixKeyPattern; + else + return getKeyPatterns(rootPrefixKeys); + } + + public ArrayList> getSuffixKeyPatterns() { + ArrayList> result = new ArrayList<>(); + for(String k : rootSuffixKeys.getChildren().keySet()) { + ColKeyTrieNode node = rootSuffixKeys.getChildren().get(k); + ArrayList nk = new ArrayList<>(); + nk.add(k); + int maxCount = node.getCount(); + getKeyPatterns2(node, result, nk, maxCount); + } + return result; + } + + private ArrayList> getKeyPatterns(ColKeyTrieNode root) { + ArrayList> result = new ArrayList<>(); + getKeyPatterns(root, result, new ArrayList<>()); + return result; + } + + private void getKeyPatterns(ColKeyTrieNode node, ArrayList> result, ArrayList nodeKeys) { + + if(node.getChildren().size() == 0) { + result.add(nodeKeys); + nodeKeys = new ArrayList<>(); + } + else { + for(String k : node.getChildren().keySet()) { + ColKeyTrieNode child = node.getChildren().get(k); + ArrayList tmpKeys = new ArrayList<>(); + tmpKeys.addAll(nodeKeys); + tmpKeys.add(k); + getKeyPatterns(child, result, tmpKeys); + } + } + } + + private void getKeyPatterns2(ColKeyTrieNode node, ArrayList> result, ArrayList nodeKeys, + int maxCount) { + + if(node.getChildren().size() == 1 && node.getCount() == maxCount) { + String k = node.getChildren().keySet().iterator().next(); + ColKeyTrieNode child = node.getChildren().get(k); + ArrayList tmpKeys = new ArrayList<>(); + tmpKeys.addAll(nodeKeys); + tmpKeys.add(k); + getKeyPatterns2(child, result, tmpKeys, maxCount); + } + else + result.add(nodeKeys); + + } + + public void insertPrefixKeysConcurrent(HashSet keys) { + insertPrefixKeysConcurrent(rootPrefixKeys, keys); + } + + private void insertPrefixKeysConcurrent(ColKeyTrieNode node, HashSet keys) { + if(node.getChildren().size() == 0) { + for(String k : keys) { + ColKeyTrieNode newNode = new ColKeyTrieNode(k); + node.getChildren().put(k, newNode); + } + } + else { + for(String childKey : node.getChildren().keySet()) { + ColKeyTrieNode child = node.getChildren().get(childKey); + insertPrefixKeysConcurrent(child, keys); + } + } + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java new file mode 100644 index 00000000000..ce08825681e --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.sysds.runtime.iogen; + +import java.util.HashMap; +import java.util.Map; + +public class ColKeyTrieNode { + private final Map children; + private String key; + private boolean check; + private int count; + + public ColKeyTrieNode() { + this.children = new HashMap<>(); + this.check = false; + this.count = 1; + } + + public ColKeyTrieNode(String key) { + this.children = new HashMap<>(); + this.key = key; + this.check = false; + this.count = 1; + } + + public void countPP() { + this.count++; + } + + public Map getChildren() { + return children; + } + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public boolean isCheck() { + return check; + } + + public void setCheck(boolean check) { + this.check = check; + } + + public int getCount() { + return count; + } + + public void setCount(int count) { + this.count = count; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java new file mode 100644 index 00000000000..75785a057ce --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java @@ -0,0 +1,35 @@ +package org.apache.sysds.runtime.iogen; + +import java.util.ArrayList; +import java.util.HashSet; + +public class ColumnProperties { + + private ArrayList> keyPatterns; + private HashSet endWithValueString; + private ArrayList> nextToPatterns; + + public ArrayList> getKeyPatterns() { + return keyPatterns; + } + + public void setKeyPatterns(ArrayList> keyPatterns) { + this.keyPatterns = keyPatterns; + } + + public HashSet getEndWithValueString() { + return endWithValueString; + } + + public void setEndWithValueString(HashSet endWithValueString) { + this.endWithValueString = endWithValueString; + } + + public ArrayList> getNextToPatterns() { + return nextToPatterns; + } + + public void setNextToPatterns(ArrayList> nextToPatterns) { + this.nextToPatterns = nextToPatterns; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 2d834fda565..f614c02d7b7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -29,37 +29,34 @@ public class CustomProperties extends FileFormatProperties implements Serializab public enum IndexProperties { IDENTIFY, PREFIX, KEY; - @Override public String toString() { return this.name().toUpperCase(); } } - private ArrayList[] colKeyPattern; - private HashSet[] endWithValueString; + private ColKeyTrie[] colKeyPattern; private Types.ValueType[] schema; private IndexProperties rowIndex; - private IndexProperties colIndex; - public CustomProperties(ArrayList[] colKeyPattern, HashSet[] endWithValueString) { + public CustomProperties(ColKeyTrie[] colKeyPattern, IndexProperties rowIndex) { this.colKeyPattern = colKeyPattern; - this.endWithValueString = endWithValueString; + this.rowIndex = rowIndex; } - public ArrayList[] getColKeyPattern() { + public ColKeyTrie[] getColKeyPattern() { return colKeyPattern; } - public void setColKeyPattern(ArrayList[] colKeyPattern) { + public void setColKeyPattern(ColKeyTrie[] colKeyPattern) { this.colKeyPattern = colKeyPattern; } - public HashSet[] getEndWithValueString() { - return endWithValueString; + public Types.ValueType[] getSchema() { + return schema; } - public void setEndWithValueString(HashSet[] endWithValueString) { - this.endWithValueString = endWithValueString; + public void setSchema(Types.ValueType[] schema) { + this.schema = schema; } public IndexProperties getRowIndex() { @@ -69,20 +66,4 @@ public IndexProperties getRowIndex() { public void setRowIndex(IndexProperties rowIndex) { this.rowIndex = rowIndex; } - - public IndexProperties getColIndex() { - return colIndex; - } - - public void setColIndex(IndexProperties colIndex) { - this.colIndex = colIndex; - } - - public Types.ValueType[] getSchema() { - return schema; - } - - public void setSchema(Types.ValueType[] schema) { - this.schema = schema; - } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index bfa77734361..05330a53eac 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -19,18 +19,19 @@ package org.apache.sysds.runtime.iogen; -import com.google.gson.Gson; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.data.Pair; import java.util.ArrayList; +import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; public class FormatIdentifying { private int[][] mapRow; + private int[] mapRowPrevious; private int[][] mapCol; private int[][] mapLen; private ArrayList sampleRawIndexes; @@ -38,13 +39,15 @@ public class FormatIdentifying { private static int nrows; private static int ncols; private int nlines; + private int windowSize = 20; + private int suffixStringLength = 100; private final boolean isMatrix; private int colIndexBeginFrom; private int rowIndexBeginFrom; - private ReaderMapping mappingValues; private CustomProperties properties; + public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { this.mappingValues = new ReaderMapping(raw, matrix); this.isMatrix = true; @@ -63,108 +66,173 @@ private void runIdentification() { mapCol = mappingValues.getMapCol(); mapLen = mappingValues.getMapLen(); sampleRawIndexes = mappingValues.getSampleRawIndexes(); + mapRowPrevious = new int[ncols]; + + for(int c=0; c< ncols; c++) + mapRowPrevious[c] = 0; nrows = mappingValues.getNrows(); ncols = mappingValues.getNcols(); nlines = mappingValues.getNlines(); - // Build a Key-Pattern foreach column - Pair[], HashSet[]> patternPair = buildKeyPattern(); - properties = new CustomProperties(patternPair.getKey(), patternPair.getValue()); + // Check the map row: + // If all cells of a row mapped to a single line of sample raw, it is a single row mapping + // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping + + boolean isSingleRow = true; + for(int r=0; r[], HashSet[]> patternPair = buildKeyPattern(); +// properties = new CustomProperties(patternPair.getKey(), patternPair.getValue()); +// +// // Check the row index format +// //verifyColsAtAllLines(patternPair.getKey()); +// +// properties.setRowIndex(CustomProperties.IndexProperties.IDENTIFY); } public CustomProperties getFormatProperties() { return properties; } - private Pair[], HashSet[]> buildKeyPattern() { + private ColKeyTrie[] buildColsKeyPatternSingleRow() { + Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); + ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); + ColKeyTrie[] colKeyPattens = new ColKeyTrie[ncols]; + + for(int c=0; c> keyPatterns; + do { + keyPatterns = trie.getAllSequentialKeys(); + check = false; + for(ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + } - ArrayList[] colKeys = new ArrayList[ncols]; - HashSet[] colKeyEndWithValueStrings = new HashSet[ncols]; - for(int c = 0; c < ncols; c++) { - Pair, HashSet> pair = buildKeyPatternForAColumn(c); - if(pair != null) { - colKeys[c] = pair.getKey(); - colKeyEndWithValueStrings[c] = pair.getValue(); - } - else { - return null; + if(!check){ + flagReconstruct = trie.reConstruct(); + if(!flagReconstruct) + break; + } + }while(!check); + + if(check){ + colKeyPattens[c] = new ColKeyTrie(keyPatterns); + for(String suffix: suffixStrings[c]) { + colKeyPattens[c].insertSuffixKeys(suffix.substring(0,Math.min(suffixStringLength, suffix.length())).toCharArray()); + } } } - return new Pair<>(colKeys, colKeyEndWithValueStrings); + return colKeyPattens; +// +// /////////////////////////////////////////////////////// +// +// Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfCols(true); +// ArrayList[] suffixStrings = extractAllSuffixStringsOfCols(); +// +// ColKeyTrie[] colKeyPattens = new ColKeyTrie[ncols]; +// +// ColKeyTrie[] colKeys = new ArrayList[ncols]; +// HashSet[] colKeyEndWithValueStrings = new HashSet[ncols]; +// +// // -------------------------------------- +// for(int c = 0; c < ncols; c++) { +// Pair, HashSet> pair = buildKeyPatternForAColumn(c, mapRowPrevious[c]); +// if(pair != null) { +// colKeys[c] = pair.getKey(); +// colKeyEndWithValueStrings[c] = pair.getValue(); +// } +// else { +// return null; +// } +// } +// return new Pair<>(colKeys, colKeyEndWithValueStrings); } - private Pair, HashSet> buildKeyPatternForAColumn(int colIndex) { - ArrayList> prefixStringAndLineNumber = getAllPrefixStringsOfAColumn(colIndex); - MappingTrie trie = new MappingTrie(); - for(Pair p : prefixStringAndLineNumber) { - trie.reverseInsert(p.getKey(), p.getValue()); - } - ArrayList> keys = trie.getAllSequentialKeys(); - HashSet endWithValueString = null; - boolean flagReconstruct; - int selectedIndex = -1; - do { - int index = 0; - for(ArrayList key : keys) { - endWithValueString = verifyColKeyInALine(colIndex, key); - if(endWithValueString != null) { - selectedIndex = index; - break; + // Get all prefix strings of a column + public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { + ArrayList[] prefixStrings = new ArrayList[ncols]; + ArrayList[] rowIndexes = new ArrayList[ncols]; + for(int c=0; c< ncols; c++){ + ArrayList ri = new ArrayList<>(); + ArrayList prefixString = new ArrayList<>(); + for(int r=0; r(keys.get(selectedIndex), endWithValueString); - else - return null; + return new Pair<>(prefixStrings, rowIndexes); } - // Get all prefix strings of a column - private ArrayList> getAllPrefixStringsOfAColumn(int colIndex) { - ArrayList> prefixStringAndLineNumber = new ArrayList<>(); - int rowIndex; - for(int r = 0; r < nrows; r++) { - rowIndex = mapRow[r][colIndex]; - if(rowIndex != -1) { - prefixStringAndLineNumber.add( - new Pair<>(sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]), rowIndex)); + private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { + ArrayList[] result = new ArrayList[ncols]; + for(int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][c]; + if(rowIndex == -1) + continue; + String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); + result[c].add(str); } } - return prefixStringAndLineNumber; + return result; } // Validate a key in a row of sample raw data - private HashSet verifyColKeyInALine(int colIndex, ArrayList key) { - + private HashSet verifyColKeyInALine(int colIndex, int pLines, ArrayList key) { boolean flag = true; HashSet endWithValueString = new HashSet<>(); for(int r = 0; r < nrows; r++) { int rowIndex = mapRow[r][colIndex]; if(rowIndex != -1) { - RawIndex ri = sampleRawIndexes.get(rowIndex); + Pair pair = getPreviousLines(rowIndex - pLines, rowIndex); int currPos = 0; for(String k : key) { - int index = ri.getRaw().indexOf(k, currPos); + int index = pair.getKey().indexOf(k, currPos); if(index != -1) currPos = index + k.length(); else { @@ -172,9 +240,9 @@ private HashSet verifyColKeyInALine(int colIndex, ArrayList key) break; } } - int endDelimPos = mapCol[r][colIndex] + mapLen[r][colIndex]; - endWithValueString.add(ri.getSubString(endDelimPos, Math.min(endDelimPos + 1, ri.getRawLength()))); - if(!flag || currPos != mapCol[r][colIndex]) { + int endDelimPos = pair.getValue() + mapCol[r][colIndex] + mapLen[r][colIndex]; + endWithValueString.add(pair.getKey().substring(endDelimPos, Math.min(endDelimPos + 1, pair.getKey().length()))); + if(!flag || currPos != pair.getValue() + mapCol[r][colIndex]) { return null; } } @@ -188,16 +256,38 @@ private HashSet verifyColKeyInALine(int colIndex, ArrayList key) private void verifyColsAtAllLines(ArrayList[] colKeyPattern){ Pair, HashSet>[] colsRowsLineNumbers = new Pair[ncols]; - boolean isRowIdentified; + boolean isRowIdentified = true; HashSet allCoveredLines = new HashSet<>(); + ArrayList colCandidatesForRowKey = new ArrayList<>(); + for(int c = 0; c < ncols; c++){ - Pair, HashSet> colRowLineNumbers = verifyColKeyAtAllLines(c, colKeyPattern[c]); - colsRowsLineNumbers[c] = colRowLineNumbers; - allCoveredLines.addAll(colRowLineNumbers.getKey()); + Pair, HashSet> colRowLineNumbers = verifyColKeyAtAllLines(c, colKeyPattern[c], mapRowPrevious[c]); + if(colRowLineNumbers != null) { + colsRowsLineNumbers[c] = colRowLineNumbers; + allCoveredLines.addAll(colRowLineNumbers.getKey()); + if(colRowLineNumbers.getValue().size() == nrows) + colCandidatesForRowKey.add(c); + } + else { + // We have to find new key for column + isRowIdentified = false; + // ***************************************************************************************** + //buildKeyPatternForAColumn() + } + } + isRowIdentified = allCoveredLines.size() == nrows | isRowIdentified; + + // 1. find an interested col as a key. The prerequisite is all values have to be not null + if(colCandidatesForRowKey.size() > 0){ + } + else { + // Here we have to find a hidden key + } + + - isRowIdentified = allCoveredLines.size() == nrows; } // Check the sequential list of keys are on a string @@ -219,9 +309,16 @@ private Integer getIndexOfKeysOnString(String str, ArrayList key, int be return -1; } - // Check the row index pattern of a column - private Pair, HashSet> verifyColKeyAtAllLines(int colIndex, ArrayList key) { + private Pair getPreviousLines(int beginLine, int endLine){ + StringBuilder sb = new StringBuilder(); + for(int i= Math.max(0, beginLine); i <= endLine; i++) + sb.append(sampleRawIndexes.get(i).getRaw()); + String str = sb.toString(); + return new Pair<>(str, str.length() - sampleRawIndexes.get(endLine).getRawLength()); + } + // Check the row index pattern of a column + private Pair, HashSet> verifyColKeyAtAllLines(int colIndex, ArrayList key, int pLines) { HashSet lineNumbers = new HashSet<>(); HashSet rowNumbers = new HashSet<>(); HashMap mapColRow = new HashMap<>(); @@ -235,10 +332,10 @@ private Pair, HashSet> verifyColKeyAtAllLines(int colI for(int l = 0; l < nlines; l++) { boolean flag = true; - RawIndex ri = sampleRawIndexes.get(l); - int index = getIndexOfKeysOnString(ri.getRaw(), key, 0); + Pair pair = getPreviousLines(l - pLines, l); + int index = getIndexOfKeysOnString(pair.getKey(), key, 0); if(index != -1) { - if(mapColRow.containsKey(l) && mapColRow.get(l) == index) + if(mapColRow.containsKey(l) && mapColRow.get(l) == index - pair.getValue()) lineNumbers.add(l); else flag = false; @@ -249,4 +346,286 @@ private Pair, HashSet> verifyColKeyAtAllLines(int colI return new Pair<>(lineNumbers, rowNumbers); } + + ///////////////////////////////////////////////////////////////////////////// + // Methods For Multi Lines Mapping // + //////////////////////////////////////////////////////////////////////////// + // This implementation is for nested datasets are scattered on multiple lines + // The following steps are required: + // 1. Extract all prefix strings per column + // 2. Build key pattern tree for each column + // 3. Build key pattern for end of values + + // Build key pattern tree for each column + private ColKeyTrie[] buildColsKeyPatternMultiRow(){ + Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); + ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); + + ColKeyTrie[] colKeyPattens = new ColKeyTrie[ncols]; + for(int c=0; c intersect = new HashSet<>(); + intersect.add(colDelim); + + ColKeyTrie trie = new ColKeyTrie(colDelim); + ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); + boolean check; + do { + ArrayList> keyPatterns = trie.getPrefixKeyPatterns(); + check = false; + for(ArrayList keyPattern: keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + if(newCheck){ + trie.setAPrefixPath(keyPattern); + } + } + + if(!check){ + remainedPrefixes.clear(); + boolean flag = true; + for(ArrayList keyPattern: keyPatterns){ + ArrayList remainedPrefix = new ArrayList<>(); + for(String ps : prefixStrings.getKey()[c]) + remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); + + intersect = findStartWithIntersectOfStrings(remainedPrefix); + if(intersect != null) + trie.insertPrefixKeysConcurrent(intersect); + else { + remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); + flag = false; + break; + } + } + if(!flag) + break; + } + } + while(!check); + + // Suffix pattern is based on char, so we need to extract all chars of a string + for(String suffix: suffixStrings[c]) { + trie.insertSuffixKeys(suffix.toCharArray()); + } + colKeyPattens[c] = trie; + } + return colKeyPattens; + } + + // Extract prefix strings: + private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse){ + + ArrayList[] result = new ArrayList[ncols]; + Pair[] minmax = new Pair[ncols]; + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + for(int r=0; r(); + int min = 0; + int max = 0; + for(int r=0; r=0; i--) + if(usedLines[r].get(i)) { + lastLine = i; + break; + } + for(int i= lastLine; i 0 ) + sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); + } + String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][c]); + if(str.length() > 0 && !str.equals("\n")) + sb.append(str); + else if(lastLine < rowIndex) + sb.deleteCharAt(sb.length()-1); + + + if(reverse) + result[c].add(sb.reverse().toString()); + else + result[c].add(sb.toString()); + max = Math.max(max, sb.length()); + if(sb.length()< min || min == 0) + min = sb.length(); + minmax[c] = new Pair<>(min, max); + } + } + return new Pair<>(result, minmax); + } + + private String findStartWithIntersectOfStrings(ArrayList strList, int minLength){ + StringBuilder sb = new StringBuilder(); + int i = 0; + boolean flag = true; + do { + char ch = strList.get(0).charAt(i); + for(int j=1; j findStartWithIntersectOfStrings(ArrayList strList){ + // 1. Extract all substrings + // 2. Find intersection of substrings + + HashSet[] substrings = new HashSet[strList.size()]; + for(int i=0; i< strList.size(); i++) + substrings[i] = new HashSet<>(); + + for(int w = windowSize; w > 2; w--) { + for(int i=0; i totalIntersect = new HashSet<>(substrings[0]); + for(int r=1; r 0) + return totalIntersect; + + } + return null; + } + + private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys){ + for(String ps: prefixStrings){ + int currentPos = 0; + int patternCount = 0; + do { + currentPos = getIndexOfKeyPatternOnString(ps, keys, currentPos).getKey(); + if(currentPos == -1) + break; + else { + patternCount++; + currentPos++; + } + }while(true); + if(patternCount!=1) + return false; + } + return true; + } + + // Check the sequential list of keys are on a string + private Pair getIndexOfKeyPatternOnString(String str, ArrayList key, int beginPos) { + int currPos = beginPos; + boolean flag = true; + int startPos = -1; + for(String k : key) { + int index = str.indexOf(k, currPos); + if(index != -1) + currPos = index + k.length(); + else { + flag = false; + break; + } + if(startPos==-1) + startPos = currPos; + } + if(flag) + return new Pair<>(startPos, currPos+key.get(key.size()-1).length()); + else + return new Pair<>(-1,-1); + } + + private ArrayList getAllSubstringsOfAString(String str,int size){ + ArrayList result = new ArrayList<>(); + if(str == null) + return result; + for(int i = 0; i <= str.length() - size; i++){ + String s = str.substring(i, i + size); + if(!s.contains("\n")) + result.add(s); + } + return result; + } + + private String getRemainedSubstring(String str, ArrayList keys){ + boolean flag = true; + int currPos = 0; + for(String k : keys) { + int index = str.indexOf(k, currPos); + if(index != -1) + currPos = index + k.length(); + else { + flag = false; + break; + } + } + if(flag) + return str.substring(currPos); + else + return null; + } + + private ArrayList[] extractAllSuffixStringsOfColsMultiLine() { + ArrayList[] result = new ArrayList[ncols]; + for(int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][c]; + if(rowIndex == -1) + continue; + StringBuilder sb = new StringBuilder(); + String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); + boolean enter = false; + if(str.length() > 0) { + sb.append(str); + enter = true; + } + + for(int i = rowIndex + 1; i < nlines; i++) { + str = sampleRawIndexes.get(i).getRaw().substring(0, Math.min(sampleRawIndexes.get(i).getRawLength(), suffixStringLength)); + if(str.length() > 0 && !enter) { + sb.append(str); + break; + } + } + if(sb.length() > 0) + sb.deleteCharAt(sb.length() - 1); + result[c].add(sb.toString()); + } + } + return result; + } + } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 3f8e97c2a71..0326d7371e1 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -32,12 +32,17 @@ public class MappingTrie { private MappingTrieNode root; - private int keyLevel; + private boolean inALine; public MappingTrie() { - root = new MappingTrieNode(MappingTrieNode.Type.INNER); - keyLevel = 0; + this.root = new MappingTrieNode(MappingTrieNode.Type.INNER); + this.keyLevel = 0; + this.inALine = true; + } + + public void setInALine(boolean inALine) { + this.inALine = inALine; } public void insert(String word, int rowIndex) { @@ -150,7 +155,7 @@ public String getIntersectOfChildren(MappingTrieNode node) { .sorted((o1, o2) -> o2.length() - o1.length()).collect(Collectors.toList()); for(String ssi : sortedList) { - if(keyLevel == 0) { + if(keyLevel == 0 && inALine) { boolean flagBest = true; for(String k : keys) { if(!k.startsWith(ssi)) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index 5df7b72b17e..c1d537b8d42 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -143,7 +143,8 @@ else if(i==rawLength-2){ if(!flag) eBitSet.set(i, false); } - extractNumericDotEActualValues(); + if(numberBitSet.cardinality() > 0) + extractNumericDotEActualValues(); } public Pair findValue(Object value, Types.ValueType valueType){ diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index e9a987d8146..b2b9f8b33be 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -71,7 +71,7 @@ public FrameCodeGen(CustomProperties properties, String className) { src.append("String str = value.toString(); \n"); src.append("strLen = str.length(); \n"); - ArrayList[] colKeyPattern = properties.getColKeyPattern(); + ArrayList[] colKeyPattern = null;//properties.getColKeyPattern(); CodeGenTrie trie = new CodeGenTrie(); for(int c = 0; c < colKeyPattern.length; c++) { trie.insert(c, properties.getSchema()[c], colKeyPattern[c]); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 7e4beb9333c..72d8635232e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -68,7 +68,7 @@ public String generateCodeJava() { src.append("while((str = br.readLine()) != null){ \n"); src.append("strLen = str.length(); \n"); - ArrayList[] colKeyPattern = properties.getColKeyPattern(); + ArrayList[] colKeyPattern = null; //properties.getColKeyPattern(); CodeGenTrie trie= new CodeGenTrie(); for(int c=0; c< colKeyPattern.length; c++){ trie.insert(c, Types.ValueType.FP64, colKeyPattern[c]); From 0ab6ddcf4e8439c9b71df81180c0ff7741f066c5 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 23 Jan 2022 00:54:33 +0100 Subject: [PATCH 09/84] Add Row Prefix Identification and Key Pattern Build for Row Index --- .../sysds/runtime/iogen/ColumnProperties.java | 35 -- .../sysds/runtime/iogen/CustomProperties.java | 17 +- .../runtime/iogen/FormatIdentifying.java | 308 +++++++++--------- .../iogen/{ColKeyTrie.java => KeyTrie.java} | 60 ++-- .../{ColKeyTrieNode.java => KeyTrieNode.java} | 10 +- .../sysds/runtime/iogen/MappingTrie.java | 3 +- 6 files changed, 199 insertions(+), 234 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java rename src/main/java/org/apache/sysds/runtime/iogen/{ColKeyTrie.java => KeyTrie.java} (69%) rename src/main/java/org/apache/sysds/runtime/iogen/{ColKeyTrieNode.java => KeyTrieNode.java} (88%) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java deleted file mode 100644 index 75785a057ce..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/ColumnProperties.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.apache.sysds.runtime.iogen; - -import java.util.ArrayList; -import java.util.HashSet; - -public class ColumnProperties { - - private ArrayList> keyPatterns; - private HashSet endWithValueString; - private ArrayList> nextToPatterns; - - public ArrayList> getKeyPatterns() { - return keyPatterns; - } - - public void setKeyPatterns(ArrayList> keyPatterns) { - this.keyPatterns = keyPatterns; - } - - public HashSet getEndWithValueString() { - return endWithValueString; - } - - public void setEndWithValueString(HashSet endWithValueString) { - this.endWithValueString = endWithValueString; - } - - public ArrayList> getNextToPatterns() { - return nextToPatterns; - } - - public void setNextToPatterns(ArrayList> nextToPatterns) { - this.nextToPatterns = nextToPatterns; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index f614c02d7b7..bb1f74e3ff4 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -22,8 +22,6 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FileFormatProperties; import java.io.Serializable; -import java.util.ArrayList; -import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { @@ -34,20 +32,27 @@ public enum IndexProperties { } } - private ColKeyTrie[] colKeyPattern; + private KeyTrie[] colKeyPattern; private Types.ValueType[] schema; private IndexProperties rowIndex; + private KeyTrie rowKeyPattern; - public CustomProperties(ColKeyTrie[] colKeyPattern, IndexProperties rowIndex) { + public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex) { this.colKeyPattern = colKeyPattern; this.rowIndex = rowIndex; } - public ColKeyTrie[] getColKeyPattern() { + public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex, KeyTrie rowKeyPattern) { + this.colKeyPattern = colKeyPattern; + this.rowIndex = rowIndex; + this.rowKeyPattern = rowKeyPattern; + } + + public KeyTrie[] getColKeyPattern() { return colKeyPattern; } - public void setColKeyPattern(ColKeyTrie[] colKeyPattern) { + public void setColKeyPattern(KeyTrie[] colKeyPattern) { this.colKeyPattern = colKeyPattern; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 05330a53eac..ed91f8b69f8 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -25,13 +25,12 @@ import java.util.ArrayList; import java.util.BitSet; -import java.util.HashMap; import java.util.HashSet; public class FormatIdentifying { private int[][] mapRow; - private int[] mapRowPrevious; + private int[] mapRowPrevious; private int[][] mapCol; private int[][] mapLen; private ArrayList sampleRawIndexes; @@ -41,22 +40,17 @@ public class FormatIdentifying { private int nlines; private int windowSize = 20; private int suffixStringLength = 100; - private final boolean isMatrix; - private int colIndexBeginFrom; - private int rowIndexBeginFrom; private ReaderMapping mappingValues; private CustomProperties properties; public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { this.mappingValues = new ReaderMapping(raw, matrix); - this.isMatrix = true; this.runIdentification(); } public FormatIdentifying(String raw, FrameBlock frame) throws Exception { this.mappingValues = new ReaderMapping(raw, frame); - this.isMatrix = false; this.runIdentification(); } @@ -96,37 +90,143 @@ private void runIdentification() { } } - ColKeyTrie[] colKeyPattern; + KeyTrie[] colKeyPattern; if(isSingleRow){ colKeyPattern = buildColsKeyPatternSingleRow(); properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTIFY); }else { - colKeyPattern = buildColsKeyPatternMultiRow(); - // TODO: distinguish row index is prefix or key + + // Check the row index is a prefix string in sample raw + // if the row indexes are in the prefix of values, so we need to build a key pattern + // to extract row indexes + // to understanding row indexes are in sample raw we check just 3 column of data + // for build a key pattern ro row indexes we just selected a row + boolean flag; + int numberOfSelectedCols = 3; + int begin = 0; + boolean check, flagReconstruct; + int selectedRowIndex = 1; + HashSet beginPos = new HashSet<>(); + KeyTrie rowKeyPattern = null; + + for(int c=0; c< Math.min(numberOfSelectedCols, ncols); c++){ + Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); + ArrayList prefixStrings = colPrefixString.getKey(); + ArrayList prefixStringRowIndexes = colPrefixString.getValue(); + ArrayList prefixRawIndex = new ArrayList<>(); + + MappingTrie trie = new MappingTrie(); + int ri = 0; + for(String ps: prefixStrings ) + trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); + + + do { + flag = trie.reConstruct(); + }while(flag); + + ArrayList> keyPatterns = trie.getAllSequentialKeys(); + for(ArrayList kp: keyPatterns){ + for(String ps: prefixStrings){ + StringBuilder sb = new StringBuilder(); + int currPos = 0; + for(String k: kp){ + sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); + currPos += sb.length() + k.length(); + } + prefixRawIndex.add(new RawIndex(sb.toString())); + } + } + + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + if(!flag) { + begin = 1; + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + } + if(!flag) { + beginPos.clear(); + break; + } + else + beginPos.add(begin); + if(c== numberOfSelectedCols -1){ + ArrayList rowPrefixStrings = new ArrayList<>(); + MappingTrie rowTrie = new MappingTrie(); + rowKeyPattern = new KeyTrie(); + for(int ci = 0; c < ncols; c++) { + int cri = mapRow[selectedRowIndex][c]; + if(cri != -1) { + String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[selectedRowIndex][ci]); + RawIndex rawIndex = new RawIndex(str); + Pair pair = rawIndex.findValue(selectedRowIndex+begin); + if(pair!=null) { + String pstr = str.substring(0, pair.getKey()); + if(pstr.length() > 0) { + rowPrefixStrings.add(pstr); + rowTrie.insert(pstr, 1); + } + rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey()+pair.getValue()).toCharArray()); + } + } + } + + do { + keyPatterns = rowTrie.getAllSequentialKeys(); + check = false; + for(ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); + check |= newCheck; + } + if(!check){ + flagReconstruct = trie.reConstruct(); + if(!flagReconstruct) + break; + } + }while(!check); + + rowKeyPattern.setPrefixKeyPattern(keyPatterns); + } + } + + if(beginPos.size() == 1){ + colKeyPattern = buildColsKeyPatternSingleRow(); + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); + } + else { + colKeyPattern = buildColsKeyPatternMultiRow(); + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.KEY); + } } - int a = 100; - - // Build a Key-Pattern foreach column -// Pair[], HashSet[]> patternPair = buildKeyPattern(); -// properties = new CustomProperties(patternPair.getKey(), patternPair.getValue()); -// -// // Check the row index format -// //verifyColsAtAllLines(patternPair.getKey()); -// -// properties.setRowIndex(CustomProperties.IndexProperties.IDENTIFY); + } + + private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex){ + for(int r=0;r[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); - ColKeyTrie[] colKeyPattens = new ColKeyTrie[ncols]; + KeyTrie[] colKeyPattens = new KeyTrie[ncols]; for(int c=0; c[], Pair[]> prefixStrings = extractAllPrefixStringsOfCols(true); -// ArrayList[] suffixStrings = extractAllSuffixStringsOfCols(); -// -// ColKeyTrie[] colKeyPattens = new ColKeyTrie[ncols]; -// -// ColKeyTrie[] colKeys = new ArrayList[ncols]; -// HashSet[] colKeyEndWithValueStrings = new HashSet[ncols]; -// -// // -------------------------------------- -// for(int c = 0; c < ncols; c++) { -// Pair, HashSet> pair = buildKeyPatternForAColumn(c, mapRowPrevious[c]); -// if(pair != null) { -// colKeys[c] = pair.getKey(); -// colKeyEndWithValueStrings[c] = pair.getValue(); -// } -// else { -// return null; -// } -// } -// return new Pair<>(colKeys, colKeyEndWithValueStrings); } // Get all prefix strings of a column @@ -188,21 +265,26 @@ public Pair[], ArrayList[]> extractAllPrefixStringsOf ArrayList[] prefixStrings = new ArrayList[ncols]; ArrayList[] rowIndexes = new ArrayList[ncols]; for(int c=0; c< ncols; c++){ - ArrayList ri = new ArrayList<>(); - ArrayList prefixString = new ArrayList<>(); - for(int r=0; r, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse); + prefixStrings[c] = pair.getKey(); + rowIndexes[c] = pair.getValue(); + } + return new Pair<>(prefixStrings, rowIndexes); + } + + public Pair, ArrayList> extractAllPrefixStringsOfAColSingleLine(int colIndex, boolean reverse) { + ArrayList prefixStrings = new ArrayList(); + ArrayList rowIndexes = new ArrayList(); + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][colIndex]; + if(rowIndex != -1) { + rowIndexes.add(rowIndex); + String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]); + if(reverse) + prefixStrings.add(new StringBuilder(str).reverse().toString()); + else + prefixStrings.add(str); } - prefixStrings[c] = prefixString; - rowIndexes[c] = ri; } return new Pair<>(prefixStrings, rowIndexes); } @@ -222,74 +304,6 @@ private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { return result; } - // Validate a key in a row of sample raw data - private HashSet verifyColKeyInALine(int colIndex, int pLines, ArrayList key) { - boolean flag = true; - HashSet endWithValueString = new HashSet<>(); - for(int r = 0; r < nrows; r++) { - int rowIndex = mapRow[r][colIndex]; - if(rowIndex != -1) { - Pair pair = getPreviousLines(rowIndex - pLines, rowIndex); - int currPos = 0; - for(String k : key) { - int index = pair.getKey().indexOf(k, currPos); - if(index != -1) - currPos = index + k.length(); - else { - flag = false; - break; - } - } - int endDelimPos = pair.getValue() + mapCol[r][colIndex] + mapLen[r][colIndex]; - endWithValueString.add(pair.getKey().substring(endDelimPos, Math.min(endDelimPos + 1, pair.getKey().length()))); - if(!flag || currPos != pair.getValue() + mapCol[r][colIndex]) { - return null; - } - } - } - if(endWithValueString.size() == 0) - return null; - return endWithValueString; - } - - // Check the row index pattern of columns - private void verifyColsAtAllLines(ArrayList[] colKeyPattern){ - Pair, HashSet>[] colsRowsLineNumbers = new Pair[ncols]; - - boolean isRowIdentified = true; - - HashSet allCoveredLines = new HashSet<>(); - ArrayList colCandidatesForRowKey = new ArrayList<>(); - - for(int c = 0; c < ncols; c++){ - Pair, HashSet> colRowLineNumbers = verifyColKeyAtAllLines(c, colKeyPattern[c], mapRowPrevious[c]); - if(colRowLineNumbers != null) { - colsRowsLineNumbers[c] = colRowLineNumbers; - allCoveredLines.addAll(colRowLineNumbers.getKey()); - if(colRowLineNumbers.getValue().size() == nrows) - colCandidatesForRowKey.add(c); - } - else { - // We have to find new key for column - isRowIdentified = false; - // ***************************************************************************************** - //buildKeyPatternForAColumn() - } - } - isRowIdentified = allCoveredLines.size() == nrows | isRowIdentified; - - // 1. find an interested col as a key. The prerequisite is all values have to be not null - if(colCandidatesForRowKey.size() > 0){ - - } - else { - // Here we have to find a hidden key - } - - - - } - // Check the sequential list of keys are on a string private Integer getIndexOfKeysOnString(String str, ArrayList key, int beginPos) { int currPos = beginPos; @@ -317,36 +331,6 @@ private Pair getPreviousLines(int beginLine, int endLine){ return new Pair<>(str, str.length() - sampleRawIndexes.get(endLine).getRawLength()); } - // Check the row index pattern of a column - private Pair, HashSet> verifyColKeyAtAllLines(int colIndex, ArrayList key, int pLines) { - HashSet lineNumbers = new HashSet<>(); - HashSet rowNumbers = new HashSet<>(); - HashMap mapColRow = new HashMap<>(); - for(int r = 0; r < nrows; r++) { - int lineNumber = mapRow[r][colIndex]; - if(lineNumber != -1) { - mapColRow.put(lineNumber, mapCol[r][colIndex]); - rowNumbers.add(r); - } - } - - for(int l = 0; l < nlines; l++) { - boolean flag = true; - Pair pair = getPreviousLines(l - pLines, l); - int index = getIndexOfKeysOnString(pair.getKey(), key, 0); - if(index != -1) { - if(mapColRow.containsKey(l) && mapColRow.get(l) == index - pair.getValue()) - lineNumbers.add(l); - else - flag = false; - } - if(!flag) - return null; - } - return new Pair<>(lineNumbers, rowNumbers); - } - - ///////////////////////////////////////////////////////////////////////////// // Methods For Multi Lines Mapping // //////////////////////////////////////////////////////////////////////////// @@ -357,11 +341,11 @@ private Pair, HashSet> verifyColKeyAtAllLines(int colI // 3. Build key pattern for end of values // Build key pattern tree for each column - private ColKeyTrie[] buildColsKeyPatternMultiRow(){ + private KeyTrie[] buildColsKeyPatternMultiRow(){ Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); - ColKeyTrie[] colKeyPattens = new ColKeyTrie[ncols]; + KeyTrie[] colKeyPattens = new KeyTrie[ncols]; for(int c=0; c intersect = new HashSet<>(); intersect.add(colDelim); - ColKeyTrie trie = new ColKeyTrie(colDelim); + KeyTrie trie = new KeyTrie(colDelim); ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); boolean check; do { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java similarity index 69% rename from src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java rename to src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java index 3b13e0cf9c4..8867c607231 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java @@ -23,25 +23,31 @@ import java.util.ArrayList; import java.util.HashSet; -public class ColKeyTrie { +public class KeyTrie { - private ColKeyTrieNode rootPrefixKeys; - private ColKeyTrieNode rootSuffixKeys; + private KeyTrieNode rootPrefixKeys; + private KeyTrieNode rootSuffixKeys; private ArrayList> prefixKeyPattern; - public ColKeyTrie(ArrayList> prefixKeyPattern) { - this.rootPrefixKeys = new ColKeyTrieNode("RootPrefixKeys"); - this.rootSuffixKeys = new ColKeyTrieNode("RootSuffixKeys"); + public KeyTrie() { + this.rootPrefixKeys = new KeyTrieNode("RootPrefixKeys"); + this.rootSuffixKeys = new KeyTrieNode("RootSuffixKeys"); + this.prefixKeyPattern = null; + } + + public KeyTrie(ArrayList> prefixKeyPattern) { + this.rootPrefixKeys = new KeyTrieNode("RootPrefixKeys"); + this.rootSuffixKeys = new KeyTrieNode("RootSuffixKeys"); this.prefixKeyPattern = prefixKeyPattern; } - public ColKeyTrie(String colDelim) { - this.rootPrefixKeys = new ColKeyTrieNode("RootPrefixKeys"); - this.rootSuffixKeys = new ColKeyTrieNode("RootSuffixKeys"); + public KeyTrie(String colDelim) { + this.rootPrefixKeys = new KeyTrieNode("RootPrefixKeys"); + this.rootSuffixKeys = new KeyTrieNode("RootSuffixKeys"); this.prefixKeyPattern = null; - ColKeyTrieNode newNode; - newNode = new ColKeyTrieNode(colDelim); + KeyTrieNode newNode; + newNode = new KeyTrieNode(colDelim); this.rootPrefixKeys.getChildren().put(colDelim, newNode); } @@ -61,7 +67,7 @@ public void insertSuffixKeys(char[] keys) { } public void setAPrefixPath(ArrayList keys) { - ColKeyTrieNode currentNode = rootPrefixKeys; + KeyTrieNode currentNode = rootPrefixKeys; for(String key : keys) { if(currentNode.getChildren().containsKey(key)) { currentNode = currentNode.getChildren().get(key); @@ -70,8 +76,8 @@ public void setAPrefixPath(ArrayList keys) { } } - private void insertKeys(ArrayList keys, ColKeyTrieNode root) { - ColKeyTrieNode currentNode = root; + private void insertKeys(ArrayList keys, KeyTrieNode root) { + KeyTrieNode currentNode = root; int index = 0; for(String key : keys) { if(currentNode.getChildren().containsKey(key)) { @@ -84,9 +90,9 @@ private void insertKeys(ArrayList keys, ColKeyTrieNode root) { break; } - ColKeyTrieNode newNode; + KeyTrieNode newNode; for(int i = index; i < keys.size(); i++) { - newNode = new ColKeyTrieNode(keys.get(i)); + newNode = new KeyTrieNode(keys.get(i)); currentNode.getChildren().put(keys.get(i), newNode); currentNode = newNode; } @@ -102,7 +108,7 @@ public ArrayList> getPrefixKeyPatterns() { public ArrayList> getSuffixKeyPatterns() { ArrayList> result = new ArrayList<>(); for(String k : rootSuffixKeys.getChildren().keySet()) { - ColKeyTrieNode node = rootSuffixKeys.getChildren().get(k); + KeyTrieNode node = rootSuffixKeys.getChildren().get(k); ArrayList nk = new ArrayList<>(); nk.add(k); int maxCount = node.getCount(); @@ -111,13 +117,13 @@ public ArrayList> getSuffixKeyPatterns() { return result; } - private ArrayList> getKeyPatterns(ColKeyTrieNode root) { + private ArrayList> getKeyPatterns(KeyTrieNode root) { ArrayList> result = new ArrayList<>(); getKeyPatterns(root, result, new ArrayList<>()); return result; } - private void getKeyPatterns(ColKeyTrieNode node, ArrayList> result, ArrayList nodeKeys) { + private void getKeyPatterns(KeyTrieNode node, ArrayList> result, ArrayList nodeKeys) { if(node.getChildren().size() == 0) { result.add(nodeKeys); @@ -125,7 +131,7 @@ private void getKeyPatterns(ColKeyTrieNode node, ArrayList> re } else { for(String k : node.getChildren().keySet()) { - ColKeyTrieNode child = node.getChildren().get(k); + KeyTrieNode child = node.getChildren().get(k); ArrayList tmpKeys = new ArrayList<>(); tmpKeys.addAll(nodeKeys); tmpKeys.add(k); @@ -134,12 +140,12 @@ private void getKeyPatterns(ColKeyTrieNode node, ArrayList> re } } - private void getKeyPatterns2(ColKeyTrieNode node, ArrayList> result, ArrayList nodeKeys, + private void getKeyPatterns2(KeyTrieNode node, ArrayList> result, ArrayList nodeKeys, int maxCount) { if(node.getChildren().size() == 1 && node.getCount() == maxCount) { String k = node.getChildren().keySet().iterator().next(); - ColKeyTrieNode child = node.getChildren().get(k); + KeyTrieNode child = node.getChildren().get(k); ArrayList tmpKeys = new ArrayList<>(); tmpKeys.addAll(nodeKeys); tmpKeys.add(k); @@ -154,18 +160,22 @@ public void insertPrefixKeysConcurrent(HashSet keys) { insertPrefixKeysConcurrent(rootPrefixKeys, keys); } - private void insertPrefixKeysConcurrent(ColKeyTrieNode node, HashSet keys) { + private void insertPrefixKeysConcurrent(KeyTrieNode node, HashSet keys) { if(node.getChildren().size() == 0) { for(String k : keys) { - ColKeyTrieNode newNode = new ColKeyTrieNode(k); + KeyTrieNode newNode = new KeyTrieNode(k); node.getChildren().put(k, newNode); } } else { for(String childKey : node.getChildren().keySet()) { - ColKeyTrieNode child = node.getChildren().get(childKey); + KeyTrieNode child = node.getChildren().get(childKey); insertPrefixKeysConcurrent(child, keys); } } } + + public void setPrefixKeyPattern(ArrayList> prefixKeyPattern) { + this.prefixKeyPattern = prefixKeyPattern; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrieNode.java similarity index 88% rename from src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java rename to src/main/java/org/apache/sysds/runtime/iogen/KeyTrieNode.java index ce08825681e..0697ba01b12 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ColKeyTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrieNode.java @@ -23,19 +23,19 @@ import java.util.HashMap; import java.util.Map; -public class ColKeyTrieNode { - private final Map children; +public class KeyTrieNode { + private final Map children; private String key; private boolean check; private int count; - public ColKeyTrieNode() { + public KeyTrieNode() { this.children = new HashMap<>(); this.check = false; this.count = 1; } - public ColKeyTrieNode(String key) { + public KeyTrieNode(String key) { this.children = new HashMap<>(); this.key = key; this.check = false; @@ -46,7 +46,7 @@ public void countPP() { this.count++; } - public Map getChildren() { + public Map getChildren() { return children; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 0326d7371e1..58c3857f79d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -34,6 +34,7 @@ public class MappingTrie { private MappingTrieNode root; private int keyLevel; private boolean inALine; + private int windowSize = 50; public MappingTrie() { this.root = new MappingTrieNode(MappingTrieNode.Type.INNER); @@ -95,7 +96,7 @@ private void getAllSubStrings(HashSet result, StringBuilder sb) { if(sb.length() == 1) result.add(sb.toString()); else { - for(int j = 1; j <= sb.length(); j++) { + for(int j = 1; j <= Math.min(sb.length(), windowSize); j++) { for(int k = 0; k <= sb.length() - j; k++) { result.add(sb.substring(k, k + j)); } From 239e7816638c9baf665bda0e6995045d4e659a33 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 23 Jan 2022 02:09:00 +0100 Subject: [PATCH 10/84] Update CodeGen --- .../sysds/runtime/iogen/CustomProperties.java | 8 +++ .../sysds/runtime/iogen/ReaderMapping.java | 16 +---- .../runtime/iogen/codegen/CodeGenTrie.java | 62 ++++++++++++++----- .../iogen/codegen/CodeGenTrieNode.java | 21 +++++-- .../runtime/iogen/codegen/FrameCodeGen.java | 11 +--- .../runtime/iogen/codegen/MatrixCodeGen.java | 10 +-- 6 files changed, 82 insertions(+), 46 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index bb1f74e3ff4..373a4fb3eb0 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -71,4 +71,12 @@ public IndexProperties getRowIndex() { public void setRowIndex(IndexProperties rowIndex) { this.rowIndex = rowIndex; } + + public KeyTrie getRowKeyPattern() { + return rowKeyPattern; + } + + public void setRowKeyPattern(KeyTrie rowKeyPattern) { + this.rowKeyPattern = rowKeyPattern; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index 736cbdd4f66..b95afd7c0b5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -37,8 +37,8 @@ public class ReaderMapping { private int[][] mapCol; private int[][] mapLen; private boolean mapped; - private int nrows; - private int ncols; + private final int nrows; + private final int ncols; private int nlines; private ArrayList sampleRawIndexes; private MatrixBlock sampleMatrix; @@ -90,17 +90,7 @@ private void ReadRaw(String raw) throws Exception { this.nlines = nlines; } - private boolean isSchemaNumeric() { - if(isMatrix) - return true; - - boolean result = true; - for(Types.ValueType vt : schema) - result &= vt.isNumeric(); - return result; - } - - private void runMapping(boolean isIndexMapping) throws Exception { + private void runMapping(boolean isIndexMapping) { mapped = findMapping(isIndexMapping); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index f593dc7325c..809918c475d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -20,43 +20,78 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.KeyTrie; + import java.util.ArrayList; import java.util.HashSet; import java.util.Random; public class CodeGenTrie { - private CodeGenTrieNode root; + private final CodeGenTrieNode rootCol; + private final CodeGenTrieNode rootRow; + private final CustomProperties properties; + private final String destination; - public CodeGenTrie() { - root = new CodeGenTrieNode(); + public CodeGenTrie(CustomProperties properties, String destination){ + this.rootCol = new CodeGenTrieNode(CodeGenTrieNode.NodeType.COL); + this.rootRow = new CodeGenTrieNode(CodeGenTrieNode.NodeType.ROW); + this.properties = properties; + this.destination = destination; + buildPrefixTree(); } - public void insert(int colIndex, Types.ValueType valueType, ArrayList keys) { + // Build Trie for Col and Row Key Patterns + private void buildPrefixTree(){ + for(int c=0; c< properties.getColKeyPattern().length; c++){ + KeyTrie keyTrie = properties.getColKeyPattern()[c]; + Types.ValueType vt = properties.getSchema() == null? Types.ValueType.FP64 : properties.getSchema()[c]; + for(ArrayList keys : keyTrie.getPrefixKeyPatterns()) + this.insert(rootCol, c, vt, keys); + } + if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX){ + KeyTrie keyTrie = properties.getRowKeyPattern(); + Types.ValueType vt = Types.ValueType.FP32; + for(ArrayList keys : keyTrie.getPrefixKeyPatterns()) + this.insert(rootCol, -1, vt, keys); + } + } + + private void insert(CodeGenTrieNode root ,int index, Types.ValueType valueType, ArrayList keys) { CodeGenTrieNode currentNode = root; - int index = 0; + int rci = 0; for(String key : keys) { if(currentNode.getChildren().containsKey(key)) { currentNode = currentNode.getChildren().get(key); - index++; + rci++; } else break; } - CodeGenTrieNode newNode; - for(int i = index; i < keys.size(); i++) { - newNode = new CodeGenTrieNode(i == keys.size() - 1, colIndex, valueType, keys.get(i), new HashSet<>()); + for(int i = rci; i < keys.size(); i++) { + newNode = new CodeGenTrieNode(i == keys.size() - 1, index, valueType, keys.get(i), new HashSet<>(), root.getType()); currentNode.getChildren().put(keys.get(i), newNode); currentNode = newNode; } } - public String getJavaCode(String destination){ + + public String getJavaCode(){ StringBuilder src = new StringBuilder(); - getJavaCode(root, src, destination, "0"); + switch(properties.getRowIndex()){ + case IDENTIFY: + getJavaRowIdentifyCode(rootCol, src, "0"); + break; + case PREFIX: + break; + case KEY: + break; + } return src.toString(); } + public String getRandomName(String base) { Random r = new Random(); int low = 0; @@ -66,7 +101,7 @@ public String getRandomName(String base) { return base + "_" + result; } - private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destination, String currPos){ + private void getJavaRowIdentifyCode(CodeGenTrieNode node, StringBuilder src, String currPos){ String currPosVariable = getRandomName("curPos"); if(node.getChildren().size() ==0 || node.isEndOfCondition()){ String key = node.getKey(); @@ -92,7 +127,7 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destina for(String key : node.getChildren().keySet()) { CodeGenTrieNode child = node.getChildren().get(key); - getJavaCode(child, src, destination, currPos); + getJavaRowIdentifyCode(child, src, currPos); } if(node.getKey() != null) { src.append("}\n"); @@ -104,6 +139,5 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String destina if(key.length() > 0) src.append("} \n"); } - } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 4c6e7a1ccfe..cccd018475b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -26,23 +26,32 @@ public class CodeGenTrieNode { + public enum NodeType { + ROW, COL; + @Override public String toString() { + return this.name().toUpperCase(); + } + } private final Map children = new HashMap<>(); private boolean endOfCondition; private int colIndex; private Types.ValueType valueType; private String key; private HashSet naStrings; + private final NodeType type; - public CodeGenTrieNode() { + public CodeGenTrieNode(NodeType type) { this.endOfCondition = false; + this.type = type; } - public CodeGenTrieNode(int colIndex, String key) { + public CodeGenTrieNode(int colIndex, String key, NodeType type) { this.colIndex = colIndex; this.key = key; + this.type = type; } - public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType valueType, String key, HashSet naStrings) { + public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType valueType, String key, HashSet naStrings, NodeType type) { this.endOfCondition = endOfCondition; this.colIndex = colIndex; this.valueType = valueType; @@ -50,7 +59,7 @@ public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType val if(endOfCondition){ this.naStrings = naStrings; } - + this.type = type; } public String geValueCode(String destination, String currPos){ @@ -135,4 +144,8 @@ public String getKey() { public void setKey(String key) { this.key = key; } + + public NodeType getType() { + return type; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index b2b9f8b33be..baee6973ea7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -21,7 +21,6 @@ import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; -import java.util.ArrayList; public class FrameCodeGen extends TemplateCodeGenBase { @@ -71,14 +70,10 @@ public FrameCodeGen(CustomProperties properties, String className) { src.append("String str = value.toString(); \n"); src.append("strLen = str.length(); \n"); - ArrayList[] colKeyPattern = null;//properties.getColKeyPattern(); - CodeGenTrie trie = new CodeGenTrie(); - for(int c = 0; c < colKeyPattern.length; c++) { - trie.insert(c, properties.getSchema()[c], colKeyPattern[c]); - } - src.append(trie.getJavaCode("dest.set")); - + CodeGenTrie trie = new CodeGenTrie(properties, "dest.set"); + src.append(trie.getJavaCode()); src.append("row++; \n"); + src.append("}} \n"); src.append("finally { \n"); src.append("IOUtilFunctions.closeSilently(reader); \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 72d8635232e..e037b600cd6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -68,14 +68,10 @@ public String generateCodeJava() { src.append("while((str = br.readLine()) != null){ \n"); src.append("strLen = str.length(); \n"); - ArrayList[] colKeyPattern = null; //properties.getColKeyPattern(); - CodeGenTrie trie= new CodeGenTrie(); - for(int c=0; c< colKeyPattern.length; c++){ - trie.insert(c, Types.ValueType.FP64, colKeyPattern[c]); - } - src.append(trie.getJavaCode("dest.appendValue")); - + CodeGenTrie trie= new CodeGenTrie(properties, "dest.appendValue"); + src.append(trie.getJavaCode()); src.append("row++; \n"); + src.append("} \n"); src.append("} \n"); src.append("finally { \n"); From 2f40135b2fa8dfd8507eb3208941e8180fc5f47e Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Mon, 24 Jan 2022 11:35:44 +0100 Subject: [PATCH 11/84] Update JAVA CodeGen --- .../sysds/runtime/iogen/CustomProperties.java | 8 ++ .../runtime/iogen/FormatIdentifying.java | 98 ++++++++++++++++++- .../apache/sysds/runtime/iogen/KeyTrie.java | 32 +++++- .../sysds/runtime/iogen/MappingTrie.java | 8 +- .../runtime/iogen/codegen/CodeGenTrie.java | 21 ++-- .../iogen/codegen/CodeGenTrieNode.java | 18 ++++ .../runtime/iogen/codegen/FrameCodeGen.java | 3 +- .../runtime/iogen/codegen/MatrixCodeGen.java | 6 +- 8 files changed, 176 insertions(+), 18 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 373a4fb3eb0..c2a4544113c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -22,6 +22,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FileFormatProperties; import java.io.Serializable; +import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { @@ -52,6 +53,13 @@ public KeyTrie[] getColKeyPattern() { return colKeyPattern; } + public HashSet[] endWithValueStrings(){ + HashSet[] endWithValueString = new HashSet[colKeyPattern.length]; + for(int i=0; i< colKeyPattern.length; i++) + endWithValueString[i] = colKeyPattern[i].getFirstSuffixKeyPatterns(); + return endWithValueString; + } + public void setColKeyPattern(KeyTrie[] colKeyPattern) { this.colKeyPattern = colKeyPattern; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index ed91f8b69f8..fe42f0ce8c4 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -184,6 +184,14 @@ private void runIdentification() { } }while(!check); + if(keyPatterns.size() == 0){ + ArrayList> kpl = new ArrayList<>(); + ArrayList kpli = new ArrayList<>(); + kpli.add(""); + kpl.add(kpli); + keyPatterns = kpl; + } + rowKeyPattern.setPrefixKeyPattern(keyPatterns); } } @@ -193,6 +201,7 @@ private void runIdentification() { properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); } else { + //buildRowKeyPatternMultiRow(); colKeyPattern = buildColsKeyPatternMultiRow(); properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.KEY); } @@ -340,6 +349,92 @@ private Pair getPreviousLines(int beginLine, int endLine){ // 2. Build key pattern tree for each column // 3. Build key pattern for end of values +// private KeyTrie[] buildRowKeyPatternMultiRow(){ +// Pair, Pair> prefixStrings = extractAllPrefixStringsOfRows(); +// +// for(String s: prefixStrings.getKey()) +// System.out.println(s.replace("\n", "")); +// +// MappingTrie mappingTrie = new MappingTrie(); +// mappingTrie.setInALine(false); +// for(String s: prefixStrings.getKey()) +// mappingTrie.reverseInsert(s, 0); +// +// boolean flag; +// do { +// flag = mappingTrie.reConstruct(); +// }while(flag); +// //mappingTrie.reConstruct(); +// ArrayList> aa = mappingTrie.getAllSequentialKeys(); +// +// for(ArrayList l: aa) { +// for(String s : l) +// System.out.print(s + " - "); +// System.out.println(); +// } +// //System.out.println(">>> "); +// return null; +// } + + private Pair, Pair> extractAllPrefixStringsOfRows(){ + + ArrayList result = new ArrayList(); + Pair minmax = new Pair(); + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + int[] minIndexRow = new int[nrows]; + for(int r=0; r= 0; i--) + if(usedLines[r].get(i)) { + lastLine = i; + break; + } + if(lastLine ==0) + continue; + for(int i = lastLine; i <= rowIndex; i++) { + if(sampleRawIndexes.get(i).getRawLength() > 0) + sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); + } + if(lastLine < rowIndex) + sb.deleteCharAt(sb.length() - 1); + + result.add(sb.toString()); + max = Math.max(max, sb.length()); + if(sb.length() < min || min == 0) + min = sb.length(); + minmax = new Pair<>(min, max); + } + return new Pair<>(result, minmax); + } + // Build key pattern tree for each column private KeyTrie[] buildColsKeyPatternMultiRow(){ Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); @@ -376,8 +471,9 @@ private KeyTrie[] buildColsKeyPatternMultiRow(){ remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); intersect = findStartWithIntersectOfStrings(remainedPrefix); - if(intersect != null) + if(intersect != null) { trie.insertPrefixKeysConcurrent(intersect); + } else { remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); flag = false; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java index 8867c607231..d8148bacce8 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java @@ -21,6 +21,7 @@ package org.apache.sysds.runtime.iogen; import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; public class KeyTrie { @@ -105,6 +106,21 @@ public ArrayList> getPrefixKeyPatterns() { return getKeyPatterns(rootPrefixKeys); } + public ArrayList> getReversePrefixKeyPatterns() { + if(this.prefixKeyPattern!=null) + return prefixKeyPattern; + else { + ArrayList> kps = getKeyPatterns(rootPrefixKeys); + for(ArrayList l : kps) { + Collections.reverse(l); + for(int i = 0; i < l.size(); i++) { + l.set(i, new StringBuilder(l.get(i)).reverse().toString()); + } + } + return kps; + } + } + public ArrayList> getSuffixKeyPatterns() { ArrayList> result = new ArrayList<>(); for(String k : rootSuffixKeys.getChildren().keySet()) { @@ -112,11 +128,20 @@ public ArrayList> getSuffixKeyPatterns() { ArrayList nk = new ArrayList<>(); nk.add(k); int maxCount = node.getCount(); - getKeyPatterns2(node, result, nk, maxCount); + getSuffixKeyPatterns(node, result, nk, maxCount); } return result; } + public HashSet getFirstSuffixKeyPatterns(){ + ArrayList> suffixKeyPattern = getSuffixKeyPatterns(); + HashSet suffixString = new HashSet<>(); + for(ArrayList kp: suffixKeyPattern){ + suffixString.add(kp.get(0)); + } + return suffixString; + } + private ArrayList> getKeyPatterns(KeyTrieNode root) { ArrayList> result = new ArrayList<>(); getKeyPatterns(root, result, new ArrayList<>()); @@ -140,7 +165,7 @@ private void getKeyPatterns(KeyTrieNode node, ArrayList> resul } } - private void getKeyPatterns2(KeyTrieNode node, ArrayList> result, ArrayList nodeKeys, + private void getSuffixKeyPatterns(KeyTrieNode node, ArrayList> result, ArrayList nodeKeys, int maxCount) { if(node.getChildren().size() == 1 && node.getCount() == maxCount) { @@ -149,7 +174,7 @@ private void getKeyPatterns2(KeyTrieNode node, ArrayList> resu ArrayList tmpKeys = new ArrayList<>(); tmpKeys.addAll(nodeKeys); tmpKeys.add(k); - getKeyPatterns2(child, result, tmpKeys, maxCount); + getSuffixKeyPatterns(child, result, tmpKeys, maxCount); } else result.add(nodeKeys); @@ -158,6 +183,7 @@ private void getKeyPatterns2(KeyTrieNode node, ArrayList> resu public void insertPrefixKeysConcurrent(HashSet keys) { insertPrefixKeysConcurrent(rootPrefixKeys, keys); + ArrayList> ss =getPrefixKeyPatterns(); } private void insertPrefixKeysConcurrent(KeyTrieNode node, HashSet keys) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 58c3857f79d..433da115b7f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -182,6 +182,12 @@ public String getIntersectOfChildren(MappingTrieNode node) { break; } while(true); +// if(count==1) +// lastCount = 1; +// else { +// lastCount = 0; +// break; +// } if(lastCount != 0 && lastCount != count) { lastCount = 0; break; @@ -317,7 +323,7 @@ public ArrayList> getAllSequentialKeys() { } } - // revert list avd values of list + // revert list and values of list for(ArrayList l: distinctKeys){ Collections.reverse(l); for(int i=0; i keys : keyTrie.getPrefixKeyPatterns()) + //keyTrie.getReversePrefixKeyPatterns() + ArrayList> ss = keyTrie.getReversePrefixKeyPatterns(); + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX){ KeyTrie keyTrie = properties.getRowKeyPattern(); Types.ValueType vt = Types.ValueType.FP32; - for(ArrayList keys : keyTrie.getPrefixKeyPatterns()) - this.insert(rootCol, -1, vt, keys); + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + this.insert(rootRow, -1, vt, keys); } } @@ -81,13 +83,20 @@ public String getJavaCode(){ StringBuilder src = new StringBuilder(); switch(properties.getRowIndex()){ case IDENTIFY: - getJavaRowIdentifyCode(rootCol, src, "0"); + getJavaCode(rootCol, src, "0"); + src.append("row++; \n"); break; case PREFIX: + getJavaCode(rootRow, src, "0"); + getJavaCode(rootCol, src, "0"); break; case KEY: + // TODO: Generate code for split stream as records + // and then increase the row number + getJavaCode(rootCol, src, "0"); break; } + return src.toString(); } @@ -101,7 +110,7 @@ public String getRandomName(String base) { return base + "_" + result; } - private void getJavaRowIdentifyCode(CodeGenTrieNode node, StringBuilder src, String currPos){ + private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos){ String currPosVariable = getRandomName("curPos"); if(node.getChildren().size() ==0 || node.isEndOfCondition()){ String key = node.getKey(); @@ -127,7 +136,7 @@ private void getJavaRowIdentifyCode(CodeGenTrieNode node, StringBuilder src, Str for(String key : node.getChildren().keySet()) { CodeGenTrieNode child = node.getChildren().get(key); - getJavaRowIdentifyCode(child, src, currPos); + getJavaCode(child, src, currPos); } if(node.getKey() != null) { src.append("}\n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index cccd018475b..e4e135418ef 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -63,6 +63,24 @@ public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType val } public String geValueCode(String destination, String currPos){ + if(this.type == NodeType.ROW) + return this.getRowPrefixValueCode(currPos); + else + return this.getColValueCode(destination, currPos); + } + + private String getRowPrefixValueCode(String currPos){ + StringBuilder src = new StringBuilder(); + String subStr; + + src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueStringRow); \n"); + subStr = "str.substring("+currPos+",endPos)"; + + src.append("row = ").append("Integer.parseInt("+subStr+"); \n"); + return src.toString(); + } + + private String getColValueCode(String destination, String currPos){ StringBuilder src = new StringBuilder(); String subStr; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index baee6973ea7..cc8f34c9e9c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -63,7 +63,7 @@ public FrameCodeGen(CustomProperties properties, String className) { src.append("Text value = new Text(); \n"); src.append("int row = rl; \n"); src.append("long lnnz = 0; \n"); - src.append("HashSet[] endWithValueString = _props.getEndWithValueString(); \n"); + src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("int index, endPos, strLen; \n"); src.append("try { \n"); src.append("while(reader.next(key, value)){ \n"); @@ -72,7 +72,6 @@ public FrameCodeGen(CustomProperties properties, String className) { CodeGenTrie trie = new CodeGenTrie(properties, "dest.set"); src.append(trie.getJavaCode()); - src.append("row++; \n"); src.append("}} \n"); src.append("finally { \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index e037b600cd6..c6216866a82 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -19,12 +19,9 @@ package org.apache.sysds.runtime.iogen.codegen; -import org.apache.sysds.common.Types; import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; -import java.util.ArrayList; - public class MatrixCodeGen extends TemplateCodeGenBase { public MatrixCodeGen(CustomProperties properties, String className) { @@ -62,7 +59,7 @@ public String generateCodeJava() { src.append("int row = rowPos.intValue(); \n"); src.append("long lnnz = 0; \n"); src.append("int index, endPos, strLen; \n"); - src.append("HashSet[] endWithValueString = _props.getEndWithValueString(); \n"); + src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); src.append("try { \n"); src.append("while((str = br.readLine()) != null){ \n"); @@ -70,7 +67,6 @@ public String generateCodeJava() { CodeGenTrie trie= new CodeGenTrie(properties, "dest.appendValue"); src.append(trie.getJavaCode()); - src.append("row++; \n"); src.append("} \n"); src.append("} \n"); From 00f6d65a0c6efda5c644f3a0f9da4c079f097e8b Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 03:43:58 +0100 Subject: [PATCH 12/84] Fixed some bugs in Identification section Update CodeGen part Add some tests for Matrix --- .../sysds/runtime/iogen/CustomProperties.java | 13 + .../runtime/iogen/FormatIdentifying.java | 193 +++++--------- .../sysds/runtime/iogen/GenerateReader.java | 6 - .../apache/sysds/runtime/iogen/KeyTrie.java | 1 - .../runtime/iogen/codegen/CodeGenTrie.java | 73 +++--- .../iogen/codegen/CodeGenTrieNode.java | 38 ++- .../runtime/iogen/codegen/FrameCodeGen.java | 1 + .../runtime/iogen/codegen/MatrixCodeGen.java | 2 + .../iogen/FrameGenerateReaderCSVTest.java | 120 --------- .../iogen/FrameGenerateReaderLibSVMTest.java | 142 ----------- .../FrameGenerateReaderMatrixMarketTest.java | 100 -------- .../iogen/GenerateReaderFrameTest.java | 3 +- .../iogen/GenerateReaderMatrixTest.java | 6 +- .../Identify/FrameGenerateReaderCSVTest.java | 121 --------- .../Identify/MatrixGRRowColIdentifyTest.java | 133 ---------- .../iogen/MatrixGenerateReaderCSVTest.java | 151 ----------- .../iogen/MatrixGenerateReaderLibSVMTest.java | 169 ------------- .../MatrixGenerateReaderMatrixMarketTest.java | 238 ------------------ .../iogen/MatrixSingleRowFlatTest.java | 163 ++++++++++++ .../iogen/MatrixSingleRowNestedTest.java | 116 +++++++++ 20 files changed, 418 insertions(+), 1371 deletions(-) delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderCSVTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderLibSVMTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderMatrixMarketTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderLibSVMTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index c2a4544113c..a6933f70864 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -37,6 +37,7 @@ public enum IndexProperties { private Types.ValueType[] schema; private IndexProperties rowIndex; private KeyTrie rowKeyPattern; + private String rowIndexBegin; public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex) { this.colKeyPattern = colKeyPattern; @@ -60,6 +61,10 @@ public HashSet[] endWithValueStrings(){ return endWithValueString; } + public HashSet endWithValueStringsRow(){ + return rowKeyPattern.getFirstSuffixKeyPatterns(); + } + public void setColKeyPattern(KeyTrie[] colKeyPattern) { this.colKeyPattern = colKeyPattern; } @@ -87,4 +92,12 @@ public KeyTrie getRowKeyPattern() { public void setRowKeyPattern(KeyTrie rowKeyPattern) { this.rowKeyPattern = rowKeyPattern; } + + public String getRowIndexBegin() { + return rowIndexBegin; + } + + public void setRowIndexBegin(String rowIndexBegin) { + this.rowIndexBegin = rowIndexBegin; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index fe42f0ce8c4..b73a6bf9a6c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -39,7 +39,7 @@ public class FormatIdentifying { private static int ncols; private int nlines; private int windowSize = 20; - private int suffixStringLength = 100; + private int suffixStringLength = 50; private ReaderMapping mappingValues; private CustomProperties properties; @@ -99,16 +99,29 @@ private void runIdentification() { // Check the row index is a prefix string in sample raw // if the row indexes are in the prefix of values, so we need to build a key pattern // to extract row indexes - // to understanding row indexes are in sample raw we check just 3 column of data - // for build a key pattern ro row indexes we just selected a row + // for understanding row indexes are in sample raw we check just 3 column of data + // for build a key pattern related to row indexes we just selected a row boolean flag; int numberOfSelectedCols = 3; int begin = 0; boolean check, flagReconstruct; - int selectedRowIndex = 1; + int[] selectedRowIndex = new int[2]; HashSet beginPos = new HashSet<>(); KeyTrie rowKeyPattern = null; + // Select two none zero row as a row index candidate + + int index = 0; + for(int r=1; r1) + break; + } + for(int c=0; c< Math.min(numberOfSelectedCols, ncols); c++){ Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); ArrayList prefixStrings = colPrefixString.getKey(); @@ -120,7 +133,6 @@ private void runIdentification() { for(String ps: prefixStrings ) trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); - do { flag = trie.reConstruct(); }while(flag); @@ -153,32 +165,39 @@ private void runIdentification() { ArrayList rowPrefixStrings = new ArrayList<>(); MappingTrie rowTrie = new MappingTrie(); rowKeyPattern = new KeyTrie(); - for(int ci = 0; c < ncols; c++) { - int cri = mapRow[selectedRowIndex][c]; - if(cri != -1) { - String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[selectedRowIndex][ci]); - RawIndex rawIndex = new RawIndex(str); - Pair pair = rawIndex.findValue(selectedRowIndex+begin); - if(pair!=null) { - String pstr = str.substring(0, pair.getKey()); - if(pstr.length() > 0) { - rowPrefixStrings.add(pstr); - rowTrie.insert(pstr, 1); + for(int si: selectedRowIndex) { + for(int ci = 0; ci < ncols; ci++) { + int cri = mapRow[si][ci]; + if(cri != -1) { + String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); + RawIndex rawIndex = new RawIndex(str); + Pair pair = rawIndex.findValue(si + begin); + if(pair != null) { + String pstr = str.substring(0, pair.getKey()); + if(pstr.length() > 0) { + rowPrefixStrings.add(pstr); + rowTrie.insert(pstr, 1); + } + rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); } - rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey()+pair.getValue()).toCharArray()); } } } do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); keyPatterns = rowTrie.getAllSequentialKeys(); check = false; for(ArrayList keyPattern : keyPatterns) { boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); check |= newCheck; + if(newCheck) + selectedKeyPatterns.add(keyPattern); } - if(!check){ - flagReconstruct = trie.reConstruct(); + if(check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = rowTrie.reConstruct(); if(!flagReconstruct) break; } @@ -191,7 +210,6 @@ private void runIdentification() { kpl.add(kpli); keyPatterns = kpl; } - rowKeyPattern.setPrefixKeyPattern(keyPatterns); } } @@ -199,9 +217,13 @@ private void runIdentification() { if(beginPos.size() == 1){ colKeyPattern = buildColsKeyPatternSingleRow(); properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); + Integer bpos = beginPos.iterator().next(); + if(bpos>0) + properties.setRowIndexBegin("-"+bpos); + else + properties.setRowIndexBegin(""); } else { - //buildRowKeyPatternMultiRow(); colKeyPattern = buildColsKeyPatternMultiRow(); properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.KEY); } @@ -244,15 +266,20 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { boolean check; boolean flagReconstruct; ArrayList> keyPatterns; + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); keyPatterns = trie.getAllSequentialKeys(); check = false; for(ArrayList keyPattern : keyPatterns) { boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); check |= newCheck; + if(newCheck) + selectedKeyPatterns.add(keyPattern); } - - if(!check){ + if(check) + keyPatterns = selectedKeyPatterns; + else { flagReconstruct = trie.reConstruct(); if(!flagReconstruct) break; @@ -313,33 +340,6 @@ private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { return result; } - // Check the sequential list of keys are on a string - private Integer getIndexOfKeysOnString(String str, ArrayList key, int beginPos) { - int currPos = beginPos; - boolean flag = true; - for(String k : key) { - int index = str.indexOf(k, currPos); - if(index != -1) - currPos = index + k.length(); - else { - flag = false; - break; - } - } - if(flag) - return currPos; - else - return -1; - } - - private Pair getPreviousLines(int beginLine, int endLine){ - StringBuilder sb = new StringBuilder(); - for(int i= Math.max(0, beginLine); i <= endLine; i++) - sb.append(sampleRawIndexes.get(i).getRaw()); - String str = sb.toString(); - return new Pair<>(str, str.length() - sampleRawIndexes.get(endLine).getRawLength()); - } - ///////////////////////////////////////////////////////////////////////////// // Methods For Multi Lines Mapping // //////////////////////////////////////////////////////////////////////////// @@ -349,92 +349,6 @@ private Pair getPreviousLines(int beginLine, int endLine){ // 2. Build key pattern tree for each column // 3. Build key pattern for end of values -// private KeyTrie[] buildRowKeyPatternMultiRow(){ -// Pair, Pair> prefixStrings = extractAllPrefixStringsOfRows(); -// -// for(String s: prefixStrings.getKey()) -// System.out.println(s.replace("\n", "")); -// -// MappingTrie mappingTrie = new MappingTrie(); -// mappingTrie.setInALine(false); -// for(String s: prefixStrings.getKey()) -// mappingTrie.reverseInsert(s, 0); -// -// boolean flag; -// do { -// flag = mappingTrie.reConstruct(); -// }while(flag); -// //mappingTrie.reConstruct(); -// ArrayList> aa = mappingTrie.getAllSequentialKeys(); -// -// for(ArrayList l: aa) { -// for(String s : l) -// System.out.print(s + " - "); -// System.out.println(); -// } -// //System.out.println(">>> "); -// return null; -// } - - private Pair, Pair> extractAllPrefixStringsOfRows(){ - - ArrayList result = new ArrayList(); - Pair minmax = new Pair(); - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - int[] minIndexRow = new int[nrows]; - for(int r=0; r= 0; i--) - if(usedLines[r].get(i)) { - lastLine = i; - break; - } - if(lastLine ==0) - continue; - for(int i = lastLine; i <= rowIndex; i++) { - if(sampleRawIndexes.get(i).getRawLength() > 0) - sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); - } - if(lastLine < rowIndex) - sb.deleteCharAt(sb.length() - 1); - - result.add(sb.toString()); - max = Math.max(max, sb.length()); - if(sb.length() < min || min == 0) - min = sb.length(); - minmax = new Pair<>(min, max); - } - return new Pair<>(result, minmax); - } - // Build key pattern tree for each column private KeyTrie[] buildColsKeyPatternMultiRow(){ Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); @@ -606,6 +520,12 @@ private HashSet findStartWithIntersectOfStrings(ArrayList strLis } private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys){ + if(keys.size() == 1){ + String k = keys.get(0); + if (k.length() == 0) + return true; + } + for(String ps: prefixStrings){ int currentPos = 0; int patternCount = 0; @@ -626,6 +546,7 @@ private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayLi // Check the sequential list of keys are on a string private Pair getIndexOfKeyPatternOnString(String str, ArrayList key, int beginPos) { + int currPos = beginPos; boolean flag = true; int startPos = -1; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 41ebf2e75e6..71e1da73041 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -83,9 +83,6 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - - String jc = src.generateCodeJava(); - matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } @@ -111,9 +108,6 @@ public FrameReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - - String jc = src.generateCodeJava(); - frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return frameReader; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java index d8148bacce8..59c090527db 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java @@ -17,7 +17,6 @@ * under the License. */ - package org.apache.sysds.runtime.iogen; import java.util.ArrayList; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 5fbe03f376b..6fd1aa4fd8a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -45,9 +45,7 @@ public CodeGenTrie(CustomProperties properties, String destination){ private void buildPrefixTree(){ for(int c=0; c< properties.getColKeyPattern().length; c++){ KeyTrie keyTrie = properties.getColKeyPattern()[c]; - Types.ValueType vt = properties.getSchema() == null? Types.ValueType.FP64 : properties.getSchema()[c]; - //keyTrie.getReversePrefixKeyPatterns() - ArrayList> ss = keyTrie.getReversePrefixKeyPatterns(); + Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } @@ -74,6 +72,7 @@ private void insert(CodeGenTrieNode root ,int index, Types.ValueType valueType, CodeGenTrieNode newNode; for(int i = rci; i < keys.size(); i++) { newNode = new CodeGenTrieNode(i == keys.size() - 1, index, valueType, keys.get(i), new HashSet<>(), root.getType()); + newNode.setRowIndexBeginPos(properties.getRowIndexBegin()); currentNode.getChildren().put(keys.get(i), newNode); currentNode = newNode; } @@ -111,42 +110,42 @@ public String getRandomName(String base) { } private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos){ - String currPosVariable = getRandomName("curPos"); - if(node.getChildren().size() ==0 || node.isEndOfCondition()){ - String key = node.getKey(); - if(key.length() > 0){ - src.append("index = str.indexOf(\""+node.getKey().replace("\"", "\\\"")+"\", "+currPos+"); \n"); - src.append("if(index != -1) { \n"); - src.append("int "+currPosVariable + " = index + "+ key.length()+"; \n"); - src.append(node.geValueCode(destination, currPosVariable)); - currPos = currPosVariable; - } - else - src.append(node.geValueCode(destination, "0")); - } + if(node.isEndOfCondition()) + src.append(node.geValueCode(destination, currPos)); if(node.getChildren().size() > 0) { - if(node.getKey() != null) { - currPosVariable = getRandomName("curPos"); - src.append("index = str.indexOf(\"" + node.getKey().replace("\"", "\\\"") + "\", " + currPos + "); \n"); - src.append("if(index != -1) { \n"); - src.append("int " + currPosVariable + " = index + " + node.getKey().length() + "; \n"); - currPos = currPosVariable; - } - - for(String key : node.getChildren().keySet()) { - CodeGenTrieNode child = node.getChildren().get(key); - getJavaCode(child, src, currPos); - } - if(node.getKey() != null) { - src.append("}\n"); - } - } - - if(node.getChildren().size() ==0 || node.isEndOfCondition()){ - String key = node.getKey(); - if(key.length() > 0) - src.append("} \n"); + String currPosVariable; +// if(node.getKey() != null) { +// currPosVariable = getRandomName("curPos"); +// src.append("index = str.indexOf(\"" + node.getKey().replace("\"", "\\\"") + "\", " + currPos + "); \n"); +// src.append("if(index != -1) { \n"); +// src.append("int " + currPosVariable + " = index + " + node.getKey().length() + "; \n"); +// currPos = currPosVariable; +// +// for(String key : node.getChildren().keySet()) { +// CodeGenTrieNode child = node.getChildren().get(key); +// getJavaCode(child, src, currPos); +// } +// src.append("} \n"); +// } +// else { + for(String key : node.getChildren().keySet()) { + if(key.length()>0){ + currPosVariable = getRandomName("curPos"); + if(node.getKey() == null) + src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\"); \n"); + else + src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\", " + currPos + "); \n"); + src.append("if(index != -1) { \n"); + src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); + currPos = currPosVariable; + } + CodeGenTrieNode child = node.getChildren().get(key); + getJavaCode(child, src, currPos); + if(key.length()>0) + src.append("} \n"); + } + //} } } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index e4e135418ef..1250d9758a9 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -39,6 +39,7 @@ public enum NodeType { private String key; private HashSet naStrings; private final NodeType type; + private String rowIndexBeginPos; public CodeGenTrieNode(NodeType type) { this.endOfCondition = false; @@ -72,28 +73,30 @@ public String geValueCode(String destination, String currPos){ private String getRowPrefixValueCode(String currPos){ StringBuilder src = new StringBuilder(); String subStr; - src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueStringRow); \n"); subStr = "str.substring("+currPos+",endPos)"; - - src.append("row = ").append("Integer.parseInt("+subStr+"); \n"); + if(rowIndexBeginPos.length() > 0) + src.append("row = ").append("Integer.parseInt("+subStr+") "+rowIndexBeginPos+"; \n"); + else + src.append("row = ").append("Integer.parseInt("+subStr+"); \n"); return src.toString(); } private String getColValueCode(String destination, String currPos){ StringBuilder src = new StringBuilder(); - String subStr; src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueString["+colIndex+"]); \n"); - subStr = "str.substring("+currPos+",endPos)"; + src.append("String cellStr"+colIndex+" = str.substring("+currPos+",endPos); \n"); if(valueType.isNumeric()) { - src.append(getParsCode(subStr)); + src.append("if ( cellStr"+colIndex+".length() > 0 ){\n"); + src.append(getParsCode("cellStr"+colIndex)); src.append("if(cellValue"+colIndex+" != 0) { \n"); src.append(destination).append("(row, " + colIndex + ", cellValue"+colIndex+"); \n"); src.append("lnnz++;\n"); src.append("}\n"); + src.append("}\n"); } else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOLEAN){ if(naStrings !=null && naStrings.size() > 0) { @@ -108,20 +111,21 @@ else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOL sb.append("}\n"); } else - src.append(getParsCode(subStr)); + src.append(getParsCode("cellStr"+colIndex)); src.append(destination).append("(row, " + colIndex + ", cellValue"+colIndex+"); \n"); } return src.toString(); } private String getParsCode(String subStr) { + String cellValue = "cellValue"+colIndex; switch(valueType ) { - case STRING: return "String cellValue"+colIndex+" = "+subStr+"; \n"; - case BOOLEAN: return "Boolean cellValue"+colIndex+" = Boolean.parseBoolean("+subStr+"); \n"; - case INT32: return "Integer cellValue"+colIndex+" = Integer.parseInt("+subStr+"); \n"; - case INT64: return "Long cellValue"+colIndex+" = Long.parseLong("+subStr+"); \n"; - case FP64: return "Double cellValue"+colIndex+" = Double.parseDouble("+subStr+"); \n"; - case FP32: return "Float cellValue"+colIndex+" = Float.parseFloat("+subStr+"); \n"; + case STRING: return "String "+cellValue+" = "+subStr+"; \n"; + case BOOLEAN: return "Boolean "+cellValue+"; \n try{ "+cellValue+"= Boolean.parseBoolean("+subStr+");} catch(Exception e){"+cellValue+"=false;} \n"; + case INT32: return "Integer "+cellValue+"; \n try{ "+cellValue+"= Integer.parseInt("+subStr+");} catch(Exception e){"+cellValue+" = 0;} \n"; + case INT64: return "Long "+cellValue+"; \n try{"+cellValue+"= Long.parseLong("+subStr+"); } catch(Exception e){"+cellValue+" = 0l;} \n"; + case FP64: return "Double "+cellValue+"; \n try{ "+cellValue+"= Double.parseDouble("+subStr+"); } catch(Exception e){"+cellValue+" = 0d;}\n"; + case FP32: return "Float "+cellValue+"; \n try{ "+cellValue+"= Float.parseFloat("+subStr+");} catch(Exception e){"+cellValue+" = 0f;} \n"; default: throw new RuntimeException("Unsupported value type: "+valueType); } } @@ -166,4 +170,12 @@ public void setKey(String key) { public NodeType getType() { return type; } + + public String getRowIndexBeginPos() { + return rowIndexBeginPos; + } + + public void setRowIndexBeginPos(String rowIndexBeginPos) { + this.rowIndexBeginPos = rowIndexBeginPos; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index cc8f34c9e9c..a77325da851 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -64,6 +64,7 @@ public FrameCodeGen(CustomProperties properties, String className) { src.append("int row = rl; \n"); src.append("long lnnz = 0; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); + src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); src.append("int index, endPos, strLen; \n"); src.append("try { \n"); src.append("while(reader.next(key, value)){ \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index c6216866a82..1ff4a017fbf 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -61,6 +61,8 @@ public String generateCodeJava() { src.append("int index, endPos, strLen; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); + if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) + src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); src.append("try { \n"); src.append("while((str = br.readLine()) != null){ \n"); src.append("strLen = str.length(); \n"); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderCSVTest.java deleted file mode 100644 index 04ab05895b7..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderCSVTest.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class FrameGenerateReaderCSVTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameGenerateReaderCSVTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void extractSampleRawCSV(String separator) { - int nrows = data.length; - int ncols = data[0].length; - StringBuilder sb = new StringBuilder(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - sb.append(data[r][c]); - if(c != ncols - 1) - sb.append(separator); - } - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - } - - @Test - public void test1() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -100, 100, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test2() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -10, 10, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test3() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "****"; - generateRandomData(100, 500, -10, 10, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -10, 10, 0.7, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test5() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ",,,,"; - generateRandomData(10, 10, -10, 10, 0.5, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test6() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(1000, 100, -10, 10, 0.4, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test7() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(1000, 100, -10, 10, 0.8, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test8() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(10000, 100, -10, 10, 0.5, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderLibSVMTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderLibSVMTest.java deleted file mode 100644 index 722498f7b95..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderLibSVMTest.java +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class FrameGenerateReaderLibSVMTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameGenerateReaderLibSVMTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void extractSampleRawLibSVM(int firstIndex, String separator, String indexSeparator) { - - int nrows = data.length; - int ncols = data[0].length; - int mid = ncols/2; - String[][] dataLibSVM = new String[2 * nrows][ncols]; - StringBuilder sb = new StringBuilder(); - int indexRow = 0; - for(int r = 0; r < nrows; r++) { - StringBuilder row1 = new StringBuilder(); - StringBuilder row2 = new StringBuilder(); - row1.append("+1"); - for(int c = 0; c < ncols - 1; c++) { - if(mid > c) { - if(data[r][c] != null) { - dataLibSVM[indexRow][c] = data[r][c]; - row1.append(separator).append(c + firstIndex).append(indexSeparator).append(data[r][c]); - } - else - dataLibSVM[indexRow][c] = defaultValue(schema[c]); - } - else - dataLibSVM[indexRow][c] = defaultValue(schema[c]); - - } - dataLibSVM[indexRow++][ncols-1] = "+1"; - - row2.append("-1"); - for(int c = 0; c < ncols - 1; c++) { - if(mid <= c) { - if(data[r][c] != null) { - dataLibSVM[indexRow][c] = data[r][c]; - row2.append(separator).append(c + firstIndex).append(indexSeparator).append(data[r][c]); - } - else - dataLibSVM[indexRow][c] = defaultValue(schema[c]); - } - else - dataLibSVM[indexRow][c] = defaultValue(schema[c]); - } - dataLibSVM[indexRow++][ncols-1] = "-1"; - sb.append(row1).append("\n"); - sb.append(row2); - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - data = dataLibSVM; - } - - @Test - public void test1() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(10, 10, -100, 100, 1, naStrings); - extractSampleRawLibSVM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test2() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(100, 200, -100, 100, 1, naStrings); - extractSampleRawLibSVM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test3() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(1000, 200, -100, 100, 1, naStrings); - extractSampleRawLibSVM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ",,,,,,"; - String indexSeparator = ":"; - generateRandomData(20, 20, -100, 100, 0.6, naStrings); - extractSampleRawLibSVM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test5() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ",,,,,"; - String indexSeparator = ":"; - generateRandomData(100, 50, -100, 100, 0.5, naStrings); - extractSampleRawLibSVM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test6() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ",,,,,"; - String indexSeparator = ":"; - generateRandomData(10, 1000, -100, 100, 0.7, naStrings); - extractSampleRawLibSVM(1,separator, indexSeparator); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderMatrixMarketTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderMatrixMarketTest.java deleted file mode 100644 index d9e4241263e..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameGenerateReaderMatrixMarketTest.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class FrameGenerateReaderMatrixMarketTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameGenerateReaderMatrixMarketTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void extractSampleRawMM(int firstIndex, String separator, String indexSeparator) { - - int nrows = data.length; - int ncols = data[0].length; - - StringBuilder sb = new StringBuilder(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(data[r][c] != null && !data[r][c].equals("0")) { - String rs = (r + firstIndex) + separator + (c + firstIndex) + separator + data[r][c]; - sb.append(rs); - if(r != nrows - 1 || c != ncols - 1) - sb.append("\n"); - } - } - } - sampleRaw = sb.toString(); - } - - @Test - public void test1() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(5, 10, -100, 100, 1, naStrings); - extractSampleRawMM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test2() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(50, 100, -100, 100, 1, naStrings); - extractSampleRawMM(0,separator, indexSeparator); - runGenerateReaderTest(); - } - @Test - public void test3() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(50, 100, -100, 100, 0.5, naStrings); - extractSampleRawMM(1,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(50, 100, -100, 100, 0.2, naStrings); - extractSampleRawMM(1,separator, indexSeparator); - runGenerateReaderTest(); - } - - @Test - public void test5() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - String indexSeparator = ":"; - generateRandomData(50, 100, -100, 100, 0.8, naStrings); - extractSampleRawMM(0,separator, indexSeparator); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java index d9a4228c137..72becdf14a6 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java @@ -181,7 +181,8 @@ protected void runGenerateReaderTest() { GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); FrameReader fr= gr.getReader(); - FrameBlock grFrame = fr.readFrameFromHDFS(dataPath,schema,names,data.length, clen); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath,schema,names,data.length, clen); + } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index 063ea29b7d0..94cc2ea7044 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -89,11 +89,11 @@ protected void generateRandomSymmetric(int size, double min, double max, double writeRawString(sampleRaw, dataPath); GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); - MatrixReader mr = gr.getReader(); - MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, -1, clen, -1, -1); + MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); + + TestUtils.compareMatrices(sampleMB, matrixBlock, 0); - int a = 100; } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java deleted file mode 100644 index 3bc13086f66..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen.Identify; - -import org.apache.sysds.test.functions.iogen.GenerateReaderFrameTest; -import org.junit.Test; - -public class FrameGenerateReaderCSVTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameGenerateReaderCSVTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void extractSampleRawCSV(String separator) { - int nrows = data.length; - int ncols = data[0].length; - StringBuilder sb = new StringBuilder(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - sb.append(data[r][c]); - if(c != ncols - 1) - sb.append(separator); - } - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - } - - @Test - public void test1() { - String[] naStrings = {}; - String separator = ","; - generateRandomData(10, 5, 1, 100, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test2() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -10, 10, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test3() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "****"; - generateRandomData(100, 500, -10, 10, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -10, 10, 0.7, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test5() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ",,,,"; - generateRandomData(10, 10, -10, 10, 0.5, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test6() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(1000, 100, -10, 10, 0.4, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test7() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(1000, 100, -10, 10, 0.8, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test8() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(10000, 100, -10, 10, 0.5, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java deleted file mode 100644 index 9a7ece6a4fa..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen.Identify; - -import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; -import org.junit.Test; - -import java.util.Random; - -public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixGenerateReaderCSVTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void generateRandomCSV(int nrows, int ncols, double min, double max, double sparsity, String separator, - String[] naString) { - - sampleMatrix = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); - StringBuilder sb = new StringBuilder(); - - for(int r = 0; r < nrows; r++) { - StringBuilder row = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - if(sampleMatrix[r][c] != 0) { - row.append(sampleMatrix[r][c]).append(separator); - } - else { - Random rn = new Random(); - int rni = rn.nextInt(naString.length); - row.append(naString[rni]).append(separator); - } - } - - sb.append(row.substring(0, row.length() - separator.length())); - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - } - - @Test - public void test1() { - sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; - sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; - runGenerateReaderTest(); - } - - @Test - public void test2() { - sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; - sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; - runGenerateReaderTest(); - } - @Test - public void test3() { - sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; - sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naString = {"NaN"}; - generateRandomCSV(20, 20, -10, 10, 1, ",", naString); - runGenerateReaderTest(); - } - - @Test - public void test5() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"name\":3,\"password\":4}}\n" + - "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + - "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + - "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + - "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; - runGenerateReaderTest(); - } - - @Test - public void test6() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + - "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + - "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + - "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + - "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; - runGenerateReaderTest(); - } - - @Test - public void test7() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + - "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + - "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + - "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + - "{\"name\":18, \"user\":{\"name\":20,\"password\":21}, \"occupation\":19}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; - runGenerateReaderTest(); - } - - @Test - public void test8() { - sampleRaw = "1,1,10\n" + - "1,2,20\n" + - "1,3,30\n" + - "2,2,40\n" + - "3,2,50\n"; - - sampleMatrix = new double[][] {{10,20,30}, {0,40,0}, {0,50,0}}; - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java deleted file mode 100644 index 7492467f5f6..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderCSVTest.java +++ /dev/null @@ -1,151 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -import java.util.Random; - -public class MatrixGenerateReaderCSVTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixGenerateReaderCSVTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void generateRandomCSV(int nrows, int ncols, double min, double max, double sparsity, String separator, - String[] naString) { - - sampleMatrix = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); - StringBuilder sb = new StringBuilder(); - - for(int r = 0; r < nrows; r++) { - StringBuilder row = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - if(sampleMatrix[r][c] != 0) { - row.append(sampleMatrix[r][c]).append(separator); - } - else { - Random rn = new Random(); - int rni = rn.nextInt(naString.length); - row.append(naString[rni]).append(separator); - } - } - - sb.append(row.substring(0, row.length() - separator.length())); - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - } - - @Test - public void test1() { - sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test2() { - String[] naString = {"NaN"}; - generateRandomCSV(1000, 10000, -10, 10, 1, ",", naString); - runGenerateReaderTest(); - } - - @Test - public void test3() { - String[] naString = {"NaN"}; - generateRandomCSV(5, 5, -10, 10, 1, ",,,", naString); - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naString = {"Nan", "NAN", "", "inf", "null", "NULL"}; - generateRandomCSV(5, 5, -10, 10, 0.5, ",,", naString); - runGenerateReaderTest(); - } - - @Test - public void test5() { - sampleRaw = "1.0,2.0,3.0,4.0,5.0\n" + "6.,7.,8.,9.,10.\n" + "11,12,13,14,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test6() { - sampleRaw = "1.0,2.0,3.0,4.0,5.0\n" + "6.,7.,8.,9.,10.\n" + "11E0,12E0,13,14E0,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test7() { - sampleRaw = "1.0,2.0,3.0,4.0,5.0\n" + "6.,7.,8.,9.,10.\n" + "1.1E1,1.2E1,13,1.4E1,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test public void test8() { - sampleRaw = "1.0,2.0,3.0,4.0,5.0\n" + "60.0E-1,7.,80.0E-1,9.,100.0E-1\n" + "1.1E1,1.2E1,13,1.4E1,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test9() { - sampleRaw = ".1E1,.2E1,3.0,4.0,0.5E1\n" + "60.0E-1,7.,80.0E-1,9.,100.0E-1\n" + "1.1E1,1.2E1,13,1.4E1,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test10() { - sampleRaw = "0.000001e6,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); - } - - @Test - public void test11() { - sampleRaw = "1,2,3,4,5,NAN\n" + "6,7,8,9,10,NAN\n" + "11,12,13,14,15,NAN"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5, 0}, {6, 7, 8, 9, 10, 0}, {11, 12, 13, 14, 15, 0}}; - runGenerateReaderTest(); - } - - @Test - public void test12() { - sampleRaw = "1,2,3,4,5,NAN,,\n" + "6,7,8,9,10,NAN,,\n" + "11,12,13,14,15,NAN,,"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5, 0, 0, 0}, {6, 7, 8, 9, 10, 0, 0, 0}, - {11, 12, 13, 14, 15, 0, 0, 0}}; - runGenerateReaderTest(); - } - - @Test - public void test13() { - String[] naString = {"Nan", "NAN", "", "inf", "null", "NULL"}; - generateRandomCSV(1000, 500, -10, 10, 0.5, ",,", naString); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderLibSVMTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderLibSVMTest.java deleted file mode 100644 index 272ce9865ea..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderLibSVMTest.java +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class MatrixGenerateReaderLibSVMTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixGenerateReaderLibSVMTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void generateRandomLIBSVM(int firstIndex, int nrows, int ncols, double min, double max, double sparsity, - String separator, String indexSeparator) { - - double[][] random = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); - sampleMatrix = new double[2 * nrows][ncols]; - StringBuilder sb = new StringBuilder(); - int indexRow = 0; - for(int r = 0; r < nrows; r++) { - StringBuilder row1 = new StringBuilder(); - StringBuilder row2 = new StringBuilder(); - row1.append("+1"); - - for(int c = 0; c < ncols - 1; c++) { - if(random[r][c] > 0) { - sampleMatrix[indexRow][c] = random[r][c]; - row1.append(separator).append(c + firstIndex).append(indexSeparator).append(random[r][c]); - } - else { - sampleMatrix[indexRow][c] = 0; - } - } - sampleMatrix[indexRow++][ncols - 1] = 1; - - row2.append("-1"); - for(int c = 0; c < ncols - 1; c++) { - if(random[r][c] < 0) { - sampleMatrix[indexRow][c] = random[r][c]; - row2.append(separator).append(c + firstIndex).append(indexSeparator).append(random[r][c]); - } - else { - sampleMatrix[indexRow][c] = 0; - } - } - - sampleMatrix[indexRow++][ncols - 1] = -1; - - sb.append(row1).append("\n"); - sb.append(row2); - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - - } - - // Index start from 0 - @Test - public void test0_1() { - sampleRaw = "+1 2:3 4:5 6:7\n" + "-1 8:-9 10:-11"; - sampleMatrix = new double[][] {{0, 0, 3, 0, 5, 0, 7, 0, 0, 0, 0, +1}, {0, 0, 0, 0, 0, 0, 0, 0, -9, 0, -11, -1}}; - runGenerateReaderTest(); - } - - @Test - public void test0_10() { - sampleRaw = "-1 8:-9 10:-11\n" + "+1 2:3 4:5 6:7\n"; - sampleMatrix = new double[][] {{0, 0, 0, 0, 0, 0, 0, 0, -9, 0, -11, -1}, {0, 0, 3, 0, 5, 0, 7, 0, 0, 0, 0, +1}}; - runGenerateReaderTest(); - } - - @Test - public void test0_2() { - generateRandomLIBSVM(0, 10, 10, -10, 10, 1, " ", ":"); - runGenerateReaderTest(); - } - - @Test - public void test0_3() { - generateRandomLIBSVM(0, 100, 10, -100, 100, 1, " ", ":"); - runGenerateReaderTest(); - } - - @Test - public void test0_4() { - generateRandomLIBSVM(0, 10, 10, -100, 100, 1, " ", ":"); - runGenerateReaderTest(); - } - - @Test - public void test0_5() { - generateRandomLIBSVM(0, 10, 10, -100, 100, 1, ",,,,", "::"); - runGenerateReaderTest(); - } - - @Test - public void test0_6() { - sampleRaw = "+1 2:3.0 4:5. 6:7\n" + "-1 8:9.0E0 10:11e0"; - sampleMatrix = new double[][] {{0, 0, 3, 0, 5, 0, 7, 0, 0, 0, 0, +1}, {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 11, -1}}; - runGenerateReaderTest(); - } - - @Test - public void test0_7() { - sampleRaw = "+10000e-4 2:3 4:5 6:7\n" + "-1 8:9 10:11"; - sampleMatrix = new double[][] {{0, 0, 3, 0, 5, 0, 7, 0, 0, 0, 0, +1}, {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 11, -1}}; - runGenerateReaderTest(); - } - - @Test - public void test0_8() { - sampleRaw = "+10000e-4 2:3 4:5 6:7\n" + "-0.00001e5 8:9 10:11"; - sampleMatrix = new double[][] {{0, 0, 3, 0, 5, 0, 7, 0, 0, 0, 0, +1}, {0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 11, -1}}; - runGenerateReaderTest(); - } - - // Index start from 1 - @Test - public void test2() { - sampleRaw = "+1 2:3 4:5 6:7\n" + "-1 8:9 10:11"; - sampleMatrix = new double[][] {{0, 3, 0, 5, 0, 7, 0, 0, 0, 0, +1}, {0, 0, 0, 0, 0, 0, 0, 9, 0, 11, -1}}; - runGenerateReaderTest(); - } - - @Test - public void test1_2() { - generateRandomLIBSVM(1, 10, 10, -10, 10, 1, " ", ":"); - runGenerateReaderTest(); - } - - @Test - public void test1_3() { - generateRandomLIBSVM(1, 10, 10, -100, 100, 1, " ", ":"); - runGenerateReaderTest(); - } - - @Test - public void test1_4() { - generateRandomLIBSVM(0, 10, 12, -100, 100, 1, ",,,,,,", ":::::"); - runGenerateReaderTest(); - } - - @Test - public void test1_5() { - generateRandomLIBSVM(1, 100, 50, -100, 100, 1, ",,,,", "::"); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java deleted file mode 100644 index 074555c3590..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixGenerateReaderMatrixMarketTest.java +++ /dev/null @@ -1,238 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class MatrixGenerateReaderMatrixMarketTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixGenerateReaderMatrixMarketTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void generateRandomMM(int firstIndex, int nrows, int ncols, double min, double max, double sparsity, - String separator) { - - sampleMatrix = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); - StringBuilder sb = new StringBuilder(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(sampleMatrix[r][c] != 0) { - String rs = (r + firstIndex) + separator + (c + firstIndex) + separator + sampleMatrix[r][c]; - sb.append(rs); - if(r != nrows - 1 || c != ncols - 1) - sb.append("\n"); - } - } - } - sampleRaw = sb.toString(); - } - - private void generateRandomSymmetricMM(int firstIndex, int size, double min, double max, double sparsity, - String separator, boolean isUpperTriangular, boolean isSkew) { - - generateRandomSymmetric(size, min, max, sparsity, isSkew); - - int start, end; - StringBuilder sb = new StringBuilder(); - - for(int r = 0; r < size; r++) { - if(isUpperTriangular) { - start = r; - end = size; - } - else { - start = 0; - end = r + 1; - } - for(int c = start; c < end; c++) { - if(sampleMatrix[r][c] != 0) { - String rs = (r + firstIndex) + separator + (c + firstIndex) + separator + sampleMatrix[r][c]; - sb.append(rs); - if(r != size - 1 || c != size - 1) - sb.append("\n"); - } - } - } - sampleRaw = sb.toString(); - } - - // Index from 0 - @Test - public void test0_1() { - sampleRaw = "0,1,1\n" + "0,2,4\n" + "1,2,2\n" + "2,3,3"; - sampleMatrix = new double[][] {{0, 1, 4, 0}, {0, 0, 2, 0}, {0, 0, 0, 3}}; - runGenerateReaderTest(); - } - - @Test - public void test0_2() { - sampleRaw = "0,0,-1\n" + "0,1,1\n" + "0,2,2\n" + "0,3,3\n" + "1,0,4\n" + "1,1,5\n" + "1,2,6\n" + "1,3,7"; - sampleMatrix = new double[][] {{-1, 1, 2, 3}, {4, 5, 6, 7}}; - runGenerateReaderTest(); - } - - @Test - public void test0_3() { - sampleRaw = "0,0,-1\n" + "0,1,1\n" + "0,2,2.0\n" + "0,3,3.\n" + "1,0,4e0\n" + "1,1,5\n" + "1,2,6\n" + "1,3,7"; - sampleMatrix = new double[][] {{-1, 1, 2, 3}, {4, 5, 6, 7}}; - runGenerateReaderTest(); - } - - @Test - public void test0_4() { - sampleRaw = "0,0,-1\n" + "0,1,0.00001e5\n" + "0,2,2.\n" + "0,3,3\n" + "1,0,4e0\n" + "1,1,5\n" + "1,2,6\n" + "1,3,7"; - sampleMatrix = new double[][] {{-1, 1, 2, 3}, {4, 5, 6, 7}}; - runGenerateReaderTest(); - } - - @Test - public void test0_5() { - generateRandomMM(0, 5, 10, -100, 100, 1, ","); - runGenerateReaderTest(); - } - - @Test public void test0_6() { - generateRandomMM(0, 10, 10, -100, 100, 1, ","); - runGenerateReaderTest(); - } - - @Test - public void test0_7() { - generateRandomMM(0, 10, 10, -100, 100, 1, " ,"); - runGenerateReaderTest(); - } - - @Test - public void test0_8() { - generateRandomMM(0, 10, 10, -100, 100, 0.5, ","); - runGenerateReaderTest(); - } - - // Index from 1 - @Test - public void test1() { - sampleRaw = "1,1,1\n" + "1,2,4\n" + "2,2,2\n" + "3,3,3"; - sampleMatrix = new double[][] {{1, 4, 0}, {0, 2, 0}, {0, 0, 3}}; - runGenerateReaderTest(); - } - - @Test - public void test1_2() { - generateRandomMM(1, 500, 1000, -100, 100, 1, ",,,,,"); - runGenerateReaderTest(); - } - - // Symmetric Tests: - // Symmetric Index from 0 - @Test - public void SymmetricTest0_1() { - sampleRaw = "0,0,1\n" + "1,0,2\n" + "1,1,3\n" + "2,0,4\n" + "2,1,5\n" + "2,2,6\n" + "3,0,7\n" + "3,1,8\n" + "3,2,9\n" + "3,3,10\n"; - sampleMatrix = new double[][] {{1, 0, 0, 0}, {2, 3, 0, 0}, {4, 5, 6, 0}, {7, 8, 9, 10}}; - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest0_2() { - sampleRaw = "0,0,1\n" + "0,1,2\n" + "0,2,3\n" + "0,0,1\n" + "1,0,2\n" + "1,1,3\n" + "2,0,4\n" + "2,1,5\n" + "2,2,6\n" + "3,0,7\n" + "3,1,8\n" + "3,2,9\n" + "3,3,10\n"; - sampleMatrix = new double[][] {{1, 0, 0, 0}, {2, 3, 0, 0}, {4, 5, 6, 0}, {7, 8, 9, 10}}; - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest0_3() { - generateRandomSymmetricMM(0, 5, -5, 5, 1, ",", true, false); - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest0_4() { - generateRandomSymmetricMM(0, 50, -100, 100, 1, " ", true, false); - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest0_5() { - generateRandomSymmetricMM(0, 5, -5, 5, 1, ",", false, false); - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest0_6() { - generateRandomSymmetricMM(0, 50, -100, 100, 1, " ", false, false); - runGenerateReaderTest(); - } - - // Symmetric Index from 1 - @Test - public void SymmetricTest1_1() { - generateRandomSymmetricMM(1, 5, -5, 5, 1, ",", true, false); - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest1_2() { - generateRandomSymmetricMM(1, 50, -100, 100, 1, " ", true, false); - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest1_3() { - generateRandomSymmetricMM(1, 50, -5, 5, 1, ",", false, false); - runGenerateReaderTest(); - } - - @Test - public void SymmetricTest1_4() { - generateRandomSymmetricMM(1, 70, -100, 100, 1, " ", false, false); - runGenerateReaderTest(); - } - - // Skew-Symmetric Tests: - // Skew-Symmetric Index from 0 - @Test - public void SkewSymmetricTest0_1() { - generateRandomSymmetricMM(0, 5, -100, 100, 1, ",", false, true); - runGenerateReaderTest(); - } - - @Test - public void SkewSymmetricTest0_2() { - generateRandomSymmetricMM(0, 5, -100, 100, 1, " ", true, true); - runGenerateReaderTest(); - } - - // Skew-Symmetric Index from 1 - @Test - public void SkewSymmetricTest0_3() { - generateRandomSymmetricMM(1, 5, -100, 100, 1, ",", false, true); - runGenerateReaderTest(); - } - - @Test - public void SkewSymmetricTest0_4() { - generateRandomSymmetricMM(1, 5, -100, 100, 1, " ", true, true); - runGenerateReaderTest(); - } - -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java new file mode 100644 index 00000000000..5ea4c199c56 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen; + +import org.junit.Test; + +public class MatrixSingleRowFlatTest extends GenerateReaderMatrixTest { + + private final static String TEST_NAME = "MatrixSingleRowFlatTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + // CSV Dataset + // 1. matrix and dataset are dense and "," is delim + @Test + public void test1() { + sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; + sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; + runGenerateReaderTest(); + } + + // 2. matrix and dataset are dense and ",a" is delim + @Test + public void test2() { + sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; + sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; + runGenerateReaderTest(); + } + + //3. matrix and dataset are dense and ",," is delim + @Test + public void test3() { + sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; + sampleMatrix = new double[][] {{1, 3, 5}, {6, 8, 10}, {11, 13,15}}; + runGenerateReaderTest(); + } + + //4. matrix and dataset contain empty/0 values and "," is delim + @Test + public void test4() { + sampleRaw = "1,2,,4,5\n" + ",7,8,9,10\n" + "11,12,,,\n" + "13,14,,,16"; + sampleMatrix = new double[][] {{1, 2, 5}, {0, 7, 10}, {11, 12, 0}, {13, 14, 16}}; + runGenerateReaderTest(); + } + + // LibSVM + //5. LibSVM with in-order col indexes and numeric col indexes + @Test + public void test5() { + sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:50 6:60\n" + "+1 1:101 2:201 \n" + + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(); + } + + //6. LibSVM with out-of-order col indexes and numeric col indexes + @Test + public void test6() { + sampleRaw = "+1 3:30 1:10 2:20\n" + "-1 5:50 6:60 4:40\n" + "+1 1:101 2:201 \n" + + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(); + } + + //7. Special LibSVM with in-order col indexes and none-numeric col indexes + // a -> 1, b->2, c->3, d->4, e->5, f->6 + @Test + public void test7() { + sampleRaw = "+1 a:10 b:20 c:30\n" + "-1 d:40 e:50 f:60\n" + "+1 a:101 b:201 \n" + + "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(); + } + + //8. Special LibSVM with out-of-order col indexes and none-numeric col indexes + // a -> 1, b->2, c->3, d->4, e->5, f->6 + @Test + public void test8() { + sampleRaw = "+1 c:30 a:10 b:20\n" + "-1 e:50 f:60 d:40\n" + "+1 a:101 b:201 \n" + + "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, + {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(); + } + + // MatrixMarket(MM) + //9. MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) + @Test + public void test9() { + sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n"+ "1,5,50\n" + "2,1,101\n" + "2,2,201\n" + "4,1,104\n" + + "4,5,504\n" + "5,3,305"; + sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(); + } + + //10. MM with inorder dataset, (RowIndex,Col Index,Value). Row begin index: Row & Col begin index: (0,1) + @Test + public void test10() { + sampleRaw = "0,1,10\n" + "0,2,20\n" + "0,3,30\n"+ "0,5,50\n" + "1,1,101\n" + "1,2,201\n" + "3,1,104\n" + + "3,5,504\n" + "4,3,305"; + sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(); + } + + //11. MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,0) + @Test + public void test11() { + sampleRaw = "1,0,10\n" + "1,1,20\n" + "1,2,30\n"+ "1,4,50\n" + "2,0,101\n" + "2,1,201\n" + "4,0,104\n" + + "4,4,504\n" + "5,2,305"; + sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(); + } + + //12. MM with inorder dataset, (RowIndex,Col Index,Value). Row begin index: Row & Col begin index: (0,0) + @Test + public void test12() { + sampleRaw = "0,0,10\n" + "0,1,20\n" + "0,2,30\n"+ "0,4,50\n" + "1,0,101\n" + "1,1,201\n" + "3,0,104\n" + + "3,4,504\n" + "4,2,305"; + sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(); + } + + //13. MM with out-of-order dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) + @Test + public void test13() { + sampleRaw = "4,5,504\n" + "1,2,20\n" + "1,1,10\n" + "2,1,101\n" + "1,3,30\n"+ "1,5,50\n" + "2,2,201\n" + "4,1,104\n" + + "5,3,305"; + sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(); + } + + //14. MM with out-of-order dataset, (ColIndex,Row Index, Value). Row & Col begin index: (1,1) + @Test + public void test14() { + sampleRaw = "5,4,504\n" + "2,1,20\n" + "1,1,10\n" + "1,2,101\n" + "3,1,30\n"+ "5,1,50\n" + "2,2,201\n" + "1,4,104\n" + + "3,5,305\n"+"2,4,204"; + sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 204, 0}, {0, 0, 305}}; + runGenerateReaderTest(); + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java new file mode 100644 index 00000000000..856d626c835 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen; + +import org.junit.Test; + +public class MatrixSingleRowNestedTest extends GenerateReaderMatrixTest { + + private final static String TEST_NAME = "MatrixSingleRowFlatTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + // JSON Dataset + //1. flat object, in-order values + @Test + public void test1() { + sampleRaw = "{\"a\":1,\"b\":2,\"c\":3,\"d\":4,\"e\":5}\n" + + "{\"a\":6,\"b\":7,\"c\":8,\"d\":9,\"e\":10}\n" + + "{\"a\":11,\"b\":12,\"c\":13,\"d\":14,\"e\":15}"; + sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; + runGenerateReaderTest(); + } + + //2. flat object, out-of-order values + @Test + public void test2() { + sampleRaw = "{\"b\":2,\"a\":1,\"e\":5,\"c\":3,\"d\":4}\n" + + "{\"d\":9,\"b\":7,\"c\":8,\"a\":6,\"e\":10}\n" + + "{\"d\":14,\"a\":11,\"e\":15,\"b\":12,\"c\":13}"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + //3. nested object with unique attribute names + @Test + public void test3() { + sampleRaw = "{\"a\":1,\"b\":{\"c\":2,\"d\":3,\"e\":4},\"f\":5}\n" + + "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + + "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; + sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; + runGenerateReaderTest(); + } + + //4. nested object with unique attribute names, out-of-order + @Test + public void test4() { + sampleRaw = "{\"a\":1,\"f\":5,\"b\":{\"c\":2,\"d\":3,\"e\":4}}\n" + + "{\"a\":6,\"f\":10,\"b\":{\"e\":9,\"c\":7,\"d\":8}}\n" + + "{\"b\":{\"d\":13,\"c\":12,\"e\":14},\"a\":11,\"f\":15}\n"; + sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; + runGenerateReaderTest(); + } + + //5. nested object with repeated attribute names, out-of-order + @Test + public void test5() { + sampleRaw = "{\"a\":1,\"b\":{\"a\":2,\"b\":3,\"f\":4},\"f\":5}\n" + + "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + + "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + + // XML + //6. nested object with unique attribute names, in-order + // single type of object, "article" is an object + @Test + public void test6() { + sampleRaw = "
12345
\n" + + "
678910
\n" + + "
1112131415
"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + + //6. nested object with unique attribute names, in-order + // multi types of object, "article", "book", and "homepage" are the object types + @Test + public void test7() { + sampleRaw = "
12345
\n" + + "678910\n" + + "1112131415"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + + //7. nested object with unique attribute names, in-order + // multi types of object, "article", "book", and "homepage" are the object types + @Test + public void test8() { + sampleRaw = "
122022GIO45
\n" + + "671980DB910\n" + + "11122012CEP1415\n"; + sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; + runGenerateReaderTest(); + } +} From b92a88c44474812652c2d5b40a257285c145caa1 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 04:12:05 +0100 Subject: [PATCH 13/84] removed an old code --- .../runtime/iogen/MatrixGenerateReader.java | 298 ------------------ 1 file changed, 298 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java deleted file mode 100644 index e628ca1cedb..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/MatrixGenerateReader.java +++ /dev/null @@ -1,298 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.commons.lang.mutable.MutableInt; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.mapred.JobConf; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.sysds.runtime.io.IOUtilFunctions; -import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.util.UtilFunctions; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -public abstract class MatrixGenerateReader extends MatrixReader { - - protected static CustomProperties _props; - protected final FastStringTokenizer fastStringTokenizerDelim; - - public MatrixGenerateReader(CustomProperties _props) { - MatrixGenerateReader._props = _props; - fastStringTokenizerDelim = new FastStringTokenizer(_props.getDelim()); - } - - protected MatrixBlock computeSize(List files, FileSystem fs, long rlen, long clen) - throws IOException, DMLRuntimeException { - // allocate target matrix block based on given size; - return new MatrixBlock(getNumRows(files, fs), (int) clen, rlen * clen); - } - - private static int getNumRows(List files, FileSystem fs) throws IOException, DMLRuntimeException { - int rows = 0; - String value; - for(int fileNo = 0; fileNo < files.size(); fileNo++) { - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); - try { - // Row Regular - if(_props.getRowPattern().equals(CustomProperties.GRPattern.Regular)) { - // TODO: check the file has header? - while(br.readLine() != null) - rows++; - } - // Row Irregular - else { - FastStringTokenizer st = new FastStringTokenizer(_props.getDelim()); - while((value = br.readLine()) != null) { - st.reset(value); - int row = st.nextInt(); - rows = Math.max(rows, row); - } - rows++; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - } - return rows; - } - - @Override - public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) - throws IOException, DMLRuntimeException { - - MatrixBlock ret = null; - if(rlen >= 0 && clen >= 0) //otherwise allocated on read - ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false); - - //prepare file access - JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path(fname); - FileSystem fs = IOUtilFunctions.getFileSystem(path, job); - - //core read - ret = readMatrixFromHDFS(path, job, fs, ret, rlen, clen, blen); - - return ret; - } - - @Override - public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz) - throws IOException, DMLRuntimeException { - - MatrixBlock ret = null; - if(rlen >= 0 && clen >= 0) //otherwise allocated on read - ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false); - - return ret; - } - - @SuppressWarnings("unchecked") - private MatrixBlock readMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, - long clen, int blen) throws IOException, DMLRuntimeException { - //prepare file paths in alphanumeric order - ArrayList files = new ArrayList<>(); - if(fs.getFileStatus(path).isDirectory()) { - for(FileStatus stat : fs.listStatus(path, IOUtilFunctions.hiddenFileFilter)) - files.add(stat.getPath()); - Collections.sort(files); - } - else - files.add(path); - - //determine matrix size via additional pass if required - if(dest == null) { - dest = computeSize(files, fs, rlen, clen); - rlen = dest.getNumRows(); - //clen = dest.getNumColumns(); - } - - //actual read of individual files - long lnnz = 0; - MutableInt row = new MutableInt(0); - for(int fileNo = 0; fileNo < files.size(); fileNo++) { - lnnz += readMatrixFromInputStream(fs.open(files.get(fileNo)), path.toString(), dest, row, rlen, clen, blen); - } - - //post processing - dest.setNonZeros(lnnz); - - return dest; - } - - protected abstract long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException; - - public static class MatrixReaderRowRegularColRegular extends MatrixGenerateReader { - - public MatrixReaderRowRegularColRegular(CustomProperties _props) { - super(_props); - } - - @Override - protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException { - - String value = null; - int row = rowPos.intValue(); - double cellValue = 0; - int col = 0; - long lnnz = 0; - fastStringTokenizerDelim.setNaStrings(_props.getNaStrings()); - - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - - //TODO: separate implementation for Sparse and Dens Matrix Blocks - - // Read the data - try { - while((value = br.readLine()) != null) //foreach line - { - fastStringTokenizerDelim.reset(value); - while(col != -1) { - cellValue = fastStringTokenizerDelim.nextDouble(); - col = fastStringTokenizerDelim.getIndex(); - if(cellValue != 0) { - dest.appendValue(row, col, cellValue); - lnnz++; - } - } - row++; - col = 0; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - - rowPos.setValue(row); - return lnnz; - } - } - - public static class MatrixReaderRowRegularColIrregular extends MatrixGenerateReader { - - public MatrixReaderRowRegularColIrregular(CustomProperties _props) { - super(_props); - } - - @Override - protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException { - - String value = null; - int row = rowPos.intValue(); - double cellValue = 0; - int col = 0; - long lnnz = 0; - - final FastStringTokenizer fastStringTokenizerIndexDelim = new FastStringTokenizer(_props.getIndexDelim()); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - - //TODO: separate implementation for Sparse and Dens Matrix Blocks - - // Read the data - try { - while((value = br.readLine()) != null) //foreach line - { - fastStringTokenizerDelim.reset(value); - String cellValueString = fastStringTokenizerDelim.nextToken(); - cellValue = UtilFunctions.parseToDouble(cellValueString, null); - dest.appendValue(row, (int) clen-_props.getFirstColIndex()-1, cellValue); - - while(col != -1) { - String nt = fastStringTokenizerDelim.nextToken(); - if(fastStringTokenizerDelim.getIndex() == -1) - break; - fastStringTokenizerIndexDelim.reset(nt); - col = fastStringTokenizerIndexDelim.nextInt(); - cellValue = fastStringTokenizerIndexDelim.nextDouble(); - if(cellValue != 0) { - dest.appendValue(row, col-_props.getFirstColIndex(), cellValue); - lnnz++; - } - } - row++; - col = 0; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - - rowPos.setValue(row); - return lnnz; - } - } - - public static class MatrixReaderRowIrregular extends MatrixGenerateReader { - - public MatrixReaderRowIrregular(CustomProperties _props) { - super(_props); - } - - @Override - protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException { - String value = null; - int row = rowPos.intValue(); - double cellValue = 0; - int col = 0; - long lnnz = 0; - - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - - //TODO: separate implementation for Sparse and Dens Matrix Blocks - - // Read the data - try { - while((value = br.readLine()) != null) //foreach line - { - fastStringTokenizerDelim.reset(value); - int ri = fastStringTokenizerDelim.nextInt(); - col = fastStringTokenizerDelim.nextInt(); - cellValue = fastStringTokenizerDelim.nextDouble(); - - if(cellValue != 0) { - dest.appendValue(ri-_props.getFirstColIndex(), col-_props.getFirstColIndex(), cellValue); - lnnz++; - } - row = Math.max(row, ri); - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - rowPos.setValue(row); - return lnnz; - } - } -} From d866f10afbc4af6bcd7fe9dca9882ece5b6146bf Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 04:25:05 +0100 Subject: [PATCH 14/84] Minor --- .../runtime/iogen/codegen/CodeGenTrie.java | 45 +++++++------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 6fd1aa4fd8a..3fdaf20e1d5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -115,37 +115,22 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos if(node.getChildren().size() > 0) { String currPosVariable; -// if(node.getKey() != null) { -// currPosVariable = getRandomName("curPos"); -// src.append("index = str.indexOf(\"" + node.getKey().replace("\"", "\\\"") + "\", " + currPos + "); \n"); -// src.append("if(index != -1) { \n"); -// src.append("int " + currPosVariable + " = index + " + node.getKey().length() + "; \n"); -// currPos = currPosVariable; -// -// for(String key : node.getChildren().keySet()) { -// CodeGenTrieNode child = node.getChildren().get(key); -// getJavaCode(child, src, currPos); -// } -// src.append("} \n"); -// } -// else { - for(String key : node.getChildren().keySet()) { - if(key.length()>0){ - currPosVariable = getRandomName("curPos"); - if(node.getKey() == null) - src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\"); \n"); - else - src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\", " + currPos + "); \n"); - src.append("if(index != -1) { \n"); - src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); - currPos = currPosVariable; - } - CodeGenTrieNode child = node.getChildren().get(key); - getJavaCode(child, src, currPos); - if(key.length()>0) - src.append("} \n"); + for(String key : node.getChildren().keySet()) { + if(key.length() > 0) { + currPosVariable = getRandomName("curPos"); + if(node.getKey() == null) + src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\"); \n"); + else + src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\", " + currPos + "); \n"); + src.append("if(index != -1) { \n"); + src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); + currPos = currPosVariable; } - //} + CodeGenTrieNode child = node.getChildren().get(key); + getJavaCode(child, src, currPos); + if(key.length() > 0) + src.append("} \n"); + } } } } From 89b2ae03504c53cdfdff12be90905967165a62a6 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 11:48:23 +0100 Subject: [PATCH 15/84] Fix a bug in ReaderMapping Add tests for Frame flat data --- .../apache/sysds/runtime/iogen/RawIndex.java | 9 +- .../runtime/iogen/codegen/FrameCodeGen.java | 3 +- .../iogen/FrameSingleRowFlatTest.java | 139 ++++++++++++++++++ .../iogen/GenerateReaderFrameTest.java | 11 +- 4 files changed, 152 insertions(+), 10 deletions(-) create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index c1d537b8d42..faaca72d24d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -150,8 +150,13 @@ else if(i==rawLength-2){ public Pair findValue(Object value, Types.ValueType valueType){ if(valueType.isNumeric()) return findValue(UtilFunctions.getDouble(value)); - else if(valueType == Types.ValueType.STRING) - return findValue(UtilFunctions.objectToString(value)); + else if(valueType == Types.ValueType.STRING){ + String os = UtilFunctions.objectToString(value); + if(os == null || os.length() == 0) + return null; + else + return findValue(UtilFunctions.objectToString(value)); + } // else if(valueType == Types.ValueType.BOOLEAN) // return findValue(UtilFunctions.objectToString()) else diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index a77325da851..e7bc2c4066d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -64,7 +64,8 @@ public FrameCodeGen(CustomProperties properties, String className) { src.append("int row = rl; \n"); src.append("long lnnz = 0; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); - src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); + if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) + src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); src.append("int index, endPos, strLen; \n"); src.append("try { \n"); src.append("while(reader.next(key, value)){ \n"); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java new file mode 100644 index 00000000000..9ef503e234f --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen; + +import org.apache.sysds.common.Types; +import org.junit.Test; + +public class FrameSingleRowFlatTest extends GenerateReaderFrameTest { + + private final static String TEST_NAME = "FrameSingleRowFlatTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + + // CSV: Frame + // 1. dataset contain INT32 values + @Test + public void test1() { + sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; + data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; + runGenerateReaderTest(); + } + + // 2. dataset contain different value types + @Test + public void test2() { + sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; + data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; + runGenerateReaderTest(); + } + + @Test + public void test3() { + sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; + data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + @Test + public void test4() { + sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; + data = new String[][] {{"1", "2", "b"}, {"6", "7", "bb"}, {"11", "12", "14"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + @Test + public void test5() { + sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; + data = new String[][] {{"1", "2", "b"}, {"6", "7", "bb"}, {"11", "12", "14"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.FP64, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + // CSV with empty values + @Test + public void test6() { + sampleRaw = "1,2,a,,c\n" + "6,,aa,bb,cc\n" + ",12,13,14,15"; + data = new String[][] {{"1", "2", ""}, {"6", "0", "bb"}, {"0", "12", "14"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + // LibSVM + // with in-order col indexes and numeric col indexes + @Test + public void test7() { + sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:50 6:60\n" + "+1 1:101 2:201 \n" + + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; + + data = new String[][] {{"1", "10", "20", "30", "0", "", ""}, + {"-1", "0", "0", "0", "40", "50", "60"}, + {"1", "101", "201", "0", "0", "", ""}, + {"-1", "0", "0", "0", "0", "", "601"}, + {"-1", "0", "0", "0", "0", "501", ""}, + {"1", "0", "0", "301", "0", "", ""}}; + + schema = new Types.ValueType[] {Types.ValueType.FP32, Types.ValueType.INT32, Types.ValueType.INT64, + Types.ValueType.FP32, Types.ValueType.FP64, Types.ValueType.STRING, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + @Test + public void test8() { + sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:a 6:b\n" + "+1 1:101 2:201 \n" + + "-1 6:c \n" + "-1 5:d\n" + "+1 3:301"; + + data = new String[][] {{"1", "10", "20", "30", "0", "", ""}, + {"-1", "0", "0", "0", "40", "a", "b"}, + {"1", "101", "201", "0", "0", "", ""}, + {"-1", "0", "0", "0", "0", "", "c"}, + {"-1", "0", "0", "0", "0", "d", ""}, + {"1", "0", "0", "301", "0", "", ""}}; + + schema = new Types.ValueType[] {Types.ValueType.FP32, Types.ValueType.INT32, Types.ValueType.INT64, + Types.ValueType.FP32, Types.ValueType.FP64, Types.ValueType.STRING, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + // MatrixMarket(MM) + //MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) + @Test + public void test9() { + sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n"+ "1,5,50\n" + "2,1,101\n" + "2,2,201\n" + "4,1,104\n" + + "4,5,504\n" + "5,3,305"; + data = new String[][] {{"10","20","30"}, + {"101","201",""}, + {"0","0",""}, + {"104", "0", ""}, + {"0", "0", "305"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.FP64, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + + +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java index 72becdf14a6..70e42765b22 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java @@ -53,8 +53,6 @@ public abstract class GenerateReaderFrameTest extends AutomatedTestBase { // Types.ValueType.BOOLEAN }; - protected Types.ValueType[] types1= { Types.ValueType.BOOLEAN}; - protected abstract String getTestName(); @Override public void setUp() { @@ -63,7 +61,6 @@ public abstract class GenerateReaderFrameTest extends AutomatedTestBase { } protected String getRandomString(int length) { - //String alphabet1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"; String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; StringBuilder salt = new StringBuilder(); Random rnd = new Random(); @@ -168,11 +165,11 @@ protected void runGenerateReaderTest() { TestConfiguration config = getTestConfiguration(getTestName()); loadTestConfiguration(config); - FrameBlock sampleFrame = new FrameBlock(schema, names, data); + FrameBlock sampleFrame = new FrameBlock(schema, data); String HOME = SCRIPT_DIR + TEST_DIR; File directory = new File(HOME); - if (! directory.exists()){ + if(!directory.exists()) { directory.mkdir(); } String dataPath = HOME + "frame_data.raw"; @@ -180,8 +177,8 @@ protected void runGenerateReaderTest() { writeRawString(sampleRaw, dataPath); GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - FrameReader fr= gr.getReader(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath,schema,names,data.length, clen); + FrameReader fr = gr.getReader(); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath, schema, data.length, clen); } catch(Exception exception) { From 0110eb92c1560d6f9ca627f6d65c7d29d001b4e3 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 12:19:23 +0100 Subject: [PATCH 16/84] Add tests for Frame nested data --- .../iogen/FrameSingleRowNestedTest.java | 80 +++++++++++++++++++ .../iogen/GenerateReaderFrameTest.java | 1 - .../iogen/GenerateReaderMatrixTest.java | 3 +- 3 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java new file mode 100644 index 00000000000..24d3a404c7c --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen; + +import org.apache.sysds.common.Types; +import org.junit.Test; + +public class FrameSingleRowNestedTest extends GenerateReaderFrameTest { + + private final static String TEST_NAME = "FrameSingleRowNestedTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + // JSON Dataset + //1. flat object, in-order values + @Test + public void test1() { + sampleRaw = "{\"a\":1,\"b\":2,\"c\":3,\"d\":4,\"e\":5}\n" + + "{\"a\":6,\"b\":7,\"c\":8,\"d\":9,\"e\":10}\n" + + "{\"a\":11,\"b\":12,\"c\":13,\"d\":14,\"e\":15}"; + + data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; + runGenerateReaderTest(); + } + + //2. flat object, out-of-order values, contain different value types + @Test + public void test2() { + sampleRaw = "{\"b\":\"string\",\"a\":\"1\",\"e\":5,\"c\":3,\"d\":4}\n" + + "{\"d\":9,\"b\":\"string2\",\"c\":8,\"a\":\"6\",\"e\":10}\n" + + "{\"d\":14,\"a\":\"11\",\"e\":15,\"b\":\"string3\",\"c\":13}"; + + data = new String[][] {{"1", "string"}, {"6", "string2"}, {"11", "string3"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING}; + runGenerateReaderTest(); + } + //3. nested object with unique attribute names + @Test + public void test3() { + sampleRaw = "{\"a\":1,\"b\":{\"c\":2,\"d\":3,\"e\":4},\"f\":5}\n" + + "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + + "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; + data = new String[][] {{"1", "2", "5"}, {"6", "7", "10"}, {"11", "12", "15"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64}; + runGenerateReaderTest(); + } + + //5. nested object with repeated attribute names, out-of-order + @Test + public void test5() { + sampleRaw = "{\"a\":1,\"b\":{\"a\":2,\"b\":3,\"f\":4},\"f\":5}\n" + + "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + + "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64, + Types.ValueType.FP32, Types.ValueType.INT64}; + data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; + runGenerateReaderTest(); + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java index 70e42765b22..8b113c978db 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java @@ -179,7 +179,6 @@ protected void runGenerateReaderTest() { FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath, schema, data.length, clen); - } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index 94cc2ea7044..ce1b5d8a04f 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -63,7 +63,8 @@ protected void generateRandomSymmetric(int size, double min, double max, double } } - @SuppressWarnings("unused") protected void runGenerateReaderTest() { + @SuppressWarnings("unused") + protected void runGenerateReaderTest() { Types.ExecMode oldPlatform = rtplatform; rtplatform = Types.ExecMode.SINGLE_NODE; From a828d141cd4cd051765b9d942eade02ae3e11eca Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 13:00:02 +0100 Subject: [PATCH 17/84] Fix Code Style --- .../sysds/runtime/iogen/CustomProperties.java | 3 +- .../sysds/runtime/iogen/MappingTrieNode.java | 3 +- .../apache/sysds/runtime/iogen/RawIndex.java | 31 ------------------- .../iogen/codegen/CodeGenTrieNode.java | 3 +- .../runtime/iogen/codegen/FrameCodeGen.java | 6 ++-- .../runtime/iogen/codegen/MatrixCodeGen.java | 3 +- .../iogen/template/MatrixGenerateReader.java | 9 ++++-- .../iogen/GenerateReaderMatrixTest.java | 3 +- 8 files changed, 20 insertions(+), 41 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index a6933f70864..28f7e992d8b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -28,7 +28,8 @@ public class CustomProperties extends FileFormatProperties implements Serializab public enum IndexProperties { IDENTIFY, PREFIX, KEY; - @Override public String toString() { + @Override + public String toString() { return this.name().toUpperCase(); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java index defab3cab37..2c1191818c7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrieNode.java @@ -27,7 +27,8 @@ public class MappingTrieNode { public enum Type { INNER, END, IGNORE; - @Override public String toString() { + @Override + public String toString() { return this.name().toUpperCase(); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index faaca72d24d..597fbf3a89e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -302,37 +302,6 @@ private void addActualValueToList(String stringValue, Integer position, HashMap< } } - public void printBitSets() { - // String numberBitSetStrng; - String dotBitSetString=""; - String eBitSetString=""; - String plusMinusBitSetString=""; - // String minusBitSetStrng; - // for(int i=0; i reader = informat.getRecordReader(split, job, Reporter.NULL); \n"); @@ -84,7 +85,8 @@ public FrameCodeGen(CustomProperties properties, String className) { return javaTemplate.replace(code, src.toString()); } - @Override public String generateCodeCPP() { + @Override + public String generateCodeCPP() { return null; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 1ff4a017fbf..0c0a0e70d3f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -81,7 +81,8 @@ public String generateCodeJava() { return javaTemplate.replace(code, src.toString()); } - @Override public String generateCodeCPP() { + @Override + public String generateCodeCPP() { return null; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index 7cbba447452..8b42aafc79c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -76,7 +76,8 @@ private static int getNumRows(List files, FileSystem fs) throws IOExceptio return rows; } - @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) + @Override + public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { MatrixBlock ret = null; @@ -94,7 +95,8 @@ private static int getNumRows(List files, FileSystem fs) throws IOExceptio return ret; } - @Override public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz) + @Override + public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { MatrixBlock ret = null; @@ -104,7 +106,8 @@ private static int getNumRows(List files, FileSystem fs) throws IOExceptio return ret; } - @SuppressWarnings("unchecked") private MatrixBlock readMatrixFromHDFS(Path path, JobConf job, FileSystem fs, + @SuppressWarnings("unchecked") + private MatrixBlock readMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int blen) throws IOException, DMLRuntimeException { //prepare file paths in alphanumeric order diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index ce1b5d8a04f..b07e9c20203 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -44,7 +44,8 @@ public abstract class GenerateReaderMatrixTest extends AutomatedTestBase { protected abstract String getTestName(); - @Override public void setUp() { + @Override + public void setUp() { TestUtils.clearAssertionInformation(); addTestConfiguration(getTestName(), new TestConfiguration(TEST_DIR, getTestName(), new String[] {"Y"})); } From 030f15e4cdd5bec6d3adfc58ddf7dbd98100456f Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 13:09:09 +0100 Subject: [PATCH 18/84] Minor rollback --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 97541ce5169..51f38c2b8fd 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ true ** false - -Xms3000m -Xmx9000m -Xmn300m + -Xms3000m -Xmx3000m -Xmn300m false From 87c4e8cc924a1f40023162dd42efb3ecd5225895 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 25 Jan 2022 16:38:05 +0100 Subject: [PATCH 19/84] Minor --- .../apache/sysds/runtime/iogen/GenerateReader.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 71e1da73041..bdc38a7791f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -29,6 +29,7 @@ import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import java.util.Random; /* Generate Reader has two steps: @@ -62,6 +63,15 @@ public GenerateReader(SampleProperties sampleProperties) throws Exception { } } + public String getRandomClassName() { + Random r = new Random(); + int low = 0; + int high = 100000000; + int result = r.nextInt(high - low) + low; + + return "GIOReader_" + result; + } + // Generate Reader for Matrix public static class GenerateReaderMatrix extends GenerateReader { @@ -77,7 +87,7 @@ public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix) throws E } public MatrixReader getReader() throws Exception { - String className = "GIOMatrixReader"; + String className = getRandomClassName(); MatrixCodeGen src = new MatrixCodeGen(properties, className); // constructor with arguments as CustomProperties @@ -102,7 +112,7 @@ public GenerateReaderFrame(String sampleRaw, FrameBlock sampleFrame) throws Exce } public FrameReader getReader() throws Exception { - String className = "GIOFrameReader"; + String className = getRandomClassName(); FrameCodeGen src = new FrameCodeGen(properties, className); // constructor with arguments as CustomProperties From ee8c972711424e2fe9d04009898a899453875d57 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 26 Jan 2022 17:52:46 +0100 Subject: [PATCH 20/84] Init commit for multi row CodeGen --- .../sysds/runtime/iogen/CustomProperties.java | 6 + .../runtime/iogen/FormatIdentifying.java | 109 ++++++++++++- .../sysds/runtime/iogen/Hirschberg.java | 150 ++++++++++++++++++ .../sysds/runtime/iogen/MappingTrie.java | 1 + .../runtime/iogen/codegen/CodeGenTrie.java | 3 +- 5 files changed, 266 insertions(+), 3 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 28f7e992d8b..cba8a71fdf5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -45,6 +45,12 @@ public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex) { this.rowIndex = rowIndex; } + public CustomProperties(KeyTrie[] colKeyPattern, KeyTrie rowKeyPattern) { + this.colKeyPattern = colKeyPattern; + this.rowIndex = IndexProperties.KEY; + this.rowKeyPattern = rowKeyPattern; + } + public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex, KeyTrie rowKeyPattern) { this.colKeyPattern = colKeyPattern; this.rowIndex = rowIndex; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index b73a6bf9a6c..a2fb9ed3791 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.BitSet; +import java.util.HashMap; import java.util.HashSet; public class FormatIdentifying { @@ -224,8 +225,9 @@ private void runIdentification() { properties.setRowIndexBegin(""); } else { + KeyTrie rowDelimPattern = new KeyTrie(findRowDelimiters()); colKeyPattern = buildColsKeyPatternMultiRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.KEY); + properties = new CustomProperties(colKeyPattern, rowDelimPattern); } } } @@ -349,6 +351,111 @@ private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { // 2. Build key pattern tree for each column // 3. Build key pattern for end of values + private ArrayList> findRowDelimiters(){ + ArrayList> keyPattern = new ArrayList<>(); + Hirschberg hirschberg = new Hirschberg(); + int misMatchPenalty = 3; + int gapPenalty = 2; + + //extract all lines are in record boundary + ArrayList recordBoundaries = new ArrayList<>(); + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + int[] minList = new int[nrows]; + HashMap maxColPos = new HashMap<>(); + int[] minColPos = new int[nrows]; + for(int r=0; r= 0; beginLine--) + if(usedLines[r].get(beginLine)) + break; + + StringBuilder sb = new StringBuilder(); + beginLine = Math.max(beginLine, 0); + + if(beginLine+1 == nlines) + continue; + + Integer subStrPos = 0; + if(maxColPos.containsKey(beginLine)) + subStrPos = maxColPos.get(beginLine); + + String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); + if(str.length() >0) { + sb.append(str).append("\n"); + } + for(int i = beginLine+1 ; i < minList[r]; i++){ + str = sampleRawIndexes.get(i).getRaw(); + if(str.length() > 0) + sb.append(str).append("\n"); + } + + str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); + if(str.length() > 0) + sb.append(str); + System.out.println(beginLine+" "+nlines+" "+minList[r]); + recordBoundaries.add(sb.toString()); + } + recordBoundaries.remove(recordBoundaries.size()-1); + + String str1 = recordBoundaries.get(0); + String str2 = recordBoundaries.get(1); + Pair, String> pattern = hirschberg.getLCS(str1, str2, misMatchPenalty, gapPenalty); + if(pattern != null) { + String intersect = pattern.getValue(); + ArrayList intersectPattern = pattern.getKey(); + for(int i = 2; i < recordBoundaries.size(); i++) { + pattern = hirschberg.getLCS(intersect, recordBoundaries.get(i), misMatchPenalty, gapPenalty); + if(pattern != null) { + intersect = pattern.getValue(); + intersectPattern = pattern.getKey(); + } + else + intersect = null; + } + if(intersect != null && intersect.length() > 0) { + keyPattern.add(intersectPattern); + return keyPattern; + } + } + return null; + } + + // Build key pattern tree for each column private KeyTrie[] buildColsKeyPatternMultiRow(){ Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java new file mode 100644 index 00000000000..8df88b61a5f --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import org.apache.sysds.runtime.matrix.data.Pair; + +import java.util.ArrayList; +import java.util.Arrays; + +public class Hirschberg { + + public Pair, String> getLCS(String x, String y, int pxy, int pgap) { + int i, j; // initialising variables + int m = x.length(); // length of gene1 + int n = y.length(); // length of gene2 + + // table for storing optimal substructure answers + int dp[][] = new int[n + m + 1][n + m + 1]; + + for(int[] x1 : dp) + Arrays.fill(x1, 0); + + // initialising the table + for(i = 0; i <= (n + m); i++) { + dp[i][0] = i * pgap; + dp[0][i] = i * pgap; + } + + // calculating the minimum penalty + for(i = 1; i <= m; i++) { + for(j = 1; j <= n; j++) { + if(x.charAt(i - 1) == y.charAt(j - 1)) { + dp[i][j] = dp[i - 1][j - 1]; + } + else { + dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy, dp[i - 1][j] + pgap), dp[i][j - 1] + pgap); + } + } + } + + // Reconstructing the solution + int l = n + m; // maximum possible length + i = m; + j = n; + int xpos = l; + int ypos = l; + + // Final answers for the respective strings + int xans[] = new int[l + 1]; + int yans[] = new int[l + 1]; + + while(!(i == 0 || j == 0)) { + if(x.charAt(i - 1) == y.charAt(j - 1)) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } + else if(dp[i - 1][j - 1] + pxy == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } + else if(dp[i - 1][j] + pgap == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) '_'; + i--; + } + else if(dp[i][j - 1] + pgap == dp[i][j]) { + xans[xpos--] = (int) '_'; + yans[ypos--] = (int) y.charAt(j - 1); + j--; + } + } + while(xpos > 0) { + if(i > 0) + xans[xpos--] = (int) x.charAt(--i); + else + xans[xpos--] = (int) '_'; + } + while(ypos > 0) { + if(j > 0) + yans[ypos--] = (int) y.charAt(--j); + else + yans[ypos--] = (int) '_'; + } + // Since we have assumed the answer to be n+m long, we need to remove the extra + // gaps in the starting id represents the index from which the arrays xans, yans are useful + int id = 1; + for(i = l; i >= 1; i--) { + if((char) yans[i] == '_' && (char) xans[i] == '_') { + id = i + 1; + break; + } + } + + StringBuilder sb = new StringBuilder(); + ArrayList pattern = new ArrayList<>(); + for(i = id; i <= l; i++) { + if(xans[i] == yans[i]) + sb.append((char) xans[i]); + else { + if(sb.length() > 0) + pattern.add(sb.toString()); + sb = new StringBuilder(); + } + } + + if(sb.length() > 0) + pattern.add(sb.toString()); + + // System.out.println(""); + // for(i = id; i <= l; i++) + // System.out.print((char) yans[i]); + // + sb = new StringBuilder(); + for(int bi = id; bi <= l; bi++) { + if(xans[bi] == yans[bi]) { + sb.append((char) xans[bi]); + //System.out.print((char) xans[bi]); + } + //else + //System.out.print("*"); + } + if(sb.length() > 0) + return new Pair<>(pattern, sb.toString()); + else + return null; + } +} + + diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 433da115b7f..a223295cbf5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -194,6 +194,7 @@ public String getIntersectOfChildren(MappingTrieNode node) { } else if(lastCount == 0) lastCount = count; + } if(lastCount != 0) return ssi; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 3fdaf20e1d5..7ed13884815 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -90,8 +90,7 @@ public String getJavaCode(){ getJavaCode(rootCol, src, "0"); break; case KEY: - // TODO: Generate code for split stream as records - // and then increase the row number + getJavaCode(rootCol, src, "0"); break; } From cc0394e5ff8f20029c55b620a8d5579685b841dc Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 26 Jan 2022 21:14:14 +0100 Subject: [PATCH 21/84] Update multi row CodeGen --- .../runtime/iogen/FormatIdentifying.java | 1 - .../sysds/runtime/iogen/GenerateReader.java | 1 + .../runtime/iogen/codegen/CodeGenTrie.java | 43 ++++++++++++++++++- .../runtime/iogen/codegen/MatrixCodeGen.java | 16 +++---- .../iogen/template/MatrixGenerateReader.java | 35 +++++++++++++++ 5 files changed, 86 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index a2fb9ed3791..0347a349735 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -427,7 +427,6 @@ private ArrayList> findRowDelimiters(){ str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); if(str.length() > 0) sb.append(str); - System.out.println(beginLine+" "+nlines+" "+minList[r]); recordBoundaries.add(sb.toString()); } recordBoundaries.remove(recordBoundaries.size()-1); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index bdc38a7791f..83013d1cf64 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -93,6 +93,7 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; + String js = src.generateCodeJava(); matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 7ed13884815..e37e61ad9bf 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -90,8 +90,27 @@ public String getJavaCode(){ getJavaCode(rootCol, src, "0"); break; case KEY: - + src.append("String strChunk, remainedStr = null; \n"); + src.append("int chunkSize = 2048; \n"); + src.append("int recordIndex = 0; \n"); + src.append("try { \n"); + src.append("do{ \n"); + src.append("strChunk = getStringChunkOfBufferReader(br, remainedStr, chunkSize); \n"); + src.append("System.out.println(strChunk); \n"); + src.append("if(strChunk == null || strChunk.length() == 0) break; \n"); + src.append("do { \n"); + ArrayList> kp = properties.getRowKeyPattern().getPrefixKeyPatterns(); + getJavaRowCode(src, kp, kp); getJavaCode(rootCol, src, "0"); + src.append("row++; \n"); + src.append("}while(true); \n"); + src.append("remainedStr = strChunk.substring(recordIndex); \n"); + + src.append("}while(true); \n"); + src.append("} \n"); + src.append("finally { \n"); + src.append("IOUtilFunctions.closeSilently(br); \n"); + src.append("} \n"); break; } @@ -132,4 +151,26 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos } } } + + private void getJavaRowCode(StringBuilder src, ArrayList> rowBeginPattern, + ArrayList> rowEndPattern){ + + // TODO: we have to extend it to multi patterns + // now, we assumed each row can have single pattern for begin and end + + for(ArrayList kb: rowBeginPattern){ + for(String k: kb){ + src.append("recordIndex = strChunk.indexOf(\""+k+"\", recordIndex); \n"); + src.append("if(recordIndex == -1) break; \n"); + } + src.append("recordIndex +="+ kb.get(kb.size() -1).length()+"; \n"); + break; + } + src.append("int recordBeginPos = recordIndex; \n"); + String endKey = rowEndPattern.get(0).get(0); + src.append("recordIndex = strChunk.indexOf(\""+endKey+"\", recordBeginPos);"); + src.append("if(recordIndex == -1) break; \n"); + src.append("str = strChunk.substring(recordBeginPos, recordIndex); \n"); + src.append("strLen = str.length(); \n"); + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 0c0a0e70d3f..272bd8b22b6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -63,18 +63,18 @@ public String generateCodeJava() { src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); - src.append("try { \n"); - src.append("while((str = br.readLine()) != null){ \n"); - src.append("strLen = str.length(); \n"); +// src.append("try { \n"); +// src.append("while((str = br.readLine()) != null){ \n"); +// src.append("strLen = str.length(); \n"); CodeGenTrie trie= new CodeGenTrie(properties, "dest.appendValue"); src.append(trie.getJavaCode()); - src.append("} \n"); - src.append("} \n"); - src.append("finally { \n"); - src.append("IOUtilFunctions.closeSilently(br); \n"); - src.append("}"); +// src.append("} \n"); +// src.append("} \n"); +// src.append("finally { \n"); +// src.append("IOUtilFunctions.closeSilently(br); \n"); +// src.append("}"); src.append("rowPos.setValue(row); \n"); src.append("return lnnz; \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index 8b42aafc79c..ace1da89756 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -161,4 +161,39 @@ protected int getEndPos(String str, int strLen, int currPos, HashSet end } return endPos; } + + //src.append("String str; \n"); + // src.append("int row = rowPos.intValue(); \n"); + // src.append("long lnnz = 0; \n"); + // src.append("int index, endPos, strLen; \n"); + // src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); + // src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); + // if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) + // src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); + // src.append("try { \n"); + // src.append("while((str = br.readLine()) != null){ \n"); + // src.append("strLen = str.length(); \n"); + + protected String getStringChunkOfBufferReader(BufferedReader br, String remainedStr,int chunkSize){ + StringBuilder sb = new StringBuilder(); + String str; + int readSize = 0; + try { + while((str = br.readLine()) != null && readSize0) { + if(remainedStr!=null && remainedStr.length() >0) + return remainedStr + sb; + else + return sb.toString(); + } + else + return null; + } } From f9d8f89a7e4f11967414dcf1225e65059f2e42e9 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 27 Jan 2022 13:58:34 +0100 Subject: [PATCH 22/84] support duplicate values --- .../runtime/iogen/FormatIdentifying.java | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 0347a349735..0a47beaa8ef 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -259,6 +259,53 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); KeyTrie[] colKeyPattens = new KeyTrie[ncols]; + // Clean prefix strings + for(int c =0; c< ncols; c++) { + ArrayList list = prefixStrings.getKey()[c]; + String token = null; + boolean flag = true; + for(int w = 1; w < windowSize && flag; w++) { + HashSet wts = new HashSet<>(); + for(String s : list) { + if(s.length() < w) + flag = false; + else + wts.add(s.substring(s.length()-w)); + } + + if(flag) { + if(wts.size() == 1) + token = wts.iterator().next(); + else { + for(String t : wts) { + int count = 0; + for(String s : list) { + if(s.endsWith(t)) + count++; + } + float percent = (float) count / list.size(); + if(percent >= 0.9) + token = t; + } + } + } + else if(wts.size() == 0) + token = ""; + } + if(token == null) + throw new RuntimeException("can't build a key pattern for the column: "+ c); + + if(token.length() > 0){ + ArrayList newList = new ArrayList<>(); + for(String s: list){ + if(s.endsWith(token)) + newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } + } + + for(int c=0; c Date: Thu, 27 Jan 2022 11:50:12 +0100 Subject: [PATCH 23/84] Experiments VLDB2022 --- .../runtime/iogen/FormatIdentifying.java | 48 +-- .../sysds/runtime/iogen/GenerateReader.java | 2 + .../runtime/iogen/codegen/CodeGenTrie.java | 5 + .../sysds/runtime/iogen/codegen/myTest.java | 102 ++++++ .../iogen/exp/GIOFrameExperimentHDFS.java | 52 +++ .../iogen/exp/GIOGenerateRapidJSONCode.java | 81 +++++ .../iogen/exp/GIONestedExperimentHDFS.java | 79 +++++ .../iogen/exp/GIONestedExperimentStream.java | 85 +++++ .../iogen/exp/SYSDSFrameExperimentHDFS.java | 50 +++ .../apache/sysds/runtime/iogen/exp/Util.java | 99 ++++++ .../sysds/runtime/iogen/exp/resultPath.sh | 13 + .../sysds/runtime/iogen/exp/runGIOFrameExp.sh | 55 +++ .../iogen/exp/runGIOGenerateRapidJSONCode.sh | 53 +++ .../runtime/iogen/exp/runGIONestedExp.sh | 54 +++ .../runtime/iogen/exp/runSYSDSFrameExp.sh | 45 +++ .../iogen/FrameSingleRowNestedTest.java | 11 + .../iogen/GenerateReaderMatrixTest.java | 18 +- .../Identify/FrameGenerateReaderCSVTest.java | 121 +++++++ .../Identify/MatrixGRRowColIdentifyTest.java | 318 ++++++++++++++++++ .../iogen/MatrixMultiRowNestedTest.java | 128 +++++++ 20 files changed, 1367 insertions(+), 52 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentHDFS.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentStream.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 0a47beaa8ef..c8bf03afea7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -259,54 +259,10 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - // Clean prefix strings - for(int c =0; c< ncols; c++) { - ArrayList list = prefixStrings.getKey()[c]; - String token = null; - boolean flag = true; - for(int w = 1; w < windowSize && flag; w++) { - HashSet wts = new HashSet<>(); - for(String s : list) { - if(s.length() < w) - flag = false; - else - wts.add(s.substring(s.length()-w)); - } - - if(flag) { - if(wts.size() == 1) - token = wts.iterator().next(); - else { - for(String t : wts) { - int count = 0; - for(String s : list) { - if(s.endsWith(t)) - count++; - } - float percent = (float) count / list.size(); - if(percent >= 0.9) - token = t; - } - } - } - else if(wts.size() == 0) - token = ""; - } - if(token == null) - throw new RuntimeException("can't build a key pattern for the column: "+ c); - - if(token.length() > 0){ - ArrayList newList = new ArrayList<>(); - for(String s: list){ - if(s.endsWith(token)) - newList.add(s); - } - prefixStrings.getKey()[c] = newList; - } - } + // clean prefix strings - for(int c=0; c keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java new file mode 100644 index 00000000000..1a2d1a58ce9 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java @@ -0,0 +1,102 @@ +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.commons.lang.mutable.MutableInt; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.HashSet; + +public class myTest extends MatrixGenerateReader { + public myTest(CustomProperties _props) { + super(_props); + } + + @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, + MutableInt rowPos, long rlen, long clen, int blen) throws IOException { + + String str; + int row = rowPos.intValue(); + long lnnz = 0; + int index, endPos, strLen; + HashSet[] endWithValueString = _props.endWithValueStrings(); + BufferedReader br = new BufferedReader(new InputStreamReader(is)); + String strChunk, remainedStr = null; + int chunkSize = 2048; + int recordIndex = 0; + try { + do{ + strChunk = getStringChunkOfBufferReader(br, remainedStr, chunkSize); + System.out.println(strChunk); + if(strChunk == null || strChunk.length() == 0) break; + do { + recordIndex = strChunk.indexOf("#index ", recordIndex); + if(recordIndex == -1) break; + //recordIndex +=7; + int recordBeginPos = recordIndex; + recordIndex = strChunk.indexOf("#index ", recordBeginPos + 7);if(recordIndex == -1) break; + str = strChunk.substring(recordBeginPos, recordIndex); + strLen = str.length(); + index = str.indexOf(" "); + if(index != -1) { + int curPos_75540091 = index + 1; + endPos = getEndPos(str, strLen, curPos_75540091, endWithValueString[1]); + String cellStr1 = str.substring(curPos_75540091,endPos); + if ( cellStr1.length() > 0 ){ + Double cellValue1; + try{ cellValue1= Double.parseDouble(cellStr1); } catch(Exception e){cellValue1 = 0d;} + if(cellValue1 != 0) { + dest.appendValue(row, 1, cellValue1); + lnnz++; + } + } + } + index = str.indexOf("#index "); + if(index != -1) { + int curPos_50855160 = index + 7; + endPos = getEndPos(str, strLen, curPos_50855160, endWithValueString[0]); + String cellStr0 = str.substring(curPos_50855160,endPos); + if ( cellStr0.length() > 0 ){ + Double cellValue0; + try{ cellValue0= Double.parseDouble(cellStr0); } catch(Exception e){cellValue0 = 0d;} + if(cellValue0 != 0) { + dest.appendValue(row, 0, cellValue0); + lnnz++; + } + } + } + index = str.indexOf("#index 1"); + if(index != -1) { + int curPos_36575074 = index + 8; + index = str.indexOf(",", curPos_36575074); + if(index != -1) { + int curPos_13302308 = index + 1; + endPos = getEndPos(str, strLen, curPos_13302308, endWithValueString[2]); + String cellStr2 = str.substring(curPos_13302308,endPos); + if ( cellStr2.length() > 0 ){ + Double cellValue2; + try{ cellValue2= Double.parseDouble(cellStr2); } catch(Exception e){cellValue2 = 0d;} + if(cellValue2 != 0) { + dest.appendValue(row, 2, cellValue2); + lnnz++; + } + } + } + } + row++; + }while(true); + remainedStr = strChunk.substring(recordIndex); + }while(true); + } + finally { + IOUtilFunctions.closeSilently(br); + } + rowPos.setValue(row); + return lnnz; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java new file mode 100755 index 00000000000..253bfec1a32 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java @@ -0,0 +1,52 @@ +package org.apache.sysds.runtime.iogen.exp; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.FrameReaderTextCSV; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +public class GIOFrameExperimentHDFS { + + public static void main(String[] args) throws Exception { + + String sampleRawFileName = args[0]; + String sampleFrameFileName = args[1]; + Integer sampleNRows = Integer.parseInt(args[2]); + String delimiter = args[3]; + String schemaFileName = args[4]; + String dataFileName = args[5]; + + Float percent = Float.parseFloat(args[6]); + String datasetName = args[7]; + String LOG_HOME =args[8]; + + if(delimiter.equals("\\t")) + delimiter = "\t"; + + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + + FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, delimiter, false); + FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); + FrameBlock sampleFrame = csv.readFrameFromHDFS(sampleFrameFileName, sampleSchema,-1,ncols); + + System.out.println(">>> "+ sampleFrame.getNumRows()+" "+ sampleFrame.getNumColumns()); + +// double tmpTime = System.nanoTime(); +// String sampleRaw = util.readEntireTextFile(sampleRawFileName); +// GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); +// FrameReader fr = gr.getReader(); +// double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; +// +// tmpTime = System.nanoTime(); +// FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); +// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; +// +// String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+","+percent+","+ sampleNRows+","+ generateTime+","+readTime; +// util.addLog(LOG_HOME, log); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java new file mode 100755 index 00000000000..008ee5c92a2 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java @@ -0,0 +1,81 @@ +package org.apache.sysds.runtime.iogen.exp; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +import java.util.ArrayList; +import java.util.HashSet; + +public class GIOGenerateRapidJSONCode { + + public static void main(String[] args) throws Exception { + +// String sampleRawFileName = args[0]; +// String sampleFrameFileName = args[1]; +// Integer sampleNRows = Integer.parseInt(args[2]); +// String delimiter = args[3]; +// String schemaFileName = args[4]; +// String baseSrc = args[5]; +// +// Float percent = Float.parseFloat(args[6]); +// String datasetName = args[7]; +// String LOG_HOME =args[8]; +// +// +// if(delimiter.equals("\\t")) +// delimiter = "\t"; +// +// Util util = new Util(); +// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); +// int ncols = sampleSchema.length; +// +// ArrayList newSampleSchema = new ArrayList<>(); +// ArrayList> newSampleFrame = new ArrayList<>(); +// +// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); +// +// for(int c = 0; c < sampleFrameStrings[0].length; c++) { +// HashSet valueSet = new HashSet<>(); +// for(int r=0; r3){ +// ArrayList tempList = new ArrayList<>(); +// for(int r=0; r newSampleSchema = new ArrayList<>(); + ArrayList> newSampleFrame = new ArrayList<>(); + + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); + + for(int c = 0; c < sampleFrameStrings[0].length; c++) { + HashSet valueSet = new HashSet<>(); + for(int r=0; r3){ + ArrayList tempList = new ArrayList<>(); + for(int r=0; r newSampleSchema = new ArrayList<>(); +// ArrayList> newSampleFrame = new ArrayList<>(); +// +// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); +// +// for(int c = 0; c < sampleFrameStrings[0].length; c++) { +// HashSet valueSet = new HashSet<>(); +// for(int r=0; r3){ +// ArrayList tempList = new ArrayList<>(); +// for(int r=0; r> "+ schema.length); + + double tmpTime = System.nanoTime(); + FrameBlock frameBlock; + if(datasetName.equals("csv")) { + FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, delimiter, false); + FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); + frameBlock = csv.readFrameFromHDFS(dataFileName, schema, -1, ncols); + } + else if(datasetName.equals("mm")) { + FileFormatPropertiesMM mmpro = new FileFormatPropertiesMM(); + FrameReaderTextCell mm =new FrameReaderTextCell(); + frameBlock = mm.readFrameFromHDFS(dataFileName, schema, nrows, ncols); + } + else + throw new RuntimeException("Format not support!"); + + double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; + + String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+",1.0,0,0,"+readTime; + util.addLog(LOG_HOME, log); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java new file mode 100755 index 00000000000..f7236010f2a --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java @@ -0,0 +1,99 @@ +package org.apache.sysds.runtime.iogen.exp; + +import org.apache.sysds.common.Types; + +import java.io.BufferedReader; +import java.io.BufferedWriter; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +public class Util { + + // Load Random 2D data from file + private double[][] load2DData(String fileName, int nrows, int ncols) throws Exception { + + Path path = Paths.get(fileName); + FileChannel inStreamRegularFile = FileChannel.open(path); + int bufferSize = ncols * 8; + + double[][] result = new double[nrows][ncols]; + try { + for(int r = 0; r < nrows; r++) { + inStreamRegularFile.position((long) r * ncols * 8); + ByteBuffer buffer = ByteBuffer.allocateDirect(bufferSize); + inStreamRegularFile.read(buffer); + buffer.flip(); + + for(int c = 0; c < ncols; c++) { + result[r][c] = buffer.getDouble(); + } + } + inStreamRegularFile.close(); + } + catch(IOException e) { + throw new Exception("Can't read matrix from ByteArray", e); + } + return result; + } + + public String readEntireTextFile(String fileName) throws IOException { + String text = new String(Files.readAllBytes(Paths.get(fileName)), StandardCharsets.UTF_8); + return text; + } + + public void createLog(String fileName, String text) throws IOException { + BufferedWriter writer = new BufferedWriter(new FileWriter(fileName)); + writer.write(text); + writer.write("\n"); + writer.close(); + } + + public void addLog(String fileName, String log) { + try(Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName, true), "utf-8"))) { + writer.write(log); + writer.write("\n"); + } + catch(Exception ex) { + } + } + + public Types.ValueType[] getSchema(String fileName) throws IOException { + String[] sschema = readEntireTextFile(fileName).trim().split(","); + Types.ValueType[] result = new Types.ValueType[sschema.length]; + for(int i = 0; i < sschema.length; i++) + result[i] = Types.ValueType.valueOf(sschema[i]); + return result; + } + + public String[][] loadFrameData(String fileName, int nrows, int ncols, String delimiter) + throws IOException { + String[][] result = new String[nrows][ncols]; + + try(BufferedReader br = new BufferedReader(new FileReader(fileName))) { + String line; + int row = 0; + while((line = br.readLine()) != null) { + String[] data = line.split(delimiter); + for(int i = 0; i < data.length; i++) { + String[] value = data[i].split("::"); + if(value.length ==2) { + int col = Integer.parseInt(value[0]); + result[row][col] = value[1]; + } + } + row++; + } + } + return result; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh new file mode 100755 index 00000000000..ce1741ac469 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +path_1="$1/benchmark" +path_2="$path_1/$3" +mkdir -p $1 +mkdir -p "$path_1" +mkdir -p "$path_2" + +log_file="$path_2/$2.csv" +if test ! -f "$log_file"; then + touch $log_file + echo "dataset,data_nrows,data_ncols,col_index_percent,sample_nrows,generate_time,read_time" > $log_file +fi diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh new file mode 100755 index 00000000000..699bd64cc3d --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +# Set properties +systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" +LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" +jar_file_path="$systemDS_Home/target/SystemDS.jar" +lib_files_path="$systemDS_Home/target/lib/*" +root_data_path="/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer" +home_log="/media/sfathollahzadeh/Windows1/saeedData/FlatDatasets/LOG" +sep="_" +ncols=7 +result_path="GIOFrameExperiment" +declare -a datasets=("aminer_author") + +BASE_SCRIPT="time java\ + -Dlog4j.configuration=file:$LOG4JPROP\ + -Xms1g\ + -Xmx15g\ + -cp\ + $jar_file_path:$lib_files_path\ + org.apache.sysds.runtime.iogen.exp.GIOFrameExperimentHDFS\ + " + +for ro in 1 #2 3 4 5 +do + for d in "${datasets[@]}"; do + ./resultPath.sh $home_log $d$ro $result_path + data_file_name="$root_data_path/$d/$d.data" + for sr in 100 #20 30 40 50 60 70 80 90 100 + do + for p in 7 + do + schema_file_name="$root_data_path/$d/$d$sep$ncols.schema" + sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$ncols.raw" + sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$ncols.frame" + delimiter="," + SCRIPT="$BASE_SCRIPT\ + $sample_raw_fileName\ + $sample_frame_file_name\ + $sr\ + $delimiter\ + $schema_file_name\ + $data_file_name\ + $p\ + $d\ + $home_log/benchmark/$result_path/$d$ro.csv + " +# echo 3 > /proc/sys/vm/drop_caches && sync +# sleep 20 + #echo $SCRIPT + time $SCRIPT + done + done + done +done diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh new file mode 100755 index 00000000000..cdf854d9619 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +# Set properties +systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" +src_cpp="/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" +LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" +jar_file_path="$systemDS_Home/target/SystemDS.jar" +lib_files_path="$systemDS_Home/target/lib/*" +root_data_path="/media/sfathollahzadeh/Windows1/saeedData/NestedDatasets" +home_log="/media/sfathollahzadeh/Windows1/saeedData/NestedDatasets/LOG" +sep="_" +result_path="GIONestedExperiment" +declare -a datasets=("aminer") + +BASE_SCRIPT="time java\ + -Dlog4j.configuration=file:$LOG4JPROP\ + -Xms1g\ + -Xmx15g\ + -cp\ + $jar_file_path:$lib_files_path\ + org.apache.sysds.runtime.iogen.exp.GIOGenerateRapidJSONCode\ + " + +for ro in 1 #2 3 4 5 +do + for d in "${datasets[@]}"; do + ./resultPath.sh $home_log $d$ro $result_path + for sr in 100 + do + for p in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + do + schema_file_name="$root_data_path/$d/$d$sep$p.schema" + sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" + sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" + delimiter="\t" + SCRIPT="$BASE_SCRIPT\ + $sample_raw_fileName\ + $sample_frame_file_name\ + $sr\ + $delimiter\ + $schema_file_name\ + $src_cpp\ + $p\ + $d\ + $home_log/benchmark/$result_path/$d$ro.csv + " +# echo 3 > /proc/sys/vm/drop_caches && sync +# sleep 20 + time $SCRIPT + done + done + done +done diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh new file mode 100755 index 00000000000..a6e035f2543 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash + +# Set properties +systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" +LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" +jar_file_path="$systemDS_Home/target/SystemDS.jar" +lib_files_path="$systemDS_Home/target/lib/*" +root_data_path="/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer" +home_log="/media/sfathollahzadeh/Windows1/saeedData/GIOLog" +sep="_" +result_path="GIONestedExperiment" +declare -a datasets=("aminer_author") + +BASE_SCRIPT="time java\ + -Dlog4j.configuration=file:$LOG4JPROP\ + -Xms1g\ + -Xmx15g\ + -cp\ + $jar_file_path:$lib_files_path\ + org.apache.sysds.runtime.iogen.exp.GIONestedExperimentHDFS\ + " + +for ro in 1 #2 3 4 5 +do + for d in "${datasets[@]}"; do + ./resultPath.sh $home_log $d$ro $result_path + data_file_name="$root_data_path/$d/$d.data" + for sr in 300 #200 300 400 500 600 700 800 900 1000 + do + for p in 7 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + do + schema_file_name="$root_data_path/$d/$d$sep$p.schema" + sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" + sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" + delimiter="\t" + SCRIPT="$BASE_SCRIPT\ + $sample_raw_fileName\ + $sample_frame_file_name\ + $sr\ + $delimiter\ + $schema_file_name\ + $data_file_name\ + $p\ + $d\ + $home_log/benchmark/$result_path/$d$ro.csv + " + #echo 3 > /proc/sys/vm/drop_caches && sync + #sleep 20 + time $SCRIPT + #echo $SCRIPT + done + done + done +done diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh new file mode 100755 index 00000000000..b8d5745f606 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +# Set properties +systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" +LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" +jar_file_path="$systemDS_Home/target/SystemDS.jar" +lib_files_path="$systemDS_Home/target/lib/*" +root_data_path="/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer" +home_log="/media/sfathollahzadeh/Windows1/saeedData/GIOLog" +sep="_" +ncols=2001 +nrows=10001 +result_path="SYSDSFrameExperiment" +declare -a datasets=("csv") + +BASE_SCRIPT="time java\ + -Dlog4j.configuration=file:$LOG4JPROP\ + -Xms1g\ + -Xmx15g\ + -cp\ + $jar_file_path:$lib_files_path\ + org.apache.sysds.runtime.iogen.exp.SYSDSFrameExperimentHDFS\ + " + +for ro in 1 #2 3 4 5 +do + for d in "${datasets[@]}"; do + ./resultPath.sh $home_log $d$ro $result_path + data_file_name="$root_data_path/$d/$d.data" + + schema_file_name="$root_data_path/$d/$d.schema" + delimiter="," + SCRIPT="$BASE_SCRIPT\ + $delimiter\ + $schema_file_name\ + $data_file_name\ + $d\ + $home_log/benchmark/$result_path/$d$ro.csv\ + $nrows + " +# echo 3 > /proc/sys/vm/drop_caches && sync +# sleep 20 + time $SCRIPT + done +done diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java index 24d3a404c7c..b3e1bb934ab 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java @@ -77,4 +77,15 @@ public void test5() { data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; runGenerateReaderTest(); } + + @Test + public void test6() { + sampleRaw = "{\"index\":207,\"name\":\"Nuno Guimarães\",\"affiliations\":[\"ISCTEUniversity Institute of Lisbon, Lisbon, Portugal\"],\"paperCount\":1,\"citationNumber\":0,\"hIndex\":0.0,\"researchInterests\":[\"mental state\",\"mental workload\",\"higher mental workload\",\"mental load\",\"mental workload evaluation\",\"mental workload pattern\",\"ecological reading situation\",\"reading condition\",\"visual user interface\",\"EEG signal\"]}\n"+ + "{\"index\":208,\"name\":\" Nguyen Minh Nhut\",\"affiliations\":[\"Data Mining Department, Institute for Infocomm Research (I2R), 1 Fusionopolis Way, Connexis (South Tower), Singapore 138632\"],\"paperCount\":1,\"citationNumber\":0,\"hIndex\":0.0,\"researchInterests\":[\"system health monitoring\",\"sensor node\",\"adaptive classification system architecture\",\"effective health monitoring system\",\"proposed system\",\"real-time adaptive classification system\",\"adaptive sampling frequency\",\"different sampling\",\"different sampling rate\",\"individual sensor\"]}\n\n"+ + "{\"index\":209,\"name\":\"Louis Janus\",\"affiliations\":[\"\"],\"paperCount\":1,\"citationNumber\":0,\"hIndex\":0.0,\"researchInterests\":[\"language instruction\"]}"; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64, + Types.ValueType.FP32, Types.ValueType.INT64}; + data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; + runGenerateReaderTest(); + } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index b07e9c20203..40249604ba3 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -23,7 +23,9 @@ import org.apache.sysds.common.Types; import org.apache.sysds.conf.CompilerConfig; import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.iogen.FormatIdentifying; import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.iogen.codegen.myTest; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; @@ -89,12 +91,16 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); - MatrixReader mr = gr.getReader(); - MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); - - TestUtils.compareMatrices(sampleMB, matrixBlock, 0); + FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); + myTest mt = new myTest(formatIdentifying.getFormatProperties()); + mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); + int a = 100; + +// GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); +// MatrixReader mr = gr.getReader(); +// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); +// +// TestUtils.compareMatrices(sampleMB, matrixBlock, 0); } catch(Exception exception) { diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java new file mode 100644 index 00000000000..3bc13086f66 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen.Identify; + +import org.apache.sysds.test.functions.iogen.GenerateReaderFrameTest; +import org.junit.Test; + +public class FrameGenerateReaderCSVTest extends GenerateReaderFrameTest { + + private final static String TEST_NAME = "FrameGenerateReaderCSVTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + private void extractSampleRawCSV(String separator) { + int nrows = data.length; + int ncols = data[0].length; + StringBuilder sb = new StringBuilder(); + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + sb.append(data[r][c]); + if(c != ncols - 1) + sb.append(separator); + } + if(r != nrows - 1) + sb.append("\n"); + } + sampleRaw = sb.toString(); + } + + @Test + public void test1() { + String[] naStrings = {}; + String separator = ","; + generateRandomData(10, 5, 1, 100, 1, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test2() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = ","; + generateRandomData(10, 10, -10, 10, 1, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test3() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "****"; + generateRandomData(100, 500, -10, 10, 1, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test4() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = ","; + generateRandomData(10, 10, -10, 10, 0.7, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test5() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = ",,,,"; + generateRandomData(10, 10, -10, 10, 0.5, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test6() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "**"; + generateRandomData(1000, 100, -10, 10, 0.4, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test7() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "**"; + generateRandomData(1000, 100, -10, 10, 0.8, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } + + @Test + public void test8() { + String[] naStrings = {"NULL", "inf", "NaN"}; + String separator = "**"; + generateRandomData(10000, 100, -10, 10, 0.5, naStrings); + extractSampleRawCSV(separator); + runGenerateReaderTest(); + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java new file mode 100644 index 00000000000..e4ebee213ec --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -0,0 +1,318 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen.Identify; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.iogen.exp.Util; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Random; + +public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { + + private final static String TEST_NAME = "MatrixGenerateReaderCSVTest"; + + @Override + protected String getTestName() { + return TEST_NAME; + } + + private void generateRandomCSV(int nrows, int ncols, double min, double max, double sparsity, String separator, + String[] naString) { + + sampleMatrix = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); + StringBuilder sb = new StringBuilder(); + + for(int r = 0; r < nrows; r++) { + StringBuilder row = new StringBuilder(); + for(int c = 0; c < ncols; c++) { + if(sampleMatrix[r][c] != 0) { + row.append(sampleMatrix[r][c]).append(separator); + } + else { + Random rn = new Random(); + int rni = rn.nextInt(naString.length); + row.append(naString[rni]).append(separator); + } + } + + sb.append(row.substring(0, row.length() - separator.length())); + if(r != nrows - 1) + sb.append("\n"); + } + sampleRaw = sb.toString(); + } + + @Test + public void test1() { + sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; + sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; + runGenerateReaderTest(); + } + + @Test + public void test2() { + sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; + sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; + runGenerateReaderTest(); + } + @Test + public void test3() { + sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; + sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; + runGenerateReaderTest(); + } + + @Test + public void test4() { + String[] naString = {"NaN"}; + generateRandomCSV(20, 20, -10, 10, 1, ",", naString); + runGenerateReaderTest(); + } + + @Test + public void test5() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"name\":3,\"password\":4}}\n" + + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + runGenerateReaderTest(); + } + + @Test + public void test6() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + runGenerateReaderTest(); + } + + @Test + public void test7() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + + "{\"name\":18, \"user\":{\"name\":20,\"password\":21}, \"occupation\":19}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + runGenerateReaderTest(); + } + + @Test + public void test8() { + sampleRaw = "1,1,10\n" + + "1,2,20\n" + + "1,3,30\n" + + "2,1,40\n" + + "2,2,50\n" + + "2,3,60\n" + + "3,1,70\n" + + "3,2,80\n"+ + "3,3,90\n"; + + sampleMatrix = new double[][] {{10,20,30}, {40,50,60}, {70,80,90}}; + runGenerateReaderTest(); + } + + @Test + public void test9() { + sampleRaw = "
\n" + // 0 + "1\n" + //1 + "2\n" + // 2 + "3\n" + // 3 + "1980\n" + // 4 + "GIO\n" + // 5 + "
\n" + // 6 + "
\n" + // 7 + "10\n" + // 8 + "20\n" + // 9 + "30\n" + // 10 + "2000\n" + // 11 + "GIO2\n" + // 12 + "
\n" + // 13 + "
\n" + // 14 + "2010\n" + // 15 + "100\n" + // 16 + "200\n" + // 17 + "300\n" + // 18 + "800\n" + // 18 + "GIO3\n" + // 19 + "
\n" + // 20 + "
\n" + // 21 + "1000\n" + // 22 + "2000\n" + // 23 + "3000\n" + // 24 + "2222\n" + // 25 + "GIO4\n" + // 26 + "
"; // 27 + + sampleMatrix = new double[][] {{1,2,3,1980}, {10,20,30,2000}, {100,200,300,2010},{1000,2000,3000,2222}}; + runGenerateReaderTest(); + } + + @Test + public void test10() { + sampleRaw = "
\n" + + "1980 \n" + + "1 \n" + + "2 \n" + + "3 \n" + + "GIO \n" + + "
\n" + + " \n" + + "10 \n" + + "21 \n" + + "30 \n" + + "2000 \n" + + "GIO2 \n" + + "\n" + + " \n" + + "100 \n" + + "300 \n" + + "210 \n" + + "GIO3 \n" + + "200 \n" + + "\n" + + "
\n" + + "2222 \n" + + "1000 \n" + + "2000 \n" + + "3000 \n" + + "GIO4 \n" + + "
"; + + sampleMatrix = new double[][] {{1,2,3,1980}, {10,21,30,2000}, {100,200,300,2010},{1000,2000,3000,2222}}; + runGenerateReaderTest(); + } + + @Test + public void test11() { + sampleRaw = "#index 1\n" + + "#t 2,3\n" + + "#s 1980\n"+ + "#index 10\n\n" + + "#t 21,30\n" + + "#s 2000\n\n"+ + "#index 100\n" + + "#t 200,300\n" + + "#s 2222"; + + sampleMatrix = new double[][] {{1,2,3,1980}, {10,21,30,2000}, {100,200,300,2010},{1000,2000,3000,2222}}; + runGenerateReaderTest(); + } + + @Test + public void test12() { +// sampleRaw = "#index 1\n" + +// "#t 2,3\n" + +// "#s 1980\n"+ +// "#index 10\n\n" + +// "#t 21,30\n" + +// "#s 2000\n\n"+ +// "#index 100\n" + +// "#t 200,300\n" + +// "#s 2222"; +// +// sampleMatrix = new double[][] {{1,2,3}, {10,21,30}, {100,200,300},{1000,2000,3000}}; +// runGenerateReaderTest(); + + StringBuilder sb = new StringBuilder(" ,)R2I( hcraeseR mmocofnI rof etutitsnI ,tnemtrapeD gniniM ataD\"[:\"snoitailiffa\",\"tuhN hniM neyugN \":\"eman\",802:\"xedni\"{"); + System.out.println(sb.reverse()); + } + + @Test + public void test13() throws Exception { + String sampleRawFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/sample_300_7.raw"; + String sampleFrameFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/sample_300_7.frame"; + Integer sampleNRows = 300; + String delimiter = "\\t"; + String schemaFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/aminer_author_7.schema"; + String dataFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/aminer_author.data"; + + Float percent = 7f;//Float.parseFloat(args[6]); + String datasetName = "aminer_author";//args[7]; + String LOG_HOME ="/media/sfathollahzadeh/Windows1/saeedData/GIOLog";//args[8]; + + if(delimiter.equals("\\t")) + delimiter = "\t"; + + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + ArrayList newSampleSchema = new ArrayList<>(); + ArrayList> newSampleFrame = new ArrayList<>(); + + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); + + for(int c = 0; c < sampleFrameStrings[0].length; c++) { + HashSet valueSet = new HashSet<>(); + for(int r=0; r3){ + ArrayList tempList = new ArrayList<>(); + for(int r=0; r1\n" + + "2\n" + + "3\n" + + "\n" + + "\n" + + "4\n" + + "5\n" + + "6\n" + + "\n" + + "\n" + + "7\n" + + "8\n" + + "9\n" + + ""; + sampleMatrix = new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; + runGenerateReaderTest(); + } + + //2. flat object, out-of-order values + @Test + public void test2() { + sampleRaw = "{\"b\":2,\"a\":1,\"e\":5,\"c\":3,\"d\":4}\n" + + "{\"d\":9,\"b\":7,\"c\":8,\"a\":6,\"e\":10}\n" + + "{\"d\":14,\"a\":11,\"e\":15,\"b\":12,\"c\":13}"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + //3. nested object with unique attribute names + @Test + public void test3() { + sampleRaw = "{\"a\":1,\"b\":{\"c\":2,\"d\":3,\"e\":4},\"f\":5}\n" + + "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + + "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; + sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; + runGenerateReaderTest(); + } + + //4. nested object with unique attribute names, out-of-order + @Test + public void test4() { + sampleRaw = "{\"a\":1,\"f\":5,\"b\":{\"c\":2,\"d\":3,\"e\":4}}\n" + + "{\"a\":6,\"f\":10,\"b\":{\"e\":9,\"c\":7,\"d\":8}}\n" + + "{\"b\":{\"d\":13,\"c\":12,\"e\":14},\"a\":11,\"f\":15}\n"; + sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; + runGenerateReaderTest(); + } + + //5. nested object with repeated attribute names, out-of-order + @Test + public void test5() { + sampleRaw = "{\"a\":1,\"b\":{\"a\":2,\"b\":3,\"f\":4},\"f\":5}\n" + + "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + + "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + + // XML + //6. nested object with unique attribute names, in-order + // single type of object, "article" is an object + @Test + public void test6() { + sampleRaw = "
12345
\n" + + "
678910
\n" + + "
1112131415
"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + + //6. nested object with unique attribute names, in-order + // multi types of object, "article", "book", and "homepage" are the object types + @Test + public void test7() { + sampleRaw = "
12345
\n" + + "678910\n" + + "1112131415"; + sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; + runGenerateReaderTest(); + } + + //7. nested object with unique attribute names, in-order + // multi types of object, "article", "book", and "homepage" are the object types + @Test + public void test8() { + sampleRaw = "
122022GIO45
\n" + + "671980DB910\n" + + "11122012CEP1415\n"; + sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; + runGenerateReaderTest(); + } +} From c4b8ec74c9d18de408b587815453131787754913 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 27 Jan 2022 14:11:04 +0100 Subject: [PATCH 24/84] minor merge --- .../runtime/iogen/FormatIdentifying.java | 47 ++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index c8bf03afea7..dfd9f83cd56 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -259,10 +259,53 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - // clean prefix strings + // Clean prefix strings + for(int c =0; c< ncols; c++) { + ArrayList list = prefixStrings.getKey()[c]; + String token = null; + boolean flag = true; + for(int w = 1; w < windowSize && flag; w++) { + HashSet wts = new HashSet<>(); + for(String s : list) { + if(s.length() < w) + flag = false; + else + wts.add(s.substring(s.length()-w)); + } + if(flag) { + if(wts.size() == 1) + token = wts.iterator().next(); + else { + for(String t : wts) { + int count = 0; + for(String s : list) { + if(s.endsWith(t)) + count++; + } + float percent = (float) count / list.size(); + if(percent >= 0.9) + token = t; + } + } + } + else if(wts.size() == 0) + token = ""; + } + if(token == null) + throw new RuntimeException("can't build a key pattern for the column: "+ c); + + if(token.length() > 0){ + ArrayList newList = new ArrayList<>(); + for(String s: list){ + if(s.endsWith(token)) + newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } + } - for(int c=0; c Date: Thu, 27 Jan 2022 17:29:35 +0100 Subject: [PATCH 25/84] minor merge --- .../org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java | 4 ---- .../org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 6cf3dffa9c7..54dbd51f32d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -46,10 +46,6 @@ public CodeGenTrie(CustomProperties properties, String destination){ private void buildPrefixTree(){ for(int c=0; c< properties.getColKeyPattern().length; c++){ KeyTrie keyTrie = properties.getColKeyPattern()[c]; - - Gson gson = new Gson(); - System.out.println(gson.toJson(keyTrie.getPrefixKeyPatterns())); - Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh index a6e035f2543..4bec122b1f6 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh @@ -25,7 +25,7 @@ do for d in "${datasets[@]}"; do ./resultPath.sh $home_log $d$ro $result_path data_file_name="$root_data_path/$d/$d.data" - for sr in 300 #200 300 400 500 600 700 800 900 1000 + for sr in 100 200 300 400 500 600 700 800 900 1000 do for p in 7 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 do From 409c06dc07d6cff5b4e33338ca657636b8dd0cc0 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 27 Jan 2022 19:15:36 +0100 Subject: [PATCH 26/84] UP a Test --- .../iogen/exp/GIOFlatFrameExperimentHDFS.java | 35 ++++++ .../iogen/exp/GIOFrameExperimentHDFS.java | 84 ++++++++------ .../iogen/exp/GIOGenerateRapidJSONCode.java | 101 ++++++++--------- .../sysds/runtime/iogen/exp/GIOMain.java | 29 +++++ .../iogen/exp/GIONestedExperimentHDFS.java | 79 ------------- .../iogen/exp/GIONestedExperimentStream.java | 104 ++++++++---------- .../iogen/exp/SYSDSFrameExperimentHDFS.java | 65 +++++------ .../sysds/runtime/iogen/exp/resultPath.sh | 2 +- .../sysds/runtime/iogen/exp/runGIOExp.sh | 67 +++++++++++ .../sysds/runtime/iogen/exp/runGIOFrameExp.sh | 55 --------- .../iogen/exp/runGIOGenerateRapidJSONCode.sh | 53 --------- .../runtime/iogen/exp/runGIONestedExp.sh | 54 --------- .../runtime/iogen/exp/runSYSDSFrameExp.sh | 45 -------- .../Identify/MatrixGRRowColIdentifyTest.java | 14 +-- 14 files changed, 303 insertions(+), 484 deletions(-) create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOMain.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentHDFS.java create mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java new file mode 100755 index 00000000000..e7dfebc1c00 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java @@ -0,0 +1,35 @@ +package org.apache.sysds.runtime.iogen.exp; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.FrameReaderTextCSV; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +public class GIOFlatFrameExperimentHDFS extends GIOMain { + + public static void main(String[] args) throws Exception { + getArgs(); + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, delimiter, false); + FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); + FrameBlock sampleFrame = csv.readFrameFromHDFS(sampleFrameFileName, sampleSchema, -1, ncols); + + double tmpTime = System.nanoTime(); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + FrameReader fr = gr.getReader(); + double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; + + tmpTime = System.nanoTime(); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); + double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; + + String log = datasetName + "," + frameBlock.getNumRows() + "," + frameBlock.getNumColumns() + "," + sampleSchema.length + "," + sampleNRows + "," + generateTime + "," + readTime; + util.addLog(LOG_HOME, log); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java index 253bfec1a32..7f8ce066f1c 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java @@ -1,52 +1,66 @@ package org.apache.sysds.runtime.iogen.exp; import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.io.FrameReaderTextCSV; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; -public class GIOFrameExperimentHDFS { +import java.util.ArrayList; +import java.util.HashSet; - public static void main(String[] args) throws Exception { +public class GIOFrameExperimentHDFS extends GIOMain { - String sampleRawFileName = args[0]; - String sampleFrameFileName = args[1]; - Integer sampleNRows = Integer.parseInt(args[2]); - String delimiter = args[3]; - String schemaFileName = args[4]; - String dataFileName = args[5]; + public static void main(String[] args) throws Exception { + getArgs(); - Float percent = Float.parseFloat(args[6]); - String datasetName = args[7]; - String LOG_HOME =args[8]; + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; - if(delimiter.equals("\\t")) - delimiter = "\t"; + ArrayList newSampleSchema = new ArrayList<>(); + ArrayList> newSampleFrame = new ArrayList<>(); - Util util = new Util(); - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); + for (int c = 0; c < sampleFrameStrings[0].length; c++) { + HashSet valueSet = new HashSet<>(); + for (int r = 0; r < sampleFrameStrings.length; r++) + valueSet.add(sampleFrameStrings[r][c]); + if (valueSet.size() > 3) { + ArrayList tempList = new ArrayList<>(); + for (int r = 0; r < sampleFrameStrings.length; r++) { + tempList.add(sampleFrameStrings[r][c]); + } + newSampleFrame.add(tempList); + newSampleSchema.add(sampleSchema[c]); + } + } - FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, delimiter, false); - FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); - FrameBlock sampleFrame = csv.readFrameFromHDFS(sampleFrameFileName, sampleSchema,-1,ncols); + sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; - System.out.println(">>> "+ sampleFrame.getNumRows()+" "+ sampleFrame.getNumColumns()); + for (int row = 0; row < sampleFrameStrings.length; row++) { + for (int col = 0; col < sampleFrameStrings[0].length; col++) { + sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); + } + } -// double tmpTime = System.nanoTime(); -// String sampleRaw = util.readEntireTextFile(sampleRawFileName); -// GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); -// FrameReader fr = gr.getReader(); -// double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// tmpTime = System.nanoTime(); -// FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); -// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+","+percent+","+ sampleNRows+","+ generateTime+","+readTime; -// util.addLog(LOG_HOME, log); - } + sampleSchema = new Types.ValueType[newSampleSchema.size()]; + for (int i = 0; i < newSampleSchema.size(); i++) + sampleSchema[i] = newSampleSchema.get(i); + + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); + + double tmpTime = System.nanoTime(); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + FrameReader fr = gr.getReader(); + double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; + + tmpTime = System.nanoTime(); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); + double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; + + String log = datasetName + "," + frameBlock.getNumRows() + "," + frameBlock.getNumColumns() + "," + sampleSchema.length + "," + sampleNRows + "," + generateTime + "," + readTime; + util.addLog(LOG_HOME, log); + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java index 008ee5c92a2..8e763dec755 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java @@ -1,81 +1,66 @@ package org.apache.sysds.runtime.iogen.exp; import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; import java.util.ArrayList; import java.util.HashSet; -public class GIOGenerateRapidJSONCode { +public class GIOGenerateRapidJSONCode extends GIOMain { - public static void main(String[] args) throws Exception { - -// String sampleRawFileName = args[0]; -// String sampleFrameFileName = args[1]; -// Integer sampleNRows = Integer.parseInt(args[2]); -// String delimiter = args[3]; -// String schemaFileName = args[4]; -// String baseSrc = args[5]; -// -// Float percent = Float.parseFloat(args[6]); -// String datasetName = args[7]; -// String LOG_HOME =args[8]; -// + public static void main(String[] args) throws Exception { +// getArgs(); // -// if(delimiter.equals("\\t")) -// delimiter = "\t"; +// Util util = new Util(); +// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); +// int ncols = sampleSchema.length; // -// Util util = new Util(); -// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); -// int ncols = sampleSchema.length; +// ArrayList newSampleSchema = new ArrayList<>(); +// ArrayList> newSampleFrame = new ArrayList<>(); // -// ArrayList newSampleSchema = new ArrayList<>(); -// ArrayList> newSampleFrame = new ArrayList<>(); +// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); // -// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); +// for (int c = 0; c < sampleFrameStrings[0].length; c++) { +// HashSet valueSet = new HashSet<>(); +// for (int r = 0; r < sampleFrameStrings.length; r++) +// valueSet.add(sampleFrameStrings[r][c]); +// if (valueSet.size() > 3) { +// ArrayList tempList = new ArrayList<>(); +// for (int r = 0; r < sampleFrameStrings.length; r++) { +// tempList.add(sampleFrameStrings[r][c]); +// } +// newSampleFrame.add(tempList); +// newSampleSchema.add(sampleSchema[c]); +// } +// } // -// for(int c = 0; c < sampleFrameStrings[0].length; c++) { -// HashSet valueSet = new HashSet<>(); -// for(int r=0; r3){ -// ArrayList tempList = new ArrayList<>(); -// for(int r=0; r newSampleSchema = new ArrayList<>(); - ArrayList> newSampleFrame = new ArrayList<>(); - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); - - for(int c = 0; c < sampleFrameStrings[0].length; c++) { - HashSet valueSet = new HashSet<>(); - for(int r=0; r3){ - ArrayList tempList = new ArrayList<>(); - for(int r=0; r newSampleSchema = new ArrayList<>(); -// ArrayList> newSampleFrame = new ArrayList<>(); +// ArrayList newSampleSchema = new ArrayList<>(); +// ArrayList> newSampleFrame = new ArrayList<>(); // -// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); +// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); // -// for(int c = 0; c < sampleFrameStrings[0].length; c++) { -// HashSet valueSet = new HashSet<>(); -// for(int r=0; r3){ -// ArrayList tempList = new ArrayList<>(); -// for(int r=0; r valueSet = new HashSet<>(); +// for (int r = 0; r < sampleFrameStrings.length; r++) +// valueSet.add(sampleFrameStrings[r][c]); +// if (valueSet.size() > 3) { +// ArrayList tempList = new ArrayList<>(); +// for (int r = 0; r < sampleFrameStrings.length; r++) { +// tempList.add(sampleFrameStrings[r][c]); +// } +// newSampleFrame.add(tempList); +// newSampleSchema.add(sampleSchema[c]); +// } +// } // -// sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; +// sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; // -// for(int row=0; row> "+ schema.length); - - double tmpTime = System.nanoTime(); - FrameBlock frameBlock; - if(datasetName.equals("csv")) { - FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, delimiter, false); - FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); - frameBlock = csv.readFrameFromHDFS(dataFileName, schema, -1, ncols); - } - else if(datasetName.equals("mm")) { - FileFormatPropertiesMM mmpro = new FileFormatPropertiesMM(); - FrameReaderTextCell mm =new FrameReaderTextCell(); - frameBlock = mm.readFrameFromHDFS(dataFileName, schema, nrows, ncols); - } - else - throw new RuntimeException("Format not support!"); - - double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; - - String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+",1.0,0,0,"+readTime; - util.addLog(LOG_HOME, log); +// getArgs(); +// if (percent < 1 || codeGen || sampleNRows < 100) +// return; +// +// Util util = new Util(); +// Types.ValueType[] schema = util.getSchema(schemaFileName); +// int ncols = schema.length; +// +// double tmpTime = System.nanoTime(); +// FrameBlock frameBlock; +// if(datasetName.equals("csv")) { +// FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, " ", false); +// FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); +// frameBlock = csv.readFrameFromHDFS(dataFileName, schema, -1, ncols); +// } +// else if(datasetName.equals("mm")) { +// FrameReaderTextCell mm =new FrameReaderTextCell(); +// frameBlock = mm.readFrameFromHDFS(dataFileName, schema, nrows, ncols); +// } +// else +// throw new RuntimeException("Format not support!"); +// +// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; +// +// String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+",1.0,0,0,"+readTime; +// util.addLog(LOG_HOME, log); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh index ce1741ac469..0ad69d46a0e 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh @@ -9,5 +9,5 @@ mkdir -p "$path_2" log_file="$path_2/$2.csv" if test ! -f "$log_file"; then touch $log_file - echo "dataset,data_nrows,data_ncols,col_index_percent,sample_nrows,generate_time,read_time" > $log_file + echo "dataset,data_nrows,data_ncols,col_selected_count,sample_nrows,generate_time,read_time" > $log_file fi diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh new file mode 100755 index 00000000000..0affe74b041 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +#SystemDS Paths: +#---------------------------------------------------------------- +systemDS_Home="/home/saeed/Documents/Github/systemds" +LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" +jar_file_path="$systemDS_Home/target/SystemDS.jar" +lib_files_path="$systemDS_Home/target/lib/*" +#----------------------------------------------------------------- +root_data_path="/home/saeed/Documents/Dataset/GIODataset/flat" +home_log="/home/saeed/Documents/ExpLog" +cpp_base_src="" #"/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" +sep="_" +nrows=-1 + +mx_mem="$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024 * 1024)))g" + +delimiter="\t" +declare -a datasets=("aminer_author") +declare -a main_classes=( "GIOFrameExperimentHDFS") + +for (( i = 0; i < 1; i++ )); do + for mc in "${main_classes[@]}"; do + for d in "${datasets[@]}"; do + ./resultPath.sh $home_log $d$i $mc + data_file_name="$root_data_path/$d/$d.data" + + for sr in 100 200 300 400 500 600 700 800 900 1000 + do + for p in 7 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + do + schema_file_name="$root_data_path/$d/$d$sep$p.schema" + sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" + sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" + SCRIPT="java\ + -Dlog4j.configuration=file:$LOG4JPROP\ + -Xms1g\ + -Xmx$mx_mem\ + -Xmn4000m\ + -DsampleRawFileName=$sample_raw_fileName\ + -DsampleFrameFileName=$sample_frame_file_name\ + -DsampleNRows=$sr\ + -Ddelimiter=$delimiter\ + -DschemaFileName=$schema_file_name\ + -DdataFileName=$data_file_name\ + -DdatasetName=$d\ + -DhomeLog=$home_log/benchmark/$mc/$d$i.csv\ + -DcppBaseSrc=$cpp_base_src\ + -Dnrows=$nrows\ + -cp\ + $jar_file_path:$lib_files_path\ + org.apache.sysds.runtime.iogen.exp.$mc\ + " + #echo 3 > /proc/sys/vm/drop_caches && sync + #sleep 20 + + #echo "++++++++++++++++++++++++++++++++++++++++++++" + #echo $SCRIPT + time $SCRIPT + done + done + done + done +done +#/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper +#/home/saeed/Documents/GIODataset/flat/aminer_paper/aminer_paper_5.schema + diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh deleted file mode 100755 index 699bd64cc3d..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOFrameExp.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env bash - -# Set properties -systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" -LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" -jar_file_path="$systemDS_Home/target/SystemDS.jar" -lib_files_path="$systemDS_Home/target/lib/*" -root_data_path="/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer" -home_log="/media/sfathollahzadeh/Windows1/saeedData/FlatDatasets/LOG" -sep="_" -ncols=7 -result_path="GIOFrameExperiment" -declare -a datasets=("aminer_author") - -BASE_SCRIPT="time java\ - -Dlog4j.configuration=file:$LOG4JPROP\ - -Xms1g\ - -Xmx15g\ - -cp\ - $jar_file_path:$lib_files_path\ - org.apache.sysds.runtime.iogen.exp.GIOFrameExperimentHDFS\ - " - -for ro in 1 #2 3 4 5 -do - for d in "${datasets[@]}"; do - ./resultPath.sh $home_log $d$ro $result_path - data_file_name="$root_data_path/$d/$d.data" - for sr in 100 #20 30 40 50 60 70 80 90 100 - do - for p in 7 - do - schema_file_name="$root_data_path/$d/$d$sep$ncols.schema" - sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$ncols.raw" - sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$ncols.frame" - delimiter="," - SCRIPT="$BASE_SCRIPT\ - $sample_raw_fileName\ - $sample_frame_file_name\ - $sr\ - $delimiter\ - $schema_file_name\ - $data_file_name\ - $p\ - $d\ - $home_log/benchmark/$result_path/$d$ro.csv - " -# echo 3 > /proc/sys/vm/drop_caches && sync -# sleep 20 - #echo $SCRIPT - time $SCRIPT - done - done - done -done diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh deleted file mode 100755 index cdf854d9619..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOGenerateRapidJSONCode.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env bash - -# Set properties -systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" -src_cpp="/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" -LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" -jar_file_path="$systemDS_Home/target/SystemDS.jar" -lib_files_path="$systemDS_Home/target/lib/*" -root_data_path="/media/sfathollahzadeh/Windows1/saeedData/NestedDatasets" -home_log="/media/sfathollahzadeh/Windows1/saeedData/NestedDatasets/LOG" -sep="_" -result_path="GIONestedExperiment" -declare -a datasets=("aminer") - -BASE_SCRIPT="time java\ - -Dlog4j.configuration=file:$LOG4JPROP\ - -Xms1g\ - -Xmx15g\ - -cp\ - $jar_file_path:$lib_files_path\ - org.apache.sysds.runtime.iogen.exp.GIOGenerateRapidJSONCode\ - " - -for ro in 1 #2 3 4 5 -do - for d in "${datasets[@]}"; do - ./resultPath.sh $home_log $d$ro $result_path - for sr in 100 - do - for p in 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 - do - schema_file_name="$root_data_path/$d/$d$sep$p.schema" - sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" - sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" - delimiter="\t" - SCRIPT="$BASE_SCRIPT\ - $sample_raw_fileName\ - $sample_frame_file_name\ - $sr\ - $delimiter\ - $schema_file_name\ - $src_cpp\ - $p\ - $d\ - $home_log/benchmark/$result_path/$d$ro.csv - " -# echo 3 > /proc/sys/vm/drop_caches && sync -# sleep 20 - time $SCRIPT - done - done - done -done diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh deleted file mode 100755 index 4bec122b1f6..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIONestedExp.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash - -# Set properties -systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" -LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" -jar_file_path="$systemDS_Home/target/SystemDS.jar" -lib_files_path="$systemDS_Home/target/lib/*" -root_data_path="/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer" -home_log="/media/sfathollahzadeh/Windows1/saeedData/GIOLog" -sep="_" -result_path="GIONestedExperiment" -declare -a datasets=("aminer_author") - -BASE_SCRIPT="time java\ - -Dlog4j.configuration=file:$LOG4JPROP\ - -Xms1g\ - -Xmx15g\ - -cp\ - $jar_file_path:$lib_files_path\ - org.apache.sysds.runtime.iogen.exp.GIONestedExperimentHDFS\ - " - -for ro in 1 #2 3 4 5 -do - for d in "${datasets[@]}"; do - ./resultPath.sh $home_log $d$ro $result_path - data_file_name="$root_data_path/$d/$d.data" - for sr in 100 200 300 400 500 600 700 800 900 1000 - do - for p in 7 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 - do - schema_file_name="$root_data_path/$d/$d$sep$p.schema" - sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" - sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" - delimiter="\t" - SCRIPT="$BASE_SCRIPT\ - $sample_raw_fileName\ - $sample_frame_file_name\ - $sr\ - $delimiter\ - $schema_file_name\ - $data_file_name\ - $p\ - $d\ - $home_log/benchmark/$result_path/$d$ro.csv - " - #echo 3 > /proc/sys/vm/drop_caches && sync - #sleep 20 - time $SCRIPT - #echo $SCRIPT - done - done - done -done diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh deleted file mode 100755 index b8d5745f606..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runSYSDSFrameExp.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# Set properties -systemDS_Home="/home/sfathollahzadeh/Documents/GitHub/systemds" -LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" -jar_file_path="$systemDS_Home/target/SystemDS.jar" -lib_files_path="$systemDS_Home/target/lib/*" -root_data_path="/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer" -home_log="/media/sfathollahzadeh/Windows1/saeedData/GIOLog" -sep="_" -ncols=2001 -nrows=10001 -result_path="SYSDSFrameExperiment" -declare -a datasets=("csv") - -BASE_SCRIPT="time java\ - -Dlog4j.configuration=file:$LOG4JPROP\ - -Xms1g\ - -Xmx15g\ - -cp\ - $jar_file_path:$lib_files_path\ - org.apache.sysds.runtime.iogen.exp.SYSDSFrameExperimentHDFS\ - " - -for ro in 1 #2 3 4 5 -do - for d in "${datasets[@]}"; do - ./resultPath.sh $home_log $d$ro $result_path - data_file_name="$root_data_path/$d/$d.data" - - schema_file_name="$root_data_path/$d/$d.schema" - delimiter="," - SCRIPT="$BASE_SCRIPT\ - $delimiter\ - $schema_file_name\ - $data_file_name\ - $d\ - $home_log/benchmark/$result_path/$d$ro.csv\ - $nrows - " -# echo 3 > /proc/sys/vm/drop_caches && sync -# sleep 20 - time $SCRIPT - done -done diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index e4ebee213ec..c90f9d9ba6b 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -251,16 +251,16 @@ public void test12() { @Test public void test13() throws Exception { - String sampleRawFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/sample_300_7.raw"; - String sampleFrameFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/sample_300_7.frame"; - Integer sampleNRows = 300; + String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/sample_100_5.raw"; + String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/sample_100_5.frame"; + Integer sampleNRows = 100; String delimiter = "\\t"; - String schemaFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/aminer_author_7.schema"; - String dataFileName = "/media/sfathollahzadeh/Windows1/saeedData/GIODataset/flat/aminer/aminer_author/aminer_author.data"; + String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/aminer_paper.schema"; + String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/aminer_paper.data"; Float percent = 7f;//Float.parseFloat(args[6]); - String datasetName = "aminer_author";//args[7]; - String LOG_HOME ="/media/sfathollahzadeh/Windows1/saeedData/GIOLog";//args[8]; + String datasetName = "aminer_paper";//args[7]; + String LOG_HOME ="/home/saeed/Documents/ExpLog";//args[8]; if(delimiter.equals("\\t")) delimiter = "\t"; From c1f1799c725604c0d1185bea17efddad88d8cfb5 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 27 Jan 2022 20:18:05 +0100 Subject: [PATCH 27/84] Fix duplicate string mapping bug --- .../apache/sysds/runtime/iogen/RawIndex.java | 518 +++++++++--------- .../iogen/FrameSingleRowFlatTest.java | 17 + 2 files changed, 276 insertions(+), 259 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index 597fbf3a89e..40a5f3b3b09 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -28,142 +28,136 @@ import java.util.HashMap; public class RawIndex { - private final String raw; - private final int rawLength; - private final BitSet numberBitSet; - private final BitSet dotBitSet; - private final BitSet eBitSet; - private final BitSet plusMinusBitSet; - private BitSet reservedPositions; - private BitSet backupReservedPositions; - private HashMap>> actualNumericValues; - private HashMap>> dotActualNumericValues; - private HashMap>> dotEActualNumericValues; - - - public RawIndex(String raw) { - this.raw = raw; - this.rawLength = raw.length(); - this.numberBitSet = new BitSet(rawLength); - this.dotBitSet = new BitSet(rawLength); - this.eBitSet = new BitSet(rawLength); - this.plusMinusBitSet = new BitSet(rawLength); - this.reservedPositions = new BitSet(rawLength); - this.backupReservedPositions = new BitSet(rawLength); - this.actualNumericValues = null; - this.dotActualNumericValues = null; - this.dotEActualNumericValues = new HashMap<>(); - - for(int i = 0; i < this.rawLength; i++) { - switch(raw.charAt(i)) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - this.numberBitSet.set(i); - break; - case '+': - case '-': - this.plusMinusBitSet.set(i); - break; - case '.': - this.dotBitSet.set(i); - break; - case 'e': - case 'E': - this.eBitSet.set(i); - break; - } - } - // Clean unnecessary sets - // Clean for "." - for(int i = dotBitSet.nextSetBit(0); i != -1; i = dotBitSet.nextSetBit(i + 1)) { - boolean flag = false; - if(i > 0) { - if(i< rawLength -2) { - flag = !numberBitSet.get(i - 1) && - !numberBitSet.get(i + 1) && - !plusMinusBitSet.get(i + 1) && - !eBitSet.get(i + 1); - } - } - else if( i== rawLength-1){ - flag = !numberBitSet.get(i - 1); - } - else if(i==0){ - if(i < rawLength-2){ - flag = !numberBitSet.get(i + 1) && - !plusMinusBitSet.get(i + 1) && - !eBitSet.get(i + 1); - } - else if( i== rawLength-1){ - flag = true; - } - } - - if(flag) - dotBitSet.set(i, false); - } - - // Clean for "+/-" - for(int i = plusMinusBitSet.nextSetBit(0); i != -1; i = plusMinusBitSet.nextSetBit(i + 1)) { - boolean flag; - if(i1 && i 0) - extractNumericDotEActualValues(); - } + private final String raw; + private final int rawLength; + private final BitSet numberBitSet; + private final BitSet dotBitSet; + private final BitSet eBitSet; + private final BitSet plusMinusBitSet; + private BitSet reservedPositions; + private BitSet backupReservedPositions; + private HashMap>> actualNumericValues; + private HashMap>> dotActualNumericValues; + private HashMap>> dotEActualNumericValues; + + + public RawIndex(String raw) { + this.raw = raw; + this.rawLength = raw.length(); + this.numberBitSet = new BitSet(rawLength); + this.dotBitSet = new BitSet(rawLength); + this.eBitSet = new BitSet(rawLength); + this.plusMinusBitSet = new BitSet(rawLength); + this.reservedPositions = new BitSet(rawLength); + this.backupReservedPositions = new BitSet(rawLength); + this.actualNumericValues = null; + this.dotActualNumericValues = null; + this.dotEActualNumericValues = new HashMap<>(); + + for (int i = 0; i < this.rawLength; i++) { + switch (raw.charAt(i)) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + this.numberBitSet.set(i); + break; + case '+': + case '-': + this.plusMinusBitSet.set(i); + break; + case '.': + this.dotBitSet.set(i); + break; + case 'e': + case 'E': + this.eBitSet.set(i); + break; + } + } + // Clean unnecessary sets + // Clean for "." + for (int i = dotBitSet.nextSetBit(0); i != -1; i = dotBitSet.nextSetBit(i + 1)) { + boolean flag = false; + if (i > 0) { + if (i < rawLength - 2) { + flag = !numberBitSet.get(i - 1) && + !numberBitSet.get(i + 1) && + !plusMinusBitSet.get(i + 1) && + !eBitSet.get(i + 1); + } + } else if (i == rawLength - 1) { + flag = !numberBitSet.get(i - 1); + } else if (i == 0) { + if (i < rawLength - 2) { + flag = !numberBitSet.get(i + 1) && + !plusMinusBitSet.get(i + 1) && + !eBitSet.get(i + 1); + } else if (i == rawLength - 1) { + flag = true; + } + } - public Pair findValue(Object value, Types.ValueType valueType){ - if(valueType.isNumeric()) - return findValue(UtilFunctions.getDouble(value)); - else if(valueType == Types.ValueType.STRING){ - String os = UtilFunctions.objectToString(value); - if(os == null || os.length() == 0) - return null; - else - return findValue(UtilFunctions.objectToString(value)); - } + if (flag) + dotBitSet.set(i, false); + } + + // Clean for "+/-" + for (int i = plusMinusBitSet.nextSetBit(0); i != -1; i = plusMinusBitSet.nextSetBit(i + 1)) { + boolean flag; + if (i < rawLength - 1) { + flag = numberBitSet.get(i + 1); + if (!flag && i < rawLength - 2) + flag = dotBitSet.get(i + 1) && numberBitSet.get(i + 2); + } else { + flag = false; + } + if (!flag) + plusMinusBitSet.set(i, false); + } + + // Clean for "e/E" + for (int i = eBitSet.nextSetBit(0); i != -1; i = eBitSet.nextSetBit(i + 1)) { + boolean flag = false; + if ((i == 1 && !numberBitSet.get(0)) || i == 0 || i == rawLength - 1) { + flag = false; + } else if (i > 1 && i < rawLength - 2) { + flag = numberBitSet.get(i - 1) || (numberBitSet.get(i - 2) && dotBitSet.get(i - 1)); + if (flag) + flag = numberBitSet.get(i + 1) || (numberBitSet.get(i + 2) && plusMinusBitSet.get(i + 1)); + } else if (i == rawLength - 2) { + flag = numberBitSet.get(rawLength - 1); + } + if (!flag) + eBitSet.set(i, false); + } + if (numberBitSet.cardinality() > 0) + extractNumericDotEActualValues(); + } + + public Pair findValue(Object value, Types.ValueType valueType) { + if (valueType.isNumeric()) + return findValue(UtilFunctions.getDouble(value)); + else if (valueType == Types.ValueType.STRING) { + String os = UtilFunctions.objectToString(value); + if (os == null || os.length() == 0) + return null; + else + return findValue(os); + } // else if(valueType == Types.ValueType.BOOLEAN) // return findValue(UtilFunctions.objectToString()) else return null; } - public Pair findValue(double value){ + public Pair findValue(double value) { // extractNumericActualValues(); // if(actualNumericValues.containsKey(value)){ // return getValuePositionAndLength(actualNumericValues.get(value)); @@ -175,150 +169,156 @@ public Pair findValue(double value){ // } // // extractNumericDotEActualValues(); - if(dotEActualNumericValues.containsKey(value)){ - return getValuePositionAndLength(dotEActualNumericValues.get(value)); - } - return null; - } + if (dotEActualNumericValues.containsKey(value)) { + return getValuePositionAndLength(dotEActualNumericValues.get(value)); + } + return null; + } - private Pair findValue(String value){ - int index = this.raw.indexOf(value); - if(index == -1) - return null; - else { - for(int i= index; i(index, value.length()); - } - } + private Pair findValue(String value) { + int index = 0; + boolean flag; + do { + flag = true; + index = this.raw.indexOf(value, index); + if (index == -1) + return null; - private Pair getValuePositionAndLength(ArrayList> list){ - for(Pair p: list){ - if(!reservedPositions.get(p.getKey())) { - reservedPositions.set(p.getKey(), p.getKey()+p.getValue()); - return p; - } - } - return null; - } + for (int i = index; i < index + value.length(); i++) + if (reservedPositions.get(i)) { + flag = false; + break; + } + if (!flag) + index += value.length(); - private void extractNumericActualValues(){ - if(this.actualNumericValues == null) - this.actualNumericValues = new HashMap<>(); - else - return; - StringBuilder sb = new StringBuilder(); - BitSet nBitSet = (BitSet) numberBitSet.clone(); - nBitSet.or(plusMinusBitSet); - int pi = nBitSet.nextSetBit(0); - sb.append(raw.charAt(pi)); - - for(int i = nBitSet.nextSetBit(pi+1); i != -1; i = nBitSet.nextSetBit(i + 1)) { - if(pi+sb.length() != i) { - addActualValueToList(sb.toString(), pi, actualNumericValues); - sb = new StringBuilder(); - sb.append(raw.charAt(i)); - pi = i; - } - else - sb.append(raw.charAt(i)); - } - if(sb.length()>0) - addActualValueToList(sb.toString(), pi, actualNumericValues); - } + } while (!flag); - private void extractNumericDotActualValues(){ - if(this.dotActualNumericValues == null) - this.dotActualNumericValues = new HashMap<>(); - else - return; - - BitSet numericDotBitSet = (BitSet) numberBitSet.clone(); - numericDotBitSet.or(dotBitSet); - numericDotBitSet.or(plusMinusBitSet); - StringBuilder sb = new StringBuilder(); - int pi = numericDotBitSet.nextSetBit(0); - sb.append(raw.charAt(pi)); - - for(int i = numericDotBitSet.nextSetBit(pi+1); i != -1; i = numericDotBitSet.nextSetBit(i + 1)) { - if(pi+sb.length() != i) { - addActualValueToList(sb.toString(), pi, dotActualNumericValues); - sb = new StringBuilder(); - sb.append(raw.charAt(i)); - pi = i; - } - else - sb.append(raw.charAt(i)); - } - if(sb.length()>0) - addActualValueToList(sb.toString(), pi, dotActualNumericValues); - } + reservedPositions.set(index, index + value.length()); + return new Pair<>(index, value.length()); + + } - private void extractNumericDotEActualValues(){ + private Pair getValuePositionAndLength(ArrayList> list) { + for (Pair p : list) { + if (!reservedPositions.get(p.getKey())) { + reservedPositions.set(p.getKey(), p.getKey() + p.getValue()); + return p; + } + } + return null; + } + + private void extractNumericActualValues() { + if (this.actualNumericValues == null) + this.actualNumericValues = new HashMap<>(); + else + return; + StringBuilder sb = new StringBuilder(); + BitSet nBitSet = (BitSet) numberBitSet.clone(); + nBitSet.or(plusMinusBitSet); + int pi = nBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for (int i = nBitSet.nextSetBit(pi + 1); i != -1; i = nBitSet.nextSetBit(i + 1)) { + if (pi + sb.length() != i) { + addActualValueToList(sb.toString(), pi, actualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } else + sb.append(raw.charAt(i)); + } + if (sb.length() > 0) + addActualValueToList(sb.toString(), pi, actualNumericValues); + } + + private void extractNumericDotActualValues() { + if (this.dotActualNumericValues == null) + this.dotActualNumericValues = new HashMap<>(); + else + return; + + BitSet numericDotBitSet = (BitSet) numberBitSet.clone(); + numericDotBitSet.or(dotBitSet); + numericDotBitSet.or(plusMinusBitSet); + StringBuilder sb = new StringBuilder(); + int pi = numericDotBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for (int i = numericDotBitSet.nextSetBit(pi + 1); i != -1; i = numericDotBitSet.nextSetBit(i + 1)) { + if (pi + sb.length() != i) { + addActualValueToList(sb.toString(), pi, dotActualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } else + sb.append(raw.charAt(i)); + } + if (sb.length() > 0) + addActualValueToList(sb.toString(), pi, dotActualNumericValues); + } + + private void extractNumericDotEActualValues() { // if(this.dotEActualNumericValues == null) // this.dotEActualNumericValues = new HashMap<>(); // else // return; - BitSet numericDotEBitSet = (BitSet) numberBitSet.clone(); - numericDotEBitSet.or(dotBitSet); - numericDotEBitSet.or(eBitSet); - numericDotEBitSet.or(plusMinusBitSet); - - StringBuilder sb = new StringBuilder(); - int pi = numericDotEBitSet.nextSetBit(0); - sb.append(raw.charAt(pi)); - - for(int i = numericDotEBitSet.nextSetBit(pi+1); i != -1; i = numericDotEBitSet.nextSetBit(i + 1)) { - if(pi+sb.length() != i) { - addActualValueToList(sb.toString(), pi, dotEActualNumericValues); - sb = new StringBuilder(); - sb.append(raw.charAt(i)); - pi = i; - } - else - sb.append(raw.charAt(i)); - } - if(sb.length()>0) - addActualValueToList(sb.toString(), pi, dotEActualNumericValues); - } + BitSet numericDotEBitSet = (BitSet) numberBitSet.clone(); + numericDotEBitSet.or(dotBitSet); + numericDotEBitSet.or(eBitSet); + numericDotEBitSet.or(plusMinusBitSet); - private void addActualValueToList(String stringValue, Integer position, HashMap>> list){ - try { - double d = UtilFunctions.getDouble(stringValue); - Pair pair = new Pair(position, stringValue.length()); - if(!list.containsKey(d)) { - ArrayList> tmpList = new ArrayList<>(); - tmpList.add(pair); - list.put(d, tmpList); - } - else - list.get(d).add(pair); - } - catch(Exception e){ - - } - } + StringBuilder sb = new StringBuilder(); + int pi = numericDotEBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); - public void cloneReservedPositions(){ - this.backupReservedPositions = (BitSet) this.reservedPositions.clone(); - } + for (int i = numericDotEBitSet.nextSetBit(pi + 1); i != -1; i = numericDotEBitSet.nextSetBit(i + 1)) { + if (pi + sb.length() != i) { + addActualValueToList(sb.toString(), pi, dotEActualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } else + sb.append(raw.charAt(i)); + } + if (sb.length() > 0) + addActualValueToList(sb.toString(), pi, dotEActualNumericValues); + } - public void restoreReservedPositions(){ - this.reservedPositions = this.backupReservedPositions; - } + private void addActualValueToList(String stringValue, Integer position, HashMap>> list) { + try { + double d = UtilFunctions.getDouble(stringValue); + Pair pair = new Pair(position, stringValue.length()); + if (!list.containsKey(d)) { + ArrayList> tmpList = new ArrayList<>(); + tmpList.add(pair); + list.put(d, tmpList); + } else + list.get(d).add(pair); + } catch (Exception e) { - public String getSubString(int start, int end){ - return raw.substring(start, end); - } + } + } - public int getRawLength() { - return rawLength; - } + public void cloneReservedPositions() { + this.backupReservedPositions = (BitSet) this.reservedPositions.clone(); + } - public String getRaw() { - return raw; - } + public void restoreReservedPositions() { + this.reservedPositions = this.backupReservedPositions; + } + + public String getSubString(int start, int end) { + return raw.substring(start, end); + } + + public int getRawLength() { + return rawLength; + } + + public String getRaw() { + return raw; + } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java index 9ef503e234f..71cc092b0e5 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java @@ -135,5 +135,22 @@ public void test9() { runGenerateReaderTest(); } +// @Test +// public void test10() { +// sampleRaw = "30,\"Stationary wave solutions of a system of reaction-diffusion equations derived from the Fitzhugh-Nagumo equations\",1984,\"SIAM Journal on Applied Mathematics\",\"\",Gene A. Klaasen\",\"William C. Troy\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Univ. of Tennessee, Knoxville\",\"Univ. of Pittsburgh, Pittsburgh, PA\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + +// "31,\"Up and running: the small business computer implementation cookbook\",1984,\"Up and running: the small business computer implementation cookbook\",\"\",Jess W. Curry, Jr.\",\"David M. Bonner\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Arthur Young\",\"Arthur Young\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + +// "32,\"Proc. IFIP working conference on Programming Languages and System Design\",1983,\"Proc. IFIP working conference on Programming Languages and System Design\",\"\",J Bormann\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Technical Univ. of Dresden, East Germany\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + +// "33,\"Fast automatic liveness analysis of hierarchical parallel systems\",1983,\"Proc. IFIP working conference on Programming Languages and System Design\",\"\",Johannes Rohrich\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Univ. Karlsruhe, Karlsruhe, W. Germany\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + +// "34,\"Concatenable type declarations: their applications and implementaion\",1983,\"Proc. IFIP working conference on Programming Languages and System Design\",\"\",A Kreczmar\",\"A Salwicki\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Univ. of Warsaw, Poland\",\"Univ. of Warsaw, Poland\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"; +// data = new String[][] {{"30","Stationary wave solutions of a system of reaction-diffusion equations derived from the Fitzhugh-Nagumo equations","1984", "SIAM Journal on Applied Mathematics",""}, +// {"31","Up and running: the small business computer implementation cookbook","1984", "Up and running: the small business computer implementation cookbook", ""}, +// {"32","Proc. IFIP working conference on Programming Languages and System Design","1983", "Proc. IFIP working conference on Programming Languages and System Design",""}, +// {"33","Fast automatic liveness analysis of hierarchical parallel systems","1983", "Proc. IFIP working conference on Programming Languages and System Design",""}, +// {"34","Concatenable type declarations: their applications and implementaion","1983", "Proc. IFIP working conference on Programming Languages and System Design",""} +// }; +// schema = new Types.ValueType[] {Types.ValueType.INT64, Types.ValueType.STRING, Types.ValueType.FP32, Types.ValueType.STRING, Types.ValueType.STRING}; +// runGenerateReaderTest(); +// } + } From bd4cbef23db970fbeba9605c9d1cfea7219102ae Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 27 Jan 2022 20:25:46 +0100 Subject: [PATCH 28/84] up --- src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh index 0affe74b041..0344d1e719d 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh @@ -16,7 +16,7 @@ nrows=-1 mx_mem="$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024 * 1024)))g" delimiter="\t" -declare -a datasets=("aminer_author") +declare -a datasets=("aminer_paper") declare -a main_classes=( "GIOFrameExperimentHDFS") for (( i = 0; i < 1; i++ )); do @@ -27,7 +27,7 @@ for (( i = 0; i < 1; i++ )); do for sr in 100 200 300 400 500 600 700 800 900 1000 do - for p in 7 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + for p in 5 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 do schema_file_name="$root_data_path/$d/$d$sep$p.schema" sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" From 28ae8b90247b292c44d1944ecc1c04ff05d457b5 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 28 Jan 2022 00:55:42 +0100 Subject: [PATCH 29/84] up --- .../sysds/runtime/iogen/GenerateReader.java | 2 +- .../runtime/iogen/codegen/CodeGenTrie.java | 5 +- .../sysds/runtime/iogen/codegen/myTest.java | 102 -------- .../sysds/runtime/iogen/codegen/mymain2.java | 218 ++++++++++++++++++ .../iogen/exp/SYSDSFrameExperimentHDFS.java | 38 +-- .../sysds/runtime/iogen/exp/runGIOExp.sh | 10 +- .../iogen/GenerateReaderMatrixTest.java | 7 +- 7 files changed, 250 insertions(+), 132 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 622a360d537..cd9d82e778f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -120,7 +120,7 @@ public FrameReader getReader() throws Exception { Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; String js = src.generateCodeJava(); - //System.out.println(js); + System.out.println(js); frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return frameReader; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 54dbd51f32d..fbeb0c95921 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -133,7 +133,7 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos src.append(node.geValueCode(destination, currPos)); if(node.getChildren().size() > 0) { - String currPosVariable; + String currPosVariable = currPos; for(String key : node.getChildren().keySet()) { if(key.length() > 0) { currPosVariable = getRandomName("curPos"); @@ -143,10 +143,9 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\", " + currPos + "); \n"); src.append("if(index != -1) { \n"); src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); - currPos = currPosVariable; } CodeGenTrieNode child = node.getChildren().get(key); - getJavaCode(child, src, currPos); + getJavaCode(child, src, currPosVariable); if(key.length() > 0) src.append("} \n"); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java deleted file mode 100644 index 1a2d1a58ce9..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/myTest.java +++ /dev/null @@ -1,102 +0,0 @@ -package org.apache.sysds.runtime.iogen.codegen; - -import org.apache.commons.lang.mutable.MutableInt; -import org.apache.sysds.runtime.io.IOUtilFunctions; -import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.util.HashSet; - -public class myTest extends MatrixGenerateReader { - public myTest(CustomProperties _props) { - super(_props); - } - - @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException { - - String str; - int row = rowPos.intValue(); - long lnnz = 0; - int index, endPos, strLen; - HashSet[] endWithValueString = _props.endWithValueStrings(); - BufferedReader br = new BufferedReader(new InputStreamReader(is)); - String strChunk, remainedStr = null; - int chunkSize = 2048; - int recordIndex = 0; - try { - do{ - strChunk = getStringChunkOfBufferReader(br, remainedStr, chunkSize); - System.out.println(strChunk); - if(strChunk == null || strChunk.length() == 0) break; - do { - recordIndex = strChunk.indexOf("#index ", recordIndex); - if(recordIndex == -1) break; - //recordIndex +=7; - int recordBeginPos = recordIndex; - recordIndex = strChunk.indexOf("#index ", recordBeginPos + 7);if(recordIndex == -1) break; - str = strChunk.substring(recordBeginPos, recordIndex); - strLen = str.length(); - index = str.indexOf(" "); - if(index != -1) { - int curPos_75540091 = index + 1; - endPos = getEndPos(str, strLen, curPos_75540091, endWithValueString[1]); - String cellStr1 = str.substring(curPos_75540091,endPos); - if ( cellStr1.length() > 0 ){ - Double cellValue1; - try{ cellValue1= Double.parseDouble(cellStr1); } catch(Exception e){cellValue1 = 0d;} - if(cellValue1 != 0) { - dest.appendValue(row, 1, cellValue1); - lnnz++; - } - } - } - index = str.indexOf("#index "); - if(index != -1) { - int curPos_50855160 = index + 7; - endPos = getEndPos(str, strLen, curPos_50855160, endWithValueString[0]); - String cellStr0 = str.substring(curPos_50855160,endPos); - if ( cellStr0.length() > 0 ){ - Double cellValue0; - try{ cellValue0= Double.parseDouble(cellStr0); } catch(Exception e){cellValue0 = 0d;} - if(cellValue0 != 0) { - dest.appendValue(row, 0, cellValue0); - lnnz++; - } - } - } - index = str.indexOf("#index 1"); - if(index != -1) { - int curPos_36575074 = index + 8; - index = str.indexOf(",", curPos_36575074); - if(index != -1) { - int curPos_13302308 = index + 1; - endPos = getEndPos(str, strLen, curPos_13302308, endWithValueString[2]); - String cellStr2 = str.substring(curPos_13302308,endPos); - if ( cellStr2.length() > 0 ){ - Double cellValue2; - try{ cellValue2= Double.parseDouble(cellStr2); } catch(Exception e){cellValue2 = 0d;} - if(cellValue2 != 0) { - dest.appendValue(row, 2, cellValue2); - lnnz++; - } - } - } - } - row++; - }while(true); - remainedStr = strChunk.substring(recordIndex); - }while(true); - } - finally { - IOUtilFunctions.closeSilently(br); - } - rowPos.setValue(row); - return lnnz; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java new file mode 100644 index 00000000000..65ae88b4f93 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java @@ -0,0 +1,218 @@ +package org.apache.sysds.runtime.iogen.codegen; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.*; +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.template.FrameGenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +import java.io.IOException; +import java.util.HashSet; + +public class mymain2 extends FrameGenerateReader { + public mymain2(CustomProperties _props) { + super(_props); + } + + @Override + protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first) throws IOException { + RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + int row = rl; + long lnnz = 0; + HashSet[] endWithValueString = _props.endWithValueStrings(); + int index, endPos, strLen; + try { + while(reader.next(key, value)){ + String str = value.toString(); + strLen = str.length(); + endPos = getEndPos(str, strLen, 0, endWithValueString[0]); + String cellStr0 = str.substring(0,endPos); + if ( cellStr0.length() > 0 ){ + Long cellValue0; + try{cellValue0= Long.parseLong(cellStr0); } catch(Exception e){cellValue0 = 0l;} + if(cellValue0 != 0) { + dest.set(row, 0, cellValue0); + lnnz++; + } + } + index = str.indexOf("\",\"\","); + if(index != -1) { + int curPos_87607716 = index + 5; + index = str.indexOf("\",", curPos_87607716); + if(index != -1) { + int curPos_57566352 = index + 2; + endPos = getEndPos(str, strLen, curPos_57566352, endWithValueString[5]); + String cellStr5 = str.substring(curPos_57566352,endPos); + String cellValue5 = cellStr5; + dest.set(row, 5, cellValue5); + } + } + index = str.indexOf("\",19"); + if(index != -1) { + int curPos_8998302 = index + 4; + index = str.indexOf(",\"", curPos_8998302); + if(index != -1) { + int curPos_70036865 = index + 2; + endPos = getEndPos(str, strLen, curPos_70036865, endWithValueString[3]); + String cellStr3 = str.substring(curPos_70036865,endPos); + String cellValue3 = cellStr3; + dest.set(row, 3, cellValue3); + } + } + index = str.indexOf(",\""); + if(index != -1) { + int curPos_33286870 = index + 2; + endPos = getEndPos(str, strLen, curPos_33286870, endWithValueString[1]); + String cellStr1 = str.substring(curPos_33286870,endPos); + String cellValue1 = cellStr1; + dest.set(row, 1, cellValue1); + } + index = str.indexOf("l\","); + if(index != -1) { + int curPos_44381926 = index + 3; + endPos = getEndPos(str, strLen, curPos_44381926, endWithValueString[4]); + String cellStr4 = str.substring(curPos_44381926,endPos); + String cellValue4 = cellStr4; + dest.set(row, 4, cellValue4); + } + index = str.indexOf("\","); + if(index != -1) { + int curPos_90282355 = index + 2; + endPos = getEndPos(str, strLen, curPos_90282355, endWithValueString[2]); + String cellStr2 = str.substring(curPos_90282355,endPos); + if ( cellStr2.length() > 0 ){ + Integer cellValue2; + try{ cellValue2= Integer.parseInt(cellStr2);} catch(Exception e){cellValue2 = 0;} + if(cellValue2 != 0) { + dest.set(row, 2, cellValue2); + lnnz++; + } + } + } + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\""); + if(index != -1) { + int curPos_86635269 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_86635269); + if(index != -1) { + int curPos_4908949 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_4908949); + if(index != -1) { + int curPos_99118963 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_99118963); + if(index != -1) { + int curPos_81981300 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_81981300); + if(index != -1) { + int curPos_7528404 = index + 50; + index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_7528404); + if(index != -1) { + int curPos_55594937 = index + 50; + endPos = getEndPos(str, strLen, curPos_55594937, endWithValueString[6]); + String cellStr6 = str.substring(curPos_55594937,endPos); + String cellValue6 = cellStr6; + dest.set(row, 6, cellValue6); + } + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_7528404); + if(index != -1) { + int curPos_20058273 = index + 50; + index = str.indexOf("\",", curPos_20058273); + if(index != -1) { + int curPos_57197559 = index + 2; + endPos = getEndPos(str, strLen, curPos_57197559, endWithValueString[7]); + String cellStr7 = str.substring(curPos_57197559,endPos); + String cellValue7 = cellStr7; + dest.set(row, 7, cellValue7); + } + index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_20058273); + if(index != -1) { + int curPos_54788108 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_54788108); + if(index != -1) { + int curPos_15575491 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_15575491); + if(index != -1) { + int curPos_50383789 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_50383789); + if(index != -1) { + int curPos_11954615 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_11954615); + if(index != -1) { + int curPos_44271891 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_44271891); + if(index != -1) { + int curPos_84341622 = index + 50; + index = str.indexOf(",", curPos_84341622); + if(index != -1) { + int curPos_62678472 = index + 1; + endPos = getEndPos(str, strLen, curPos_62678472, endWithValueString[9]); + String cellStr9 = str.substring(curPos_62678472,endPos); + if ( cellStr9.length() > 0 ){ + Long cellValue9; + try{cellValue9= Long.parseLong(cellStr9); } catch(Exception e){cellValue9 = 0l;} + if(cellValue9 != 0) { + dest.set(row, 9, cellValue9); + lnnz++; + } + } + } + } + } + } + } + } + } + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_20058273); + if(index != -1) { + int curPos_89818247 = index + 50; + index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_89818247); + if(index != -1) { + int curPos_51945105 = index + 50; + index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_51945105); + if(index != -1) { + int curPos_65787925 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_65787925); + if(index != -1) { + int curPos_67105752 = index + 50; + index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_67105752); + if(index != -1) { + int curPos_60302668 = index + 50; + index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_60302668); + if(index != -1) { + int curPos_58266363 = index + 50; + endPos = getEndPos(str, strLen, curPos_58266363, endWithValueString[8]); + String cellStr8 = str.substring(curPos_58266363,endPos); + if ( cellStr8.length() > 0 ){ + Long cellValue8; + try{cellValue8= Long.parseLong(cellStr8); } catch(Exception e){cellValue8 = 0l;} + if(cellValue8 != 0) { + dest.set(row, 8, cellValue8); + lnnz++; + } + } + } + } + } + } + } + } + } + } + } + } + } + } + row++; + }} + finally { + IOUtilFunctions.closeSilently(reader); + } + return row; + + + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java index 39bbf61d942..b673e41d9d6 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java @@ -9,20 +9,20 @@ public class SYSDSFrameExperimentHDFS extends GIOMain { public static void main(String[] args) throws Exception { -// getArgs(); -// if (percent < 1 || codeGen || sampleNRows < 100) -// return; -// -// Util util = new Util(); -// Types.ValueType[] schema = util.getSchema(schemaFileName); -// int ncols = schema.length; -// -// double tmpTime = System.nanoTime(); -// FrameBlock frameBlock; -// if(datasetName.equals("csv")) { -// FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, " ", false); -// FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); -// frameBlock = csv.readFrameFromHDFS(dataFileName, schema, -1, ncols); + getArgs(); + + Util util = new Util(); + Types.ValueType[] schema = util.getSchema(schemaFileName); + int ncols = schema.length; + + System.out.println(">>>>>>>>>>>>>>>>>>> "+ncols); + + double tmpTime = System.nanoTime(); + FrameBlock frameBlock; + //if(datasetName.equals("csv")) { + FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, ",", false); + FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); + frameBlock = csv.readFrameFromHDFS(dataFileName, schema, -1, ncols); // } // else if(datasetName.equals("mm")) { // FrameReaderTextCell mm =new FrameReaderTextCell(); @@ -30,10 +30,10 @@ public static void main(String[] args) throws Exception { // } // else // throw new RuntimeException("Format not support!"); -// -// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+",1.0,0,0,"+readTime; -// util.addLog(LOG_HOME, log); + + double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; + + String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+",1.0,0,0,"+readTime; + util.addLog(LOG_HOME, log); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh index 0344d1e719d..15259ed7649 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh @@ -17,7 +17,7 @@ mx_mem="$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024 * 1024)) delimiter="\t" declare -a datasets=("aminer_paper") -declare -a main_classes=( "GIOFrameExperimentHDFS") +declare -a main_classes=("GIOFrameExperimentHDFS") #SYSDSFrameExperimentHDFS GIOFrameExperimentHDFS for (( i = 0; i < 1; i++ )); do for mc in "${main_classes[@]}"; do @@ -25,10 +25,14 @@ for (( i = 0; i < 1; i++ )); do ./resultPath.sh $home_log $d$i $mc data_file_name="$root_data_path/$d/$d.data" - for sr in 100 200 300 400 500 600 700 800 900 1000 + for sr in 100 #200 300 400 500 600 700 800 900 1000 do - for p in 5 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + for p in 11 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 do + #schema_file_name="$root_data_path/$d/$d.schema" + #sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" + #sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" + schema_file_name="$root_data_path/$d/$d$sep$p.schema" sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index 40249604ba3..b14ae35225c 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -25,7 +25,6 @@ import org.apache.sysds.runtime.io.MatrixReader; import org.apache.sysds.runtime.iogen.FormatIdentifying; import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.iogen.codegen.myTest; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; @@ -92,9 +91,9 @@ protected void runGenerateReaderTest() { int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); - myTest mt = new myTest(formatIdentifying.getFormatProperties()); - mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); - int a = 100; +// myTest mt = new myTest(formatIdentifying.getFormatProperties()); +// mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); +// int a = 100; // GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); // MatrixReader mr = gr.getReader(); From 72b750749a78ad7c1edde85002e83bb3f97046d6 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 28 Jan 2022 03:54:58 +0100 Subject: [PATCH 30/84] Fix a bug in code gen --- .../runtime/iogen/FormatIdentifying.java | 50 +++-- .../sysds/runtime/iogen/GenerateReader.java | 2 +- .../sysds/runtime/iogen/ReaderMapping.java | 19 +- .../sysds/runtime/iogen/codegen/mymain2.java | 174 ++++-------------- .../sysds/runtime/iogen/exp/runGIOExp.sh | 6 +- .../Identify/MatrixGRRowColIdentifyTest.java | 10 +- 6 files changed, 88 insertions(+), 173 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index dfd9f83cd56..0f3994ee9fc 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -27,6 +27,8 @@ import java.util.BitSet; import java.util.HashMap; import java.util.HashSet; +import java.util.Map; + public class FormatIdentifying { @@ -34,6 +36,7 @@ public class FormatIdentifying { private int[] mapRowPrevious; private int[][] mapCol; private int[][] mapLen; + private int NaN; private ArrayList sampleRawIndexes; private static int nrows; @@ -69,29 +72,21 @@ private void runIdentification() { nrows = mappingValues.getNrows(); ncols = mappingValues.getNcols(); nlines = mappingValues.getNlines(); + NaN = (ncols * nrows) - mappingValues.getNaN(); // Check the map row: // If all cells of a row mapped to a single line of sample raw, it is a single row mapping // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping - boolean isSingleRow = true; - for(int r=0; r map = new HashMap<>(); + int nan = 0; + for (Integer t : list) { + if (t != -1) { + Integer val = map.get(t); + map.put(t, val == null ? 1 : val + 1); + } else + nan++; + } + if (map.size() == 0) + return nan; + + Map.Entry max = null; + for (Map.Entry e : map.entrySet()) { + if (max == null || e.getValue() > max.getValue()) + max = e; + } + return max.getValue() + nan; + } + private KeyTrie[] buildColsKeyPatternSingleRow() { Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); @@ -284,7 +300,7 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { count++; } float percent = (float) count / list.size(); - if(percent >= 0.9) + if(percent >= 0.60) token = t; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index cd9d82e778f..622a360d537 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -120,7 +120,7 @@ public FrameReader getReader() throws Exception { Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; String js = src.generateCodeJava(); - System.out.println(js); + //System.out.println(js); frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return frameReader; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index b95afd7c0b5..e39ca62347d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -40,6 +40,7 @@ public class ReaderMapping { private final int nrows; private final int ncols; private int nlines; + private int NaN; private ArrayList sampleRawIndexes; private MatrixBlock sampleMatrix; private FrameBlock sampleFrame; @@ -98,6 +99,7 @@ protected boolean findMapping(boolean isIndexMapping) { mapRow = new int[nrows][ncols]; mapCol = new int[nrows][ncols]; mapLen = new int[nrows][ncols]; + NaN = 0; // Set "-1" as default value for all defined matrix for(int r = 0; r < nrows; r++) @@ -107,8 +109,9 @@ protected boolean findMapping(boolean isIndexMapping) { int itRow = 0; for(int r = 0; r < nrows; r++) { for(int c = 0; c < ncols; c++) { - if(isIndexMapping || ((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && this.sampleFrame.get( - r, c) != null))) { + if(isIndexMapping || ((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || + (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r,c)!=null) || + (schema[c].isNumeric() && this.sampleFrame.getDouble(r,c)!=0))))) { HashSet checkedLines = new HashSet<>(); while(checkedLines.size() < nlines) { RawIndex ri = sampleRawIndexes.get(itRow); @@ -128,19 +131,25 @@ protected boolean findMapping(boolean isIndexMapping) { } } } + else + NaN++; } } boolean flagMap = true; for(int r = 0; r < nrows && flagMap; r++) for(int c = 0; c < ncols && flagMap; c++) - if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, - c) != null) || (this.isMatrix && this.sampleMatrix.getValue(r, c) != 0))) { + if(mapRow[r][c] == -1 && ( + (!this.isMatrix && this.sampleFrame.get(r,c) != null) || + (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r,c)!=null) || + (schema[c].isNumeric() && this.sampleFrame.getDouble(r,c)!=0))))) { flagMap = false; } return flagMap; } - + public int getNaN() { + return NaN; + } public int[][] getMapRow() { return mapRow; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java index 65ae88b4f93..8da7a5d4cad 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java @@ -40,51 +40,31 @@ protected int readFrameFromInputSplit(InputSplit split, InputFormat 0 ){ Integer cellValue2; try{ cellValue2= Integer.parseInt(cellStr2);} catch(Exception e){cellValue2 = 0;} @@ -93,119 +73,29 @@ protected int readFrameFromInputSplit(InputSplit split, InputFormat 0 ){ - Long cellValue9; - try{cellValue9= Long.parseLong(cellStr9); } catch(Exception e){cellValue9 = 0l;} - if(cellValue9 != 0) { - dest.set(row, 9, cellValue9); - lnnz++; - } - } - } - } - } - } - } - } - } - index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_20058273); - if(index != -1) { - int curPos_89818247 = index + 50; - index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_89818247); - if(index != -1) { - int curPos_51945105 = index + 50; - index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_51945105); - if(index != -1) { - int curPos_65787925 = index + 50; - index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_65787925); - if(index != -1) { - int curPos_67105752 = index + 50; - index = str.indexOf(",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"", curPos_67105752); - if(index != -1) { - int curPos_60302668 = index + 50; - index = str.indexOf("\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",", curPos_60302668); - if(index != -1) { - int curPos_58266363 = index + 50; - endPos = getEndPos(str, strLen, curPos_58266363, endWithValueString[8]); - String cellStr8 = str.substring(curPos_58266363,endPos); - if ( cellStr8.length() > 0 ){ - Long cellValue8; - try{cellValue8= Long.parseLong(cellStr8); } catch(Exception e){cellValue8 = 0l;} - if(cellValue8 != 0) { - dest.set(row, 8, cellValue8); - lnnz++; - } - } - } - } - } - } - } - } - } - } + int curPos_78452455 = index + 1; + endPos = getEndPos(str, strLen, curPos_78452455, endWithValueString[3]); + String cellStr3 = str.substring(curPos_78452455,endPos); + if ( cellStr3.length() > 0 ){ + Integer cellValue3; + try{ cellValue3= Integer.parseInt(cellStr3);} catch(Exception e){cellValue3 = 0;} + if(cellValue3 != 0) { + dest.set(row, 3, cellValue3); + lnnz++; } } } } + index = str.indexOf(",,,"); + if(index != -1) { + int curPos_8253849 = index + 3; + endPos = getEndPos(str, strLen, curPos_8253849, endWithValueString[5]); + String cellStr5 = str.substring(curPos_8253849,endPos); + String cellValue5 = cellStr5; + dest.set(row, 5, cellValue5); + } row++; }} finally { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh index 15259ed7649..f047d80e227 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh @@ -7,8 +7,8 @@ LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" jar_file_path="$systemDS_Home/target/SystemDS.jar" lib_files_path="$systemDS_Home/target/lib/*" #----------------------------------------------------------------- -root_data_path="/home/saeed/Documents/Dataset/GIODataset/flat" -home_log="/home/saeed/Documents/ExpLog" +root_data_path="/home/saeed/Documents/Dataset/GIODataset/json" +home_log="/home/saeed/Documents/ExpLog/json/" cpp_base_src="" #"/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" sep="_" nrows=-1 @@ -25,7 +25,7 @@ for (( i = 0; i < 1; i++ )); do ./resultPath.sh $home_log $d$i $mc data_file_name="$root_data_path/$d/$d.data" - for sr in 100 #200 300 400 500 600 700 800 900 1000 + for sr in 1000 #200 300 400 500 600 700 800 900 1000 do for p in 11 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 do diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index c90f9d9ba6b..7fbefe48825 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -251,12 +251,12 @@ public void test12() { @Test public void test13() throws Exception { - String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/sample_100_5.raw"; - String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/sample_100_5.frame"; - Integer sampleNRows = 100; + String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/sample_1000_11.raw"; + String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/sample_1000_11.frame"; + Integer sampleNRows = 1000; String delimiter = "\\t"; - String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/aminer_paper.schema"; - String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper/aminer_paper.data"; + String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/aminer_paper_11.schema"; + String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/aminer_paper.data"; Float percent = 7f;//Float.parseFloat(args[6]); String datasetName = "aminer_paper";//args[7]; From 078266c179431e0b6817557212fc8eb90be4b4ef Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sat, 29 Jan 2022 11:39:58 +0100 Subject: [PATCH 31/84] optimization --- .../sysds/runtime/iogen/CustomProperties.java | 3 +- .../runtime/iogen/FormatIdentifying.java | 111 +++-- .../sysds/runtime/iogen/GenerateReader.java | 4 +- .../sysds/runtime/iogen/Hirschberg.java | 415 +++++++++++++----- .../sysds/runtime/iogen/MappingTrie.java | 32 +- .../apache/sysds/runtime/iogen/RawIndex.java | 20 + .../runtime/iogen/codegen/CodeGenTrie.java | 13 +- .../sysds/runtime/iogen/exp/runGIOExp.sh | 6 +- .../Identify/MatrixGRRowColIdentifyTest.java | 8 +- 9 files changed, 448 insertions(+), 164 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index cba8a71fdf5..1a380627e28 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -64,7 +64,8 @@ public KeyTrie[] getColKeyPattern() { public HashSet[] endWithValueStrings(){ HashSet[] endWithValueString = new HashSet[colKeyPattern.length]; for(int i=0; i< colKeyPattern.length; i++) - endWithValueString[i] = colKeyPattern[i].getFirstSuffixKeyPatterns(); + if (colKeyPattern[i]!=null) + endWithValueString[i] = colKeyPattern[i].getFirstSuffixKeyPatterns(); return endWithValueString; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 0f3994ee9fc..a0576cd26b7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -19,6 +19,7 @@ package org.apache.sysds.runtime.iogen; +import org.apache.sysds.lops.Lop; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.data.Pair; @@ -270,6 +271,25 @@ private Integer mostCommonScore(int[] list) { return max.getValue() + nan; } + private Integer mostCommonValue(int[] list) { + Map map = new HashMap<>(); + for (Integer t : list) { + if (t != -1) { + Integer val = map.get(t); + map.put(t, val == null ? 1 : val + 1); + } + } + if (map.size() == 0) + return -1; + + Map.Entry max = null; + for (Map.Entry e : map.entrySet()) { + if (max == null || e.getValue() > max.getValue()) + max = e; + } + return max.getKey(); + } + private KeyTrie[] buildColsKeyPatternSingleRow() { Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); @@ -285,8 +305,13 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { for(String s : list) { if(s.length() < w) flag = false; - else - wts.add(s.substring(s.length()-w)); + else { + String subStr = s.substring(s.length() - w); + if (!subStr.contains(Lop.OPERAND_DELIMITOR)) + wts.add(subStr); + else + flag = false; + } } if(flag) { @@ -308,10 +333,23 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { else if(wts.size() == 0) token = ""; } - if(token == null) - throw new RuntimeException("can't build a key pattern for the column: "+ c); - - if(token.length() > 0){ + if(token == null) { + int[] listLength = new int[nrows]; + for (int r = 0; r< nrows; r++) + listLength[r] = mapCol[r][c]; + int commonLength = mostCommonValue(listLength); + if (commonLength == 0){ + ArrayList newList = new ArrayList<>(); + for(String s: list){ + if(s.length() == 0) + newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } + else + throw new RuntimeException("can't build a key pattern for the column: " + c); + } + else if(token.length() > 0){ ArrayList newList = new ArrayList<>(); for(String s: list){ if(s.endsWith(token)) @@ -324,31 +362,48 @@ else if(wts.size() == 0) for(int c=0; c> keyPatterns; + ArrayList> keyPatterns = null; - do { - ArrayList> selectedKeyPatterns = new ArrayList<>(); - keyPatterns = trie.getAllSequentialKeys(); - check = false; - for(ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); - check |= newCheck; - if(newCheck) - selectedKeyPatterns.add(keyPattern); - } - if(check) - keyPatterns = selectedKeyPatterns; - else { - flagReconstruct = trie.reConstruct(); - if(!flagReconstruct) - break; + + for(String ps: prefixStrings.getKey()[c]) + trie.reverseInsert(ps, prefixStrings.getValue()[c].get(ri++)); + + if (trie.getRoot().getChildren().size() == 1){ + String[] splitPattern= prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); + ArrayList reverseSplitPattern = new ArrayList<>(); + for (String ps: splitPattern) + if (ps.length() > 0) + reverseSplitPattern.add(ps); + if (reverseSplitPattern.size() == 0) + reverseSplitPattern.add(""); + check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], reverseSplitPattern); + if (check) { + keyPatterns = new ArrayList<>(); + keyPatterns.add(reverseSplitPattern); } - }while(!check); + } + else { + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = trie.getAllSequentialKeys(); + check = false; + for (ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + if (newCheck) + selectedKeyPatterns.add(keyPattern); + } + if (check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = trie.reConstruct(); + if (!flagReconstruct) + break; + } + } while (!check); + } if(check){ colKeyPattens[c] = new KeyTrie(keyPatterns); @@ -379,7 +434,7 @@ public Pair, ArrayList> extractAllPrefixStringsOfACol int rowIndex = mapRow[r][colIndex]; if(rowIndex != -1) { rowIndexes.add(rowIndex); - String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]); + String str = sampleRawIndexes.get(rowIndex).getRemainedTexts(mapCol[r][colIndex]);//sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]); if(reverse) prefixStrings.add(new StringBuilder(str).reverse().toString()); else diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 622a360d537..26d52f789c1 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -93,7 +93,7 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - String js = src.generateCodeJava(); + //String js = src.generateCodeJava(); matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } @@ -119,7 +119,7 @@ public FrameReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - String js = src.generateCodeJava(); + //String js = src.generateCodeJava(); //System.out.println(js); frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java index 8df88b61a5f..ad58e57c280 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java @@ -26,125 +26,302 @@ public class Hirschberg { - public Pair, String> getLCS(String x, String y, int pxy, int pgap) { - int i, j; // initialising variables - int m = x.length(); // length of gene1 - int n = y.length(); // length of gene2 - - // table for storing optimal substructure answers - int dp[][] = new int[n + m + 1][n + m + 1]; - - for(int[] x1 : dp) - Arrays.fill(x1, 0); - - // initialising the table - for(i = 0; i <= (n + m); i++) { - dp[i][0] = i * pgap; - dp[0][i] = i * pgap; - } - - // calculating the minimum penalty - for(i = 1; i <= m; i++) { - for(j = 1; j <= n; j++) { - if(x.charAt(i - 1) == y.charAt(j - 1)) { - dp[i][j] = dp[i - 1][j - 1]; - } - else { - dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy, dp[i - 1][j] + pgap), dp[i][j - 1] + pgap); - } - } - } - - // Reconstructing the solution - int l = n + m; // maximum possible length - i = m; - j = n; - int xpos = l; - int ypos = l; - - // Final answers for the respective strings - int xans[] = new int[l + 1]; - int yans[] = new int[l + 1]; - - while(!(i == 0 || j == 0)) { - if(x.charAt(i - 1) == y.charAt(j - 1)) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } - else if(dp[i - 1][j - 1] + pxy == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } - else if(dp[i - 1][j] + pgap == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) '_'; - i--; - } - else if(dp[i][j - 1] + pgap == dp[i][j]) { - xans[xpos--] = (int) '_'; - yans[ypos--] = (int) y.charAt(j - 1); - j--; - } - } - while(xpos > 0) { - if(i > 0) - xans[xpos--] = (int) x.charAt(--i); - else - xans[xpos--] = (int) '_'; - } - while(ypos > 0) { - if(j > 0) - yans[ypos--] = (int) y.charAt(--j); - else - yans[ypos--] = (int) '_'; - } - // Since we have assumed the answer to be n+m long, we need to remove the extra - // gaps in the starting id represents the index from which the arrays xans, yans are useful - int id = 1; - for(i = l; i >= 1; i--) { - if((char) yans[i] == '_' && (char) xans[i] == '_') { - id = i + 1; - break; - } - } - - StringBuilder sb = new StringBuilder(); - ArrayList pattern = new ArrayList<>(); - for(i = id; i <= l; i++) { - if(xans[i] == yans[i]) - sb.append((char) xans[i]); - else { - if(sb.length() > 0) - pattern.add(sb.toString()); - sb = new StringBuilder(); - } - } - - if(sb.length() > 0) - pattern.add(sb.toString()); - - // System.out.println(""); - // for(i = id; i <= l; i++) - // System.out.print((char) yans[i]); - // - sb = new StringBuilder(); - for(int bi = id; bi <= l; bi++) { - if(xans[bi] == yans[bi]) { - sb.append((char) xans[bi]); - //System.out.print((char) xans[bi]); - } - //else - //System.out.print("*"); - } - if(sb.length() > 0) - return new Pair<>(pattern, sb.toString()); - else - return null; - } + public Pair, String> getLCS(String x, String y, int pxy, int pgap) { + int i, j; // initialising variables + int m = x.length(); // length of gene1 + int n = y.length(); // length of gene2 + + // table for storing optimal substructure answers + int dp[][] = new int[n + m + 1][n + m + 1]; + + for (int[] x1 : dp) + Arrays.fill(x1, 0); + + // initialising the table + for (i = 0; i <= (n + m); i++) { + dp[i][0] = i * pgap; + dp[0][i] = i * pgap; + } + + // calculating the minimum penalty + for (i = 1; i <= m; i++) { + for (j = 1; j <= n; j++) { + if (x.charAt(i - 1) == y.charAt(j - 1)) { + dp[i][j] = dp[i - 1][j - 1]; + } else { + dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy, dp[i - 1][j] + pgap), dp[i][j - 1] + pgap); + } + } + } + + // Reconstructing the solution + int l = n + m; // maximum possible length + i = m; + j = n; + int xpos = l; + int ypos = l; + + // Final answers for the respective strings + int xans[] = new int[l + 1]; + int yans[] = new int[l + 1]; + + while (!(i == 0 || j == 0)) { + if (x.charAt(i - 1) == y.charAt(j - 1)) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } else if (dp[i - 1][j - 1] + pxy == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } else if (dp[i - 1][j] + pgap == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) '_'; + i--; + } else if (dp[i][j - 1] + pgap == dp[i][j]) { + xans[xpos--] = (int) '_'; + yans[ypos--] = (int) y.charAt(j - 1); + j--; + } + } + while (xpos > 0) { + if (i > 0) + xans[xpos--] = (int) x.charAt(--i); + else + xans[xpos--] = (int) '_'; + } + while (ypos > 0) { + if (j > 0) + yans[ypos--] = (int) y.charAt(--j); + else + yans[ypos--] = (int) '_'; + } + // Since we have assumed the answer to be n+m long, we need to remove the extra + // gaps in the starting id represents the index from which the arrays xans, yans are useful + int id = 1; + for (i = l; i >= 1; i--) { + if ((char) yans[i] == '_' && (char) xans[i] == '_') { + id = i + 1; + break; + } + } + + StringBuilder sb = new StringBuilder(); + ArrayList pattern = new ArrayList<>(); + for (i = id; i <= l; i++) { + if (xans[i] == yans[i]) + sb.append((char) xans[i]); + else { + if (sb.length() > 0) + pattern.add(sb.toString()); + sb = new StringBuilder(); + } + } + + if (sb.length() > 0) + pattern.add(sb.toString()); + + // System.out.println(""); + // for(i = id; i <= l; i++) + // System.out.print((char) yans[i]); + // + sb = new StringBuilder(); + for (int bi = id; bi <= l; bi++) { + if (xans[bi] == yans[bi]) { + sb.append((char) xans[bi]); + System.out.print((char) xans[bi]); + } + //else + //System.out.print("*"); + } + System.out.println(); + if (sb.length() > 0) { +// StringBuilder stringBuilder = new StringBuilder(); +// for (String s: pattern){ +// stringBuilder.append(s).append("_"); +// } +// if (stringBuilder.length()>0) +// stringBuilder.deleteCharAt(stringBuilder.length()-1); + return new Pair<>(pattern, sb.toString()); + } + else + return null; + } +public Pair, String> getLCS(String x, String y) { + int i, j; // initialising variables + int m = x.length(); // length of gene1 + int n = y.length(); // length of gene2 + + // table for storing optimal substructure answers + int dp[][] = new int[n + m + 1][n + m + 1]; + + for (int[] x1 : dp) + Arrays.fill(x1, 0); + + // initialising the table + for (i = 0; i <= (n + m); i++) { + dp[i][0] = i; + dp[0][i] = i; + } + + // calculating the minimum penalty + for (i = 1; i <= m; i++) { + for (j = 1; j <= n; j++) { + if (x.charAt(i - 1) == y.charAt(j - 1)) { + dp[i][j] = dp[i - 1][j - 1]; + } else { + dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1], dp[i - 1][j]), dp[i][j - 1]); + } + } + } + + // Reconstructing the solution + int l = n + m; // maximum possible length + i = m; + j = n; + int xpos = l; + int ypos = l; + + // Final answers for the respective strings + int xans[] = new int[l + 1]; + int yans[] = new int[l + 1]; + + while (!(i == 0 || j == 0)) { + if (x.charAt(i - 1) == y.charAt(j - 1)) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } else if (dp[i - 1][j - 1] == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } else if (dp[i - 1][j] == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) '_'; + i--; + } else if (dp[i][j - 1] == dp[i][j]) { + xans[xpos--] = (int) '_'; + yans[ypos--] = (int) y.charAt(j - 1); + j--; + } + } + while (xpos > 0) { + if (i > 0) + xans[xpos--] = (int) x.charAt(--i); + else + xans[xpos--] = (int) '_'; + } + while (ypos > 0) { + if (j > 0) + yans[ypos--] = (int) y.charAt(--j); + else + yans[ypos--] = (int) '_'; + } + // Since we have assumed the answer to be n+m long, we need to remove the extra + // gaps in the starting id represents the index from which the arrays xans, yans are useful + int id = 1; + for (i = l; i >= 1; i--) { + if ((char) yans[i] == '_' && (char) xans[i] == '_') { + id = i + 1; + break; + } + } + + StringBuilder sb = new StringBuilder(); + ArrayList pattern = new ArrayList<>(); + for (i = id; i <= l; i++) { + if (xans[i] == yans[i]) + sb.append((char) xans[i]); + else { + if (sb.length() > 0) + pattern.add(sb.toString()); + sb = new StringBuilder(); + } + } + + if (sb.length() > 0) + pattern.add(sb.toString()); + + // System.out.println(""); + // for(i = id; i <= l; i++) + // System.out.print((char) yans[i]); + // + sb = new StringBuilder(); + for (int bi = id; bi <= l; bi++) { + if (xans[bi] == yans[bi]) { + sb.append((char) xans[bi]); + System.out.print((char) xans[bi]); + } + //else + //System.out.print("*"); + } + System.out.println(); + if (sb.length() > 0) { +// StringBuilder stringBuilder = new StringBuilder(); +// for (String s: pattern){ +// stringBuilder.append(s).append("_"); +// } +// if (stringBuilder.length()>0) +// stringBuilder.deleteCharAt(stringBuilder.length()-1); + return new Pair<>(pattern, sb.toString()); + } + else + return null; +} + + public ArrayList getLCS(ArrayList list, int pxy, int pgap) { + if (list.size() < 2) + return null; + + +// +// +// if (pattern != null) { +// String intersect = pattern.getValue(); +// ArrayList intersectPattern = pattern.getKey(); +// for (int i = 2; i < list.size(); i++) { +// if (i==199) +// System.out.print(i+" >> " + list.get(i)+"\n"); +// pattern = getLCS(intersect, list.get(i)); +// if (pattern != null) { +// intersect = pattern.getValue(); +// intersectPattern = pattern.getKey(); +// } else +// intersect = null; +// } +// if (intersect != null) +// return intersectPattern; +// +// } + +// Hirschberg2 hirschberg2 = new Hirschberg2(); +// String str1 = list.get(0); +// String str2 = list.get(1); +// Pair, String pattern = hirschberg2.algC(str1.length(), str2.length(),str1, str2); +// if (pattern != null) { +// String intersect = pattern.getValue(); +// ArrayList intersectPattern = pattern.getKey(); +// for (int i = 2; i < list.size(); i++) { +// if (i==199) +// System.out.print(i+" >> " + list.get(i)+"\n"); +// pattern = getLCS(intersect, list.get(i)); +// if (pattern != null) { +// intersect = pattern.getValue(); +// intersectPattern = pattern.getKey(); +// } else +// intersect = null; +// } +// if (intersect != null) +// return intersectPattern; +// +// } + + return null; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index a223295cbf5..603c59a4c20 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -19,6 +19,7 @@ package org.apache.sysds.runtime.iogen; +import org.apache.sysds.lops.Lop; import org.apache.sysds.runtime.matrix.data.Pair; import java.util.ArrayList; @@ -74,6 +75,26 @@ public MappingTrieNode getFistMultiChildNode(MappingTrieNode node) { return node; } + public void insertKeys(ArrayList keys) { + MappingTrieNode currentNode = root; + int index = 0; + for(String key : keys) { + if(currentNode.getChildren().containsKey(key)) { + currentNode = currentNode.getChildren().get(key); + index++; + } + else + break; + } + + MappingTrieNode newNode; + for(int i = index; i < keys.size(); i++) { + newNode = new MappingTrieNode(); + currentNode.getChildren().put(keys.get(i), newNode); + currentNode = newNode; + } + } + public Set getAllSubStringsOfStringContainIntersect(String str, BitSet bitSet) { HashSet result = new HashSet<>(); StringBuilder sb = new StringBuilder(); @@ -98,7 +119,9 @@ private void getAllSubStrings(HashSet result, StringBuilder sb) { else { for(int j = 1; j <= Math.min(sb.length(), windowSize); j++) { for(int k = 0; k <= sb.length() - j; k++) { - result.add(sb.substring(k, k + j)); + String subStr = sb.substring(k, k + j); + if (!subStr.contains(Lop.OPERAND_DELIMITOR)) + result.add(subStr); } } } @@ -288,7 +311,12 @@ public ArrayList> getAllSequentialKeys() { for(Pair> n : k) if(n.getKey() != null) { if(level < keyLevel || keyLevel == 0) { - kl.add(n.getKey()); + String[] splitText = n.getKey().split(Lop.OPERAND_DELIMITOR,-1); + String str = splitText[0]; + if (str.length() == 0 && splitText.length >1) + str = splitText[1]; + + kl.add(str); level++; } else diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index 40a5f3b3b09..cdbf7cbb897 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -20,6 +20,7 @@ package org.apache.sysds.runtime.iogen; import org.apache.sysds.common.Types; +import org.apache.sysds.lops.Lop; import org.apache.sysds.runtime.matrix.data.Pair; import org.apache.sysds.runtime.util.UtilFunctions; @@ -302,6 +303,25 @@ private void addActualValueToList(String stringValue, Integer position, HashMap< } } + public String getRemainedTexts(int endPos) { + StringBuilder sb = new StringBuilder(); + StringBuilder result = new StringBuilder(); + for (int i = 0; i < endPos; i++) { + if (!reservedPositions.get(i)) + sb.append(raw.charAt(i)); + else { + if (sb.length() > 0) { + result.append(Lop.OPERAND_DELIMITOR).append(sb); + sb = new StringBuilder(); + } + } + } + if (sb.length() > 0) + result.append(Lop.OPERAND_DELIMITOR).append(sb); + + return result.toString(); + } + public void cloneReservedPositions() { this.backupReservedPositions = (BitSet) this.reservedPositions.clone(); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index fbeb0c95921..2e0878f275c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -19,7 +19,6 @@ package org.apache.sysds.runtime.iogen.codegen; -import com.google.gson.Gson; import org.apache.sysds.common.Types; import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.KeyTrie; @@ -47,15 +46,19 @@ private void buildPrefixTree(){ for(int c=0; c< properties.getColKeyPattern().length; c++){ KeyTrie keyTrie = properties.getColKeyPattern()[c]; Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) - this.insert(rootCol, c, vt, keys); + if (keyTrie != null) { + for (ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + this.insert(rootCol, c, vt, keys); + } } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX){ KeyTrie keyTrie = properties.getRowKeyPattern(); Types.ValueType vt = Types.ValueType.FP32; - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) - this.insert(rootRow, -1, vt, keys); + if (keyTrie != null) { + for (ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + this.insert(rootRow, -1, vt, keys); + } } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh index f047d80e227..4f34d0f392f 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh @@ -7,8 +7,8 @@ LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" jar_file_path="$systemDS_Home/target/SystemDS.jar" lib_files_path="$systemDS_Home/target/lib/*" #----------------------------------------------------------------- -root_data_path="/home/saeed/Documents/Dataset/GIODataset/json" -home_log="/home/saeed/Documents/ExpLog/json/" +root_data_path="/home/saeed/Documents/Dataset/GIODataset/libsvm" +home_log="/home/saeed/Documents/ExpLog/libsvm/" cpp_base_src="" #"/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" sep="_" nrows=-1 @@ -25,7 +25,7 @@ for (( i = 0; i < 1; i++ )); do ./resultPath.sh $home_log $d$i $mc data_file_name="$root_data_path/$d/$d.data" - for sr in 1000 #200 300 400 500 600 700 800 900 1000 + for sr in 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 do for p in 11 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 do diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 7fbefe48825..48c11a5508c 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -251,12 +251,12 @@ public void test12() { @Test public void test13() throws Exception { - String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/sample_1000_11.raw"; - String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/sample_1000_11.frame"; + String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/sample_1000_11.raw"; + String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/sample_1000_11.frame"; Integer sampleNRows = 1000; String delimiter = "\\t"; - String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/aminer_paper_11.schema"; - String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/json/aminer_paper/aminer_paper.data"; + String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/aminer_paper_11.schema"; + String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/aminer_paper.data"; Float percent = 7f;//Float.parseFloat(args[6]); String datasetName = "aminer_paper";//args[7]; From 951f3c33c59d10aa848f781323c89c37af9f8c28 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Mon, 31 Jan 2022 23:01:08 +0100 Subject: [PATCH 32/84] optimization fix some code style fix some bug --- .../sysds/runtime/iogen/CustomProperties.java | 12 +++-- .../sysds/runtime/iogen/GenerateReader.java | 3 -- .../apache/sysds/runtime/iogen/KeyTrie.java | 10 ++-- .../sysds/runtime/iogen/MappingTrie.java | 45 ++++++++++++----- .../sysds/runtime/iogen/ReaderMapping.java | 13 +++-- .../runtime/iogen/codegen/CodeGenTrie.java | 48 +++++++++++-------- .../iogen/exp/GIOFrameExperimentHDFS.java | 2 +- .../sysds/runtime/iogen/exp/resultPath.sh | 8 +--- .../sysds/runtime/iogen/exp/runGIOExp.sh | 31 +++++++----- .../Identify/MatrixGRRowColIdentifyTest.java | 10 ++-- 10 files changed, 106 insertions(+), 76 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 1a380627e28..5fb2fd1cb6e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -21,6 +21,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FileFormatProperties; + import java.io.Serializable; import java.util.HashSet; @@ -28,6 +29,7 @@ public class CustomProperties extends FileFormatProperties implements Serializab public enum IndexProperties { IDENTIFY, PREFIX, KEY; + @Override public String toString() { return this.name().toUpperCase(); @@ -61,15 +63,15 @@ public KeyTrie[] getColKeyPattern() { return colKeyPattern; } - public HashSet[] endWithValueStrings(){ - HashSet[] endWithValueString = new HashSet[colKeyPattern.length]; - for(int i=0; i< colKeyPattern.length; i++) - if (colKeyPattern[i]!=null) + public HashSet[] endWithValueStrings() { + HashSet[] endWithValueString = new HashSet[colKeyPattern.length]; + for(int i = 0; i < colKeyPattern.length; i++) + if(colKeyPattern[i] != null) endWithValueString[i] = colKeyPattern[i].getFirstSuffixKeyPatterns(); return endWithValueString; } - public HashSet endWithValueStringsRow(){ + public HashSet endWithValueStringsRow() { return rowKeyPattern.getFirstSuffixKeyPatterns(); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 26d52f789c1..bdc38a7791f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -93,7 +93,6 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - //String js = src.generateCodeJava(); matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } @@ -119,8 +118,6 @@ public FrameReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - //String js = src.generateCodeJava(); - //System.out.println(js); frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return frameReader; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java index 59c090527db..6be915f590e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/KeyTrie.java @@ -99,14 +99,14 @@ private void insertKeys(ArrayList keys, KeyTrieNode root) { } public ArrayList> getPrefixKeyPatterns() { - if(this.prefixKeyPattern!=null) + if(this.prefixKeyPattern != null) return prefixKeyPattern; else return getKeyPatterns(rootPrefixKeys); } public ArrayList> getReversePrefixKeyPatterns() { - if(this.prefixKeyPattern!=null) + if(this.prefixKeyPattern != null) return prefixKeyPattern; else { ArrayList> kps = getKeyPatterns(rootPrefixKeys); @@ -132,10 +132,10 @@ public ArrayList> getSuffixKeyPatterns() { return result; } - public HashSet getFirstSuffixKeyPatterns(){ + public HashSet getFirstSuffixKeyPatterns() { ArrayList> suffixKeyPattern = getSuffixKeyPatterns(); HashSet suffixString = new HashSet<>(); - for(ArrayList kp: suffixKeyPattern){ + for(ArrayList kp : suffixKeyPattern) { suffixString.add(kp.get(0)); } return suffixString; @@ -182,7 +182,7 @@ private void getSuffixKeyPatterns(KeyTrieNode node, ArrayList> public void insertPrefixKeysConcurrent(HashSet keys) { insertPrefixKeysConcurrent(rootPrefixKeys, keys); - ArrayList> ss =getPrefixKeyPatterns(); + ArrayList> ss = getPrefixKeyPatterns(); } private void insertPrefixKeysConcurrent(KeyTrieNode node, HashSet keys) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java index 603c59a4c20..a56736de3c7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingTrie.java @@ -35,7 +35,7 @@ public class MappingTrie { private MappingTrieNode root; private int keyLevel; private boolean inALine; - private int windowSize = 50; + private int windowSize = 100; public MappingTrie() { this.root = new MappingTrieNode(MappingTrieNode.Type.INNER); @@ -58,6 +58,7 @@ public void reverseInsert(String word, int rowIndex) { tmpList.add(rowIndex); this.insert(new StringBuilder(word).reverse().toString(), tmpList); } + public void insert(String word, ArrayList rowIndexes) { MappingTrieNode newNode; if(root.getChildren().containsKey(word)) @@ -69,6 +70,12 @@ public void insert(String word, ArrayList rowIndexes) { } public MappingTrieNode getFistMultiChildNode(MappingTrieNode node) { + + if(node.getNodeType() == MappingTrieNode.Type.INNER && node.getChildren().size() == 1) { + String nkey = node.getChildren().keySet().iterator().next(); + if(node.getChildren().get(nkey).getRowIndexes().size() > 1) + return node; + } if(node.getChildren().size() == 1 && node.getNodeType() != MappingTrieNode.Type.END) return getFistMultiChildNode(node.getChildren().get(node.getChildren().keySet().iterator().next())); else @@ -106,7 +113,7 @@ else if(sb.length() > 0) { sb = new StringBuilder(); } } - if(sb.length() > 0){ + if(sb.length() > 0) { getAllSubStrings(result, sb); } @@ -119,9 +126,7 @@ private void getAllSubStrings(HashSet result, StringBuilder sb) { else { for(int j = 1; j <= Math.min(sb.length(), windowSize); j++) { for(int k = 0; k <= sb.length() - j; k++) { - String subStr = sb.substring(k, k + j); - if (!subStr.contains(Lop.OPERAND_DELIMITOR)) - result.add(subStr); + result.add(sb.substring(k, k + j)); } } } @@ -132,8 +137,23 @@ public String getIntersectOfChildren(MappingTrieNode node) { return null; else { Set keys = node.getChildren().keySet(); - if(keys.size() == 1) - return String.valueOf(keys.iterator().next().charAt(0)); + if(keys.size() == 1) { + String[] splitText = keys.iterator().next().split(Lop.OPERAND_DELIMITOR, -1); + String str = splitText[0]; + if(str.length() == 0 && splitText.length > 1) + str = splitText[1]; + return String.valueOf(str.charAt(0)); + } + + Set newKeys = new HashSet<>(); + for(String k : keys) { + String[] splitText = k.split(Lop.OPERAND_DELIMITOR, -1); + String str = splitText[0]; + if(str.length() == 0 && splitText.length > 1) + str = splitText[1]; + newKeys.add(str); + } + keys = newKeys; boolean flag = false; int maxKeyLength = 0; @@ -250,7 +270,7 @@ public boolean reConstruct() { for(String k : node.getChildren().keySet()) { String key = k.substring(k.indexOf(intersect) + intersect.length()); - if(key.length() > 0) { + if(key.length() > 0 && !key.equals(Lop.OPERAND_DELIMITOR)) { intersectTrie.insert(key, node.getChildren().get(k).getRowIndexes()); intersectRowIndexes.addAll(node.getChildren().get(k).getRowIndexes()); } @@ -289,7 +309,6 @@ public ArrayList> getAllSequentialKeys() { int level = 0; for(Pair> n : k) { if(n.getKey() != null) { - if(level == keyLevel - 1 || keyLevel == 0) { indexOrder.add(new Pair<>(index, n.getValue().size())); break; @@ -311,9 +330,9 @@ public ArrayList> getAllSequentialKeys() { for(Pair> n : k) if(n.getKey() != null) { if(level < keyLevel || keyLevel == 0) { - String[] splitText = n.getKey().split(Lop.OPERAND_DELIMITOR,-1); + String[] splitText = n.getKey().split(Lop.OPERAND_DELIMITOR, -1); String str = splitText[0]; - if (str.length() == 0 && splitText.length >1) + if(str.length() == 0 && splitText.length > 1) str = splitText[1]; kl.add(str); @@ -353,9 +372,9 @@ public ArrayList> getAllSequentialKeys() { } // revert list and values of list - for(ArrayList l: distinctKeys){ + for(ArrayList l : distinctKeys) { Collections.reverse(l); - for(int i=0; i checkedLines = new HashSet<>(); while(checkedLines.size() < nlines) { RawIndex ri = sampleRawIndexes.get(itRow); @@ -138,10 +138,9 @@ protected boolean findMapping(boolean isIndexMapping) { boolean flagMap = true; for(int r = 0; r < nrows && flagMap; r++) for(int c = 0; c < ncols && flagMap; c++) - if(mapRow[r][c] == -1 && ( - (!this.isMatrix && this.sampleFrame.get(r,c) != null) || - (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r,c)!=null) || - (schema[c].isNumeric() && this.sampleFrame.getDouble(r,c)!=0))))) { + if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, + c) != null) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, + c) != null) || (schema[c].isNumeric() && this.sampleFrame.getDouble(r, c) != 0))))) { flagMap = false; } return flagMap; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 2e0878f275c..543ba4e8d09 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -19,10 +19,12 @@ package org.apache.sysds.runtime.iogen.codegen; +import com.google.gson.Gson; import org.apache.sysds.common.Types; import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.KeyTrie; +import java.security.GeneralSecurityException; import java.util.ArrayList; import java.util.HashSet; import java.util.Random; @@ -33,7 +35,7 @@ public class CodeGenTrie { private final CustomProperties properties; private final String destination; - public CodeGenTrie(CustomProperties properties, String destination){ + public CodeGenTrie(CustomProperties properties, String destination) { this.rootCol = new CodeGenTrieNode(CodeGenTrieNode.NodeType.COL); this.rootRow = new CodeGenTrieNode(CodeGenTrieNode.NodeType.ROW); this.properties = properties; @@ -42,27 +44,29 @@ public CodeGenTrie(CustomProperties properties, String destination){ } // Build Trie for Col and Row Key Patterns - private void buildPrefixTree(){ - for(int c=0; c< properties.getColKeyPattern().length; c++){ + private void buildPrefixTree() { + for(int c = 0; c < properties.getColKeyPattern().length; c++) { KeyTrie keyTrie = properties.getColKeyPattern()[c]; Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; - if (keyTrie != null) { - for (ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + Gson gson = new Gson(); + System.out.println(gson.toJson(keyTrie.getPrefixKeyPatterns())); + if(keyTrie != null) { + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } } - if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX){ + if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { KeyTrie keyTrie = properties.getRowKeyPattern(); Types.ValueType vt = Types.ValueType.FP32; - if (keyTrie != null) { - for (ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + if(keyTrie != null) { + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootRow, -1, vt, keys); } } } - private void insert(CodeGenTrieNode root ,int index, Types.ValueType valueType, ArrayList keys) { + private void insert(CodeGenTrieNode root, int index, Types.ValueType valueType, ArrayList keys) { CodeGenTrieNode currentNode = root; int rci = 0; for(String key : keys) { @@ -73,18 +77,25 @@ private void insert(CodeGenTrieNode root ,int index, Types.ValueType valueType, else break; } - CodeGenTrieNode newNode; - for(int i = rci; i < keys.size(); i++) { - newNode = new CodeGenTrieNode(i == keys.size() - 1, index, valueType, keys.get(i), new HashSet<>(), root.getType()); - newNode.setRowIndexBeginPos(properties.getRowIndexBegin()); - currentNode.getChildren().put(keys.get(i), newNode); - currentNode = newNode; + if(rci == keys.size()) { + currentNode.setEndOfCondition(true); + currentNode.setColIndex(index); + } + else { + CodeGenTrieNode newNode; + for(int i = rci; i < keys.size(); i++) { + newNode = new CodeGenTrieNode(i == keys.size() - 1, index, valueType, keys.get(i), new HashSet<>(), + root.getType()); + newNode.setRowIndexBeginPos(properties.getRowIndexBegin()); + currentNode.getChildren().put(keys.get(i), newNode); + currentNode = newNode; + } } } - public String getJavaCode(){ + public String getJavaCode() { StringBuilder src = new StringBuilder(); - switch(properties.getRowIndex()){ + switch(properties.getRowIndex()) { case IDENTIFY: getJavaCode(rootCol, src, "0"); src.append("row++; \n"); @@ -121,7 +132,6 @@ public String getJavaCode(){ return src.toString(); } - public String getRandomName(String base) { Random r = new Random(); int low = 0; @@ -131,7 +141,7 @@ public String getRandomName(String base) { return base + "_" + result; } - private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos){ + private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos) { if(node.isEndOfCondition()) src.append(node.geValueCode(destination, currPos)); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java index 7f8ce066f1c..06fbedaf348 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java @@ -26,7 +26,7 @@ public static void main(String[] args) throws Exception { HashSet valueSet = new HashSet<>(); for (int r = 0; r < sampleFrameStrings.length; r++) valueSet.add(sampleFrameStrings[r][c]); - if (valueSet.size() > 3) { + if (valueSet.size() > 0) { ArrayList tempList = new ArrayList<>(); for (int r = 0; r < sampleFrameStrings.length; r++) { tempList.add(sampleFrameStrings[r][c]); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh index 0ad69d46a0e..a573a99b966 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh @@ -1,12 +1,6 @@ #!/usr/bin/env bash -path_1="$1/benchmark" -path_2="$path_1/$3" -mkdir -p $1 -mkdir -p "$path_1" -mkdir -p "$path_2" - -log_file="$path_2/$2.csv" +log_file="$1.csv" if test ! -f "$log_file"; then touch $log_file echo "dataset,data_nrows,data_ncols,col_selected_count,sample_nrows,generate_time,read_time" > $log_file diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh index 4f34d0f392f..2ceb544a83c 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh @@ -7,8 +7,9 @@ LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" jar_file_path="$systemDS_Home/target/SystemDS.jar" lib_files_path="$systemDS_Home/target/lib/*" #----------------------------------------------------------------- -root_data_path="/home/saeed/Documents/Dataset/GIODataset/libsvm" -home_log="/home/saeed/Documents/ExpLog/libsvm/" +format="json" +root_data_path="/home/saeed/Documents/Dataset/GIODataset/twitter/$format/" +#home_log="/home/saeed/Documents/ExpLog/json/" cpp_base_src="" #"/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" sep="_" nrows=-1 @@ -16,26 +17,34 @@ nrows=-1 mx_mem="$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024 * 1024)))g" delimiter="\t" -declare -a datasets=("aminer_paper") +declare -a datasets=("twitter") declare -a main_classes=("GIOFrameExperimentHDFS") #SYSDSFrameExperimentHDFS GIOFrameExperimentHDFS -for (( i = 0; i < 1; i++ )); do +for (( i = 1; i < 2; i++ )); do + for mc in "${main_classes[@]}"; do for d in "${datasets[@]}"; do - ./resultPath.sh $home_log $d$i $mc - data_file_name="$root_data_path/$d/$d.data" + #home_log="/home/saeed/Documents/ExpLog/$format/$d/Q$i/" + home_log="/home/saeed/Documents/ExpLog/GIO-$d-$format-Q$i" + ./resultPath.sh $home_log + data_file_name="$root_data_path/$d.data" for sr in 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 do - for p in 11 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 + for p in 1 #2 5 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 do #schema_file_name="$root_data_path/$d/$d.schema" #sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" #sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" - schema_file_name="$root_data_path/$d/$d$sep$p.schema" - sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" - sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" +# schema_file_name="$root_data_path/$d/$d$sep$p.schema" +# sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" +# sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" +# + schema_file_name="$root_data_path/Q$i/$d$sep$p.schema" + sample_raw_fileName="$root_data_path/Q$i/sample_$d$sr$sep$p.raw" + sample_frame_file_name="$root_data_path/Q$i/sample_$d$sr$sep$p.frame" + SCRIPT="java\ -Dlog4j.configuration=file:$LOG4JPROP\ -Xms1g\ @@ -48,7 +57,7 @@ for (( i = 0; i < 1; i++ )); do -DschemaFileName=$schema_file_name\ -DdataFileName=$data_file_name\ -DdatasetName=$d\ - -DhomeLog=$home_log/benchmark/$mc/$d$i.csv\ + -DhomeLog=$home_log.csv\ -DcppBaseSrc=$cpp_base_src\ -Dnrows=$nrows\ -cp\ diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 48c11a5508c..9601f0dde9c 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -251,12 +251,12 @@ public void test12() { @Test public void test13() throws Exception { - String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/sample_1000_11.raw"; - String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/sample_1000_11.frame"; + String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.raw"; + String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.frame"; Integer sampleNRows = 1000; String delimiter = "\\t"; - String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/aminer_paper_11.schema"; - String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/csv/aminer_paper/aminer_paper.data"; + String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/aminer_author_5.schema"; + String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/aminer_author.data"; Float percent = 7f;//Float.parseFloat(args[6]); String datasetName = "aminer_paper";//args[7]; @@ -278,7 +278,7 @@ public void test13() throws Exception { HashSet valueSet = new HashSet<>(); for(int r=0; r3){ + if(valueSet.size()>0){ ArrayList tempList = new ArrayList<>(); for(int r=0; r Date: Tue, 1 Feb 2022 19:28:09 +0100 Subject: [PATCH 33/84] up --- .../runtime/iogen/GIO/GIOIdentification.java | 35 ++++++ .../runtime/iogen/{exp => GIO}/Util.java | 46 +++----- .../iogen/exp/GIOFlatFrameExperimentHDFS.java | 1 + .../iogen/exp/GIOFrameExperimentHDFS.java | 103 +++++++++--------- .../iogen/exp/SYSDSFrameExperimentHDFS.java | 2 +- .../Identify/MatrixGRRowColIdentifyTest.java | 3 +- 6 files changed, 103 insertions(+), 87 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java rename src/main/java/org/apache/sysds/runtime/iogen/{exp => GIO}/Util.java (66%) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java new file mode 100644 index 00000000000..cf8b8148959 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java @@ -0,0 +1,35 @@ +package org.apache.sysds.runtime.iogen.GIO; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +public class GIOIdentification { + + public static void main(String[] args) throws Exception { + String sampleRawFileName; + String sampleFrameFileName; + String sampleRawDelimiter; + String schemaFileName; + + sampleRawFileName = System.getProperty("sampleRawFileName"); + sampleFrameFileName = System.getProperty("sampleFrameFileName"); + sampleRawDelimiter = System.getProperty("delimiter"); + if(sampleRawDelimiter.equals("\\t")) + sampleRawDelimiter = "\t"; + schemaFileName = System.getProperty("schemaFileName"); + + + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, sampleRawDelimiter); + + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); + + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + gr.getReader(); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java similarity index 66% rename from src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java rename to src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java index f7236010f2a..ea3347cb149 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java @@ -1,4 +1,4 @@ -package org.apache.sysds.runtime.iogen.exp; +package org.apache.sysds.runtime.iogen.GIO; import org.apache.sysds.common.Types; @@ -16,36 +16,10 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; public class Util { - // Load Random 2D data from file - private double[][] load2DData(String fileName, int nrows, int ncols) throws Exception { - - Path path = Paths.get(fileName); - FileChannel inStreamRegularFile = FileChannel.open(path); - int bufferSize = ncols * 8; - - double[][] result = new double[nrows][ncols]; - try { - for(int r = 0; r < nrows; r++) { - inStreamRegularFile.position((long) r * ncols * 8); - ByteBuffer buffer = ByteBuffer.allocateDirect(bufferSize); - inStreamRegularFile.read(buffer); - buffer.flip(); - - for(int c = 0; c < ncols; c++) { - result[r][c] = buffer.getDouble(); - } - } - inStreamRegularFile.close(); - } - catch(IOException e) { - throw new Exception("Can't read matrix from ByteArray", e); - } - return result; - } - public String readEntireTextFile(String fileName) throws IOException { String text = new String(Files.readAllBytes(Paths.get(fileName)), StandardCharsets.UTF_8); return text; @@ -75,25 +49,31 @@ public Types.ValueType[] getSchema(String fileName) throws IOException { return result; } - public String[][] loadFrameData(String fileName, int nrows, int ncols, String delimiter) + public String[][] loadFrameData(String fileName, int ncols, String delimiter) throws IOException { - String[][] result = new String[nrows][ncols]; + ArrayList sampleRawLines = new ArrayList<>(); try(BufferedReader br = new BufferedReader(new FileReader(fileName))) { String line; - int row = 0; while((line = br.readLine()) != null) { String[] data = line.split(delimiter); + String[] colsData = new String[ncols]; for(int i = 0; i < data.length; i++) { String[] value = data[i].split("::"); if(value.length ==2) { int col = Integer.parseInt(value[0]); - result[row][col] = value[1]; + colsData[col] = value[1]; } } - row++; + sampleRawLines.add(colsData); } } + + int nrows = sampleRawLines.size(); + String[][] result = new String[nrows][ncols]; + for(int i=0; i< nrows; i++) + result[i] = sampleRawLines.get(i); + return result; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java index e7dfebc1c00..72f2ac9f82e 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java @@ -4,6 +4,7 @@ import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.io.FrameReaderTextCSV; +import org.apache.sysds.runtime.iogen.GIO.Util; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java index 06fbedaf348..b23017c99e6 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java @@ -2,6 +2,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.iogen.GIO.Util; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; @@ -11,56 +12,56 @@ public class GIOFrameExperimentHDFS extends GIOMain { public static void main(String[] args) throws Exception { - getArgs(); - - Util util = new Util(); - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - - ArrayList newSampleSchema = new ArrayList<>(); - ArrayList> newSampleFrame = new ArrayList<>(); - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); - - for (int c = 0; c < sampleFrameStrings[0].length; c++) { - HashSet valueSet = new HashSet<>(); - for (int r = 0; r < sampleFrameStrings.length; r++) - valueSet.add(sampleFrameStrings[r][c]); - if (valueSet.size() > 0) { - ArrayList tempList = new ArrayList<>(); - for (int r = 0; r < sampleFrameStrings.length; r++) { - tempList.add(sampleFrameStrings[r][c]); - } - newSampleFrame.add(tempList); - newSampleSchema.add(sampleSchema[c]); - } - } - - sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; - - for (int row = 0; row < sampleFrameStrings.length; row++) { - for (int col = 0; col < sampleFrameStrings[0].length; col++) { - sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); - } - } - - sampleSchema = new Types.ValueType[newSampleSchema.size()]; - for (int i = 0; i < newSampleSchema.size(); i++) - sampleSchema[i] = newSampleSchema.get(i); - - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - - double tmpTime = System.nanoTime(); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - FrameReader fr = gr.getReader(); - double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; - - tmpTime = System.nanoTime(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); - double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; - - String log = datasetName + "," + frameBlock.getNumRows() + "," + frameBlock.getNumColumns() + "," + sampleSchema.length + "," + sampleNRows + "," + generateTime + "," + readTime; - util.addLog(LOG_HOME, log); +// getArgs(); +// +// Util util = new Util(); +// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); +// int ncols = sampleSchema.length; +// +// ArrayList newSampleSchema = new ArrayList<>(); +// ArrayList> newSampleFrame = new ArrayList<>(); +// +// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); +// +// for (int c = 0; c < sampleFrameStrings[0].length; c++) { +// HashSet valueSet = new HashSet<>(); +// for (int r = 0; r < sampleFrameStrings.length; r++) +// valueSet.add(sampleFrameStrings[r][c]); +// if (valueSet.size() > 0) { +// ArrayList tempList = new ArrayList<>(); +// for (int r = 0; r < sampleFrameStrings.length; r++) { +// tempList.add(sampleFrameStrings[r][c]); +// } +// newSampleFrame.add(tempList); +// newSampleSchema.add(sampleSchema[c]); +// } +// } +// +// sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; +// +// for (int row = 0; row < sampleFrameStrings.length; row++) { +// for (int col = 0; col < sampleFrameStrings[0].length; col++) { +// sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); +// } +// } +// +// sampleSchema = new Types.ValueType[newSampleSchema.size()]; +// for (int i = 0; i < newSampleSchema.size(); i++) +// sampleSchema[i] = newSampleSchema.get(i); +// +// FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); +// +// double tmpTime = System.nanoTime(); +// String sampleRaw = util.readEntireTextFile(sampleRawFileName); +// GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); +// FrameReader fr = gr.getReader(); +// double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; +// +// tmpTime = System.nanoTime(); +// FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); +// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; +// +// String log = datasetName + "," + frameBlock.getNumRows() + "," + frameBlock.getNumColumns() + "," + sampleSchema.length + "," + sampleNRows + "," + generateTime + "," + readTime; +// util.addLog(LOG_HOME, log); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java index b673e41d9d6..1f5c3249566 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java @@ -3,7 +3,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; import org.apache.sysds.runtime.io.FrameReaderTextCSV; -import org.apache.sysds.runtime.io.FrameReaderTextCell; +import org.apache.sysds.runtime.iogen.GIO.Util; import org.apache.sysds.runtime.matrix.data.FrameBlock; public class SYSDSFrameExperimentHDFS extends GIOMain { diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 9601f0dde9c..4315b3c47b3 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -22,12 +22,11 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.iogen.exp.Util; +import org.apache.sysds.runtime.iogen.GIO.Util; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; import org.junit.Test; -import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.Random; From bb4394cc129648b6bbcb9bca83a6593f5cb658c2 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 6 Feb 2022 22:34:56 +0100 Subject: [PATCH 34/84] Add Baselines --- .../runtime/io/FrameReaderJSONJackson.java | 139 +++++++ .../runtime/iogen/Baseline/SystemDSJSON.java | 45 +++ .../runtime/iogen/GIO/GIOIdentification.java | 36 +- .../sysds/runtime/iogen/GIO/GIORead.java | 74 ++++ .../apache/sysds/runtime/iogen/GIO/Util.java | 18 +- .../runtime/iogen/codegen/CodeGenTrie.java | 4 +- .../Identify/MatrixGRRowColIdentifyTest.java | 354 +++++++++--------- 7 files changed, 494 insertions(+), 176 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/GIO/GIORead.java diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java new file mode 100644 index 00000000000..3d3fa937f6f --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java @@ -0,0 +1,139 @@ +package org.apache.sysds.runtime.io; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.databind.node.ValueNode; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.UtilFunctions; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import static org.apache.sysds.runtime.io.FrameReader.*; + + +public class FrameReaderJSONJackson +{ + public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Map schemaMap, + long rlen, long clen) throws IOException, DMLRuntimeException + { + //prepare file access + JobConf jobConf = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fileSystem = IOUtilFunctions.getFileSystem(path, jobConf); + FileInputFormat.addInputPath(jobConf, path); + + //check existence and non-empty file + checkValidInputFile(fileSystem, path); + + Types.ValueType[] lschema = createOutputSchema(schema, clen); + String[] lnames = createOutputNamesFromSchemaMap(schemaMap); + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + + readJSONLFrameFromHDFS(path, jobConf, fileSystem, ret, schema, schemaMap); + return ret; + } + + + public void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, + Types.ValueType[] schema, Map schemaMap) throws IOException + { + TextInputFormat inputFormat = new TextInputFormat(); + inputFormat.configure(jobConf); + InputSplit[] splits = inputFormat.getSplits(jobConf, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + + for (int i = 0, rowPos = 0; i < splits.length; i++) { + rowPos = readJSONLFrameFromInputSplit(splits[i], inputFormat, jobConf, schema, schemaMap, dest, rowPos); + } + } + + + private int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, + JobConf jobConf, Types.ValueType[] schema, Map schemaMap, FrameBlock dest, int currentRow) + throws IOException + { + RecordReader reader = inputFormat.getRecordReader(split, jobConf, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + int row = currentRow; + try { + while (reader.next(key, value)) { + ObjectMapper mapper = new ObjectMapper(); + JsonNode root = mapper.readTree(value.toString()); + Map map = new HashMap<>(); + addKeys("", root, map, new ArrayList<>()); + + for (Map.Entry entry : schemaMap.entrySet()) { + String strCellValue = map.get(entry.getKey()); + if(strCellValue!=null){ + dest.set(row, entry.getValue(), UtilFunctions.stringToObject(schema[entry.getValue()], strCellValue)); + } + } + row++; + } + } + finally { + IOUtilFunctions.closeSilently(reader); + } + return row; + } + private static void addKeys(String currentPath, JsonNode jsonNode, Map map, List suffix) { + if (jsonNode.isObject()) { + ObjectNode objectNode = (ObjectNode) jsonNode; + Iterator> iter = objectNode.fields(); + String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; + + while (iter.hasNext()) { + Map.Entry entry = iter.next(); + addKeys(pathPrefix + entry.getKey(), entry.getValue(), map, suffix); + } + } else if (jsonNode.isArray()) { + ArrayNode arrayNode = (ArrayNode) jsonNode; + for (int i = 0; i < arrayNode.size(); i++) { + suffix.add(i + 1); + addKeys(currentPath, arrayNode.get(i), map, suffix); + if (i + 1 (); + } + ValueNode valueNode = (ValueNode) jsonNode; + map.put("/"+currentPath, valueNode.asText()); + } + } + + private String[] createOutputNamesFromSchemaMap(Map schemaMap) { + String[] names = new String[schemaMap.size()]; + schemaMap.forEach((key, value) -> names[value] = key); + return names; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java b/src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java new file mode 100644 index 00000000000..3c4041202c4 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java @@ -0,0 +1,45 @@ +package org.apache.sysds.runtime.iogen.Baseline; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameReaderJSONJackson; +import org.apache.sysds.runtime.io.FrameReaderJSONL; +import org.apache.sysds.runtime.iogen.GIO.Util; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.wink.json4j.JSONException; + +import java.io.IOException; +import java.util.Map; + +public class SystemDSJSON { + + public static void main(String[] args) throws IOException, JSONException { + + String schemaFileName; + String schemaMapFileName; + String dataFileName; + long nrows; + String config; + + schemaFileName = System.getProperty("schemaFileName"); + schemaMapFileName = System.getProperty("schemaMapFileName"); + dataFileName = System.getProperty("dataFileName"); + nrows = Long.parseLong(System.getProperty("nrows")); + config = System.getProperty("config"); + + Util util = new Util(); + Types.ValueType[] schema = util.getSchema(schemaFileName); + int ncols = schema.length; + Map schemaMap = util.getSchemaMap(schemaMapFileName); + + FrameBlock readBlock; + if(config.equals("SystemDS+Jason4j")) { + FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); + readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } + else if(config.equals("SystemDS+Jackson")) { + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } + + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java index cf8b8148959..5fce7b0d97d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIOIdentification.java @@ -4,6 +4,9 @@ import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; +import java.util.ArrayList; +import java.util.HashSet; + public class GIOIdentification { public static void main(String[] args) throws Exception { @@ -17,15 +20,44 @@ public static void main(String[] args) throws Exception { sampleRawDelimiter = System.getProperty("delimiter"); if(sampleRawDelimiter.equals("\\t")) sampleRawDelimiter = "\t"; - schemaFileName = System.getProperty("schemaFileName"); - + schemaFileName = System.getProperty("schemaFileName"); Util util = new Util(); Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); int ncols = sampleSchema.length; String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, sampleRawDelimiter); + ArrayList newSampleSchema = new ArrayList<>(); + ArrayList> newSampleFrame = new ArrayList<>(); + + for(int c = 0; c < sampleFrameStrings[0].length; c++) { + HashSet valueSet = new HashSet<>(); + for(int r = 0; r < sampleFrameStrings.length; r++) + valueSet.add(sampleFrameStrings[r][c]); + if(valueSet.size() > 1) { + ArrayList tempList = new ArrayList<>(); + for(int r = 0; r < sampleFrameStrings.length; r++) { + tempList.add(sampleFrameStrings[r][c]); + } + newSampleFrame.add(tempList); + newSampleSchema.add(sampleSchema[c]); + } + } + + sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; + + for(int row = 0; row < sampleFrameStrings.length; row++) { + for(int col = 0; col < sampleFrameStrings[0].length; col++) { + sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); + } + } + + sampleSchema = new Types.ValueType[newSampleSchema.size()]; + for(int i = 0; i < newSampleSchema.size(); i++) + sampleSchema[i] = newSampleSchema.get(i); + + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); String sampleRaw = util.readEntireTextFile(sampleRawFileName); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIORead.java b/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIORead.java new file mode 100644 index 00000000000..1f338adf103 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/GIO/GIORead.java @@ -0,0 +1,74 @@ +package org.apache.sysds.runtime.iogen.GIO; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; + +import java.util.ArrayList; +import java.util.HashSet; + +public class GIORead { + + public static void main(String[] args) throws Exception { + String sampleRawFileName; + String sampleFrameFileName; + String sampleRawDelimiter; + String schemaFileName; + String dataFileName; + long nrows; + + sampleRawFileName = System.getProperty("sampleRawFileName"); + sampleFrameFileName = System.getProperty("sampleFrameFileName"); + sampleRawDelimiter = System.getProperty("delimiter"); + if(sampleRawDelimiter.equals("\\t")) + sampleRawDelimiter = "\t"; + schemaFileName = System.getProperty("schemaFileName"); + dataFileName = System.getProperty("dataFileName"); + nrows = Long.parseLong(System.getProperty("nrows")); + + + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, sampleRawDelimiter); + + ArrayList newSampleSchema = new ArrayList<>(); + ArrayList> newSampleFrame = new ArrayList<>(); + + for(int c = 0; c < sampleFrameStrings[0].length; c++) { + HashSet valueSet = new HashSet<>(); + for(int r = 0; r < sampleFrameStrings.length; r++) + valueSet.add(sampleFrameStrings[r][c]); + if(valueSet.size() > 1) { + ArrayList tempList = new ArrayList<>(); + for(int r = 0; r < sampleFrameStrings.length; r++) { + tempList.add(sampleFrameStrings[r][c]); + } + newSampleFrame.add(tempList); + newSampleSchema.add(sampleSchema[c]); + } + } + + sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; + + for(int row = 0; row < sampleFrameStrings.length; row++) { + for(int col = 0; col < sampleFrameStrings[0].length; col++) { + sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); + } + } + + sampleSchema = new Types.ValueType[newSampleSchema.size()]; + for(int i = 0; i < newSampleSchema.size(); i++) + sampleSchema[i] = newSampleSchema.get(i); + + + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); + + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + FrameReader fr = gr.getReader(); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, nrows, sampleSchema.length); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java index ea3347cb149..1e35240fd78 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java @@ -17,11 +17,13 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; public class Util { public String readEntireTextFile(String fileName) throws IOException { - String text = new String(Files.readAllBytes(Paths.get(fileName)), StandardCharsets.UTF_8); + String text = Files.readString(Paths.get(fileName)); return text; } @@ -49,11 +51,23 @@ public Types.ValueType[] getSchema(String fileName) throws IOException { return result; } + public Map getSchemaMap(String fileName) throws IOException { + Map schemaMap = new HashMap<>(); + try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { + String line; + while((line = br.readLine()) != null) { + String[] colSchema = line.split(","); + schemaMap.put(colSchema[0], Integer.parseInt(colSchema[1])); + } + } + return schemaMap; + } + public String[][] loadFrameData(String fileName, int ncols, String delimiter) throws IOException { ArrayList sampleRawLines = new ArrayList<>(); - try(BufferedReader br = new BufferedReader(new FileReader(fileName))) { + try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { String line; while((line = br.readLine()) != null) { String[] data = line.split(delimiter); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 543ba4e8d09..e3c4cbf166c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -48,9 +48,9 @@ private void buildPrefixTree() { for(int c = 0; c < properties.getColKeyPattern().length; c++) { KeyTrie keyTrie = properties.getColKeyPattern()[c]; Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; - Gson gson = new Gson(); - System.out.println(gson.toJson(keyTrie.getPrefixKeyPatterns())); + //Gson gson = new Gson(); if(keyTrie != null) { + //System.out.println(c+": "+gson.toJson(keyTrie.getPrefixKeyPatterns())); for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 4315b3c47b3..7daeba29c5f 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -21,22 +21,26 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.FrameReaderJSONL; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.iogen.GIO.Util; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; import org.junit.Test; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Random; public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { private final static String TEST_NAME = "MatrixGenerateReaderCSVTest"; - @Override - protected String getTestName() { + @Override protected String getTestName() { return TEST_NAME; } @@ -66,203 +70,198 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou sampleRaw = sb.toString(); } - @Test - public void test1() { + @Test public void test1() { sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; runGenerateReaderTest(); } - @Test - public void test2() { + @Test public void test2() { sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; runGenerateReaderTest(); } - @Test - public void test3() { + + @Test public void test3() { sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; runGenerateReaderTest(); } - @Test - public void test4() { + @Test public void test4() { String[] naString = {"NaN"}; generateRandomCSV(20, 20, -10, 10, 1, ",", naString); runGenerateReaderTest(); } - @Test - public void test5() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"name\":3,\"password\":4}}\n" + - "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + - "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + - "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + - "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + @Test public void test5() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"name\":3,\"password\":4}}\n" + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12}, {15, 16}, {19, 20}}; runGenerateReaderTest(); } - @Test - public void test6() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + - "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + - "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + - "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + - "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + @Test public void test6() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12}, {15, 16}, {19, 20}}; runGenerateReaderTest(); } - @Test - public void test7() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + - "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + - "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + - "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + - "{\"name\":18, \"user\":{\"name\":20,\"password\":21}, \"occupation\":19}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12},{15,16},{19,20}}; + @Test public void test7() { + sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + "{\"name\":18, \"user\":{\"name\":20,\"password\":21}, \"occupation\":19}"; + sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12}, {15, 16}, {19, 20}}; runGenerateReaderTest(); } - @Test - public void test8() { - sampleRaw = "1,1,10\n" + - "1,2,20\n" + - "1,3,30\n" + - "2,1,40\n" + - "2,2,50\n" + - "2,3,60\n" + - "3,1,70\n" + - "3,2,80\n"+ - "3,3,90\n"; - - sampleMatrix = new double[][] {{10,20,30}, {40,50,60}, {70,80,90}}; + @Test public void test8() { + sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "2,1,40\n" + "2,2,50\n" + "2,3,60\n" + "3,1,70\n" + "3,2,80\n" + "3,3,90\n"; + + sampleMatrix = new double[][] {{10, 20, 30}, {40, 50, 60}, {70, 80, 90}}; runGenerateReaderTest(); } - @Test - public void test9() { + @Test public void test9() { sampleRaw = "
\n" + // 0 - "1\n" + //1 - "2\n" + // 2 - "3\n" + // 3 - "1980\n" + // 4 - "GIO\n" + // 5 - "
\n" + // 6 - "
\n" + // 7 - "10\n" + // 8 - "20\n" + // 9 - "30\n" + // 10 - "2000\n" + // 11 - "GIO2\n" + // 12 - "
\n" + // 13 - "
\n" + // 14 - "2010\n" + // 15 - "100\n" + // 16 - "200\n" + // 17 - "300\n" + // 18 - "800\n" + // 18 - "GIO3\n" + // 19 - "
\n" + // 20 - "
\n" + // 21 - "1000\n" + // 22 - "2000\n" + // 23 - "3000\n" + // 24 - "2222\n" + // 25 - "GIO4\n" + // 26 - "
"; // 27 - - sampleMatrix = new double[][] {{1,2,3,1980}, {10,20,30,2000}, {100,200,300,2010},{1000,2000,3000,2222}}; + "1\n" + //1 + "2\n" + // 2 + "3\n" + // 3 + "1980\n" + // 4 + "GIO\n" + // 5 + "\n" + // 6 + "
\n" + // 7 + "10\n" + // 8 + "20\n" + // 9 + "30\n" + // 10 + "2000\n" + // 11 + "GIO2\n" + // 12 + "
\n" + // 13 + "
\n" + // 14 + "2010\n" + // 15 + "100\n" + // 16 + "200\n" + // 17 + "300\n" + // 18 + "800\n" + // 18 + "GIO3\n" + // 19 + "
\n" + // 20 + "
\n" + // 21 + "1000\n" + // 22 + "2000\n" + // 23 + "3000\n" + // 24 + "2222\n" + // 25 + "GIO4\n" + // 26 + "
"; // 27 + + sampleMatrix = new double[][] {{1, 2, 3, 1980}, {10, 20, 30, 2000}, {100, 200, 300, 2010}, + {1000, 2000, 3000, 2222}}; runGenerateReaderTest(); } - @Test - public void test10() { - sampleRaw = "
\n" + - "1980 \n" + - "1 \n" + - "2 \n" + - "3 \n" + - "GIO \n" + - "
\n" + - " \n" + - "10 \n" + - "21 \n" + - "30 \n" + - "2000 \n" + - "GIO2 \n" + - "\n" + - " \n" + - "100 \n" + - "300 \n" + - "210 \n" + - "GIO3 \n" + - "200 \n" + - "\n" + - "
\n" + - "2222 \n" + - "1000 \n" + - "2000 \n" + - "3000 \n" + - "GIO4 \n" + - "
"; - - sampleMatrix = new double[][] {{1,2,3,1980}, {10,21,30,2000}, {100,200,300,2010},{1000,2000,3000,2222}}; + @Test public void test10() { + sampleRaw = "
\n" + "1980 \n" + "1 \n" + "2 \n" + "3 \n" + "GIO \n" + "
\n" + " \n" + "10 \n" + "21 \n" + "30 \n" + "2000 \n" + "GIO2 \n" + "\n" + " \n" + "100 \n" + "300 \n" + "210 \n" + "GIO3 \n" + "200 \n" + "\n" + "
\n" + "2222 \n" + "1000 \n" + "2000 \n" + "3000 \n" + "GIO4 \n" + "
"; + + sampleMatrix = new double[][] {{1, 2, 3, 1980}, {10, 21, 30, 2000}, {100, 200, 300, 2010}, + {1000, 2000, 3000, 2222}}; runGenerateReaderTest(); } - @Test - public void test11() { - sampleRaw = "#index 1\n" + - "#t 2,3\n" + - "#s 1980\n"+ - "#index 10\n\n" + - "#t 21,30\n" + - "#s 2000\n\n"+ - "#index 100\n" + - "#t 200,300\n" + - "#s 2222"; - - sampleMatrix = new double[][] {{1,2,3,1980}, {10,21,30,2000}, {100,200,300,2010},{1000,2000,3000,2222}}; + @Test public void test11() { + sampleRaw = "#index 1\n" + "#t 2,3\n" + "#s 1980\n" + "#index 10\n\n" + "#t 21,30\n" + "#s 2000\n\n" + "#index 100\n" + "#t 200,300\n" + "#s 2222"; + + sampleMatrix = new double[][] {{1, 2, 3, 1980}, {10, 21, 30, 2000}, {100, 200, 300, 2010}, + {1000, 2000, 3000, 2222}}; runGenerateReaderTest(); - } + } - @Test - public void test12() { -// sampleRaw = "#index 1\n" + -// "#t 2,3\n" + -// "#s 1980\n"+ -// "#index 10\n\n" + -// "#t 21,30\n" + -// "#s 2000\n\n"+ -// "#index 100\n" + -// "#t 200,300\n" + -// "#s 2222"; -// -// sampleMatrix = new double[][] {{1,2,3}, {10,21,30}, {100,200,300},{1000,2000,3000}}; -// runGenerateReaderTest(); - - StringBuilder sb = new StringBuilder(" ,)R2I( hcraeseR mmocofnI rof etutitsnI ,tnemtrapeD gniniM ataD\"[:\"snoitailiffa\",\"tuhN hniM neyugN \":\"eman\",802:\"xedni\"{"); + @Test public void test12() { + // sampleRaw = "#index 1\n" + + // "#t 2,3\n" + + // "#s 1980\n"+ + // "#index 10\n\n" + + // "#t 21,30\n" + + // "#s 2000\n\n"+ + // "#index 100\n" + + // "#t 200,300\n" + + // "#s 2222"; + // + // sampleMatrix = new double[][] {{1,2,3}, {10,21,30}, {100,200,300},{1000,2000,3000}}; + // runGenerateReaderTest(); + + StringBuilder sb = new StringBuilder( + " ,)R2I( hcraeseR mmocofnI rof etutitsnI ,tnemtrapeD gniniM ataD\"[:\"snoitailiffa\",\"tuhN hniM neyugN \":\"eman\",802:\"xedni\"{"); System.out.println(sb.reverse()); } - @Test - public void test13() throws Exception { - String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.raw"; - String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.frame"; - Integer sampleNRows = 1000; + // @Test + // public void test13() throws Exception { + // String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.raw"; + // String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.frame"; + // Integer sampleNRows = 1000; + // String delimiter = "\\t"; + // String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/aminer_author_5.schema"; + // String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/aminer_author.data"; + // + // Float percent = 7f;//Float.parseFloat(args[6]); + // String datasetName = "aminer_paper";//args[7]; + // String LOG_HOME ="/home/saeed/Documents/ExpLog";//args[8]; + // + // if(delimiter.equals("\\t")) + // delimiter = "\t"; + // + // Util util = new Util(); + // Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + // int ncols = sampleSchema.length; + // + // ArrayList newSampleSchema = new ArrayList<>(); + // ArrayList> newSampleFrame = new ArrayList<>(); + // + // String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); + // + // for(int c = 0; c < sampleFrameStrings[0].length; c++) { + // HashSet valueSet = new HashSet<>(); + // for(int r=0; r0){ + // ArrayList tempList = new ArrayList<>(); + // for(int r=0; r newSampleSchema = new ArrayList<>(); ArrayList> newSampleFrame = new ArrayList<>(); - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); for(int c = 0; c < sampleFrameStrings[0].length; c++) { HashSet valueSet = new HashSet<>(); - for(int r=0; r0){ + if(valueSet.size() > 1) { ArrayList tempList = new ArrayList<>(); - for(int r=0; r schemaMap = new HashMap<>(); + schemaMap.put("/returnFlag",0); + schemaMap.put("/lineStatus",1); + schemaMap.put("/quantity",2); + schemaMap.put("/extendedPrice",3); + schemaMap.put("/discount",4); + schemaMap.put("/tax",5); + // Read FrameBlock + FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(FILENAME_SINGLE, schema, schemaMap, -1, schema.length); - String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+","+percent+","+ sampleNRows+","+ generateTime+","+readTime; - util.addLog(LOG_HOME, log); + int a = 100; } } From 5c7190fe938ff6b626836f37e6cd829ea50755db Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 11 Feb 2022 03:27:50 +0100 Subject: [PATCH 35/84] Add Baselines --- .../sysds/runtime/io/FrameReaderJSONGson.java | 140 ++++ .../runtime/io/FrameReaderJSONJackson.java | 7 +- .../iogen/{GIO/GIORead.java => EXP/GIO.java} | 13 +- .../iogen/{GIO => EXP}/GIOIdentification.java | 2 +- .../sysds/runtime/iogen/EXP/SystemDSGson.java | 34 + .../SystemDSJackson.java} | 19 +- .../runtime/iogen/EXP/SystemDSJson4j.java | 33 + .../runtime/iogen/{GIO => EXP}/Util.java | 2 +- .../runtime/iogen/FormatIdentifying.java | 2 +- .../sysds/runtime/iogen/Hirschberg.java | 626 ++++++++++-------- .../runtime/iogen/codegen/CodeGenTrie.java | 6 +- .../iogen/exp/GIOFlatFrameExperimentHDFS.java | 36 - .../iogen/exp/GIOFrameExperimentHDFS.java | 67 -- .../iogen/exp/GIOGenerateRapidJSONCode.java | 66 -- .../sysds/runtime/iogen/exp/GIOMain.java | 29 - .../iogen/exp/GIONestedExperimentStream.java | 71 -- .../iogen/exp/SYSDSFrameExperimentHDFS.java | 39 -- .../sysds/runtime/iogen/exp/resultPath.sh | 7 - .../sysds/runtime/iogen/exp/runGIOExp.sh | 80 --- 19 files changed, 566 insertions(+), 713 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java rename src/main/java/org/apache/sysds/runtime/iogen/{GIO/GIORead.java => EXP/GIO.java} (88%) rename src/main/java/org/apache/sysds/runtime/iogen/{GIO => EXP}/GIOIdentification.java (98%) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java rename src/main/java/org/apache/sysds/runtime/iogen/{Baseline/SystemDSJSON.java => EXP/SystemDSJackson.java} (54%) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java rename src/main/java/org/apache/sysds/runtime/iogen/{GIO => EXP}/Util.java (98%) delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/exp/GIOMain.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentStream.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java new file mode 100644 index 00000000000..f569b742247 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java @@ -0,0 +1,140 @@ +package org.apache.sysds.runtime.io; + +import com.google.gson.JsonArray; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParser; +import com.google.gson.JsonPrimitive; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.UtilFunctions; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static org.apache.sysds.runtime.io.FrameReader.checkValidInputFile; +import static org.apache.sysds.runtime.io.FrameReader.createOutputFrameBlock; +import static org.apache.sysds.runtime.io.FrameReader.createOutputSchema; + +public class FrameReaderJSONGson +{ + public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Map schemaMap, + long rlen, long clen) throws IOException, DMLRuntimeException + { + //prepare file access + JobConf jobConf = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fileSystem = IOUtilFunctions.getFileSystem(path, jobConf); + FileInputFormat.addInputPath(jobConf, path); + + //check existence and non-empty file + checkValidInputFile(fileSystem, path); + + Types.ValueType[] lschema = createOutputSchema(schema, clen); + String[] lnames = createOutputNamesFromSchemaMap(schemaMap); + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + + readJSONLFrameFromHDFS(path, jobConf, fileSystem, ret, schema, schemaMap); + return ret; + } + + + public void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, + Types.ValueType[] schema, Map schemaMap) throws IOException + { + TextInputFormat inputFormat = new TextInputFormat(); + inputFormat.configure(jobConf); + InputSplit[] splits = inputFormat.getSplits(jobConf, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + + for (int i = 0, rowPos = 0; i < splits.length; i++) { + rowPos = readJSONLFrameFromInputSplit(splits[i], inputFormat, jobConf, schema, schemaMap, dest, rowPos); + } + } + + + private int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, + JobConf jobConf, Types.ValueType[] schema, Map schemaMap, FrameBlock dest, int currentRow) + throws IOException + { + RecordReader reader = inputFormat.getRecordReader(split, jobConf, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + int row = currentRow; + try { + while (reader.next(key, value)) { + JsonParser jsonParser = new JsonParser(); + JsonElement root= jsonParser.parse(value.toString()); + Map map = new HashMap<>(); + addKeys("", root, map, new ArrayList<>()); + for (Map.Entry entry : schemaMap.entrySet()) { + String strCellValue = map.get(entry.getKey()); + if(strCellValue!=null){ + dest.set(row, entry.getValue(), UtilFunctions.stringToObject(schema[entry.getValue()], strCellValue)); + } + } + row++; + } + } + finally { + IOUtilFunctions.closeSilently(reader); + } + return row; + } + + private static void addKeys(String currentPath, JsonElement jsonNode, Map map, List suffix) { + + if (jsonNode.isJsonObject()) { + JsonObject jsonObject = (JsonObject) jsonNode; + Set> iter = jsonObject.entrySet(); + String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; + for(Map.Entry entry: iter){ + addKeys(pathPrefix + entry.getKey(), entry.getValue(), map, suffix); + } + } else if (jsonNode.isJsonArray()) { + JsonArray arrayNode = (JsonArray) jsonNode; + for (int i = 0; i < arrayNode.size(); i++) { + suffix.add(i + 1); + addKeys(currentPath+"-"+i, arrayNode.get(i), map, suffix); + if (i + 1 (); + } + JsonPrimitive valueNode = (JsonPrimitive) jsonNode; + map.put(currentPath, valueNode.getAsString()); + } + } + + + private String[] createOutputNamesFromSchemaMap(Map schemaMap) { + String[] names = new String[schemaMap.size()]; + schemaMap.forEach((key, value) -> names[value] = key); + return names; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java index 3d3fa937f6f..ab61ee4554b 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java @@ -84,7 +84,6 @@ private int readJSONLFrameFromInputSplit(InputSplit split, InputFormat map = new HashMap<>(); addKeys("", root, map, new ArrayList<>()); - for (Map.Entry entry : schemaMap.entrySet()) { String strCellValue = map.get(entry.getKey()); if(strCellValue!=null){ @@ -113,14 +112,14 @@ private static void addKeys(String currentPath, JsonNode jsonNode, Map schemaMap = util.getSchemaMap(schemaMapFileName); + + FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); + FrameBlock readBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java similarity index 54% rename from src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java rename to src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java index 3c4041202c4..6850264315c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/Baseline/SystemDSJSON.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java @@ -1,16 +1,14 @@ -package org.apache.sysds.runtime.iogen.Baseline; +package org.apache.sysds.runtime.iogen.EXP; import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReaderJSONJackson; -import org.apache.sysds.runtime.io.FrameReaderJSONL; -import org.apache.sysds.runtime.iogen.GIO.Util; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.wink.json4j.JSONException; import java.io.IOException; import java.util.Map; -public class SystemDSJSON { +public class SystemDSJackson { public static void main(String[] args) throws IOException, JSONException { @@ -18,28 +16,19 @@ public static void main(String[] args) throws IOException, JSONException { String schemaMapFileName; String dataFileName; long nrows; - String config; schemaFileName = System.getProperty("schemaFileName"); schemaMapFileName = System.getProperty("schemaMapFileName"); dataFileName = System.getProperty("dataFileName"); nrows = Long.parseLong(System.getProperty("nrows")); - config = System.getProperty("config"); Util util = new Util(); Types.ValueType[] schema = util.getSchema(schemaFileName); int ncols = schema.length; Map schemaMap = util.getSchemaMap(schemaMapFileName); - FrameBlock readBlock; - if(config.equals("SystemDS+Jason4j")) { - FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); - readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - else if(config.equals("SystemDS+Jackson")) { - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java new file mode 100644 index 00000000000..e19ac5eedff --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java @@ -0,0 +1,33 @@ +package org.apache.sysds.runtime.iogen.EXP; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameReaderJSONL; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.wink.json4j.JSONException; + +import java.io.IOException; +import java.util.Map; + +public class SystemDSJson4j { + + public static void main(String[] args) throws IOException, JSONException { + + String schemaFileName; + String schemaMapFileName; + String dataFileName; + long nrows; + + schemaFileName = System.getProperty("schemaFileName"); + schemaMapFileName = System.getProperty("schemaMapFileName"); + dataFileName = System.getProperty("dataFileName"); + nrows = Long.parseLong(System.getProperty("nrows")); + + Util util = new Util(); + Types.ValueType[] schema = util.getSchema(schemaFileName); + int ncols = schema.length; + Map schemaMap = util.getSchemaMap(schemaMapFileName); + + FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); + FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java similarity index 98% rename from src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java rename to src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java index 1e35240fd78..adb35dd0471 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/GIO/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java @@ -1,4 +1,4 @@ -package org.apache.sysds.runtime.iogen.GIO; +package org.apache.sysds.runtime.iogen.EXP; import org.apache.sysds.common.Types; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index a0576cd26b7..6378a0965a3 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -325,7 +325,7 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { count++; } float percent = (float) count / list.size(); - if(percent >= 0.60) + if(percent >= 1) token = t; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java index ad58e57c280..dabd478c813 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java @@ -26,302 +26,344 @@ public class Hirschberg { - public Pair, String> getLCS(String x, String y, int pxy, int pgap) { - int i, j; // initialising variables - int m = x.length(); // length of gene1 - int n = y.length(); // length of gene2 - - // table for storing optimal substructure answers - int dp[][] = new int[n + m + 1][n + m + 1]; - - for (int[] x1 : dp) - Arrays.fill(x1, 0); - - // initialising the table - for (i = 0; i <= (n + m); i++) { - dp[i][0] = i * pgap; - dp[0][i] = i * pgap; - } - - // calculating the minimum penalty - for (i = 1; i <= m; i++) { - for (j = 1; j <= n; j++) { - if (x.charAt(i - 1) == y.charAt(j - 1)) { - dp[i][j] = dp[i - 1][j - 1]; - } else { - dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy, dp[i - 1][j] + pgap), dp[i][j - 1] + pgap); - } - } - } - - // Reconstructing the solution - int l = n + m; // maximum possible length - i = m; - j = n; - int xpos = l; - int ypos = l; - - // Final answers for the respective strings - int xans[] = new int[l + 1]; - int yans[] = new int[l + 1]; - - while (!(i == 0 || j == 0)) { - if (x.charAt(i - 1) == y.charAt(j - 1)) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } else if (dp[i - 1][j - 1] + pxy == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } else if (dp[i - 1][j] + pgap == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) '_'; - i--; - } else if (dp[i][j - 1] + pgap == dp[i][j]) { - xans[xpos--] = (int) '_'; - yans[ypos--] = (int) y.charAt(j - 1); - j--; - } - } - while (xpos > 0) { - if (i > 0) - xans[xpos--] = (int) x.charAt(--i); - else - xans[xpos--] = (int) '_'; - } - while (ypos > 0) { - if (j > 0) - yans[ypos--] = (int) y.charAt(--j); - else - yans[ypos--] = (int) '_'; - } - // Since we have assumed the answer to be n+m long, we need to remove the extra - // gaps in the starting id represents the index from which the arrays xans, yans are useful - int id = 1; - for (i = l; i >= 1; i--) { - if ((char) yans[i] == '_' && (char) xans[i] == '_') { - id = i + 1; - break; - } - } - - StringBuilder sb = new StringBuilder(); - ArrayList pattern = new ArrayList<>(); - for (i = id; i <= l; i++) { - if (xans[i] == yans[i]) - sb.append((char) xans[i]); - else { - if (sb.length() > 0) - pattern.add(sb.toString()); - sb = new StringBuilder(); - } - } - - if (sb.length() > 0) - pattern.add(sb.toString()); - - // System.out.println(""); - // for(i = id; i <= l; i++) - // System.out.print((char) yans[i]); - // - sb = new StringBuilder(); - for (int bi = id; bi <= l; bi++) { - if (xans[bi] == yans[bi]) { - sb.append((char) xans[bi]); - System.out.print((char) xans[bi]); - } - //else - //System.out.print("*"); - } - System.out.println(); - if (sb.length() > 0) { -// StringBuilder stringBuilder = new StringBuilder(); -// for (String s: pattern){ -// stringBuilder.append(s).append("_"); -// } -// if (stringBuilder.length()>0) -// stringBuilder.deleteCharAt(stringBuilder.length()-1); - return new Pair<>(pattern, sb.toString()); - } - else - return null; - } -public Pair, String> getLCS(String x, String y) { - int i, j; // initialising variables - int m = x.length(); // length of gene1 - int n = y.length(); // length of gene2 - - // table for storing optimal substructure answers - int dp[][] = new int[n + m + 1][n + m + 1]; - - for (int[] x1 : dp) - Arrays.fill(x1, 0); - - // initialising the table - for (i = 0; i <= (n + m); i++) { - dp[i][0] = i; - dp[0][i] = i; - } - - // calculating the minimum penalty - for (i = 1; i <= m; i++) { - for (j = 1; j <= n; j++) { - if (x.charAt(i - 1) == y.charAt(j - 1)) { - dp[i][j] = dp[i - 1][j - 1]; - } else { - dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1], dp[i - 1][j]), dp[i][j - 1]); - } - } - } - - // Reconstructing the solution - int l = n + m; // maximum possible length - i = m; - j = n; - int xpos = l; - int ypos = l; - - // Final answers for the respective strings - int xans[] = new int[l + 1]; - int yans[] = new int[l + 1]; - - while (!(i == 0 || j == 0)) { - if (x.charAt(i - 1) == y.charAt(j - 1)) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } else if (dp[i - 1][j - 1] == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } else if (dp[i - 1][j] == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) '_'; - i--; - } else if (dp[i][j - 1] == dp[i][j]) { - xans[xpos--] = (int) '_'; - yans[ypos--] = (int) y.charAt(j - 1); - j--; - } - } - while (xpos > 0) { - if (i > 0) - xans[xpos--] = (int) x.charAt(--i); - else - xans[xpos--] = (int) '_'; - } - while (ypos > 0) { - if (j > 0) - yans[ypos--] = (int) y.charAt(--j); - else - yans[ypos--] = (int) '_'; - } - // Since we have assumed the answer to be n+m long, we need to remove the extra - // gaps in the starting id represents the index from which the arrays xans, yans are useful - int id = 1; - for (i = l; i >= 1; i--) { - if ((char) yans[i] == '_' && (char) xans[i] == '_') { - id = i + 1; - break; - } - } - - StringBuilder sb = new StringBuilder(); - ArrayList pattern = new ArrayList<>(); - for (i = id; i <= l; i++) { - if (xans[i] == yans[i]) - sb.append((char) xans[i]); - else { - if (sb.length() > 0) - pattern.add(sb.toString()); - sb = new StringBuilder(); - } - } - - if (sb.length() > 0) - pattern.add(sb.toString()); - - // System.out.println(""); - // for(i = id; i <= l; i++) - // System.out.print((char) yans[i]); - // - sb = new StringBuilder(); - for (int bi = id; bi <= l; bi++) { - if (xans[bi] == yans[bi]) { - sb.append((char) xans[bi]); - System.out.print((char) xans[bi]); - } - //else - //System.out.print("*"); - } - System.out.println(); - if (sb.length() > 0) { -// StringBuilder stringBuilder = new StringBuilder(); -// for (String s: pattern){ -// stringBuilder.append(s).append("_"); -// } -// if (stringBuilder.length()>0) -// stringBuilder.deleteCharAt(stringBuilder.length()-1); - return new Pair<>(pattern, sb.toString()); - } - else - return null; + public Pair, String> getLCS(String x, String y, int pxy, int pgap) { + int i, j; // initialising variables + int m = x.length(); // length of gene1 + int n = y.length(); // length of gene2 + + // table for storing optimal substructure answers + int dp[][] = new int[n + m + 1][n + m + 1]; + + for(int[] x1 : dp) + Arrays.fill(x1, 0); + + // initialising the table + for(i = 0; i <= (n + m); i++) { + dp[i][0] = i * pgap; + dp[0][i] = i * pgap; + } + + // calculating the minimum penalty + for(i = 1; i <= m; i++) { + for(j = 1; j <= n; j++) { + if(x.charAt(i - 1) == y.charAt(j - 1)) { + dp[i][j] = dp[i - 1][j - 1]; + } + else { + dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy, dp[i - 1][j] + pgap), dp[i][j - 1] + pgap); + } + } + } + + // Reconstructing the solution + int l = n + m; // maximum possible length + i = m; + j = n; + int xpos = l; + int ypos = l; + + // Final answers for the respective strings + int xans[] = new int[l + 1]; + int yans[] = new int[l + 1]; + + while(!(i == 0 || j == 0)) { + if(x.charAt(i - 1) == y.charAt(j - 1)) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } + else if(dp[i - 1][j - 1] + pxy == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } + else if(dp[i - 1][j] + pgap == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) '_'; + i--; + } + else if(dp[i][j - 1] + pgap == dp[i][j]) { + xans[xpos--] = (int) '_'; + yans[ypos--] = (int) y.charAt(j - 1); + j--; + } + } + while(xpos > 0) { + if(i > 0) + xans[xpos--] = (int) x.charAt(--i); + else + xans[xpos--] = (int) '_'; + } + while(ypos > 0) { + if(j > 0) + yans[ypos--] = (int) y.charAt(--j); + else + yans[ypos--] = (int) '_'; + } + // Since we have assumed the answer to be n+m long, we need to remove the extra + // gaps in the starting id represents the index from which the arrays xans, yans are useful + int id = 1; + for(i = l; i >= 1; i--) { + if((char) yans[i] == '_' && (char) xans[i] == '_') { + id = i + 1; + break; + } + } + + StringBuilder sb = new StringBuilder(); + ArrayList pattern = new ArrayList<>(); + for(i = id; i <= l; i++) { + if(xans[i] == yans[i]) + sb.append((char) xans[i]); + else { + if(sb.length() > 0) + pattern.add(sb.toString()); + sb = new StringBuilder(); + } + } + + if(sb.length() > 0) + pattern.add(sb.toString()); + + // System.out.println(""); + // for(i = id; i <= l; i++) + // System.out.print((char) yans[i]); + // + sb = new StringBuilder(); + for(int bi = id; bi <= l; bi++) { + if(xans[bi] == yans[bi]) { + sb.append((char) xans[bi]); + System.out.print((char) xans[bi]); + } + //else + //System.out.print("*"); + } + System.out.println(); + if(sb.length() > 0) { + // StringBuilder stringBuilder = new StringBuilder(); + // for (String s: pattern){ + // stringBuilder.append(s).append("_"); + // } + // if (stringBuilder.length()>0) + // stringBuilder.deleteCharAt(stringBuilder.length()-1); + return new Pair<>(pattern, sb.toString()); + } + else + return null; + } + + public Pair, String> getLCS(String x, String y) { + int i, j; // initialising variables + int m = x.length(); // length of gene1 + int n = y.length(); // length of gene2 + + // table for storing optimal substructure answers + int dp[][] = new int[n + m + 1][n + m + 1]; + + for(int[] x1 : dp) + Arrays.fill(x1, 0); + + // initialising the table + for(i = 0; i <= (n + m); i++) { + dp[i][0] = i; + dp[0][i] = i; + } + + // calculating the minimum penalty + for(i = 1; i <= m; i++) { + for(j = 1; j <= n; j++) { + if(x.charAt(i - 1) == y.charAt(j - 1)) { + dp[i][j] = dp[i - 1][j - 1]; + } + else { + dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1], dp[i - 1][j]), dp[i][j - 1]); + } + } + } + + // Reconstructing the solution + int l = n + m; // maximum possible length + i = m; + j = n; + int xpos = l; + int ypos = l; + + // Final answers for the respective strings + int xans[] = new int[l + 1]; + int yans[] = new int[l + 1]; + + while(!(i == 0 || j == 0)) { + if(x.charAt(i - 1) == y.charAt(j - 1)) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } + else if(dp[i - 1][j - 1] == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) y.charAt(j - 1); + i--; + j--; + } + else if(dp[i - 1][j] == dp[i][j]) { + xans[xpos--] = (int) x.charAt(i - 1); + yans[ypos--] = (int) '_'; + i--; + } + else if(dp[i][j - 1] == dp[i][j]) { + xans[xpos--] = (int) '_'; + yans[ypos--] = (int) y.charAt(j - 1); + j--; + } + } + while(xpos > 0) { + if(i > 0) + xans[xpos--] = (int) x.charAt(--i); + else + xans[xpos--] = (int) '_'; + } + while(ypos > 0) { + if(j > 0) + yans[ypos--] = (int) y.charAt(--j); + else + yans[ypos--] = (int) '_'; + } + // Since we have assumed the answer to be n+m long, we need to remove the extra + // gaps in the starting id represents the index from which the arrays xans, yans are useful + int id = 1; + for(i = l; i >= 1; i--) { + if((char) yans[i] == '_' && (char) xans[i] == '_') { + id = i + 1; + break; + } + } + + StringBuilder sb = new StringBuilder(); + ArrayList pattern = new ArrayList<>(); + for(i = id; i <= l; i++) { + if(xans[i] == yans[i]) + sb.append((char) xans[i]); + else { + if(sb.length() > 0) + pattern.add(sb.toString()); + sb = new StringBuilder(); + } + } + + if(sb.length() > 0) + pattern.add(sb.toString()); + + // System.out.println(""); + // for(i = id; i <= l; i++) + // System.out.print((char) yans[i]); + // + sb = new StringBuilder(); + for(int bi = id; bi <= l; bi++) { + if(xans[bi] == yans[bi]) { + sb.append((char) xans[bi]); + System.out.print((char) xans[bi]); + } + //else + //System.out.print("*"); + } + System.out.println(); + if(sb.length() > 0) { + // StringBuilder stringBuilder = new StringBuilder(); + // for (String s: pattern){ + // stringBuilder.append(s).append("_"); + // } + // if (stringBuilder.length()>0) + // stringBuilder.deleteCharAt(stringBuilder.length()-1); + return new Pair<>(pattern, sb.toString()); + } + else + return null; + } + + public static void main(String[] args) { + int misMatchPenalty = 3; + int gapPenalty = 2; + Hirschberg hirschberg = new Hirschberg(); + String s1 = "123"; + String s2 = "12369666"; + System.out.println(hirschberg.getLCS(s1, s2, misMatchPenalty, gapPenalty).getValue()); + } + + public ArrayList getLCS(ArrayList list, int pxy, int pgap) { + + if(list.size() < 2) + return null; + + String str1 = list.get(0); + String str2 = list.get(1); + + Pair, String> pattern = getLCS(str1, str2, pxy, pgap); + if(pattern != null) { + String intersect = pattern.getValue(); + ArrayList intersectPattern = pattern.getKey(); + for(int i = 2; i < list.size(); i++) { + pattern = getLCS(intersect, list.get(i)); + if(pattern != null) { + intersect = pattern.getValue(); + intersectPattern = pattern.getKey(); + } + else + intersect = null; + } + if(intersect != null) + return intersectPattern; + } + return null; + } } - - public ArrayList getLCS(ArrayList list, int pxy, int pgap) { - if (list.size() < 2) - return null; - - +// public ArrayList getLCS(ArrayList list, int pxy, int pgap) { +// if(list.size() < 2) +// return null; // +// if(pattern != null) { +// String intersect = pattern.getValue(); +// ArrayList intersectPattern = pattern.getKey(); +// for(int i = 2; i < list.size(); i++) { +// if(i == 199) +// System.out.print(i + " >> " + list.get(i) + "\n"); +// pattern = getLCS(intersect, list.get(i)); +// if(pattern != null) { +// intersect = pattern.getValue(); +// intersectPattern = pattern.getKey(); +// } +// else +// intersect = null; +// } +// if(intersect != null) +// return intersectPattern; // -// if (pattern != null) { -// String intersect = pattern.getValue(); -// ArrayList intersectPattern = pattern.getKey(); -// for (int i = 2; i < list.size(); i++) { -// if (i==199) -// System.out.print(i+" >> " + list.get(i)+"\n"); -// pattern = getLCS(intersect, list.get(i)); -// if (pattern != null) { -// intersect = pattern.getValue(); -// intersectPattern = pattern.getKey(); -// } else -// intersect = null; -// } -// if (intersect != null) -// return intersectPattern; +// } // -// } - -// Hirschberg2 hirschberg2 = new Hirschberg2(); -// String str1 = list.get(0); -// String str2 = list.get(1); -// Pair, String pattern = hirschberg2.algC(str1.length(), str2.length(),str1, str2); -// if (pattern != null) { -// String intersect = pattern.getValue(); -// ArrayList intersectPattern = pattern.getKey(); -// for (int i = 2; i < list.size(); i++) { -// if (i==199) -// System.out.print(i+" >> " + list.get(i)+"\n"); -// pattern = getLCS(intersect, list.get(i)); -// if (pattern != null) { -// intersect = pattern.getValue(); -// intersectPattern = pattern.getKey(); -// } else -// intersect = null; -// } -// if (intersect != null) -// return intersectPattern; +// Hirschberg2 hirschberg2 = new Hirschberg2(); +// String str1 = list.get(0); +// String str2 = list.get(1); +// Pair, String pattern = hirschberg2.algC(str1.length(), str2.length(), str1, str2); +// if(pattern != null) { +// String intersect = pattern.getValue(); +// ArrayList intersectPattern = pattern.getKey(); +// for(int i = 2; i < list.size(); i++) { +// if(i == 199) +// System.out.print(i + " >> " + list.get(i) + "\n"); +// pattern = getLCS(intersect, list.get(i)); +// if(pattern != null) { +// intersect = pattern.getValue(); +// intersectPattern = pattern.getKey(); +// } +// else +// intersect = null; +// } +// if(intersect != null) +// return intersectPattern; // -// } - - return null; - } -} +// } +// +// } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index e3c4cbf166c..b77a53e838f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -54,6 +54,8 @@ private void buildPrefixTree() { for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } +// else +// System.out.println(">>>>>>>>>>>>>>>>>>>>> "+c); } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { @@ -151,9 +153,9 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos if(key.length() > 0) { currPosVariable = getRandomName("curPos"); if(node.getKey() == null) - src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\"); \n"); + src.append("index = str.indexOf(\"" + key.replace("\\\"","\"").replace("\"", "\\\"") + "\"); \n"); else - src.append("index = str.indexOf(\"" + key.replace("\"", "\\\"") + "\", " + currPos + "); \n"); + src.append("index = str.indexOf(\"" + key.replace("\\\"","\"").replace("\"", "\\\"") + "\", " + currPos + "); \n"); src.append("if(index != -1) { \n"); src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java deleted file mode 100755 index 72f2ac9f82e..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFlatFrameExperimentHDFS.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.apache.sysds.runtime.iogen.exp; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.io.FrameReaderTextCSV; -import org.apache.sysds.runtime.iogen.GIO.Util; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -public class GIOFlatFrameExperimentHDFS extends GIOMain { - - public static void main(String[] args) throws Exception { - getArgs(); - Util util = new Util(); - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - - FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, delimiter, false); - FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); - FrameBlock sampleFrame = csv.readFrameFromHDFS(sampleFrameFileName, sampleSchema, -1, ncols); - - double tmpTime = System.nanoTime(); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - FrameReader fr = gr.getReader(); - double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; - - tmpTime = System.nanoTime(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); - double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; - - String log = datasetName + "," + frameBlock.getNumRows() + "," + frameBlock.getNumColumns() + "," + sampleSchema.length + "," + sampleNRows + "," + generateTime + "," + readTime; - util.addLog(LOG_HOME, log); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java deleted file mode 100755 index b23017c99e6..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOFrameExperimentHDFS.java +++ /dev/null @@ -1,67 +0,0 @@ -package org.apache.sysds.runtime.iogen.exp; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.GIO.Util; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -import java.util.ArrayList; -import java.util.HashSet; - -public class GIOFrameExperimentHDFS extends GIOMain { - - public static void main(String[] args) throws Exception { -// getArgs(); -// -// Util util = new Util(); -// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); -// int ncols = sampleSchema.length; -// -// ArrayList newSampleSchema = new ArrayList<>(); -// ArrayList> newSampleFrame = new ArrayList<>(); -// -// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); -// -// for (int c = 0; c < sampleFrameStrings[0].length; c++) { -// HashSet valueSet = new HashSet<>(); -// for (int r = 0; r < sampleFrameStrings.length; r++) -// valueSet.add(sampleFrameStrings[r][c]); -// if (valueSet.size() > 0) { -// ArrayList tempList = new ArrayList<>(); -// for (int r = 0; r < sampleFrameStrings.length; r++) { -// tempList.add(sampleFrameStrings[r][c]); -// } -// newSampleFrame.add(tempList); -// newSampleSchema.add(sampleSchema[c]); -// } -// } -// -// sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; -// -// for (int row = 0; row < sampleFrameStrings.length; row++) { -// for (int col = 0; col < sampleFrameStrings[0].length; col++) { -// sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); -// } -// } -// -// sampleSchema = new Types.ValueType[newSampleSchema.size()]; -// for (int i = 0; i < newSampleSchema.size(); i++) -// sampleSchema[i] = newSampleSchema.get(i); -// -// FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); -// -// double tmpTime = System.nanoTime(); -// String sampleRaw = util.readEntireTextFile(sampleRawFileName); -// GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); -// FrameReader fr = gr.getReader(); -// double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// tmpTime = System.nanoTime(); -// FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); -// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// String log = datasetName + "," + frameBlock.getNumRows() + "," + frameBlock.getNumColumns() + "," + sampleSchema.length + "," + sampleNRows + "," + generateTime + "," + readTime; -// util.addLog(LOG_HOME, log); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java deleted file mode 100755 index 8e763dec755..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOGenerateRapidJSONCode.java +++ /dev/null @@ -1,66 +0,0 @@ -package org.apache.sysds.runtime.iogen.exp; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -import java.util.ArrayList; -import java.util.HashSet; - -public class GIOGenerateRapidJSONCode extends GIOMain { - - public static void main(String[] args) throws Exception { -// getArgs(); -// -// Util util = new Util(); -// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); -// int ncols = sampleSchema.length; -// -// ArrayList newSampleSchema = new ArrayList<>(); -// ArrayList> newSampleFrame = new ArrayList<>(); -// -// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); -// -// for (int c = 0; c < sampleFrameStrings[0].length; c++) { -// HashSet valueSet = new HashSet<>(); -// for (int r = 0; r < sampleFrameStrings.length; r++) -// valueSet.add(sampleFrameStrings[r][c]); -// if (valueSet.size() > 3) { -// ArrayList tempList = new ArrayList<>(); -// for (int r = 0; r < sampleFrameStrings.length; r++) { -// tempList.add(sampleFrameStrings[r][c]); -// } -// newSampleFrame.add(tempList); -// newSampleSchema.add(sampleSchema[c]); -// } -// } -// -// sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; -// -// for (int row = 0; row < sampleFrameStrings.length; row++) { -// for (int col = 0; col < sampleFrameStrings[0].length; col++) { -// sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); -// } -// } -// -// sampleSchema = new Types.ValueType[newSampleSchema.size()]; -// for (int i = 0; i < newSampleSchema.size(); i++) -// sampleSchema[i] = newSampleSchema.get(i); -// -// FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); -// -// double tmpTime = System.nanoTime(); -// String sampleRaw = util.readEntireTextFile(sampleRawFileName); -// GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); -// -// double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// int p = (int) (percent * 100); -// String sourceFileName = cppBaseSrc + "/source/FrameReaderGIO_" + p + ".cpp"; -// String headerFileName = cppBaseSrc + "/header/FrameReaderGIO_" + p + ".h"; -// -// gr.getReaderRapidJSON("FrameReaderGIO_" + p, sourceFileName, headerFileName); -// String log = datasetName + ",0," + ncols + "," + percent + "," + sampleNRows + "," + generateTime + ",0"; -// util.addLog(LOG_HOME, log); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOMain.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOMain.java deleted file mode 100644 index 570985243d0..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIOMain.java +++ /dev/null @@ -1,29 +0,0 @@ -package org.apache.sysds.runtime.iogen.exp; - -public class GIOMain { - protected static String sampleRawFileName; - protected static String sampleFrameFileName; - protected static Integer sampleNRows; - protected static String delimiter; - protected static String schemaFileName; - protected static String dataFileName; - protected static String datasetName; - protected static String cppBaseSrc; - protected static String LOG_HOME; - protected static Integer nrows; - - public static void getArgs(){ - sampleRawFileName = System.getProperty("sampleRawFileName"); - sampleFrameFileName = System.getProperty("sampleFrameFileName"); - sampleNRows = Integer.parseInt(System.getProperty("sampleNRows")); - delimiter = System.getProperty("delimiter"); - if(delimiter.equals("\\t")) - delimiter = "\t"; - schemaFileName = System.getProperty("schemaFileName"); - dataFileName = System.getProperty("dataFileName"); - datasetName = System.getProperty("datasetName"); - cppBaseSrc = System.getProperty("cppBaseSrc"); - LOG_HOME = System.getProperty("homeLog"); - nrows = Integer.parseInt(System.getProperty("nrows")); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentStream.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentStream.java deleted file mode 100755 index d8161c889f1..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/GIONestedExperimentStream.java +++ /dev/null @@ -1,71 +0,0 @@ -package org.apache.sysds.runtime.iogen.exp; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -import java.io.File; -import java.io.FileInputStream; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.HashSet; - -public class GIONestedExperimentStream extends GIOMain { - - public static void main(String[] args) throws Exception { -// getArgs(); -// Util util = new Util(); -// Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); -// int ncols = sampleSchema.length; -// -// ArrayList newSampleSchema = new ArrayList<>(); -// ArrayList> newSampleFrame = new ArrayList<>(); -// -// String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); -// -// for (int c = 0; c < sampleFrameStrings[0].length; c++) { -// HashSet valueSet = new HashSet<>(); -// for (int r = 0; r < sampleFrameStrings.length; r++) -// valueSet.add(sampleFrameStrings[r][c]); -// if (valueSet.size() > 3) { -// ArrayList tempList = new ArrayList<>(); -// for (int r = 0; r < sampleFrameStrings.length; r++) { -// tempList.add(sampleFrameStrings[r][c]); -// } -// newSampleFrame.add(tempList); -// newSampleSchema.add(sampleSchema[c]); -// } -// } -// -// sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; -// -// for (int row = 0; row < sampleFrameStrings.length; row++) { -// for (int col = 0; col < sampleFrameStrings[0].length; col++) { -// sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); -// } -// } -// -// sampleSchema = new Types.ValueType[newSampleSchema.size()]; -// for (int i = 0; i < newSampleSchema.size(); i++) -// sampleSchema[i] = newSampleSchema.get(i); -// -// FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); -// -// double tmpTime = System.nanoTime(); -// String sampleRaw = util.readEntireTextFile(sampleRawFileName); -// GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); -// FrameReader fr = gr.getReader(codeGen); -// double generateTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// tmpTime = System.nanoTime(); -// File file = new File(dataFileName); -// InputStream is = new FileInputStream(file); -// -// FrameBlock frameBlock = fr.readFrameFromInputStream(is, gr.getProperties().getSchema(), nrows, gr.getProperties().getSchema().length); -// double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; -// -// String log = datasetName+"-CodeGen("+codeGen+ ")," + frameBlock.getNumRows() + "," + ncols + "," + percent + "," + sampleNRows + "," + generateTime + "," + readTime; -// util.addLog(LOG_HOME, log); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java b/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java deleted file mode 100755 index 1f5c3249566..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/SYSDSFrameExperimentHDFS.java +++ /dev/null @@ -1,39 +0,0 @@ -package org.apache.sysds.runtime.iogen.exp; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; -import org.apache.sysds.runtime.io.FrameReaderTextCSV; -import org.apache.sysds.runtime.iogen.GIO.Util; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -public class SYSDSFrameExperimentHDFS extends GIOMain { - - public static void main(String[] args) throws Exception { - getArgs(); - - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - - System.out.println(">>>>>>>>>>>>>>>>>>> "+ncols); - - double tmpTime = System.nanoTime(); - FrameBlock frameBlock; - //if(datasetName.equals("csv")) { - FileFormatPropertiesCSV csvpro = new FileFormatPropertiesCSV(false, ",", false); - FrameReaderTextCSV csv = new FrameReaderTextCSV(csvpro); - frameBlock = csv.readFrameFromHDFS(dataFileName, schema, -1, ncols); -// } -// else if(datasetName.equals("mm")) { -// FrameReaderTextCell mm =new FrameReaderTextCell(); -// frameBlock = mm.readFrameFromHDFS(dataFileName, schema, nrows, ncols); -// } -// else -// throw new RuntimeException("Format not support!"); - - double readTime = (System.nanoTime() - tmpTime) / 1000000000.0; - - String log= datasetName+","+ frameBlock.getNumRows()+","+ ncols+",1.0,0,0,"+readTime; - util.addLog(LOG_HOME, log); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh deleted file mode 100755 index a573a99b966..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/resultPath.sh +++ /dev/null @@ -1,7 +0,0 @@ -#!/usr/bin/env bash - -log_file="$1.csv" -if test ! -f "$log_file"; then - touch $log_file - echo "dataset,data_nrows,data_ncols,col_selected_count,sample_nrows,generate_time,read_time" > $log_file -fi diff --git a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh b/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh deleted file mode 100755 index 2ceb544a83c..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/exp/runGIOExp.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env bash - -#SystemDS Paths: -#---------------------------------------------------------------- -systemDS_Home="/home/saeed/Documents/Github/systemds" -LOG4JPROP="$systemDS_Home/scripts/perftest/conf/log4j.properties" -jar_file_path="$systemDS_Home/target/SystemDS.jar" -lib_files_path="$systemDS_Home/target/lib/*" -#----------------------------------------------------------------- -format="json" -root_data_path="/home/saeed/Documents/Dataset/GIODataset/twitter/$format/" -#home_log="/home/saeed/Documents/ExpLog/json/" -cpp_base_src="" #"/home/sfathollahzadeh/Documents/GitHub/papers/2022-icde-gIO/experiments/benchmark/RapidJSONCPP/src/at/tugraz" -sep="_" -nrows=-1 - -mx_mem="$(($(getconf _PHYS_PAGES) * $(getconf PAGE_SIZE) / (1024 * 1024 * 1024)))g" - -delimiter="\t" -declare -a datasets=("twitter") -declare -a main_classes=("GIOFrameExperimentHDFS") #SYSDSFrameExperimentHDFS GIOFrameExperimentHDFS - -for (( i = 1; i < 2; i++ )); do - - for mc in "${main_classes[@]}"; do - for d in "${datasets[@]}"; do - #home_log="/home/saeed/Documents/ExpLog/$format/$d/Q$i/" - home_log="/home/saeed/Documents/ExpLog/GIO-$d-$format-Q$i" - ./resultPath.sh $home_log - data_file_name="$root_data_path/$d.data" - - for sr in 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 - do - for p in 1 #2 5 #0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 - do - #schema_file_name="$root_data_path/$d/$d.schema" - #sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" - #sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" - -# schema_file_name="$root_data_path/$d/$d$sep$p.schema" -# sample_raw_fileName="$root_data_path/$d/sample_$sr$sep$p.raw" -# sample_frame_file_name="$root_data_path/$d/sample_$sr$sep$p.frame" -# - schema_file_name="$root_data_path/Q$i/$d$sep$p.schema" - sample_raw_fileName="$root_data_path/Q$i/sample_$d$sr$sep$p.raw" - sample_frame_file_name="$root_data_path/Q$i/sample_$d$sr$sep$p.frame" - - SCRIPT="java\ - -Dlog4j.configuration=file:$LOG4JPROP\ - -Xms1g\ - -Xmx$mx_mem\ - -Xmn4000m\ - -DsampleRawFileName=$sample_raw_fileName\ - -DsampleFrameFileName=$sample_frame_file_name\ - -DsampleNRows=$sr\ - -Ddelimiter=$delimiter\ - -DschemaFileName=$schema_file_name\ - -DdataFileName=$data_file_name\ - -DdatasetName=$d\ - -DhomeLog=$home_log.csv\ - -DcppBaseSrc=$cpp_base_src\ - -Dnrows=$nrows\ - -cp\ - $jar_file_path:$lib_files_path\ - org.apache.sysds.runtime.iogen.exp.$mc\ - " - #echo 3 > /proc/sys/vm/drop_caches && sync - #sleep 20 - - #echo "++++++++++++++++++++++++++++++++++++++++++++" - #echo $SCRIPT - time $SCRIPT - done - done - done - done -done -#/home/saeed/Documents/Dataset/GIODataset/flat/aminer_paper -#/home/saeed/Documents/GIODataset/flat/aminer_paper/aminer_paper_5.schema - From 2aae0ee36e4fdc297d23129694b2b50df914e4ae Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Mon, 14 Feb 2022 11:20:14 +0100 Subject: [PATCH 36/84] Update Exp branch --- pom.xml | 16 ++- .../runtime/io/FrameReaderJSONJackson.java | 6 +- .../sysds/runtime/io/FrameReaderJSONL.java | 3 +- .../apache/sysds/runtime/iogen/EXP/GIO.java | 10 +- .../sysds/runtime/iogen/EXP/SystemDSCSV.java | 35 +++++ .../runtime/iogen/codegen/CodeGenTrie.java | 5 - .../sysds/runtime/iogen/codegen/mymain2.java | 108 -------------- .../Identify/MatrixGRRowColIdentifyTest.java | 132 ++++++++++-------- 8 files changed, 132 insertions(+), 183 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java diff --git a/pom.xml b/pom.xml index 51f38c2b8fd..4be3b3ba96d 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ true ** false - -Xms3000m -Xmx3000m -Xmn300m + -Xms3000m -Xmx9000m -Xmn300m false @@ -1046,7 +1046,19 @@ com.fasterxml.jackson.core jackson-databind - 2.12.3 + 2.13.1 + + + + com.fasterxml.jackson.core + jackson-annotations + 2.13.1 + + + + com.fasterxml.jackson.core + jackson-core + 2.13.1 diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java index ab61ee4554b..5d0bf47d9f5 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java @@ -87,7 +87,11 @@ private int readJSONLFrameFromInputSplit(InputSplit split, InputFormat entry : schemaMap.entrySet()) { String strCellValue = map.get(entry.getKey()); if(strCellValue!=null){ - dest.set(row, entry.getValue(), UtilFunctions.stringToObject(schema[entry.getValue()], strCellValue)); + try { + dest.set(row, entry.getValue(), UtilFunctions.stringToObject(schema[entry.getValue()], strCellValue)); + } + catch(Exception e){} + } } row++; diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java index f43ae5670c3..1f9d6c72c03 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java @@ -128,7 +128,8 @@ private static String getStringFromJSONPath(JSONObject jsonObject, String path) } if(temp == null){ - throw new IOException("Could not traverse the JSON path: '" + path + "'!"); + return null; + //throw new IOException("Could not traverse the JSON path: '" + path + "'!"); } return temp.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java index 2b1c234b9f9..619ef62013a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java @@ -2,7 +2,6 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.EXP.Util; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; @@ -71,13 +70,6 @@ public static void main(String[] args) throws Exception { GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, nrows, sampleSchema.length); -// -// for(int i=0; i< 10;i++){ -// System.out.print("Row "+i+"\t"); -// for(int j=0; j keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } -// else -// System.out.println(">>>>>>>>>>>>>>>>>>>>> "+c); } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java deleted file mode 100644 index 8da7a5d4cad..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/mymain2.java +++ /dev/null @@ -1,108 +0,0 @@ -package org.apache.sysds.runtime.iogen.codegen; - -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.*; -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.IOUtilFunctions; -import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.template.FrameGenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -import java.io.IOException; -import java.util.HashSet; - -public class mymain2 extends FrameGenerateReader { - public mymain2(CustomProperties _props) { - super(_props); - } - - @Override - protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, boolean first) throws IOException { - RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int row = rl; - long lnnz = 0; - HashSet[] endWithValueString = _props.endWithValueStrings(); - int index, endPos, strLen; - try { - while(reader.next(key, value)){ - String str = value.toString(); - strLen = str.length(); - endPos = getEndPos(str, strLen, 0, endWithValueString[0]); - String cellStr0 = str.substring(0,endPos); - if ( cellStr0.length() > 0 ){ - Long cellValue0; - try{cellValue0= Long.parseLong(cellStr0); } catch(Exception e){cellValue0 = 0l;} - if(cellValue0 != 0) { - dest.set(row, 0, cellValue0); - lnnz++; - } - } - index = str.indexOf(",,,\""); - if(index != -1) { - int curPos_60858111 = index + 4; - index = str.indexOf("\",", curPos_60858111); - if(index != -1) { - int curPos_2775125 = index + 2; - endPos = getEndPos(str, strLen, curPos_2775125, endWithValueString[6]); - String cellStr6 = str.substring(curPos_2775125,endPos); - String cellValue6 = cellStr6; - dest.set(row, 6, cellValue6); - } - } - index = str.indexOf(",\""); - if(index != -1) { - int curPos_63087344 = index + 2; - endPos = getEndPos(str, strLen, curPos_63087344, endWithValueString[1]); - String cellStr1 = str.substring(curPos_63087344,endPos); - String cellValue1 = cellStr1; - dest.set(row, 1, cellValue1); - } - index = str.indexOf("\","); - if(index != -1) { - int curPos_41366400 = index + 2; - endPos = getEndPos(str, strLen, curPos_41366400, endWithValueString[2]); - String cellStr2 = str.substring(curPos_41366400,endPos); - if ( cellStr2.length() > 0 ){ - Integer cellValue2; - try{ cellValue2= Integer.parseInt(cellStr2);} catch(Exception e){cellValue2 = 0;} - if(cellValue2 != 0) { - dest.set(row, 2, cellValue2); - lnnz++; - } - } - index = str.indexOf(",", curPos_41366400); - if(index != -1) { - int curPos_78452455 = index + 1; - endPos = getEndPos(str, strLen, curPos_78452455, endWithValueString[3]); - String cellStr3 = str.substring(curPos_78452455,endPos); - if ( cellStr3.length() > 0 ){ - Integer cellValue3; - try{ cellValue3= Integer.parseInt(cellStr3);} catch(Exception e){cellValue3 = 0;} - if(cellValue3 != 0) { - dest.set(row, 3, cellValue3); - lnnz++; - } - } - } - } - index = str.indexOf(",,,"); - if(index != -1) { - int curPos_8253849 = index + 3; - endPos = getEndPos(str, strLen, curPos_8253849, endWithValueString[5]); - String cellStr5 = str.substring(curPos_8253849,endPos); - String cellValue5 = cellStr5; - dest.set(row, 5, cellValue5); - } - row++; - }} - finally { - IOUtilFunctions.closeSilently(reader); - } - return row; - - - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 7daeba29c5f..40424bc54f3 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -21,15 +21,14 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.FrameReaderJSONJackson; import org.apache.sysds.runtime.io.FrameReaderJSONL; import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.iogen.GIO.Util; +import org.apache.sysds.runtime.iogen.EXP.Util; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; import org.junit.Test; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -258,74 +257,93 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou // } @Test public void test13() throws Exception { - String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.raw"; - String sampleFrameFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.frame"; - String delimiter = "\\t"; - String schemaFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/tpch-json.schema"; - - Util util = new Util(); - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - - ArrayList newSampleSchema = new ArrayList<>(); - ArrayList> newSampleFrame = new ArrayList<>(); - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); - - for(int c = 0; c < sampleFrameStrings[0].length; c++) { - HashSet valueSet = new HashSet<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) - valueSet.add(sampleFrameStrings[r][c]); - if(valueSet.size() > 1) { - ArrayList tempList = new ArrayList<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) { - tempList.add(sampleFrameStrings[r][c]); + ///home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/twitter-examples/F10 + for(int f=9;f<=9;f++) { + System.out.println("+++++++++++++++++++++ Q="+f); + String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/F"+f+"/sample-yelp-csv200.raw"; + String sampleFrameFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/F"+f+"/sample-yelp-csv200.frame"; + String delimiter = "\\t"; + String schemaFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/F"+f+"/yelp-csv.schema"; + String dataFileName ="/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/yelp-csv.data"; + + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + ArrayList newSampleSchema = new ArrayList<>(); + ArrayList> newSampleFrame = new ArrayList<>(); + + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); + + for(int c = 0; c < sampleFrameStrings[0].length; c++) { + HashSet valueSet = new HashSet<>(); + for(int r = 0; r < sampleFrameStrings.length; r++) + valueSet.add(sampleFrameStrings[r][c]); + if(valueSet.size() > 1) { + ArrayList tempList = new ArrayList<>(); + for(int r = 0; r < sampleFrameStrings.length; r++) { + tempList.add(sampleFrameStrings[r][c]); + } + newSampleFrame.add(tempList); + newSampleSchema.add(sampleSchema[c]); } - newSampleFrame.add(tempList); - newSampleSchema.add(sampleSchema[c]); } - } - sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; + sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; - for(int row = 0; row < sampleFrameStrings.length; row++) { - for(int col = 0; col < sampleFrameStrings[0].length; col++) { - sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); + for(int row = 0; row < sampleFrameStrings.length; row++) { + for(int col = 0; col < sampleFrameStrings[0].length; col++) { + sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); + } } - } - sampleSchema = new Types.ValueType[newSampleSchema.size()]; - for(int i = 0; i < newSampleSchema.size(); i++) - sampleSchema[i] = newSampleSchema.get(i); + sampleSchema = new Types.ValueType[newSampleSchema.size()]; + for(int i = 0; i < newSampleSchema.size(); i++) + sampleSchema[i] = newSampleSchema.get(i); - //String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); + //String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - gr.getReader(); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + FrameReader fr =gr.getReader(); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); + } } @Test public void test14() throws Exception { - FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); - - String FILENAME_SINGLE = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.raw"; - Types.ValueType[] schema = {Types.ValueType.STRING,Types.ValueType.STRING,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64}; - - Map schemaMap = new HashMap<>(); - schemaMap.put("/returnFlag",0); - schemaMap.put("/lineStatus",1); - schemaMap.put("/quantity",2); - schemaMap.put("/extendedPrice",3); - schemaMap.put("/discount",4); - schemaMap.put("/tax",5); - // Read FrameBlock - FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(FILENAME_SINGLE, schema, schemaMap, -1, schema.length); - - int a = 100; +// FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); +// +// String FILENAME_SINGLE = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.raw"; +// Types.ValueType[] schema = {Types.ValueType.STRING,Types.ValueType.STRING,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64}; +// +// Map schemaMap = new HashMap<>(); +// schemaMap.put("/returnFlag",0); +// schemaMap.put("/lineStatus",1); +// schemaMap.put("/quantity",2); +// schemaMap.put("/extendedPrice",3); +// schemaMap.put("/discount",4); +// schemaMap.put("/tax",5); +// // Read FrameBlock +// FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(FILENAME_SINGLE, schema, schemaMap, -1, schema.length); +// +// int a = 100; + + String schemaFileName ="/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schema"; + String schemaMapFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schemaMap"; + String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/twitter-json.data"; + long nrows = 1000; + + Util util = new Util(); + Types.ValueType[] schema = util.getSchema(schemaFileName); + int ncols = schema.length; + Map schemaMap = util.getSchemaMap(schemaMapFileName); + + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); } } From 3baaca0e34ad131146345dbc2e3cf573900636b1 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 17 Feb 2022 03:22:30 +0100 Subject: [PATCH 37/84] Add CodeGen for Spars Datasets --- .../sysds/runtime/iogen/CustomProperties.java | 10 ++ .../iogen/EXP/{GIO.java => GIOFrame.java} | 4 +- ...ation.java => GIOFrameIdentification.java} | 5 +- .../sysds/runtime/iogen/EXP/GIOMatrix.java | 32 ++++ .../iogen/EXP/GIOMatrixIdentification.java | 26 +++ .../apache/sysds/runtime/iogen/EXP/Util.java | 19 +- .../runtime/iogen/FormatIdentifying.java | 18 +- .../sysds/runtime/iogen/GenerateReader.java | 1 + .../sysds/runtime/iogen/Hirschberg.java | 80 +-------- .../runtime/iogen/codegen/CodeGenTrie.java | 167 ++++++++++++++++-- .../runtime/iogen/codegen/MatrixCodeGen.java | 30 ++-- .../iogen/template/MatrixGenerateReader.java | 22 +-- 12 files changed, 297 insertions(+), 117 deletions(-) rename src/main/java/org/apache/sysds/runtime/iogen/EXP/{GIO.java => GIOFrame.java} (97%) rename src/main/java/org/apache/sysds/runtime/iogen/EXP/{GIOIdentification.java => GIOFrameIdentification.java} (95%) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 5fb2fd1cb6e..23aad4a5651 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -23,6 +23,7 @@ import org.apache.sysds.runtime.io.FileFormatProperties; import java.io.Serializable; +import java.util.HashMap; import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { @@ -41,6 +42,7 @@ public String toString() { private IndexProperties rowIndex; private KeyTrie rowKeyPattern; private String rowIndexBegin; + private HashMap colKeyPatternMap; public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex) { this.colKeyPattern = colKeyPattern; @@ -110,4 +112,12 @@ public String getRowIndexBegin() { public void setRowIndexBegin(String rowIndexBegin) { this.rowIndexBegin = rowIndexBegin; } + + public HashMap getColKeyPatternMap() { + return colKeyPatternMap; + } + + public void setColKeyPatternMap(HashMap colKeyPatternMap) { + this.colKeyPatternMap = colKeyPatternMap; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java similarity index 97% rename from src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java rename to src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java index 619ef62013a..5092b7af070 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIO.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java @@ -8,7 +8,7 @@ import java.util.ArrayList; import java.util.HashSet; -public class GIO { +public class GIOFrame { public static void main(String[] args) throws Exception { String sampleRawFileName; @@ -32,7 +32,7 @@ public static void main(String[] args) throws Exception { Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); int ncols = sampleSchema.length; - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, sampleRawDelimiter); + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter); ArrayList newSampleSchema = new ArrayList<>(); ArrayList> newSampleFrame = new ArrayList<>(); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java similarity index 95% rename from src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOIdentification.java rename to src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java index 56d60532cd5..be189327b8f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java @@ -7,7 +7,7 @@ import java.util.ArrayList; import java.util.HashSet; -public class GIOIdentification { +public class GIOFrameIdentification { public static void main(String[] args) throws Exception { String sampleRawFileName; @@ -24,9 +24,8 @@ public static void main(String[] args) throws Exception { schemaFileName = System.getProperty("schemaFileName"); Util util = new Util(); Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, sampleRawDelimiter); + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter); ArrayList newSampleSchema = new ArrayList<>(); ArrayList> newSampleFrame = new ArrayList<>(); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java new file mode 100644 index 00000000000..4deb63ae571 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java @@ -0,0 +1,32 @@ +package org.apache.sysds.runtime.iogen.EXP; + +import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; + +public class GIOMatrix { + + public static void main(String[] args) throws Exception { + String sampleRawFileName; + String sampleMatrixFileName; + String sampleRawDelimiter; + String dataFileName; + long nrows; + + sampleRawFileName = System.getProperty("sampleRawFileName"); + sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); + sampleRawDelimiter = System.getProperty("delimiter"); + if(sampleRawDelimiter.equals("\\t")) + sampleRawDelimiter = "\t"; + dataFileName = System.getProperty("dataFileName"); + nrows = Long.parseLong(System.getProperty("nrows")); + + Util util = new Util(); + MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + MatrixReader matrixReader = gr.getReader(); + MatrixBlock matrixBlock = matrixReader.readMatrixFromHDFS(dataFileName, nrows, sampleMB.getNumColumns(), -1, -1); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java new file mode 100644 index 00000000000..e000009c1d6 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java @@ -0,0 +1,26 @@ +package org.apache.sysds.runtime.iogen.EXP; + +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; + +public class GIOMatrixIdentification { + + public static void main(String[] args) throws Exception { + String sampleRawFileName; + String sampleMatrixFileName; + String sampleRawDelimiter; + + sampleRawFileName = System.getProperty("sampleRawFileName"); + sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); + sampleRawDelimiter = System.getProperty("delimiter"); + if(sampleRawDelimiter.equals("\\t")) + sampleRawDelimiter = "\t"; + + Util util = new Util(); + MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + gr.getReader(); + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java index adb35dd0471..e1edd83ef36 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java @@ -1,6 +1,8 @@ package org.apache.sysds.runtime.iogen.EXP; import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.util.DataConverter; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -63,16 +65,17 @@ public Map getSchemaMap(String fileName) throws IOException { return schemaMap; } - public String[][] loadFrameData(String fileName, int ncols, String delimiter) + public String[][] loadFrameData(String fileName,String delimiter) throws IOException { ArrayList sampleRawLines = new ArrayList<>(); - + int ncols = 0; try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { String line; while((line = br.readLine()) != null) { String[] data = line.split(delimiter); + ncols=data.length; String[] colsData = new String[ncols]; - for(int i = 0; i < data.length; i++) { + for(int i = 0; i < ncols; i++) { String[] value = data[i].split("::"); if(value.length ==2) { int col = Integer.parseInt(value[0]); @@ -90,4 +93,14 @@ public String[][] loadFrameData(String fileName, int ncols, String delimiter) return result; } + + public MatrixBlock loadMatrixData(String fileName, String delimiter) throws IOException { + String[][] dataString = loadFrameData(fileName,delimiter); + double[][] data = new double[dataString.length][dataString[0].length]; + for(int i=0;i 0){ reverseSplitPattern.add(ps); if (reverseSplitPattern.size() == 0) reverseSplitPattern.add(""); - check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], reverseSplitPattern); - if (check) { - keyPatterns = new ArrayList<>(); - keyPatterns.add(reverseSplitPattern); + + int maxPatternLength = reverseSplitPattern.size(); + check = false; + for(int sp= 0; sp< maxPatternLength;sp++){ + ArrayList shortPattern = new ArrayList<>(); + for(int spi= maxPatternLength - sp-1; spi< maxPatternLength; spi++){ + shortPattern.add(reverseSplitPattern.get(spi)); + } + check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], shortPattern); + if (check) { + keyPatterns = new ArrayList<>(); + keyPatterns.add(shortPattern); + break; + } } } else { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index bdc38a7791f..eea8f2c0a82 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -93,6 +93,7 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; + String jc = src.generateCodeJava(); matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java index dabd478c813..e28c678a0c0 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java @@ -135,12 +135,12 @@ else if(dp[i][j - 1] + pgap == dp[i][j]) { for(int bi = id; bi <= l; bi++) { if(xans[bi] == yans[bi]) { sb.append((char) xans[bi]); - System.out.print((char) xans[bi]); + //System.out.print((char) xans[bi]); } - //else - //System.out.print("*"); +// else +// System.out.print("*"); } - System.out.println(); + //System.out.println(); if(sb.length() > 0) { // StringBuilder stringBuilder = new StringBuilder(); // for (String s: pattern){ @@ -263,34 +263,19 @@ else if(dp[i][j - 1] == dp[i][j]) { for(int bi = id; bi <= l; bi++) { if(xans[bi] == yans[bi]) { sb.append((char) xans[bi]); - System.out.print((char) xans[bi]); + //System.out.print((char) xans[bi]); } - //else - //System.out.print("*"); +// else +// System.out.print("*"); } - System.out.println(); + //System.out.println(); if(sb.length() > 0) { - // StringBuilder stringBuilder = new StringBuilder(); - // for (String s: pattern){ - // stringBuilder.append(s).append("_"); - // } - // if (stringBuilder.length()>0) - // stringBuilder.deleteCharAt(stringBuilder.length()-1); return new Pair<>(pattern, sb.toString()); } else return null; } - public static void main(String[] args) { - int misMatchPenalty = 3; - int gapPenalty = 2; - Hirschberg hirschberg = new Hirschberg(); - String s1 = "123"; - String s2 = "12369666"; - System.out.println(hirschberg.getLCS(s1, s2, misMatchPenalty, gapPenalty).getValue()); - } - public ArrayList getLCS(ArrayList list, int pxy, int pgap) { if(list.size() < 2) @@ -299,7 +284,7 @@ public ArrayList getLCS(ArrayList list, int pxy, int pgap) { String str1 = list.get(0); String str2 = list.get(1); - Pair, String> pattern = getLCS(str1, str2, pxy, pgap); + Pair, String> pattern = getLCS(str1, str2); if(pattern != null) { String intersect = pattern.getValue(); ArrayList intersectPattern = pattern.getKey(); @@ -318,52 +303,5 @@ public ArrayList getLCS(ArrayList list, int pxy, int pgap) { return null; } } -// public ArrayList getLCS(ArrayList list, int pxy, int pgap) { -// if(list.size() < 2) -// return null; -// -// if(pattern != null) { -// String intersect = pattern.getValue(); -// ArrayList intersectPattern = pattern.getKey(); -// for(int i = 2; i < list.size(); i++) { -// if(i == 199) -// System.out.print(i + " >> " + list.get(i) + "\n"); -// pattern = getLCS(intersect, list.get(i)); -// if(pattern != null) { -// intersect = pattern.getValue(); -// intersectPattern = pattern.getKey(); -// } -// else -// intersect = null; -// } -// if(intersect != null) -// return intersectPattern; -// -// } -// -// Hirschberg2 hirschberg2 = new Hirschberg2(); -// String str1 = list.get(0); -// String str2 = list.get(1); -// Pair, String pattern = hirschberg2.algC(str1.length(), str2.length(), str1, str2); -// if(pattern != null) { -// String intersect = pattern.getValue(); -// ArrayList intersectPattern = pattern.getKey(); -// for(int i = 2; i < list.size(); i++) { -// if(i == 199) -// System.out.print(i + " >> " + list.get(i) + "\n"); -// pattern = getLCS(intersect, list.get(i)); -// if(pattern != null) { -// intersect = pattern.getValue(); -// intersectPattern = pattern.getKey(); -// } -// else -// intersect = null; -// } -// if(intersect != null) -// return intersectPattern; -// -// } -// -// } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index d7c02a09676..ce7af9f263d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -19,12 +19,13 @@ package org.apache.sysds.runtime.iogen.codegen; -import com.google.gson.Gson; import org.apache.sysds.common.Types; +import org.apache.sysds.lops.Lop; import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.KeyTrie; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.Random; @@ -33,13 +34,76 @@ public class CodeGenTrie { private final CodeGenTrieNode rootRow; private final CustomProperties properties; private final String destination; + private HashMap colKeyPatternMap; + private HashSet regexSet; + private final boolean isRegexBase; + private boolean isMatrix; public CodeGenTrie(CustomProperties properties, String destination) { this.rootCol = new CodeGenTrieNode(CodeGenTrieNode.NodeType.COL); this.rootRow = new CodeGenTrieNode(CodeGenTrieNode.NodeType.ROW); this.properties = properties; this.destination = destination; - buildPrefixTree(); + this.isMatrix = false; + + HashSet conditions = new HashSet<>(); + for(int c = 0; c < properties.getColKeyPattern().length; c++) { + KeyTrie keyTrie = properties.getColKeyPattern()[c]; + if(keyTrie != null) { + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { + conditions.add(keys.get(0)); + break; + } + } + } + if(conditions.size() < 2) { + buildPrefixTree(); + this.isRegexBase = false; + } + else { + this.colKeyPatternMap = new HashMap<>(); + this.regexSet = new HashSet<>(); + this.isRegexBase = true; + buildPrefixTreeRegex(); + } + + } + + // Build Trie for Col and Row Key Patterns + private void buildPrefixTreeRegex() { + for(int c = 0; c < properties.getColKeyPattern().length; c++) { + KeyTrie keyTrie = properties.getColKeyPattern()[c]; + if(keyTrie != null) { + StringBuilder ksb = new StringBuilder(); + StringBuilder sbr = new StringBuilder(); + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { + for(String ks : keys) { + ksb.append(ks).append(Lop.OPERAND_DELIMITOR); + String tmp = ks.replaceAll("\\s+", "\\\\s+"); + tmp = tmp.replaceAll("\\d+", "\\\\d+"); + sbr.append("(").append(tmp).append(")").append(Lop.OPERAND_DELIMITOR); + } + ksb.deleteCharAt(ksb.length() - 1); + sbr.deleteCharAt(sbr.length() - 1); + break; + } + if(ksb.length() == 0) + colKeyPatternMap.put("", c); + else + colKeyPatternMap.put(ksb.toString(), c); + regexSet.add(sbr.toString()); + + } + } + + if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { + KeyTrie keyTrie = properties.getRowKeyPattern(); + Types.ValueType vt = Types.ValueType.FP32; + if(keyTrie != null) { + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + this.insert(rootRow, -1, vt, keys); + } + } } // Build Trie for Col and Row Key Patterns @@ -47,10 +111,14 @@ private void buildPrefixTree() { for(int c = 0; c < properties.getColKeyPattern().length; c++) { KeyTrie keyTrie = properties.getColKeyPattern()[c]; Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; + //Gson gson = new Gson(); if(keyTrie != null) { + //System.out.println(gson.toJson(keyTrie.getReversePrefixKeyPatterns())); for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } + //else + // System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>> "+c); } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { @@ -108,7 +176,6 @@ public String getJavaCode() { src.append("try { \n"); src.append("do{ \n"); src.append("strChunk = getStringChunkOfBufferReader(br, remainedStr, chunkSize); \n"); - src.append("System.out.println(strChunk); \n"); src.append("if(strChunk == null || strChunk.length() == 0) break; \n"); src.append("do { \n"); ArrayList> kp = properties.getRowKeyPattern().getPrefixKeyPatterns(); @@ -139,6 +206,68 @@ public String getRandomName(String base) { } private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos) { + if(!isRegexBase) + getJavaCodeIndexOf(node, src, currPos); + else + getJavaCodeRegex(src); + } + + private void getJavaCodeRegex(StringBuilder src) { + + // TODO: for fist item start with "" + //src.append("List allMatches = new ArrayList(); \n"); + for(String s : regexSet) { + if(s.equals("()")) { + src.append("int colIndex0 = getColIndex(colKeyPatternMap, \"\"); \n"); + src.append("endPos = getEndPos(str, strLen, 0, endWithValueString[colIndex0]); \n"); + src.append("String cellStr0 = str.substring(0, endPos); \n"); + src.append("if ( cellStr0.length() > 0 ){\n"); + if(isMatrix) { + src.append("Double cellValue0; \n"); + src.append("try{cellValue0 = Double.parseDouble(cellStr0); } catch(Exception e){cellValue0= 0d;}\n"); + src.append("if(cellValue0!= 0) { \n"); + src.append(destination).append("(row, colIndex0 , cellValue0); \n"); + src.append("lnnz++;\n"); + src.append("} \n"); + } + else { + src.append(destination).append("(row, colIndex0 , UtilFunctions.stringToObject(properties.getSchema()[colIndex0], cellStr)0); \n"); + } + src.append("}\n"); + } + else { + int groupCount = s.split(Lop.OPERAND_DELIMITOR).length; + if(groupCount > 1) + break; + src.append("Matcher matcher = Pattern.compile(\"" + s.replace("\\", "\\\\") + "\").matcher(str); \n"); + src.append("while(matcher.find()) { \n"); + src.append("String key = ").append("matcher.group(1);\n"); + src.append("int currPos = matcher.end();\n"); + src.append("int colIndex = getColIndex(colKeyPatternMap, key); \n"); + src.append("if(colIndex!=-1) { \n"); + //src.append("Types.ValueType vt = pair.getValue();\n"); + src.append("endPos = getEndPos(str, strLen, currPos, endWithValueString[colIndex]); \n"); + src.append("String cellStr = str.substring(currPos, endPos); \n"); + src.append("if ( cellStr.length() > 0 ){\n"); + if(isMatrix) { + src.append("Double cellValue; \n"); + src.append("try{cellValue = Double.parseDouble(cellStr); } catch(Exception e){cellValue= 0d;}\n"); + src.append("if(cellValue!= 0) { \n"); + src.append(destination).append("(row, colIndex , cellValue); \n"); + src.append("lnnz++;\n"); + src.append("} \n"); + } + else { + src.append(destination).append("(row, colIndex , UtilFunctions.stringToObject(properties.getSchema()[colIndex], cellStr)); \n"); + } + src.append("}\n"); + src.append("}\n"); + src.append("}\n"); + } + } + } + + private void getJavaCodeIndexOf(CodeGenTrieNode node, StringBuilder src, String currPos) { if(node.isEndOfCondition()) src.append(node.geValueCode(destination, currPos)); @@ -148,14 +277,16 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos if(key.length() > 0) { currPosVariable = getRandomName("curPos"); if(node.getKey() == null) - src.append("index = str.indexOf(\"" + key.replace("\\\"","\"").replace("\"", "\\\"") + "\"); \n"); + src.append( + "index = str.indexOf(\"" + key.replace("\\\"", "\"").replace("\"", "\\\"") + "\"); \n"); else - src.append("index = str.indexOf(\"" + key.replace("\\\"","\"").replace("\"", "\\\"") + "\", " + currPos + "); \n"); + src.append("index = str.indexOf(\"" + key.replace("\\\"", "\"") + .replace("\"", "\\\"") + "\", " + currPos + "); \n"); src.append("if(index != -1) { \n"); src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); } CodeGenTrieNode child = node.getChildren().get(key); - getJavaCode(child, src, currPosVariable); + getJavaCodeIndexOf(child, src, currPosVariable); if(key.length() > 0) src.append("} \n"); } @@ -163,24 +294,36 @@ private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos } private void getJavaRowCode(StringBuilder src, ArrayList> rowBeginPattern, - ArrayList> rowEndPattern){ + ArrayList> rowEndPattern) { // TODO: we have to extend it to multi patterns // now, we assumed each row can have single pattern for begin and end - for(ArrayList kb: rowBeginPattern){ - for(String k: kb){ - src.append("recordIndex = strChunk.indexOf(\""+k+"\", recordIndex); \n"); + for(ArrayList kb : rowBeginPattern) { + for(String k : kb) { + src.append("recordIndex = strChunk.indexOf(\"" + k + "\", recordIndex); \n"); src.append("if(recordIndex == -1) break; \n"); } - src.append("recordIndex +="+ kb.get(kb.size() -1).length()+"; \n"); + src.append("recordIndex +=" + kb.get(kb.size() - 1).length() + "; \n"); break; } src.append("int recordBeginPos = recordIndex; \n"); String endKey = rowEndPattern.get(0).get(0); - src.append("recordIndex = strChunk.indexOf(\""+endKey+"\", recordBeginPos);"); + src.append("recordIndex = strChunk.indexOf(\"" + endKey + "\", recordBeginPos);"); src.append("if(recordIndex == -1) break; \n"); src.append("str = strChunk.substring(recordBeginPos, recordIndex); \n"); src.append("strLen = str.length(); \n"); } + + public void setMatrix(boolean matrix) { + isMatrix = matrix; + } + + public boolean isRegexBase() { + return isRegexBase; + } + + public HashMap getColKeyPatternMap() { + return colKeyPatternMap; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 272bd8b22b6..11b0a1c7da1 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -30,6 +30,10 @@ public MatrixCodeGen(CustomProperties properties, String className) { // 1. set java code template javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + + "import java.util.HashMap;\n" + + "import java.util.HashSet;\n" + + "import java.util.regex.Matcher;\n" + + "import java.util.regex.Pattern; \n"+ "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + @@ -37,8 +41,6 @@ public MatrixCodeGen(CustomProperties properties, String className) { "import java.io.IOException;\n" + "import java.io.InputStream;\n" + "import java.io.InputStreamReader;\n" + - "import java.util.HashSet; \n" + - "public class "+className+" extends MatrixGenerateReader {\n"+ " public "+className+"(CustomProperties _props) {\n"+ @@ -55,26 +57,32 @@ public MatrixCodeGen(CustomProperties properties, String className) { @Override public String generateCodeJava() { StringBuilder src = new StringBuilder(); + CodeGenTrie trie= new CodeGenTrie(properties, "dest.appendValue"); + trie.setMatrix(true); src.append("String str; \n"); src.append("int row = rowPos.intValue(); \n"); src.append("long lnnz = 0; \n"); src.append("int index, endPos, strLen; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); + if(trie.isRegexBase()) { + properties.setColKeyPatternMap(trie.getColKeyPatternMap()); + src.append( + "HashMap colKeyPatternMap = _props.getColKeyPatternMap(); \n"); + } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); -// src.append("try { \n"); -// src.append("while((str = br.readLine()) != null){ \n"); -// src.append("strLen = str.length(); \n"); + src.append("try { \n"); + src.append("while((str = br.readLine()) != null){ \n"); + src.append("strLen = str.length(); \n"); - CodeGenTrie trie= new CodeGenTrie(properties, "dest.appendValue"); src.append(trie.getJavaCode()); -// src.append("} \n"); -// src.append("} \n"); -// src.append("finally { \n"); -// src.append("IOUtilFunctions.closeSilently(br); \n"); -// src.append("}"); + src.append("} \n"); + src.append("} \n"); + src.append("finally { \n"); + src.append("IOUtilFunctions.closeSilently(br); \n"); + src.append("}"); src.append("rowPos.setValue(row); \n"); src.append("return lnnz; \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index ace1da89756..90c530c845f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -30,6 +30,7 @@ import org.apache.hadoop.fs.Path; import org.apache.sysds.runtime.io.IOUtilFunctions; import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.matrix.data.Pair; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -41,8 +42,11 @@ import java.io.Writer; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public abstract class MatrixGenerateReader extends MatrixReader { @@ -162,17 +166,13 @@ protected int getEndPos(String str, int strLen, int currPos, HashSet end return endPos; } - //src.append("String str; \n"); - // src.append("int row = rowPos.intValue(); \n"); - // src.append("long lnnz = 0; \n"); - // src.append("int index, endPos, strLen; \n"); - // src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); - // src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); - // if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) - // src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); - // src.append("try { \n"); - // src.append("while((str = br.readLine()) != null){ \n"); - // src.append("strLen = str.length(); \n"); + + protected int getColIndex(HashMap colKeyPatternMap, String key){ + if(colKeyPatternMap.containsKey(key)) + return colKeyPatternMap.get(key); + else + return -1; + } protected String getStringChunkOfBufferReader(BufferedReader br, String remainedStr,int chunkSize){ StringBuilder sb = new StringBuilder(); From 350282a984ad225524a3b628932bf695c6d99745 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 17 Feb 2022 19:17:48 +0100 Subject: [PATCH 38/84] Update GIO Exp --- .../sysds/runtime/iogen/EXP/GIOFrame.java | 34 +---------------- .../iogen/EXP/GIOFrameIdentification.java | 37 +------------------ .../sysds/runtime/iogen/EXP/SystemDSCSV.java | 1 - .../apache/sysds/runtime/iogen/EXP/Util.java | 21 ++++++++--- .../sysds/runtime/iogen/GenerateReader.java | 2 - .../runtime/iogen/codegen/CodeGenTrie.java | 7 +++- 6 files changed, 24 insertions(+), 78 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java index 5092b7af070..7a58e596e54 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java @@ -32,40 +32,8 @@ public static void main(String[] args) throws Exception { Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); int ncols = sampleSchema.length; - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter); - - ArrayList newSampleSchema = new ArrayList<>(); - ArrayList> newSampleFrame = new ArrayList<>(); - - for(int c = 0; c < sampleFrameStrings[0].length; c++) { - HashSet valueSet = new HashSet<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) - valueSet.add(sampleFrameStrings[r][c]); - if(valueSet.size() > 1) { - ArrayList tempList = new ArrayList<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) { - tempList.add(sampleFrameStrings[r][c]); - } - newSampleFrame.add(tempList); - newSampleSchema.add(sampleSchema[c]); - } - } - - sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; - - for(int row = 0; row < sampleFrameStrings.length; row++) { - for(int col = 0; col < sampleFrameStrings[0].length; col++) { - sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); - } - } - - sampleSchema = new Types.ValueType[newSampleSchema.size()]; - for(int i = 0; i < newSampleSchema.size(); i++) - sampleSchema[i] = newSampleSchema.get(i); - - + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); FrameReader fr = gr.getReader(); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java index be189327b8f..75127daff57 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java @@ -4,9 +4,6 @@ import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; -import java.util.ArrayList; -import java.util.HashSet; - public class GIOFrameIdentification { public static void main(String[] args) throws Exception { @@ -24,39 +21,9 @@ public static void main(String[] args) throws Exception { schemaFileName = System.getProperty("schemaFileName"); Util util = new Util(); Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter); - - ArrayList newSampleSchema = new ArrayList<>(); - ArrayList> newSampleFrame = new ArrayList<>(); - - for(int c = 0; c < sampleFrameStrings[0].length; c++) { - HashSet valueSet = new HashSet<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) - valueSet.add(sampleFrameStrings[r][c]); - if(valueSet.size() > 1) { - ArrayList tempList = new ArrayList<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) { - tempList.add(sampleFrameStrings[r][c]); - } - newSampleFrame.add(tempList); - newSampleSchema.add(sampleSchema[c]); - } - } - - sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; - - for(int row = 0; row < sampleFrameStrings.length; row++) { - for(int col = 0; col < sampleFrameStrings[0].length; col++) { - sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); - } - } - - sampleSchema = new Types.ValueType[newSampleSchema.size()]; - for(int i = 0; i < newSampleSchema.size(); i++) - sampleSchema[i] = newSampleSchema.get(i); - - + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); String sampleRaw = util.readEntireTextFile(sampleRawFileName); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java index 4fc831ab2b1..aa326c5e187 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java @@ -8,7 +8,6 @@ import org.apache.wink.json4j.JSONException; import java.io.IOException; -import java.util.Map; public class SystemDSCSV { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java index e1edd83ef36..97989a0ea98 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java @@ -65,17 +65,15 @@ public Map getSchemaMap(String fileName) throws IOException { return schemaMap; } - public String[][] loadFrameData(String fileName,String delimiter) + public String[][] loadFrameData(String fileName,String delimiter, int ncols) throws IOException { ArrayList sampleRawLines = new ArrayList<>(); - int ncols = 0; try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { String line; while((line = br.readLine()) != null) { String[] data = line.split(delimiter); - ncols=data.length; String[] colsData = new String[ncols]; - for(int i = 0; i < ncols; i++) { + for(int i = 0; i < data.length; i++) { String[] value = data[i].split("::"); if(value.length ==2) { int col = Integer.parseInt(value[0]); @@ -95,11 +93,22 @@ public String[][] loadFrameData(String fileName,String delimiter) } public MatrixBlock loadMatrixData(String fileName, String delimiter) throws IOException { - String[][] dataString = loadFrameData(fileName,delimiter); + int ncols = 0; + try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { + String line; + while((line = br.readLine()) != null) { + String[] data = line.split(delimiter); + ncols = Math.max(ncols, Integer.parseInt( data[data.length-1].split("::")[0])); + } + } + String[][] dataString = loadFrameData(fileName,delimiter, ncols+1); double[][] data = new double[dataString.length][dataString[0].length]; for(int i=0;i keys : keyTrie.getReversePrefixKeyPatterns()) { conditions.add(keys.get(0)); + //Gson gson = new Gson(); + //System.out.println(c+" >> "+ gson.toJson(keys)); + break; } } } - if(conditions.size() < 2) { + + if(conditions.size() < 150) { buildPrefixTree(); this.isRegexBase = false; } From d84c3e982d7c0426a64dd229b42e68b1b4e3ff7a Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sat, 19 Feb 2022 00:14:22 +0100 Subject: [PATCH 39/84] Update GIO Exp --- .../apache/sysds/runtime/iogen/codegen/CodeGenTrie.java | 9 +-------- .../sysds/runtime/iogen/codegen/CodeGenTrieNode.java | 4 +++- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index e14f9e2be19..084ebf55041 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -53,9 +53,6 @@ public CodeGenTrie(CustomProperties properties, String destination) { if(keyTrie != null) { for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { conditions.add(keys.get(0)); - //Gson gson = new Gson(); - //System.out.println(c+" >> "+ gson.toJson(keys)); - break; } } @@ -116,14 +113,10 @@ private void buildPrefixTree() { for(int c = 0; c < properties.getColKeyPattern().length; c++) { KeyTrie keyTrie = properties.getColKeyPattern()[c]; Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; - //Gson gson = new Gson(); if(keyTrie != null) { - //System.out.println(gson.toJson(keyTrie.getReversePrefixKeyPatterns())); for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) this.insert(rootCol, c, vt, keys); } - //else - // System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>> "+c); } if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { @@ -171,7 +164,7 @@ public String getJavaCode() { src.append("row++; \n"); break; case PREFIX: - getJavaCode(rootRow, src, "0"); + getJavaCodeIndexOf(rootRow, src, "0"); getJavaCode(rootCol, src, "0"); break; case KEY: diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 6fde8a81f25..54f029796dc 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -76,10 +76,12 @@ private String getRowPrefixValueCode(String currPos){ String subStr; src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueStringRow); \n"); subStr = "str.substring("+currPos+",endPos)"; + src.append("try{ \n"); if(rowIndexBeginPos.length() > 0) - src.append("row = ").append("Integer.parseInt("+subStr+") "+rowIndexBeginPos+"; \n"); + src.append("row = ").append("Integer.parseInt(" + subStr + ") " + rowIndexBeginPos + "; \n"); else src.append("row = ").append("Integer.parseInt("+subStr+"); \n"); + src.append("} catch(Exception e){} \n"); return src.toString(); } From 1d3895adaed1f80110b7281c67d6bea04187a7e5 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sat, 26 Feb 2022 16:12:28 +0100 Subject: [PATCH 40/84] Update GIO Exp --- .../sysds/runtime/iogen/EXP/GIOFrame.java | 21 ++-- .../iogen/EXP/GIOFrameIdentification.java | 4 +- .../sysds/runtime/iogen/EXP/GIOMatrix.java | 22 ++-- .../iogen/EXP/GIOMatrixIdentification.java | 4 +- .../sysds/runtime/iogen/EXP/SystemDS.java | 110 ++++++++++++++++++ .../sysds/runtime/iogen/EXP/SystemDSCSV.java | 34 ------ .../runtime/iogen/codegen/CodeGenTrie.java | 11 +- 7 files changed, 149 insertions(+), 57 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java index 7a58e596e54..4e5e2b9ce5a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java @@ -4,6 +4,7 @@ import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.wink.json4j.JSONObject; import java.util.ArrayList; import java.util.HashSet; @@ -16,19 +17,25 @@ public static void main(String[] args) throws Exception { String sampleRawDelimiter; String schemaFileName; String dataFileName; - long nrows; + long rows = -1; sampleRawFileName = System.getProperty("sampleRawFileName"); sampleFrameFileName = System.getProperty("sampleFrameFileName"); - sampleRawDelimiter = System.getProperty("delimiter"); - if(sampleRawDelimiter.equals("\\t")) - sampleRawDelimiter = "\t"; + sampleRawDelimiter = "\t"; schemaFileName = System.getProperty("schemaFileName"); dataFileName = System.getProperty("dataFileName"); - nrows = Long.parseLong(System.getProperty("nrows")); + Util util = new Util(); + // read and parse mtd file + String mtdFileName = dataFileName + ".mtd"; + try { + String mtd = util.readEntireTextFile(mtdFileName); + mtd = mtd.replace("\n", "").replace("\r", ""); + mtd = mtd.toLowerCase().trim(); + JSONObject jsonObject = new JSONObject(mtd); + if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); + } catch (Exception exception) {} - Util util = new Util(); Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); int ncols = sampleSchema.length; @@ -37,7 +44,7 @@ public static void main(String[] args) throws Exception { String sampleRaw = util.readEntireTextFile(sampleRawFileName); GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); FrameReader fr = gr.getReader(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, nrows, sampleSchema.length); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java index 75127daff57..54593553a9b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java @@ -14,9 +14,7 @@ public static void main(String[] args) throws Exception { sampleRawFileName = System.getProperty("sampleRawFileName"); sampleFrameFileName = System.getProperty("sampleFrameFileName"); - sampleRawDelimiter = System.getProperty("delimiter"); - if(sampleRawDelimiter.equals("\\t")) - sampleRawDelimiter = "\t"; + sampleRawDelimiter = "\t"; schemaFileName = System.getProperty("schemaFileName"); Util util = new Util(); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java index 4deb63ae571..17bd0eccdbb 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java @@ -3,6 +3,7 @@ import org.apache.sysds.runtime.io.MatrixReader; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.wink.json4j.JSONObject; public class GIOMatrix { @@ -11,22 +12,29 @@ public static void main(String[] args) throws Exception { String sampleMatrixFileName; String sampleRawDelimiter; String dataFileName; - long nrows; + long rows = -1; sampleRawFileName = System.getProperty("sampleRawFileName"); sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); - sampleRawDelimiter = System.getProperty("delimiter"); - if(sampleRawDelimiter.equals("\\t")) - sampleRawDelimiter = "\t"; + sampleRawDelimiter = "\t"; dataFileName = System.getProperty("dataFileName"); - nrows = Long.parseLong(System.getProperty("nrows")); - Util util = new Util(); + // read and parse mtd file + String mtdFileName = dataFileName + ".mtd"; + try { + String mtd = util.readEntireTextFile(mtdFileName); + mtd = mtd.replace("\n", "").replace("\r", ""); + mtd = mtd.toLowerCase().trim(); + JSONObject jsonObject = new JSONObject(mtd); + if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); + } catch (Exception exception) {} + + MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); String sampleRaw = util.readEntireTextFile(sampleRawFileName); GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); MatrixReader matrixReader = gr.getReader(); - MatrixBlock matrixBlock = matrixReader.readMatrixFromHDFS(dataFileName, nrows, sampleMB.getNumColumns(), -1, -1); + MatrixBlock matrixBlock = matrixReader.readMatrixFromHDFS(dataFileName, rows, sampleMB.getNumColumns(), -1, -1); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java index e000009c1d6..73b08fd480d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java @@ -12,9 +12,7 @@ public static void main(String[] args) throws Exception { sampleRawFileName = System.getProperty("sampleRawFileName"); sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); - sampleRawDelimiter = System.getProperty("delimiter"); - if(sampleRawDelimiter.equals("\\t")) - sampleRawDelimiter = "\t"; + sampleRawDelimiter = "\t"; Util util = new Util(); MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java new file mode 100644 index 00000000000..f9bdc5f9a0b --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -0,0 +1,110 @@ +package org.apache.sysds.runtime.iogen.EXP; + +import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.*; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.wink.json4j.JSONException; +import org.apache.wink.json4j.JSONObject; + +import java.io.IOException; +import java.util.Map; + +public class SystemDS { + + public static void main(String[] args) throws IOException, JSONException { + + String schemaFileName; + String dataFileName; + String dataType = null; + String valueType; + String sep = null; + String indSep = null; + boolean header = false; + long cols = -1; + long rows = -1; + String format = null; + String config = null; + String schemaMapFileName = null; + + + Util util = new Util(); + dataFileName = System.getProperty("dataFileName"); + // read and parse mtd file + String mtdFileName = dataFileName + ".mtd"; + try { + String mtd = util.readEntireTextFile(mtdFileName); + mtd = mtd.replace("\n", "").replace("\r", ""); + mtd = mtd.toLowerCase().trim(); + JSONObject jsonObject = new JSONObject(mtd); + if (jsonObject.containsKey("data_type")) dataType = jsonObject.getString("data_type"); + + if (jsonObject.containsKey("value_type")) valueType = jsonObject.getString("value_type"); + + if (jsonObject.containsKey("format")) format = jsonObject.getString("format"); + + if (jsonObject.containsKey("cols")) cols = jsonObject.getLong("cols"); + + if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); + + if (jsonObject.containsKey("header")) header = jsonObject.getBoolean("header"); + } catch (Exception exception) { + } + + if (dataType.equalsIgnoreCase("matrix")) { + MatrixReader matrixReader = null; + switch (format) { + case "csv": + FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); + matrixReader = new ReaderTextCSV(propertiesCSV); + break; + case "libsvm": + FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); + matrixReader = new ReaderTextLIBSVM(propertiesLIBSVM); + break; + case "mm": + matrixReader = new ReaderTextCell(Types.FileFormat.MM); + break; + } + if (matrixReader == null) throw new IOException("The Matrix Reader is NULL: " + dataFileName + ", format: " + format); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + } else { + schemaFileName = System.getProperty("schemaFileName"); + Types.ValueType[] schema = util.getSchema(schemaFileName); + cols = schema.length; + FrameBlock frameBlock = null; + + + switch (format) { + case "csv": + FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); + FrameReader frameReader = new FrameReaderTextCSV(propertiesCSV); + frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); + break; + case "json": + schemaMapFileName = System.getProperty("schemaMapFileName"); + Map schemaMap = util.getSchemaMap(schemaMapFileName); + config = System.getProperty("config"); + switch (config.toLowerCase()) { + case "gson": + FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); + frameBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + + case "jackson": + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + frameBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + case "json4j": + FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); + frameBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + default: + throw new IOException("JSON Config don't support!!" + config); + } + break; + } + + } + + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java deleted file mode 100644 index aa326c5e187..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSCSV.java +++ /dev/null @@ -1,34 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FileFormatPropertiesCSV; -import org.apache.sysds.runtime.io.FrameReaderJSONGson; -import org.apache.sysds.runtime.io.FrameReaderTextCSV; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONException; - -import java.io.IOException; - -public class SystemDSCSV { - - public static void main(String[] args) throws IOException, JSONException { - - String schemaFileName; - String schemaMapFileName; - String dataFileName; - long nrows; - - schemaFileName = System.getProperty("schemaFileName"); - schemaMapFileName = System.getProperty("schemaMapFileName"); - dataFileName = System.getProperty("dataFileName"); - nrows = Long.parseLong(System.getProperty("nrows")); - - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(false, "\t", false); - FrameReaderTextCSV frameReaderTextCSV = new FrameReaderTextCSV(propertiesCSV); - FrameBlock readBlock = frameReaderTextCSV.readFrameFromHDFS(dataFileName, schema, nrows, ncols); - - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 084ebf55041..8abb2ff47bd 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -53,6 +53,8 @@ public CodeGenTrie(CustomProperties properties, String destination) { if(keyTrie != null) { for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { conditions.add(keys.get(0)); + //Gson gson=new Gson(); + //System.out.println(gson.toJson(keys)); break; } } @@ -222,14 +224,16 @@ private void getJavaCodeRegex(StringBuilder src) { src.append("if ( cellStr0.length() > 0 ){\n"); if(isMatrix) { src.append("Double cellValue0; \n"); - src.append("try{cellValue0 = Double.parseDouble(cellStr0); } catch(Exception e){cellValue0= 0d;}\n"); + src.append( + "try{cellValue0 = Double.parseDouble(cellStr0); } catch(Exception e){cellValue0= 0d;}\n"); src.append("if(cellValue0!= 0) { \n"); src.append(destination).append("(row, colIndex0 , cellValue0); \n"); src.append("lnnz++;\n"); src.append("} \n"); } else { - src.append(destination).append("(row, colIndex0 , UtilFunctions.stringToObject(properties.getSchema()[colIndex0], cellStr)0); \n"); + src.append(destination).append( + "(row, colIndex0 , UtilFunctions.stringToObject(properties.getSchema()[colIndex0], cellStr)0); \n"); } src.append("}\n"); } @@ -256,7 +260,8 @@ private void getJavaCodeRegex(StringBuilder src) { src.append("} \n"); } else { - src.append(destination).append("(row, colIndex , UtilFunctions.stringToObject(properties.getSchema()[colIndex], cellStr)); \n"); + src.append(destination).append( + "(row, colIndex , UtilFunctions.stringToObject(properties.getSchema()[colIndex], cellStr)); \n"); } src.append("}\n"); src.append("}\n"); From d26aafd1926aab2fd242436b6f0c8eb788380d2f Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sat, 26 Feb 2022 22:36:29 +0100 Subject: [PATCH 41/84] Update GIO Exp --- .../org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 8abb2ff47bd..653d7f932f5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -53,14 +53,14 @@ public CodeGenTrie(CustomProperties properties, String destination) { if(keyTrie != null) { for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { conditions.add(keys.get(0)); - //Gson gson=new Gson(); - //System.out.println(gson.toJson(keys)); +// Gson gson=new Gson(); +// System.out.println(c+" >> "+gson.toJson(keys)); break; } } } - if(conditions.size() < 150) { + if(conditions.size() < 100) { buildPrefixTree(); this.isRegexBase = false; } From 4b98c6d22343c3e9556d65895d5074b5faff37b0 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 27 Feb 2022 01:07:33 +0100 Subject: [PATCH 42/84] Update GIO, Move from 2D Array to MatrixBlock --- pom.xml | 2 +- .../apache/sysds/runtime/iogen/EXP/Util.java | 31 +- .../runtime/iogen/FormatIdentifying.java | 1586 ++++++++--------- .../sysds/runtime/iogen/ReaderMapping.java | 38 +- .../iogen/GenerateReaderMatrixTest.java | 16 +- .../Identify/MatrixGRRowColIdentifyTest.java | 336 ++-- .../iogen/MatrixSingleRowFlatTest.java | 45 + 7 files changed, 1073 insertions(+), 981 deletions(-) diff --git a/pom.xml b/pom.xml index 4be3b3ba96d..0868f44de6d 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ true ** false - -Xms3000m -Xmx9000m -Xmn300m + -Xms3000m -Xmx18000m -Xmn300m false diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java index 97989a0ea98..d6a230ef22d 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java @@ -94,22 +94,33 @@ public String[][] loadFrameData(String fileName,String delimiter, int ncols) public MatrixBlock loadMatrixData(String fileName, String delimiter) throws IOException { int ncols = 0; + int nrows = 0; try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { String line; while((line = br.readLine()) != null) { String[] data = line.split(delimiter); ncols = Math.max(ncols, Integer.parseInt( data[data.length-1].split("::")[0])); + nrows++; } } - String[][] dataString = loadFrameData(fileName,delimiter, ncols+1); - double[][] data = new double[dataString.length][dataString[0].length]; - for(int i=0;i sampleRawIndexes; - - private static int nrows; - private static int ncols; - private int nlines; - private int windowSize = 20; - private int suffixStringLength = 50; - private ReaderMapping mappingValues; - private CustomProperties properties; - - - public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { - this.mappingValues = new ReaderMapping(raw, matrix); - this.runIdentification(); - } - - public FormatIdentifying(String raw, FrameBlock frame) throws Exception { - this.mappingValues = new ReaderMapping(raw, frame); - this.runIdentification(); - } - - private void runIdentification() { - - mapRow = mappingValues.getMapRow(); - mapCol = mappingValues.getMapCol(); - mapLen = mappingValues.getMapLen(); - sampleRawIndexes = mappingValues.getSampleRawIndexes(); - mapRowPrevious = new int[ncols]; - - for(int c=0; c< ncols; c++) - mapRowPrevious[c] = 0; - - nrows = mappingValues.getNrows(); - ncols = mappingValues.getNcols(); - nlines = mappingValues.getNlines(); - NaN = (ncols * nrows) - mappingValues.getNaN(); - - // Check the map row: - // If all cells of a row mapped to a single line of sample raw, it is a single row mapping - // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping - - boolean isSingleRow = false; - int missedCount = 0; - for(int r=0; r beginPos = new HashSet<>(); - KeyTrie rowKeyPattern = null; - - // Select two none zero row as a row index candidate - - int index = 0; - for(int r=1; r1) - break; - } - - for(int c=0; c< Math.min(numberOfSelectedCols, ncols); c++){ - Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); - ArrayList prefixStrings = colPrefixString.getKey(); - ArrayList prefixStringRowIndexes = colPrefixString.getValue(); - ArrayList prefixRawIndex = new ArrayList<>(); - - MappingTrie trie = new MappingTrie(); - int ri = 0; - for(String ps: prefixStrings ) - trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); - - do { - flag = trie.reConstruct(); - }while(flag); - - ArrayList> keyPatterns = trie.getAllSequentialKeys(); - for(ArrayList kp: keyPatterns){ - for(String ps: prefixStrings){ - StringBuilder sb = new StringBuilder(); - int currPos = 0; - for(String k: kp){ - sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); - currPos += sb.length() + k.length(); - } - prefixRawIndex.add(new RawIndex(sb.toString())); - } - } - - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - if(!flag) { - begin = 1; - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - } - if(!flag) { - beginPos.clear(); - break; - } - else - beginPos.add(begin); - if(c== numberOfSelectedCols -1){ - ArrayList rowPrefixStrings = new ArrayList<>(); - MappingTrie rowTrie = new MappingTrie(); - rowKeyPattern = new KeyTrie(); - for(int si: selectedRowIndex) { - for(int ci = 0; ci < ncols; ci++) { - int cri = mapRow[si][ci]; - if(cri != -1) { - String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); - RawIndex rawIndex = new RawIndex(str); - Pair pair = rawIndex.findValue(si + begin); - if(pair != null) { - String pstr = str.substring(0, pair.getKey()); - if(pstr.length() > 0) { - rowPrefixStrings.add(pstr); - rowTrie.insert(pstr, 1); - } - rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); - } - } - } - } - - do { - ArrayList> selectedKeyPatterns = new ArrayList<>(); - keyPatterns = rowTrie.getAllSequentialKeys(); - check = false; - for(ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); - check |= newCheck; - if(newCheck) - selectedKeyPatterns.add(keyPattern); - } - if(check) - keyPatterns = selectedKeyPatterns; - else { - flagReconstruct = rowTrie.reConstruct(); - if(!flagReconstruct) - break; - } - }while(!check); - - if(keyPatterns.size() == 0){ - ArrayList> kpl = new ArrayList<>(); - ArrayList kpli = new ArrayList<>(); - kpli.add(""); - kpl.add(kpli); - keyPatterns = kpl; - } - rowKeyPattern.setPrefixKeyPattern(keyPatterns); - } - } - - if(beginPos.size() == 1){ - colKeyPattern = buildColsKeyPatternSingleRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); - Integer bpos = beginPos.iterator().next(); - if(bpos>0) - properties.setRowIndexBegin("-"+bpos); - else - properties.setRowIndexBegin(""); - } - else { - KeyTrie rowDelimPattern = new KeyTrie(findRowDelimiters()); - colKeyPattern = buildColsKeyPatternMultiRow(); - properties = new CustomProperties(colKeyPattern, rowDelimPattern); - } - } - } - - private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex){ - for(int r=0;r map = new HashMap<>(); - int nan = 0; - for (Integer t : list) { - if (t != -1) { - Integer val = map.get(t); - map.put(t, val == null ? 1 : val + 1); - } else - nan++; - } - if (map.size() == 0) - return nan; - - Map.Entry max = null; - for (Map.Entry e : map.entrySet()) { - if (max == null || e.getValue() > max.getValue()) - max = e; - } - return max.getValue() + nan; - } - - private Integer mostCommonValue(int[] list) { - Map map = new HashMap<>(); - for (Integer t : list) { - if (t != -1) { - Integer val = map.get(t); - map.put(t, val == null ? 1 : val + 1); - } - } - if (map.size() == 0) - return -1; - - Map.Entry max = null; - for (Map.Entry e : map.entrySet()) { - if (max == null || e.getValue() > max.getValue()) - max = e; - } - return max.getKey(); - } - - private KeyTrie[] buildColsKeyPatternSingleRow() { - Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); - ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); - KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - - // Clean prefix strings - for(int c =0; c< ncols; c++) { - ArrayList list = prefixStrings.getKey()[c]; - String token = null; - boolean flag = true; - for(int w = 1; w < windowSize && flag; w++) { - HashSet wts = new HashSet<>(); - for(String s : list) { - if(s.length() < w) - flag = false; - else { - String subStr = s.substring(s.length() - w); - if (!subStr.contains(Lop.OPERAND_DELIMITOR)) - wts.add(subStr); - else - flag = false; - } - } - - if(flag) { - if(wts.size() == 1) - token = wts.iterator().next(); - else { - for(String t : wts) { - int count = 0; - for(String s : list) { - if(s.endsWith(t)) - count++; - } - float percent = (float) count / list.size(); - if(percent >= 1) - token = t; - } - } - } - else if(wts.size() == 0) - token = ""; - } - if(token == null) { - int[] listLength = new int[nrows]; - for (int r = 0; r< nrows; r++) - listLength[r] = mapCol[r][c]; - int commonLength = mostCommonValue(listLength); - if (commonLength == 0){ - ArrayList newList = new ArrayList<>(); - for(String s: list){ - if(s.length() == 0) - newList.add(s); - } - prefixStrings.getKey()[c] = newList; - } - else - throw new RuntimeException("can't build a key pattern for the column: " + c); - } - else if(token.length() > 0){ - ArrayList newList = new ArrayList<>(); - for(String s: list){ - if(s.endsWith(token)) - newList.add(s); - } - prefixStrings.getKey()[c] = newList; - } - } - - for(int c=0; c> keyPatterns = null; - - - for(String ps: prefixStrings.getKey()[c]) - trie.reverseInsert(ps, prefixStrings.getValue()[c].get(ri++)); - - if (trie.getRoot().getChildren().size() == 1){ - String[] splitPattern= prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); - ArrayList reverseSplitPattern = new ArrayList<>(); - for (String ps: splitPattern) - if (ps.length() > 0) - reverseSplitPattern.add(ps); - if (reverseSplitPattern.size() == 0) - reverseSplitPattern.add(""); - - int maxPatternLength = reverseSplitPattern.size(); - check = false; - for(int sp= 0; sp< maxPatternLength;sp++){ - ArrayList shortPattern = new ArrayList<>(); - for(int spi= maxPatternLength - sp-1; spi< maxPatternLength; spi++){ - shortPattern.add(reverseSplitPattern.get(spi)); - } - check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], shortPattern); - if (check) { - keyPatterns = new ArrayList<>(); - keyPatterns.add(shortPattern); - break; - } - } - } - else { - do { - ArrayList> selectedKeyPatterns = new ArrayList<>(); - keyPatterns = trie.getAllSequentialKeys(); - check = false; - for (ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); - check |= newCheck; - if (newCheck) - selectedKeyPatterns.add(keyPattern); - } - if (check) - keyPatterns = selectedKeyPatterns; - else { - flagReconstruct = trie.reConstruct(); - if (!flagReconstruct) - break; - } - } while (!check); - } - - if(check){ - colKeyPattens[c] = new KeyTrie(keyPatterns); - for(String suffix: suffixStrings[c]) { - colKeyPattens[c].insertSuffixKeys(suffix.substring(0,Math.min(suffixStringLength, suffix.length())).toCharArray()); - } - } - } - return colKeyPattens; - } - - // Get all prefix strings of a column - public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { - ArrayList[] prefixStrings = new ArrayList[ncols]; - ArrayList[] rowIndexes = new ArrayList[ncols]; - for(int c=0; c< ncols; c++){ - Pair, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse); - prefixStrings[c] = pair.getKey(); - rowIndexes[c] = pair.getValue(); - } - return new Pair<>(prefixStrings, rowIndexes); - } - - public Pair, ArrayList> extractAllPrefixStringsOfAColSingleLine(int colIndex, boolean reverse) { - ArrayList prefixStrings = new ArrayList(); - ArrayList rowIndexes = new ArrayList(); - for(int r = 0; r < nrows; r++) { - int rowIndex = mapRow[r][colIndex]; - if(rowIndex != -1) { - rowIndexes.add(rowIndex); - String str = sampleRawIndexes.get(rowIndex).getRemainedTexts(mapCol[r][colIndex]);//sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]); - if(reverse) - prefixStrings.add(new StringBuilder(str).reverse().toString()); - else - prefixStrings.add(str); - } - } - return new Pair<>(prefixStrings, rowIndexes); - } - - private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { - ArrayList[] result = new ArrayList[ncols]; - for(int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - for(int r = 0; r < nrows; r++) { - int rowIndex = mapRow[r][c]; - if(rowIndex == -1) - continue; - String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); - result[c].add(str); - } - } - return result; - } - - ///////////////////////////////////////////////////////////////////////////// - // Methods For Multi Lines Mapping // - //////////////////////////////////////////////////////////////////////////// - // This implementation is for nested datasets are scattered on multiple lines - // The following steps are required: - // 1. Extract all prefix strings per column - // 2. Build key pattern tree for each column - // 3. Build key pattern for end of values - - private ArrayList> findRowDelimiters(){ - ArrayList> keyPattern = new ArrayList<>(); - Hirschberg hirschberg = new Hirschberg(); - int misMatchPenalty = 3; - int gapPenalty = 2; - - //extract all lines are in record boundary - ArrayList recordBoundaries = new ArrayList<>(); - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - int[] minList = new int[nrows]; - HashMap maxColPos = new HashMap<>(); - int[] minColPos = new int[nrows]; - for(int r=0; r= 0; beginLine--) - if(usedLines[r].get(beginLine)) - break; - - StringBuilder sb = new StringBuilder(); - beginLine = Math.max(beginLine, 0); - - if(beginLine+1 == nlines) - continue; - - Integer subStrPos = 0; - if(maxColPos.containsKey(beginLine)) - subStrPos = maxColPos.get(beginLine); - - String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); - if(str.length() >0) { - sb.append(str).append("\n"); - } - for(int i = beginLine+1 ; i < minList[r]; i++){ - str = sampleRawIndexes.get(i).getRaw(); - if(str.length() > 0) - sb.append(str).append("\n"); - } - - str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); - if(str.length() > 0) - sb.append(str); - recordBoundaries.add(sb.toString()); - } - recordBoundaries.remove(recordBoundaries.size()-1); - - String str1 = recordBoundaries.get(0); - String str2 = recordBoundaries.get(1); - Pair, String> pattern = hirschberg.getLCS(str1, str2, misMatchPenalty, gapPenalty); - if(pattern != null) { - String intersect = pattern.getValue(); - ArrayList intersectPattern = pattern.getKey(); - for(int i = 2; i < recordBoundaries.size(); i++) { - pattern = hirschberg.getLCS(intersect, recordBoundaries.get(i), misMatchPenalty, gapPenalty); - if(pattern != null) { - intersect = pattern.getValue(); - intersectPattern = pattern.getKey(); - } - else - intersect = null; - } - if(intersect != null && intersect.length() > 0) { - keyPattern.add(intersectPattern); - return keyPattern; - } - } - return null; - } - - - // Build key pattern tree for each column - private KeyTrie[] buildColsKeyPatternMultiRow(){ - Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); - ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); - - KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - for(int c=0; c intersect = new HashSet<>(); - intersect.add(colDelim); - - KeyTrie trie = new KeyTrie(colDelim); - ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); - boolean check; - do { - ArrayList> keyPatterns = trie.getPrefixKeyPatterns(); - check = false; - for(ArrayList keyPattern: keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); - check |= newCheck; - if(newCheck){ - trie.setAPrefixPath(keyPattern); - } - } - - if(!check){ - remainedPrefixes.clear(); - boolean flag = true; - for(ArrayList keyPattern: keyPatterns){ - ArrayList remainedPrefix = new ArrayList<>(); - for(String ps : prefixStrings.getKey()[c]) - remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); - - intersect = findStartWithIntersectOfStrings(remainedPrefix); - if(intersect != null) { - trie.insertPrefixKeysConcurrent(intersect); - } - else { - remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); - flag = false; - break; - } - } - if(!flag) - break; - } - } - while(!check); - - // Suffix pattern is based on char, so we need to extract all chars of a string - for(String suffix: suffixStrings[c]) { - trie.insertSuffixKeys(suffix.toCharArray()); - } - colKeyPattens[c] = trie; - } - return colKeyPattens; - } - - // Extract prefix strings: - private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse){ - - ArrayList[] result = new ArrayList[ncols]; - Pair[] minmax = new Pair[ncols]; - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - for(int r=0; r(); - int min = 0; - int max = 0; - for(int r=0; r=0; i--) - if(usedLines[r].get(i)) { - lastLine = i; - break; - } - for(int i= lastLine; i 0 ) - sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); - } - String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][c]); - if(str.length() > 0 && !str.equals("\n")) - sb.append(str); - else if(lastLine < rowIndex) - sb.deleteCharAt(sb.length()-1); - - - if(reverse) - result[c].add(sb.reverse().toString()); - else - result[c].add(sb.toString()); - max = Math.max(max, sb.length()); - if(sb.length()< min || min == 0) - min = sb.length(); - minmax[c] = new Pair<>(min, max); - } - } - return new Pair<>(result, minmax); - } - - private String findStartWithIntersectOfStrings(ArrayList strList, int minLength){ - StringBuilder sb = new StringBuilder(); - int i = 0; - boolean flag = true; - do { - char ch = strList.get(0).charAt(i); - for(int j=1; j findStartWithIntersectOfStrings(ArrayList strList){ - // 1. Extract all substrings - // 2. Find intersection of substrings - - HashSet[] substrings = new HashSet[strList.size()]; - for(int i=0; i< strList.size(); i++) - substrings[i] = new HashSet<>(); - - for(int w = windowSize; w > 2; w--) { - for(int i=0; i totalIntersect = new HashSet<>(substrings[0]); - for(int r=1; r 0) - return totalIntersect; - - } - return null; - } - - private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys){ - if(keys.size() == 1){ - String k = keys.get(0); - if (k.length() == 0) - return true; - } - - for(String ps: prefixStrings){ - int currentPos = 0; - int patternCount = 0; - do { - currentPos = getIndexOfKeyPatternOnString(ps, keys, currentPos).getKey(); - if(currentPos == -1) - break; - else { - patternCount++; - currentPos++; - } - }while(true); - if(patternCount!=1) - return false; - } - return true; - } - - // Check the sequential list of keys are on a string - private Pair getIndexOfKeyPatternOnString(String str, ArrayList key, int beginPos) { - - int currPos = beginPos; - boolean flag = true; - int startPos = -1; - for(String k : key) { - int index = str.indexOf(k, currPos); - if(index != -1) - currPos = index + k.length(); - else { - flag = false; - break; - } - if(startPos==-1) - startPos = currPos; - } - if(flag) - return new Pair<>(startPos, currPos+key.get(key.size()-1).length()); - else - return new Pair<>(-1,-1); - } - - private ArrayList getAllSubstringsOfAString(String str,int size){ - ArrayList result = new ArrayList<>(); - if(str == null) - return result; - for(int i = 0; i <= str.length() - size; i++){ - String s = str.substring(i, i + size); - if(!s.contains("\n")) - result.add(s); - } - return result; - } - - private String getRemainedSubstring(String str, ArrayList keys){ - boolean flag = true; - int currPos = 0; - for(String k : keys) { - int index = str.indexOf(k, currPos); - if(index != -1) - currPos = index + k.length(); - else { - flag = false; - break; - } - } - if(flag) - return str.substring(currPos); - else - return null; - } - - private ArrayList[] extractAllSuffixStringsOfColsMultiLine() { - ArrayList[] result = new ArrayList[ncols]; - for(int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - - for(int r = 0; r < nrows; r++) { - int rowIndex = mapRow[r][c]; - if(rowIndex == -1) - continue; - StringBuilder sb = new StringBuilder(); - String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); - boolean enter = false; - if(str.length() > 0) { - sb.append(str); - enter = true; - } - - for(int i = rowIndex + 1; i < nlines; i++) { - str = sampleRawIndexes.get(i).getRaw().substring(0, Math.min(sampleRawIndexes.get(i).getRawLength(), suffixStringLength)); - if(str.length() > 0 && !enter) { - sb.append(str); - break; - } - } - if(sb.length() > 0) - sb.deleteCharAt(sb.length() - 1); - result[c].add(sb.toString()); - } - } - return result; - } + private MatrixBlock mapRow; + private int[] mapRowPrevious; + private MatrixBlock mapCol; + private MatrixBlock mapLen; + private MatrixBlock mapHas; + private int NaN; + private ArrayList sampleRawIndexes; + + private static int nrows; + private static int ncols; + private int nlines; + private int windowSize = 20; + private int suffixStringLength = 50; + private ReaderMapping mappingValues; + private CustomProperties properties; + + + public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { + this.mappingValues = new ReaderMapping(raw, matrix); + this.runIdentification(); + } + + public FormatIdentifying(String raw, FrameBlock frame) throws Exception { + this.mappingValues = new ReaderMapping(raw, frame); + this.runIdentification(); + } + + private void runIdentification() { + + mapRow = mappingValues.getMapRow(); + mapCol = mappingValues.getMapCol(); + mapLen = mappingValues.getMapLen(); + mapHas = mappingValues.getMapHas(); + sampleRawIndexes = mappingValues.getSampleRawIndexes(); + mapRowPrevious = new int[ncols]; + + for (int c = 0; c < ncols; c++) + mapRowPrevious[c] = 0; + + nrows = mappingValues.getNrows(); + ncols = mappingValues.getNcols(); + nlines = mappingValues.getNlines(); + NaN = (ncols * nrows) - mappingValues.getNaN(); + + // Check the map row: + // If all cells of a row mapped to a single line of sample raw, it is a single row mapping + // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping + + boolean isSingleRow = false; + int missedCount = 0; + for (int r = 0; r < nrows; r++) + missedCount += ncols - mostCommonScore(r); + if ((float) missedCount / NaN < 0.07) isSingleRow = true; + + KeyTrie[] colKeyPattern; + + if (isSingleRow) { + colKeyPattern = buildColsKeyPatternSingleRow(); + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTIFY); + } else { + + // Check the row index is a prefix string in sample raw + // if the row indexes are in the prefix of values, so we need to build a key pattern + // to extract row indexes + // for understanding row indexes are in sample raw we check just 3 column of data + // for build a key pattern related to row indexes we just selected a row + boolean flag; + int numberOfSelectedCols = 3; + int begin = 0; + boolean check, flagReconstruct; + int[] selectedRowIndex = new int[2]; + HashSet beginPos = new HashSet<>(); + KeyTrie rowKeyPattern = null; + + // Select two none zero row as a row index candidate + + int index = 0; + for (int r = 1; r < nrows; r++) { + for (int c = 0; c < ncols; c++) + if (mapHas.getValue(r, c) != 0) { + selectedRowIndex[index++] = r; + break; + } + if (index > 1) break; + } + + for (int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { + Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); + ArrayList prefixStrings = colPrefixString.getKey(); + ArrayList prefixStringRowIndexes = colPrefixString.getValue(); + ArrayList prefixRawIndex = new ArrayList<>(); + + MappingTrie trie = new MappingTrie(); + int ri = 0; + for (String ps : prefixStrings) + trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); + + do { + flag = trie.reConstruct(); + } while (flag); + + ArrayList> keyPatterns = trie.getAllSequentialKeys(); + for (ArrayList kp : keyPatterns) { + for (String ps : prefixStrings) { + StringBuilder sb = new StringBuilder(); + int currPos = 0; + for (String k : kp) { + sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); + currPos += sb.length() + k.length(); + } + prefixRawIndex.add(new RawIndex(sb.toString())); + } + } + + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + if (!flag) { + begin = 1; + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + } + if (!flag) { + beginPos.clear(); + break; + } else beginPos.add(begin); + if (c == numberOfSelectedCols - 1) { + ArrayList rowPrefixStrings = new ArrayList<>(); + MappingTrie rowTrie = new MappingTrie(); + rowKeyPattern = new KeyTrie(); + for (int si : selectedRowIndex) { + for (int ci = 0; ci < ncols; ci++) { + int cri = (int) mapRow.getValue(si, ci); + if (mapHas.getValue(si, ci) == 1) { + String str = sampleRawIndexes.get(cri).getSubString(0, (int) mapCol.getValue(si, ci)); + RawIndex rawIndex = new RawIndex(str); + Pair pair = rawIndex.findValue(si + begin); + if (pair != null) { + String pstr = str.substring(0, pair.getKey()); + if (pstr.length() > 0) { + rowPrefixStrings.add(pstr); + rowTrie.insert(pstr, 1); + } + rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); + } + } + } + } + + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = rowTrie.getAllSequentialKeys(); + check = false; + for (ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); + check |= newCheck; + if (newCheck) selectedKeyPatterns.add(keyPattern); + } + if (check) keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = rowTrie.reConstruct(); + if (!flagReconstruct) break; + } + } while (!check); + + if (keyPatterns.size() == 0) { + ArrayList> kpl = new ArrayList<>(); + ArrayList kpli = new ArrayList<>(); + kpli.add(""); + kpl.add(kpli); + keyPatterns = kpl; + } + rowKeyPattern.setPrefixKeyPattern(keyPatterns); + } + } + + if (beginPos.size() == 1) { + colKeyPattern = buildColsKeyPatternSingleRow(); + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); + Integer bpos = beginPos.iterator().next(); + if (bpos > 0) properties.setRowIndexBegin("-" + bpos); + else properties.setRowIndexBegin(""); + } else { + KeyTrie rowDelimPattern = new KeyTrie(findRowDelimiters()); + colKeyPattern = buildColsKeyPatternMultiRow(); + properties = new CustomProperties(colKeyPattern, rowDelimPattern); + } + } + } + + private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex) { + for (int r = 0; r < nrows; r++) { + int rowIndex = (int) this.mapRow.getValue(r, colIndex); + if (rowIndex != -1) { + boolean flag = false; + for (RawIndex ri : prefixRawIndex) { + if (ri.findValue(r + beginPos) != null) { + flag = true; + break; + } + } + if (!flag) return false; + } + } + return true; + } + + public CustomProperties getFormatProperties() { + return properties; + } + + private Integer mostCommonScore(int rowIndex) { + Map map = new HashMap<>(); + int nan = 0; + for (int c = 0; c < ncols; c++) { + if (mapHas.getValue(rowIndex, c) != 0) { + int t = (int) mapRow.getValue(rowIndex, c); + Integer val = map.get(t); + map.put(t, val == null ? 1 : val + 1); + } else nan++; + } + if (map.size() == 0) return nan; + + Map.Entry max = null; + for (Map.Entry e : map.entrySet()) { + if (max == null || e.getValue() > max.getValue()) max = e; + } + return max.getValue() + nan; + } + + private Integer mostCommonValue(int[] list) { + Map map = new HashMap<>(); + for (Integer t : list) { + if (t != -1) { + Integer val = map.get(t); + map.put(t, val == null ? 1 : val + 1); + } + } + if (map.size() == 0) return -1; + + Map.Entry max = null; + for (Map.Entry e : map.entrySet()) { + if (max == null || e.getValue() > max.getValue()) max = e; + } + return max.getKey(); + } + + private KeyTrie[] buildColsKeyPatternSingleRow() { + Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); + ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); + KeyTrie[] colKeyPattens = new KeyTrie[ncols]; + + // Clean prefix strings + for (int c = 0; c < ncols; c++) { + ArrayList list = prefixStrings.getKey()[c]; + String token = null; + boolean flag = true; + for (int w = 1; w < windowSize && flag; w++) { + HashSet wts = new HashSet<>(); + for (String s : list) { + if (s.length() < w) flag = false; + else { + String subStr = s.substring(s.length() - w); + if (!subStr.contains(Lop.OPERAND_DELIMITOR)) wts.add(subStr); + else flag = false; + } + } + + if (flag) { + if (wts.size() == 1) token = wts.iterator().next(); + else { + for (String t : wts) { + int count = 0; + for (String s : list) { + if (s.endsWith(t)) count++; + } + float percent = (float) count / list.size(); + if (percent >= 1) token = t; + } + } + } else if (wts.size() == 0) token = ""; + } + if (token == null) { + int[] listLength = new int[nrows]; + for (int r = 0; r < nrows; r++) { + if (mapHas.getValue(r, c) == 1) listLength[r] = (int) mapCol.getValue(r, c); + else listLength[r] = -1; + } + int commonLength = mostCommonValue(listLength); + if (commonLength == 0) { + ArrayList newList = new ArrayList<>(); + for (String s : list) { + if (s.length() == 0) newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } else throw new RuntimeException("can't build a key pattern for the column: " + c); + } else if (token.length() > 0) { + ArrayList newList = new ArrayList<>(); + for (String s : list) { + if (s.endsWith(token)) newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } + } + + for (int c = 0; c < ncols; c++) { + MappingTrie trie = new MappingTrie(); + int ri = 0; + boolean check; + boolean flagReconstruct; + ArrayList> keyPatterns = null; + + + for (String ps : prefixStrings.getKey()[c]) + trie.reverseInsert(ps, prefixStrings.getValue()[c].get(ri++)); + + if (trie.getRoot().getChildren().size() == 1) { + String[] splitPattern = prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); + ArrayList reverseSplitPattern = new ArrayList<>(); + for (String ps : splitPattern) + if (ps.length() > 0) reverseSplitPattern.add(ps); + if (reverseSplitPattern.size() == 0) reverseSplitPattern.add(""); + + int maxPatternLength = reverseSplitPattern.size(); + check = false; + for (int sp = 0; sp < maxPatternLength; sp++) { + ArrayList shortPattern = new ArrayList<>(); + for (int spi = maxPatternLength - sp - 1; spi < maxPatternLength; spi++) { + shortPattern.add(reverseSplitPattern.get(spi)); + } + check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], shortPattern); + if (check) { + keyPatterns = new ArrayList<>(); + keyPatterns.add(shortPattern); + break; + } + } + } else { + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = trie.getAllSequentialKeys(); + check = false; + for (ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + if (newCheck) selectedKeyPatterns.add(keyPattern); + } + if (check) keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = trie.reConstruct(); + if (!flagReconstruct) break; + } + } while (!check); + } + + if (check) { + colKeyPattens[c] = new KeyTrie(keyPatterns); + for (String suffix : suffixStrings[c]) { + colKeyPattens[c].insertSuffixKeys(suffix.substring(0, Math.min(suffixStringLength, suffix.length())).toCharArray()); + } + } + } + return colKeyPattens; + } + + // Get all prefix strings of a column + public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { + ArrayList[] prefixStrings = new ArrayList[ncols]; + ArrayList[] rowIndexes = new ArrayList[ncols]; + for (int c = 0; c < ncols; c++) { + Pair, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse); + prefixStrings[c] = pair.getKey(); + rowIndexes[c] = pair.getValue(); + } + return new Pair<>(prefixStrings, rowIndexes); + } + + public Pair, ArrayList> extractAllPrefixStringsOfAColSingleLine(int colIndex, boolean reverse) { + ArrayList prefixStrings = new ArrayList(); + ArrayList rowIndexes = new ArrayList(); + for (int r = 0; r < nrows; r++) { + int rowIndex = (int) mapRow.getValue(r, colIndex); + if (mapHas.getValue(r, colIndex) == 1) { + rowIndexes.add(rowIndex); + String str = sampleRawIndexes.get(rowIndex).getRemainedTexts((int) mapCol.getValue(r, colIndex));//sampleRawIndexes.get(rowIndex) + // .getSubString(0, + // mapCol[r][colIndex]); + if (reverse) prefixStrings.add(new StringBuilder(str).reverse().toString()); + else prefixStrings.add(str); + } + } + return new Pair<>(prefixStrings, rowIndexes); + } + + private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { + ArrayList[] result = new ArrayList[ncols]; + for (int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + for (int r = 0; r < nrows; r++) { + int rowIndex = (int) mapRow.getValue(r, c); + if (mapHas.getValue(r,c) != 1) continue; + String str = sampleRawIndexes.get(rowIndex).getRaw().substring((int) (mapCol.getValue(r, c) + mapLen.getValue(r, c))); + result[c].add(str); + } + } + return result; + } + + ///////////////////////////////////////////////////////////////////////////// + // Methods For Multi Lines Mapping // + //////////////////////////////////////////////////////////////////////////// + // This implementation is for nested datasets are scattered on multiple lines + // The following steps are required: + // 1. Extract all prefix strings per column + // 2. Build key pattern tree for each column + // 3. Build key pattern for end of values + + private ArrayList> findRowDelimiters() { + ArrayList> keyPattern = new ArrayList<>(); + Hirschberg hirschberg = new Hirschberg(); + int misMatchPenalty = 3; + int gapPenalty = 2; + + //extract all lines are in record boundary + ArrayList recordBoundaries = new ArrayList<>(); + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + int[] minList = new int[nrows]; + HashMap maxColPos = new HashMap<>(); + int[] minColPos = new int[nrows]; + for (int r = 0; r < nrows; r++) + tmpUsedLines[r] = new BitSet(); + + for (int r = 0; r < nrows; r++) { + int min = nlines; + int minPos = 0; + for (int c = 0; c < ncols; c++) + if (mapHas.getValue(r, c) != 0) { + tmpUsedLines[r].set((int) mapRow.getValue(r, c)); + if (mapRow.getValue(r, c) <= min) { + min = (int) mapRow.getValue(r, c); + if (minPos != 0) minPos = (int) Math.min(minPos, mapCol.getValue(r, c)); + else minPos = (int) mapCol.getValue(r, c); + + } + if (maxColPos.containsKey((int) mapRow.getValue(r, c))) + maxColPos.put((int) mapRow.getValue(r, c), (int) Math.max(maxColPos.get((int) mapRow.getValue(r, c)), (int) mapCol.getValue(r, c) + mapLen.getValue(r, c))); + else maxColPos.put((int) mapRow.getValue(r, c), (int) (mapCol.getValue(r, c) + mapLen.getValue(r, c))); + } + minList[r] = min; + minColPos[r] = minPos; + } + + for (int r = 0; r < nrows; r++) { + usedLines[r] = new BitSet(nlines); + for (int i = 0; i < nrows; i++) { + if (i != r) usedLines[r].or(tmpUsedLines[i]); + } + } + + for (int r = 0; r < nrows; r++) { + int beginLine = minList[r]; + for (; beginLine >= 0; beginLine--) + if (usedLines[r].get(beginLine)) break; + + StringBuilder sb = new StringBuilder(); + beginLine = Math.max(beginLine, 0); + + if (beginLine + 1 == nlines) continue; + + Integer subStrPos = 0; + if (maxColPos.containsKey(beginLine)) subStrPos = maxColPos.get(beginLine); + + String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); + if (str.length() > 0) { + sb.append(str).append("\n"); + } + for (int i = beginLine + 1; i < minList[r]; i++) { + str = sampleRawIndexes.get(i).getRaw(); + if (str.length() > 0) sb.append(str).append("\n"); + } + + str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); + if (str.length() > 0) sb.append(str); + recordBoundaries.add(sb.toString()); + } + recordBoundaries.remove(recordBoundaries.size() - 1); + + String str1 = recordBoundaries.get(0); + String str2 = recordBoundaries.get(1); + Pair, String> pattern = hirschberg.getLCS(str1, str2, misMatchPenalty, gapPenalty); + if (pattern != null) { + String intersect = pattern.getValue(); + ArrayList intersectPattern = pattern.getKey(); + for (int i = 2; i < recordBoundaries.size(); i++) { + pattern = hirschberg.getLCS(intersect, recordBoundaries.get(i), misMatchPenalty, gapPenalty); + if (pattern != null) { + intersect = pattern.getValue(); + intersectPattern = pattern.getKey(); + } else intersect = null; + } + if (intersect != null && intersect.length() > 0) { + keyPattern.add(intersectPattern); + return keyPattern; + } + } + return null; + } + + + // Build key pattern tree for each column + private KeyTrie[] buildColsKeyPatternMultiRow() { + Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); + ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); + + KeyTrie[] colKeyPattens = new KeyTrie[ncols]; + for (int c = 0; c < ncols; c++) { + // 1. Build Prefix Key Pattern + String colDelim = findStartWithIntersectOfStrings(prefixStrings.getKey()[c], prefixStrings.getValue()[c].getKey()); + + HashSet intersect = new HashSet<>(); + intersect.add(colDelim); + + KeyTrie trie = new KeyTrie(colDelim); + ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); + boolean check; + do { + ArrayList> keyPatterns = trie.getPrefixKeyPatterns(); + check = false; + for (ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + if (newCheck) { + trie.setAPrefixPath(keyPattern); + } + } + + if (!check) { + remainedPrefixes.clear(); + boolean flag = true; + for (ArrayList keyPattern : keyPatterns) { + ArrayList remainedPrefix = new ArrayList<>(); + for (String ps : prefixStrings.getKey()[c]) + remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); + + intersect = findStartWithIntersectOfStrings(remainedPrefix); + if (intersect != null) { + trie.insertPrefixKeysConcurrent(intersect); + } else { + remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); + flag = false; + break; + } + } + if (!flag) break; + } + } while (!check); + + // Suffix pattern is based on char, so we need to extract all chars of a string + for (String suffix : suffixStrings[c]) { + trie.insertSuffixKeys(suffix.toCharArray()); + } + colKeyPattens[c] = trie; + } + return colKeyPattens; + } + + // Extract prefix strings: + private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse) { + + ArrayList[] result = new ArrayList[ncols]; + Pair[] minmax = new Pair[ncols]; + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + for (int r = 0; r < nrows; r++) + tmpUsedLines[r] = new BitSet(); + + for (int r = 0; r < nrows; r++) + for (int c = 0; c < ncols; c++) + if (mapHas.getValue(r, c) != 0) tmpUsedLines[r].set((int) mapRow.getValue(r, c)); + + for (int r = 0; r < nrows; r++) { + usedLines[r] = new BitSet(nlines); + for (int i = 0; i < nrows; i++) { + if (i != r) usedLines[r].or(tmpUsedLines[i]); + } + } + + // extract prefix strings + for (int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + int min = 0; + int max = 0; + for (int r = 0; r < nrows; r++) { + int rowIndex = (int) mapRow.getValue(r, c); + if (mapHas.getValue(r,c) != 1) continue; + StringBuilder sb = new StringBuilder(); + int lastLine = 0; + + for (int i = rowIndex - 1; i >= 0; i--) + if (usedLines[r].get(i)) { + lastLine = i; + break; + } + for (int i = lastLine; i < rowIndex; i++) { + if (sampleRawIndexes.get(i).getRawLength() > 0) sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); + } + String str = sampleRawIndexes.get(rowIndex).getSubString(0, (int) mapCol.getValue(r, c)); + if (str.length() > 0 && !str.equals("\n")) sb.append(str); + else if (lastLine < rowIndex) sb.deleteCharAt(sb.length() - 1); + + + if (reverse) result[c].add(sb.reverse().toString()); + else result[c].add(sb.toString()); + max = Math.max(max, sb.length()); + if (sb.length() < min || min == 0) min = sb.length(); + minmax[c] = new Pair<>(min, max); + } + } + return new Pair<>(result, minmax); + } + + private String findStartWithIntersectOfStrings(ArrayList strList, int minLength) { + StringBuilder sb = new StringBuilder(); + int i = 0; + boolean flag = true; + do { + char ch = strList.get(0).charAt(i); + for (int j = 1; j < Math.min(strList.size(), minLength); j++) { + char cch = strList.get(j).charAt(i); + if (ch != cch || ch == '\n') { + flag = false; + break; + } + } + if (flag) sb.append(ch); + i++; + } while (flag && i < minLength); + return sb.toString(); + + } + + private HashSet findStartWithIntersectOfStrings(ArrayList strList) { + // 1. Extract all substrings + // 2. Find intersection of substrings + + HashSet[] substrings = new HashSet[strList.size()]; + for (int i = 0; i < strList.size(); i++) + substrings[i] = new HashSet<>(); + + for (int w = windowSize; w > 2; w--) { + for (int i = 0; i < strList.size(); i++) { + substrings[i].clear(); + substrings[i].addAll(getAllSubstringsOfAString(strList.get(i), w)); + } + + HashSet totalIntersect = new HashSet<>(substrings[0]); + for (int r = 1; r < substrings.length; r++) + totalIntersect.retainAll(substrings[r]); + + if (totalIntersect.size() > 0) return totalIntersect; + + } + return null; + } + + private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys) { + if (keys.size() == 1) { + String k = keys.get(0); + if (k.length() == 0) return true; + } + + for (String ps : prefixStrings) { + int currentPos = 0; + int patternCount = 0; + do { + currentPos = getIndexOfKeyPatternOnString(ps, keys, currentPos).getKey(); + if (currentPos == -1) break; + else { + patternCount++; + currentPos++; + } + } while (true); + if (patternCount != 1) return false; + } + return true; + } + + // Check the sequential list of keys are on a string + private Pair getIndexOfKeyPatternOnString(String str, ArrayList key, int beginPos) { + + int currPos = beginPos; + boolean flag = true; + int startPos = -1; + for (String k : key) { + int index = str.indexOf(k, currPos); + if (index != -1) currPos = index + k.length(); + else { + flag = false; + break; + } + if (startPos == -1) startPos = currPos; + } + if (flag) return new Pair<>(startPos, currPos + key.get(key.size() - 1).length()); + else return new Pair<>(-1, -1); + } + + private ArrayList getAllSubstringsOfAString(String str, int size) { + ArrayList result = new ArrayList<>(); + if (str == null) return result; + for (int i = 0; i <= str.length() - size; i++) { + String s = str.substring(i, i + size); + if (!s.contains("\n")) result.add(s); + } + return result; + } + + private String getRemainedSubstring(String str, ArrayList keys) { + boolean flag = true; + int currPos = 0; + for (String k : keys) { + int index = str.indexOf(k, currPos); + if (index != -1) currPos = index + k.length(); + else { + flag = false; + break; + } + } + if (flag) return str.substring(currPos); + else return null; + } + + private ArrayList[] extractAllSuffixStringsOfColsMultiLine() { + ArrayList[] result = new ArrayList[ncols]; + for (int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + + for (int r = 0; r < nrows; r++) { + int rowIndex = (int) mapRow.getValue(r, c); + if (mapHas.getValue(r,c) != 1) continue; + StringBuilder sb = new StringBuilder(); + String str = sampleRawIndexes.get(rowIndex).getRaw().substring((int) (mapCol.getValue(r, c) + mapLen.getValue(r, c))); + boolean enter = false; + if (str.length() > 0) { + sb.append(str); + enter = true; + } + + for (int i = rowIndex + 1; i < nlines; i++) { + str = sampleRawIndexes.get(i).getRaw().substring(0, Math.min(sampleRawIndexes.get(i).getRawLength(), suffixStringLength)); + if (str.length() > 0 && !enter) { + sb.append(str); + break; + } + } + if (sb.length() > 0) sb.deleteCharAt(sb.length() - 1); + result[c].add(sb.toString()); + } + } + return result; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index de6c1e98611..1bcddd8b2ed 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -33,9 +33,10 @@ public class ReaderMapping { - private int[][] mapRow; - private int[][] mapCol; - private int[][] mapLen; + private MatrixBlock mapRow; + private MatrixBlock mapCol; + private MatrixBlock mapLen; + private MatrixBlock mapHas; private boolean mapped; private final int nrows; private final int ncols; @@ -96,16 +97,12 @@ private void runMapping(boolean isIndexMapping) { } protected boolean findMapping(boolean isIndexMapping) { - mapRow = new int[nrows][ncols]; - mapCol = new int[nrows][ncols]; - mapLen = new int[nrows][ncols]; + mapRow = new MatrixBlock(nrows, ncols, true); + mapCol = new MatrixBlock(nrows, ncols, true); + mapLen = new MatrixBlock(nrows, ncols, true); + mapHas = new MatrixBlock(nrows, ncols, true); NaN = 0; - // Set "-1" as default value for all defined matrix - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) - mapRow[r][c] = mapCol[r][c] = mapLen[r][c] = -1; - int itRow = 0; for(int r = 0; r < nrows; r++) { for(int c = 0; c < ncols; c++) { @@ -118,9 +115,10 @@ protected boolean findMapping(boolean isIndexMapping) { Pair pair = this.isMatrix ? ri.findValue( sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); if(pair != null) { - mapRow[r][c] = itRow; - mapCol[r][c] = pair.getKey(); - mapLen[r][c] = pair.getValue(); + mapRow.setValue(r,c, itRow); + mapCol.setValue(r,c,pair.getKey()); + mapLen.setValue(r,c,pair.getValue()); + mapHas.setValue(r,c,1); break; } else { @@ -138,7 +136,7 @@ protected boolean findMapping(boolean isIndexMapping) { boolean flagMap = true; for(int r = 0; r < nrows && flagMap; r++) for(int c = 0; c < ncols && flagMap; c++) - if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, + if(mapHas.getDouble(r,c) == -1 && ((!this.isMatrix && this.sampleFrame.get(r, c) != null) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, c) != null) || (schema[c].isNumeric() && this.sampleFrame.getDouble(r, c) != 0))))) { flagMap = false; @@ -150,18 +148,22 @@ public int getNaN() { return NaN; } - public int[][] getMapRow() { + public MatrixBlock getMapRow() { return mapRow; } - public int[][] getMapCol() { + public MatrixBlock getMapCol() { return mapCol; } - public int[][] getMapLen() { + public MatrixBlock getMapLen() { return mapLen; } + public MatrixBlock getMapHas() { + return mapHas; + } + public ArrayList getSampleRawIndexes() { return sampleRawIndexes; } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index b14ae35225c..ec38386f6c1 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -90,16 +90,12 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); -// myTest mt = new myTest(formatIdentifying.getFormatProperties()); -// mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); -// int a = 100; - -// GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); -// MatrixReader mr = gr.getReader(); -// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); -// -// TestUtils.compareMatrices(sampleMB, matrixBlock, 0); + //FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + MatrixReader mr = gr.getReader(); + MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); + + TestUtils.compareMatrices(sampleMB, matrixBlock, 0); } catch(Exception exception) { diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 40424bc54f3..6f9727579d7 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -19,21 +19,36 @@ package org.apache.sysds.test.functions.iogen.Identify; +import com.google.gson.Gson; import org.apache.sysds.common.Types; +import org.apache.sysds.lops.Lop; +import org.apache.sysds.runtime.io.FileFormatPropertiesLIBSVM; import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.io.FrameReaderJSONJackson; import org.apache.sysds.runtime.io.FrameReaderJSONL; +import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.io.ReaderTextLIBSVM; +import org.apache.sysds.runtime.iogen.FormatIdentifying; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.iogen.EXP.Util; +import org.apache.sysds.runtime.iogen.Hirschberg; +import org.apache.sysds.runtime.iogen.MappingTrie; import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; import org.junit.Test; +import java.io.IOException; +import java.security.GeneralSecurityException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Random; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { @@ -170,101 +185,22 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou runGenerateReaderTest(); } - @Test public void test12() { - // sampleRaw = "#index 1\n" + - // "#t 2,3\n" + - // "#s 1980\n"+ - // "#index 10\n\n" + - // "#t 21,30\n" + - // "#s 2000\n\n"+ - // "#index 100\n" + - // "#t 200,300\n" + - // "#s 2222"; - // - // sampleMatrix = new double[][] {{1,2,3}, {10,21,30}, {100,200,300},{1000,2000,3000}}; - // runGenerateReaderTest(); + @Test public void test101() throws IOException { - StringBuilder sb = new StringBuilder( - " ,)R2I( hcraeseR mmocofnI rof etutitsnI ,tnemtrapeD gniniM ataD\"[:\"snoitailiffa\",\"tuhN hniM neyugN \":\"eman\",802:\"xedni\"{"); - System.out.println(sb.reverse()); + FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(" ", ":"); + ReaderTextLIBSVM readerTextLIBSVM = new ReaderTextLIBSVM(propertiesLIBSVM); + MatrixBlock mb = readerTextLIBSVM.readMatrixFromHDFS("/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/susy-libsvm/susy-libsvm.data",-1,18,-1,-1); } - // @Test - // public void test13() throws Exception { - // String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.raw"; - // String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.frame"; - // Integer sampleNRows = 1000; - // String delimiter = "\\t"; - // String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/aminer_author_5.schema"; - // String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/aminer_author.data"; - // - // Float percent = 7f;//Float.parseFloat(args[6]); - // String datasetName = "aminer_paper";//args[7]; - // String LOG_HOME ="/home/saeed/Documents/ExpLog";//args[8]; - // - // if(delimiter.equals("\\t")) - // delimiter = "\t"; - // - // Util util = new Util(); - // Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - // int ncols = sampleSchema.length; - // - // ArrayList newSampleSchema = new ArrayList<>(); - // ArrayList> newSampleFrame = new ArrayList<>(); - // - // String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); - // - // for(int c = 0; c < sampleFrameStrings[0].length; c++) { - // HashSet valueSet = new HashSet<>(); - // for(int r=0; r0){ - // ArrayList tempList = new ArrayList<>(); - // for(int r=0; r newSampleSchema = new ArrayList<>(); ArrayList> newSampleFrame = new ArrayList<>(); - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, delimiter,0); for(int c = 0; c < sampleFrameStrings[0].length; c++) { HashSet valueSet = new HashSet<>(); @@ -308,42 +244,214 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou String sampleRaw = util.readEntireTextFile(sampleRawFileName); GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - FrameReader fr =gr.getReader(); + FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); + int a = 100; } } + + @Test public void test14() throws Exception { -// FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); + ///home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/twitter-examples/F10 + for(int f = 1; f <= 784; f++) { + System.out.println("+++++++++++++++++++++ Q=" + f); + String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/mnist8m-libsvm/F" + f + "/sample-mnist8m-libsvm200.raw"; + String sampleMatrixFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/mnist8m-libsvm/F" + f + "/sample-mnist8m-libsvm200.matrix"; + String delimiter = "\\t"; + String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/mnist8m-libsvm/mnist8m-libsvm.data"; + + Util util = new Util(); + + MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, delimiter); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + MatrixReader mr = gr.getReader(); +// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); + +// FormatIdentifying fi = new FormatIdentifying(sampleRaw,sampleMB); +// +// myregex mr = new myregex(fi.getFormatProperties()); +// mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); + + int a = 100; + + } + } + + @Test public void test15() throws Exception { + String str = "0 1:9.728614687919616699e-01 2:6.538545489311218262e-01 3:1.176224589347839355e+00 4:1.157156467437744141e+00 5:-1.739873170852661133e+00 6:-8.743090629577636719e-01 7:5.677649974822998047e-01 8:-1.750000417232513428e-01 9:8.100607395172119141e-01 10:-2.525521218776702881e-01 11:1.921887040138244629e+00 12:8.896374106407165527e-01 13:4.107718467712402344e-01 14:1.145620822906494141e+00 15:1.932632088661193848e+00 16:9.944640994071960449e-01 17:1.367815494537353516e+00 18:4.071449860930442810e-02"; + String str1="0 1:0.30151 2:0.30151 3:0.30151 4:0.30151 5:0.30151 6:0.30151 7:0.30151 8:0.30151 9:0.30151 10:0.30151 11:0.30151"; + + +// String str = " 123:"; +// String s= str.replaceAll("\\d+","\\\\d+"); +// System.out.println(s); + + //(?<=^|[\w\d]\s)([\w\d]+)(?=\s|$) + + String regex = "(\\d+:)";//"(?<=\\d:)(.*?)(?=\\d:)"; //(.*?)(\d+:) + +// String regex="\\d+:"; + + List allMatches = new ArrayList(); + + for(int i=0;i<10000000;i++) { + Matcher m = Pattern.compile(regex).matcher(str1); + while(m.find()) { + String s = m.group(1) + " ";//+ m.group(3);//+" "+ m.group(5); + //System.out.println(s); + //allMatches.add(m.group(5)); + } + } + + + // -// String FILENAME_SINGLE = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.raw"; -// Types.ValueType[] schema = {Types.ValueType.STRING,Types.ValueType.STRING,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64}; +// Pattern p = Pattern.compile(regex); // -// Map schemaMap = new HashMap<>(); -// schemaMap.put("/returnFlag",0); -// schemaMap.put("/lineStatus",1); -// schemaMap.put("/quantity",2); -// schemaMap.put("/extendedPrice",3); -// schemaMap.put("/discount",4); -// schemaMap.put("/tax",5); -// // Read FrameBlock -// FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(FILENAME_SINGLE, schema, schemaMap, -1, schema.length); +// // Find match between given string +// // and regular expression +// // using Pattern.matcher() +// Matcher m = p.matcher(str); // -// int a = 100; +// // Get the subsequence +// // using find() method +// while(m.find()) { +// System.out.println(m.group()+" "+m.start()+" "+ m.end()+" "); +// } + + // int misMatchPenalty = 3; + // int gapPenalty = 2; + // Hirschberg hirschberg = new Hirschberg(); + + // ArrayList list = new ArrayList<>(); + // for(int i=0;i<100000000;i++){ + // list.add(" "+i+":"+i+"--"); + // } + // + // ArrayList ll = hirschberg.getLCS(list, misMatchPenalty,gapPenalty); + // Gson gson = new Gson(); + // System.out.println(gson.toJson(ll)); + // + // + // + //// List allMatches = new ArrayList(); + //// Matcher m = Pattern.compile("\\s\\w:").matcher(str); + //// while (m.find()) { + //// + //// allMatches.add(m.group()); + //// } + //// for(String s: allMatches) + //// System.out.println(s); + //// + + //--------------------------------------------- + // Regex to extract the string + // between two delimiters + // String regex = "\\[(.*?)\\]"; + // + // // Compile the Regex. + // Pattern p = Pattern.compile(regex); + // + // // Find match between given string + // // and regular expression + // // using Pattern.matcher() + // Matcher m = p.matcher(str); + // + // // Get the subsequence + // // using find() method + // while (m.find()) + // { + // System.out.println(m.group(1)); + // } + // //---------------------------------------------- + // Pattern.compile() + // MappingTrie mappingTrie = new MappingTrie(); + // for(int i=0;i<1000000;i++){ + // mappingTrie.insert(" "+i+":",i); + // } + // + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 123:",0); + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 124:",0); + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 125:",0); + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 256233:",0); + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 58296:",0); + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 10000:",0); + // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 9658263:",0); + // + // boolean flag=false; + // do { + // flag = mappingTrie.reConstruct(); + // }while(flag); + // + // ArrayList> myList = mappingTrie.getAllSequentialKeys(); + // Gson gson = new Gson(); + // System.out.println(gson.toJson(myList.get(0))); + + } + + + @Test public void test16() throws Exception { + ///home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/twitter-examples/F10 + for(int f = 1; f <= 2; f++) { + System.out.println("+++++++++++++++++++++ Q=" + f); + String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/Q" + f + "/sample-yelp-csv200.raw"; + String sampleFrameFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/Q" + f + "/sample-yelp-csv200.frame"; + String delimiter = "\\t"; + String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/yelp-csv.data"; + String schemaFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/Q" + f + "/yelp-csv.schema"; + + Util util = new Util(); + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; - String schemaFileName ="/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schema"; - String schemaMapFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schemaMap"; - String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/twitter-json.data"; - long nrows = 1000; + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, delimiter,ncols); - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - Map schemaMap = util.getSchemaMap(schemaMapFileName); + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + FrameReader fr = gr.getReader(); - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + //FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, ncols); + int a = 100; + } + } + + + + @Test public void test17() throws Exception { + + MatrixBlock m = new MatrixBlock(10,10,true); + + for(int f = 2; f <= 2; f++) { + System.out.println("+++++++++++++++++++++ Q=" + f); + String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/queen-mm/F" + f + "/sample-queen" + + "-mm200.raw"; + String sampleMatrixFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/queen-mm/F" + f + "/sample-queen-mm200.matrix"; + String delimiter = "\\t"; + String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/queen-mm/queen-mm.data"; + + Util util = new Util(); + + MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, delimiter); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + MatrixReader mr = gr.getReader(); + // MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); + + // FormatIdentifying fi = new FormatIdentifying(sampleRaw,sampleMB); + // + // myregex mr = new myregex(fi.getFormatProperties()); + // mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); + + int a = 100; + + } } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index 5ea4c199c56..abc5328671b 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -21,6 +21,8 @@ import org.junit.Test; +import java.util.Random; + public class MatrixSingleRowFlatTest extends GenerateReaderMatrixTest { private final static String TEST_NAME = "MatrixSingleRowFlatTest"; @@ -160,4 +162,47 @@ public void test14() { sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 204, 0}, {0, 0, 305}}; runGenerateReaderTest(); } + + + public static int getRandomNumber() { + Random r = new Random(); + int low = 0; + int high = 100000000; + int result = r.nextInt(high - low) + low; + return result; + } + + @Test + public void test15() { + + Integer[][] data = new Integer[1000][33554432]; + +// for(int i=0;i<1000;i++){ +// for(int j=Math.max(i-5,0);j<=i;j++) +// data[i][j] = getRandomNumber(); +// } +// StringBuilder sb = new StringBuilder(); +// +// int r=2; +// int c=1000000; +// sampleMatrix = new double[r][c]; +// for(int i=0;i Date: Sun, 27 Feb 2022 01:46:09 +0100 Subject: [PATCH 43/84] Revert "Update GIO, Move from 2D Array to MatrixBlock" This reverts commit 4b98c6d22343c3e9556d65895d5074b5faff37b0. --- pom.xml | 2 +- .../apache/sysds/runtime/iogen/EXP/Util.java | 31 +- .../runtime/iogen/FormatIdentifying.java | 1586 +++++++++-------- .../sysds/runtime/iogen/ReaderMapping.java | 38 +- .../iogen/GenerateReaderMatrixTest.java | 16 +- .../Identify/MatrixGRRowColIdentifyTest.java | 336 ++-- .../iogen/MatrixSingleRowFlatTest.java | 45 - 7 files changed, 981 insertions(+), 1073 deletions(-) diff --git a/pom.xml b/pom.xml index 0868f44de6d..4be3b3ba96d 100644 --- a/pom.xml +++ b/pom.xml @@ -61,7 +61,7 @@ true ** false - -Xms3000m -Xmx18000m -Xmn300m + -Xms3000m -Xmx9000m -Xmn300m false diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java index d6a230ef22d..97989a0ea98 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java @@ -94,33 +94,22 @@ public String[][] loadFrameData(String fileName,String delimiter, int ncols) public MatrixBlock loadMatrixData(String fileName, String delimiter) throws IOException { int ncols = 0; - int nrows = 0; try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { String line; while((line = br.readLine()) != null) { String[] data = line.split(delimiter); ncols = Math.max(ncols, Integer.parseInt( data[data.length-1].split("::")[0])); - nrows++; } } - MatrixBlock mbd = new MatrixBlock(nrows, ncols+1, true); - - try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { - String line; - int r=0; - while((line = br.readLine()) != null) { - String[] data = line.split(delimiter); - for(int i = 0; i < data.length; i++) { - String[] value = data[i].split("::"); - if(value.length ==2) { - int col = Integer.parseInt(value[0]); - double v = Double.parseDouble(value[1]); - mbd.setValue(r, col, v); - } - } - r++; - } - } - return mbd; + String[][] dataString = loadFrameData(fileName,delimiter, ncols+1); + double[][] data = new double[dataString.length][dataString[0].length]; + for(int i=0;i sampleRawIndexes; - - private static int nrows; - private static int ncols; - private int nlines; - private int windowSize = 20; - private int suffixStringLength = 50; - private ReaderMapping mappingValues; - private CustomProperties properties; - - - public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { - this.mappingValues = new ReaderMapping(raw, matrix); - this.runIdentification(); - } - - public FormatIdentifying(String raw, FrameBlock frame) throws Exception { - this.mappingValues = new ReaderMapping(raw, frame); - this.runIdentification(); - } - - private void runIdentification() { - - mapRow = mappingValues.getMapRow(); - mapCol = mappingValues.getMapCol(); - mapLen = mappingValues.getMapLen(); - mapHas = mappingValues.getMapHas(); - sampleRawIndexes = mappingValues.getSampleRawIndexes(); - mapRowPrevious = new int[ncols]; - - for (int c = 0; c < ncols; c++) - mapRowPrevious[c] = 0; - - nrows = mappingValues.getNrows(); - ncols = mappingValues.getNcols(); - nlines = mappingValues.getNlines(); - NaN = (ncols * nrows) - mappingValues.getNaN(); - - // Check the map row: - // If all cells of a row mapped to a single line of sample raw, it is a single row mapping - // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping - - boolean isSingleRow = false; - int missedCount = 0; - for (int r = 0; r < nrows; r++) - missedCount += ncols - mostCommonScore(r); - if ((float) missedCount / NaN < 0.07) isSingleRow = true; - - KeyTrie[] colKeyPattern; - - if (isSingleRow) { - colKeyPattern = buildColsKeyPatternSingleRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTIFY); - } else { - - // Check the row index is a prefix string in sample raw - // if the row indexes are in the prefix of values, so we need to build a key pattern - // to extract row indexes - // for understanding row indexes are in sample raw we check just 3 column of data - // for build a key pattern related to row indexes we just selected a row - boolean flag; - int numberOfSelectedCols = 3; - int begin = 0; - boolean check, flagReconstruct; - int[] selectedRowIndex = new int[2]; - HashSet beginPos = new HashSet<>(); - KeyTrie rowKeyPattern = null; - - // Select two none zero row as a row index candidate - - int index = 0; - for (int r = 1; r < nrows; r++) { - for (int c = 0; c < ncols; c++) - if (mapHas.getValue(r, c) != 0) { - selectedRowIndex[index++] = r; - break; - } - if (index > 1) break; - } - - for (int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { - Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); - ArrayList prefixStrings = colPrefixString.getKey(); - ArrayList prefixStringRowIndexes = colPrefixString.getValue(); - ArrayList prefixRawIndex = new ArrayList<>(); - - MappingTrie trie = new MappingTrie(); - int ri = 0; - for (String ps : prefixStrings) - trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); - - do { - flag = trie.reConstruct(); - } while (flag); - - ArrayList> keyPatterns = trie.getAllSequentialKeys(); - for (ArrayList kp : keyPatterns) { - for (String ps : prefixStrings) { - StringBuilder sb = new StringBuilder(); - int currPos = 0; - for (String k : kp) { - sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); - currPos += sb.length() + k.length(); - } - prefixRawIndex.add(new RawIndex(sb.toString())); - } - } - - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - if (!flag) { - begin = 1; - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - } - if (!flag) { - beginPos.clear(); - break; - } else beginPos.add(begin); - if (c == numberOfSelectedCols - 1) { - ArrayList rowPrefixStrings = new ArrayList<>(); - MappingTrie rowTrie = new MappingTrie(); - rowKeyPattern = new KeyTrie(); - for (int si : selectedRowIndex) { - for (int ci = 0; ci < ncols; ci++) { - int cri = (int) mapRow.getValue(si, ci); - if (mapHas.getValue(si, ci) == 1) { - String str = sampleRawIndexes.get(cri).getSubString(0, (int) mapCol.getValue(si, ci)); - RawIndex rawIndex = new RawIndex(str); - Pair pair = rawIndex.findValue(si + begin); - if (pair != null) { - String pstr = str.substring(0, pair.getKey()); - if (pstr.length() > 0) { - rowPrefixStrings.add(pstr); - rowTrie.insert(pstr, 1); - } - rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); - } - } - } - } - - do { - ArrayList> selectedKeyPatterns = new ArrayList<>(); - keyPatterns = rowTrie.getAllSequentialKeys(); - check = false; - for (ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); - check |= newCheck; - if (newCheck) selectedKeyPatterns.add(keyPattern); - } - if (check) keyPatterns = selectedKeyPatterns; - else { - flagReconstruct = rowTrie.reConstruct(); - if (!flagReconstruct) break; - } - } while (!check); - - if (keyPatterns.size() == 0) { - ArrayList> kpl = new ArrayList<>(); - ArrayList kpli = new ArrayList<>(); - kpli.add(""); - kpl.add(kpli); - keyPatterns = kpl; - } - rowKeyPattern.setPrefixKeyPattern(keyPatterns); - } - } - - if (beginPos.size() == 1) { - colKeyPattern = buildColsKeyPatternSingleRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); - Integer bpos = beginPos.iterator().next(); - if (bpos > 0) properties.setRowIndexBegin("-" + bpos); - else properties.setRowIndexBegin(""); - } else { - KeyTrie rowDelimPattern = new KeyTrie(findRowDelimiters()); - colKeyPattern = buildColsKeyPatternMultiRow(); - properties = new CustomProperties(colKeyPattern, rowDelimPattern); - } - } - } - - private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex) { - for (int r = 0; r < nrows; r++) { - int rowIndex = (int) this.mapRow.getValue(r, colIndex); - if (rowIndex != -1) { - boolean flag = false; - for (RawIndex ri : prefixRawIndex) { - if (ri.findValue(r + beginPos) != null) { - flag = true; - break; - } - } - if (!flag) return false; - } - } - return true; - } - - public CustomProperties getFormatProperties() { - return properties; - } - - private Integer mostCommonScore(int rowIndex) { - Map map = new HashMap<>(); - int nan = 0; - for (int c = 0; c < ncols; c++) { - if (mapHas.getValue(rowIndex, c) != 0) { - int t = (int) mapRow.getValue(rowIndex, c); - Integer val = map.get(t); - map.put(t, val == null ? 1 : val + 1); - } else nan++; - } - if (map.size() == 0) return nan; - - Map.Entry max = null; - for (Map.Entry e : map.entrySet()) { - if (max == null || e.getValue() > max.getValue()) max = e; - } - return max.getValue() + nan; - } - - private Integer mostCommonValue(int[] list) { - Map map = new HashMap<>(); - for (Integer t : list) { - if (t != -1) { - Integer val = map.get(t); - map.put(t, val == null ? 1 : val + 1); - } - } - if (map.size() == 0) return -1; - - Map.Entry max = null; - for (Map.Entry e : map.entrySet()) { - if (max == null || e.getValue() > max.getValue()) max = e; - } - return max.getKey(); - } - - private KeyTrie[] buildColsKeyPatternSingleRow() { - Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); - ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); - KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - - // Clean prefix strings - for (int c = 0; c < ncols; c++) { - ArrayList list = prefixStrings.getKey()[c]; - String token = null; - boolean flag = true; - for (int w = 1; w < windowSize && flag; w++) { - HashSet wts = new HashSet<>(); - for (String s : list) { - if (s.length() < w) flag = false; - else { - String subStr = s.substring(s.length() - w); - if (!subStr.contains(Lop.OPERAND_DELIMITOR)) wts.add(subStr); - else flag = false; - } - } - - if (flag) { - if (wts.size() == 1) token = wts.iterator().next(); - else { - for (String t : wts) { - int count = 0; - for (String s : list) { - if (s.endsWith(t)) count++; - } - float percent = (float) count / list.size(); - if (percent >= 1) token = t; - } - } - } else if (wts.size() == 0) token = ""; - } - if (token == null) { - int[] listLength = new int[nrows]; - for (int r = 0; r < nrows; r++) { - if (mapHas.getValue(r, c) == 1) listLength[r] = (int) mapCol.getValue(r, c); - else listLength[r] = -1; - } - int commonLength = mostCommonValue(listLength); - if (commonLength == 0) { - ArrayList newList = new ArrayList<>(); - for (String s : list) { - if (s.length() == 0) newList.add(s); - } - prefixStrings.getKey()[c] = newList; - } else throw new RuntimeException("can't build a key pattern for the column: " + c); - } else if (token.length() > 0) { - ArrayList newList = new ArrayList<>(); - for (String s : list) { - if (s.endsWith(token)) newList.add(s); - } - prefixStrings.getKey()[c] = newList; - } - } - - for (int c = 0; c < ncols; c++) { - MappingTrie trie = new MappingTrie(); - int ri = 0; - boolean check; - boolean flagReconstruct; - ArrayList> keyPatterns = null; - - - for (String ps : prefixStrings.getKey()[c]) - trie.reverseInsert(ps, prefixStrings.getValue()[c].get(ri++)); - - if (trie.getRoot().getChildren().size() == 1) { - String[] splitPattern = prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); - ArrayList reverseSplitPattern = new ArrayList<>(); - for (String ps : splitPattern) - if (ps.length() > 0) reverseSplitPattern.add(ps); - if (reverseSplitPattern.size() == 0) reverseSplitPattern.add(""); - - int maxPatternLength = reverseSplitPattern.size(); - check = false; - for (int sp = 0; sp < maxPatternLength; sp++) { - ArrayList shortPattern = new ArrayList<>(); - for (int spi = maxPatternLength - sp - 1; spi < maxPatternLength; spi++) { - shortPattern.add(reverseSplitPattern.get(spi)); - } - check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], shortPattern); - if (check) { - keyPatterns = new ArrayList<>(); - keyPatterns.add(shortPattern); - break; - } - } - } else { - do { - ArrayList> selectedKeyPatterns = new ArrayList<>(); - keyPatterns = trie.getAllSequentialKeys(); - check = false; - for (ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); - check |= newCheck; - if (newCheck) selectedKeyPatterns.add(keyPattern); - } - if (check) keyPatterns = selectedKeyPatterns; - else { - flagReconstruct = trie.reConstruct(); - if (!flagReconstruct) break; - } - } while (!check); - } - - if (check) { - colKeyPattens[c] = new KeyTrie(keyPatterns); - for (String suffix : suffixStrings[c]) { - colKeyPattens[c].insertSuffixKeys(suffix.substring(0, Math.min(suffixStringLength, suffix.length())).toCharArray()); - } - } - } - return colKeyPattens; - } - - // Get all prefix strings of a column - public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { - ArrayList[] prefixStrings = new ArrayList[ncols]; - ArrayList[] rowIndexes = new ArrayList[ncols]; - for (int c = 0; c < ncols; c++) { - Pair, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse); - prefixStrings[c] = pair.getKey(); - rowIndexes[c] = pair.getValue(); - } - return new Pair<>(prefixStrings, rowIndexes); - } - - public Pair, ArrayList> extractAllPrefixStringsOfAColSingleLine(int colIndex, boolean reverse) { - ArrayList prefixStrings = new ArrayList(); - ArrayList rowIndexes = new ArrayList(); - for (int r = 0; r < nrows; r++) { - int rowIndex = (int) mapRow.getValue(r, colIndex); - if (mapHas.getValue(r, colIndex) == 1) { - rowIndexes.add(rowIndex); - String str = sampleRawIndexes.get(rowIndex).getRemainedTexts((int) mapCol.getValue(r, colIndex));//sampleRawIndexes.get(rowIndex) - // .getSubString(0, - // mapCol[r][colIndex]); - if (reverse) prefixStrings.add(new StringBuilder(str).reverse().toString()); - else prefixStrings.add(str); - } - } - return new Pair<>(prefixStrings, rowIndexes); - } - - private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { - ArrayList[] result = new ArrayList[ncols]; - for (int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - for (int r = 0; r < nrows; r++) { - int rowIndex = (int) mapRow.getValue(r, c); - if (mapHas.getValue(r,c) != 1) continue; - String str = sampleRawIndexes.get(rowIndex).getRaw().substring((int) (mapCol.getValue(r, c) + mapLen.getValue(r, c))); - result[c].add(str); - } - } - return result; - } - - ///////////////////////////////////////////////////////////////////////////// - // Methods For Multi Lines Mapping // - //////////////////////////////////////////////////////////////////////////// - // This implementation is for nested datasets are scattered on multiple lines - // The following steps are required: - // 1. Extract all prefix strings per column - // 2. Build key pattern tree for each column - // 3. Build key pattern for end of values - - private ArrayList> findRowDelimiters() { - ArrayList> keyPattern = new ArrayList<>(); - Hirschberg hirschberg = new Hirschberg(); - int misMatchPenalty = 3; - int gapPenalty = 2; - - //extract all lines are in record boundary - ArrayList recordBoundaries = new ArrayList<>(); - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - int[] minList = new int[nrows]; - HashMap maxColPos = new HashMap<>(); - int[] minColPos = new int[nrows]; - for (int r = 0; r < nrows; r++) - tmpUsedLines[r] = new BitSet(); - - for (int r = 0; r < nrows; r++) { - int min = nlines; - int minPos = 0; - for (int c = 0; c < ncols; c++) - if (mapHas.getValue(r, c) != 0) { - tmpUsedLines[r].set((int) mapRow.getValue(r, c)); - if (mapRow.getValue(r, c) <= min) { - min = (int) mapRow.getValue(r, c); - if (minPos != 0) minPos = (int) Math.min(minPos, mapCol.getValue(r, c)); - else minPos = (int) mapCol.getValue(r, c); - - } - if (maxColPos.containsKey((int) mapRow.getValue(r, c))) - maxColPos.put((int) mapRow.getValue(r, c), (int) Math.max(maxColPos.get((int) mapRow.getValue(r, c)), (int) mapCol.getValue(r, c) + mapLen.getValue(r, c))); - else maxColPos.put((int) mapRow.getValue(r, c), (int) (mapCol.getValue(r, c) + mapLen.getValue(r, c))); - } - minList[r] = min; - minColPos[r] = minPos; - } - - for (int r = 0; r < nrows; r++) { - usedLines[r] = new BitSet(nlines); - for (int i = 0; i < nrows; i++) { - if (i != r) usedLines[r].or(tmpUsedLines[i]); - } - } - - for (int r = 0; r < nrows; r++) { - int beginLine = minList[r]; - for (; beginLine >= 0; beginLine--) - if (usedLines[r].get(beginLine)) break; - - StringBuilder sb = new StringBuilder(); - beginLine = Math.max(beginLine, 0); - - if (beginLine + 1 == nlines) continue; - - Integer subStrPos = 0; - if (maxColPos.containsKey(beginLine)) subStrPos = maxColPos.get(beginLine); - - String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); - if (str.length() > 0) { - sb.append(str).append("\n"); - } - for (int i = beginLine + 1; i < minList[r]; i++) { - str = sampleRawIndexes.get(i).getRaw(); - if (str.length() > 0) sb.append(str).append("\n"); - } - - str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); - if (str.length() > 0) sb.append(str); - recordBoundaries.add(sb.toString()); - } - recordBoundaries.remove(recordBoundaries.size() - 1); - - String str1 = recordBoundaries.get(0); - String str2 = recordBoundaries.get(1); - Pair, String> pattern = hirschberg.getLCS(str1, str2, misMatchPenalty, gapPenalty); - if (pattern != null) { - String intersect = pattern.getValue(); - ArrayList intersectPattern = pattern.getKey(); - for (int i = 2; i < recordBoundaries.size(); i++) { - pattern = hirschberg.getLCS(intersect, recordBoundaries.get(i), misMatchPenalty, gapPenalty); - if (pattern != null) { - intersect = pattern.getValue(); - intersectPattern = pattern.getKey(); - } else intersect = null; - } - if (intersect != null && intersect.length() > 0) { - keyPattern.add(intersectPattern); - return keyPattern; - } - } - return null; - } - - - // Build key pattern tree for each column - private KeyTrie[] buildColsKeyPatternMultiRow() { - Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); - ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); - - KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - for (int c = 0; c < ncols; c++) { - // 1. Build Prefix Key Pattern - String colDelim = findStartWithIntersectOfStrings(prefixStrings.getKey()[c], prefixStrings.getValue()[c].getKey()); - - HashSet intersect = new HashSet<>(); - intersect.add(colDelim); - - KeyTrie trie = new KeyTrie(colDelim); - ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); - boolean check; - do { - ArrayList> keyPatterns = trie.getPrefixKeyPatterns(); - check = false; - for (ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); - check |= newCheck; - if (newCheck) { - trie.setAPrefixPath(keyPattern); - } - } - - if (!check) { - remainedPrefixes.clear(); - boolean flag = true; - for (ArrayList keyPattern : keyPatterns) { - ArrayList remainedPrefix = new ArrayList<>(); - for (String ps : prefixStrings.getKey()[c]) - remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); - - intersect = findStartWithIntersectOfStrings(remainedPrefix); - if (intersect != null) { - trie.insertPrefixKeysConcurrent(intersect); - } else { - remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); - flag = false; - break; - } - } - if (!flag) break; - } - } while (!check); - - // Suffix pattern is based on char, so we need to extract all chars of a string - for (String suffix : suffixStrings[c]) { - trie.insertSuffixKeys(suffix.toCharArray()); - } - colKeyPattens[c] = trie; - } - return colKeyPattens; - } - - // Extract prefix strings: - private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse) { - - ArrayList[] result = new ArrayList[ncols]; - Pair[] minmax = new Pair[ncols]; - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - for (int r = 0; r < nrows; r++) - tmpUsedLines[r] = new BitSet(); - - for (int r = 0; r < nrows; r++) - for (int c = 0; c < ncols; c++) - if (mapHas.getValue(r, c) != 0) tmpUsedLines[r].set((int) mapRow.getValue(r, c)); - - for (int r = 0; r < nrows; r++) { - usedLines[r] = new BitSet(nlines); - for (int i = 0; i < nrows; i++) { - if (i != r) usedLines[r].or(tmpUsedLines[i]); - } - } - - // extract prefix strings - for (int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - int min = 0; - int max = 0; - for (int r = 0; r < nrows; r++) { - int rowIndex = (int) mapRow.getValue(r, c); - if (mapHas.getValue(r,c) != 1) continue; - StringBuilder sb = new StringBuilder(); - int lastLine = 0; - - for (int i = rowIndex - 1; i >= 0; i--) - if (usedLines[r].get(i)) { - lastLine = i; - break; - } - for (int i = lastLine; i < rowIndex; i++) { - if (sampleRawIndexes.get(i).getRawLength() > 0) sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); - } - String str = sampleRawIndexes.get(rowIndex).getSubString(0, (int) mapCol.getValue(r, c)); - if (str.length() > 0 && !str.equals("\n")) sb.append(str); - else if (lastLine < rowIndex) sb.deleteCharAt(sb.length() - 1); - - - if (reverse) result[c].add(sb.reverse().toString()); - else result[c].add(sb.toString()); - max = Math.max(max, sb.length()); - if (sb.length() < min || min == 0) min = sb.length(); - minmax[c] = new Pair<>(min, max); - } - } - return new Pair<>(result, minmax); - } - - private String findStartWithIntersectOfStrings(ArrayList strList, int minLength) { - StringBuilder sb = new StringBuilder(); - int i = 0; - boolean flag = true; - do { - char ch = strList.get(0).charAt(i); - for (int j = 1; j < Math.min(strList.size(), minLength); j++) { - char cch = strList.get(j).charAt(i); - if (ch != cch || ch == '\n') { - flag = false; - break; - } - } - if (flag) sb.append(ch); - i++; - } while (flag && i < minLength); - return sb.toString(); - - } - - private HashSet findStartWithIntersectOfStrings(ArrayList strList) { - // 1. Extract all substrings - // 2. Find intersection of substrings - - HashSet[] substrings = new HashSet[strList.size()]; - for (int i = 0; i < strList.size(); i++) - substrings[i] = new HashSet<>(); - - for (int w = windowSize; w > 2; w--) { - for (int i = 0; i < strList.size(); i++) { - substrings[i].clear(); - substrings[i].addAll(getAllSubstringsOfAString(strList.get(i), w)); - } - - HashSet totalIntersect = new HashSet<>(substrings[0]); - for (int r = 1; r < substrings.length; r++) - totalIntersect.retainAll(substrings[r]); - - if (totalIntersect.size() > 0) return totalIntersect; - - } - return null; - } - - private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys) { - if (keys.size() == 1) { - String k = keys.get(0); - if (k.length() == 0) return true; - } - - for (String ps : prefixStrings) { - int currentPos = 0; - int patternCount = 0; - do { - currentPos = getIndexOfKeyPatternOnString(ps, keys, currentPos).getKey(); - if (currentPos == -1) break; - else { - patternCount++; - currentPos++; - } - } while (true); - if (patternCount != 1) return false; - } - return true; - } - - // Check the sequential list of keys are on a string - private Pair getIndexOfKeyPatternOnString(String str, ArrayList key, int beginPos) { - - int currPos = beginPos; - boolean flag = true; - int startPos = -1; - for (String k : key) { - int index = str.indexOf(k, currPos); - if (index != -1) currPos = index + k.length(); - else { - flag = false; - break; - } - if (startPos == -1) startPos = currPos; - } - if (flag) return new Pair<>(startPos, currPos + key.get(key.size() - 1).length()); - else return new Pair<>(-1, -1); - } - - private ArrayList getAllSubstringsOfAString(String str, int size) { - ArrayList result = new ArrayList<>(); - if (str == null) return result; - for (int i = 0; i <= str.length() - size; i++) { - String s = str.substring(i, i + size); - if (!s.contains("\n")) result.add(s); - } - return result; - } - - private String getRemainedSubstring(String str, ArrayList keys) { - boolean flag = true; - int currPos = 0; - for (String k : keys) { - int index = str.indexOf(k, currPos); - if (index != -1) currPos = index + k.length(); - else { - flag = false; - break; - } - } - if (flag) return str.substring(currPos); - else return null; - } - - private ArrayList[] extractAllSuffixStringsOfColsMultiLine() { - ArrayList[] result = new ArrayList[ncols]; - for (int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - - for (int r = 0; r < nrows; r++) { - int rowIndex = (int) mapRow.getValue(r, c); - if (mapHas.getValue(r,c) != 1) continue; - StringBuilder sb = new StringBuilder(); - String str = sampleRawIndexes.get(rowIndex).getRaw().substring((int) (mapCol.getValue(r, c) + mapLen.getValue(r, c))); - boolean enter = false; - if (str.length() > 0) { - sb.append(str); - enter = true; - } - - for (int i = rowIndex + 1; i < nlines; i++) { - str = sampleRawIndexes.get(i).getRaw().substring(0, Math.min(sampleRawIndexes.get(i).getRawLength(), suffixStringLength)); - if (str.length() > 0 && !enter) { - sb.append(str); - break; - } - } - if (sb.length() > 0) sb.deleteCharAt(sb.length() - 1); - result[c].add(sb.toString()); - } - } - return result; - } + private int[][] mapRow; + private int[] mapRowPrevious; + private int[][] mapCol; + private int[][] mapLen; + private int NaN; + private ArrayList sampleRawIndexes; + + private static int nrows; + private static int ncols; + private int nlines; + private int windowSize = 20; + private int suffixStringLength = 50; + private ReaderMapping mappingValues; + private CustomProperties properties; + + + public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { + this.mappingValues = new ReaderMapping(raw, matrix); + this.runIdentification(); + } + + public FormatIdentifying(String raw, FrameBlock frame) throws Exception { + this.mappingValues = new ReaderMapping(raw, frame); + this.runIdentification(); + } + + private void runIdentification() { + + mapRow = mappingValues.getMapRow(); + mapCol = mappingValues.getMapCol(); + mapLen = mappingValues.getMapLen(); + sampleRawIndexes = mappingValues.getSampleRawIndexes(); + mapRowPrevious = new int[ncols]; + + for(int c=0; c< ncols; c++) + mapRowPrevious[c] = 0; + + nrows = mappingValues.getNrows(); + ncols = mappingValues.getNcols(); + nlines = mappingValues.getNlines(); + NaN = (ncols * nrows) - mappingValues.getNaN(); + + // Check the map row: + // If all cells of a row mapped to a single line of sample raw, it is a single row mapping + // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping + + boolean isSingleRow = false; + int missedCount = 0; + for(int r=0; r beginPos = new HashSet<>(); + KeyTrie rowKeyPattern = null; + + // Select two none zero row as a row index candidate + + int index = 0; + for(int r=1; r1) + break; + } + + for(int c=0; c< Math.min(numberOfSelectedCols, ncols); c++){ + Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); + ArrayList prefixStrings = colPrefixString.getKey(); + ArrayList prefixStringRowIndexes = colPrefixString.getValue(); + ArrayList prefixRawIndex = new ArrayList<>(); + + MappingTrie trie = new MappingTrie(); + int ri = 0; + for(String ps: prefixStrings ) + trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); + + do { + flag = trie.reConstruct(); + }while(flag); + + ArrayList> keyPatterns = trie.getAllSequentialKeys(); + for(ArrayList kp: keyPatterns){ + for(String ps: prefixStrings){ + StringBuilder sb = new StringBuilder(); + int currPos = 0; + for(String k: kp){ + sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); + currPos += sb.length() + k.length(); + } + prefixRawIndex.add(new RawIndex(sb.toString())); + } + } + + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + if(!flag) { + begin = 1; + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + } + if(!flag) { + beginPos.clear(); + break; + } + else + beginPos.add(begin); + if(c== numberOfSelectedCols -1){ + ArrayList rowPrefixStrings = new ArrayList<>(); + MappingTrie rowTrie = new MappingTrie(); + rowKeyPattern = new KeyTrie(); + for(int si: selectedRowIndex) { + for(int ci = 0; ci < ncols; ci++) { + int cri = mapRow[si][ci]; + if(cri != -1) { + String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); + RawIndex rawIndex = new RawIndex(str); + Pair pair = rawIndex.findValue(si + begin); + if(pair != null) { + String pstr = str.substring(0, pair.getKey()); + if(pstr.length() > 0) { + rowPrefixStrings.add(pstr); + rowTrie.insert(pstr, 1); + } + rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); + } + } + } + } + + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = rowTrie.getAllSequentialKeys(); + check = false; + for(ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); + check |= newCheck; + if(newCheck) + selectedKeyPatterns.add(keyPattern); + } + if(check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = rowTrie.reConstruct(); + if(!flagReconstruct) + break; + } + }while(!check); + + if(keyPatterns.size() == 0){ + ArrayList> kpl = new ArrayList<>(); + ArrayList kpli = new ArrayList<>(); + kpli.add(""); + kpl.add(kpli); + keyPatterns = kpl; + } + rowKeyPattern.setPrefixKeyPattern(keyPatterns); + } + } + + if(beginPos.size() == 1){ + colKeyPattern = buildColsKeyPatternSingleRow(); + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); + Integer bpos = beginPos.iterator().next(); + if(bpos>0) + properties.setRowIndexBegin("-"+bpos); + else + properties.setRowIndexBegin(""); + } + else { + KeyTrie rowDelimPattern = new KeyTrie(findRowDelimiters()); + colKeyPattern = buildColsKeyPatternMultiRow(); + properties = new CustomProperties(colKeyPattern, rowDelimPattern); + } + } + } + + private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex){ + for(int r=0;r map = new HashMap<>(); + int nan = 0; + for (Integer t : list) { + if (t != -1) { + Integer val = map.get(t); + map.put(t, val == null ? 1 : val + 1); + } else + nan++; + } + if (map.size() == 0) + return nan; + + Map.Entry max = null; + for (Map.Entry e : map.entrySet()) { + if (max == null || e.getValue() > max.getValue()) + max = e; + } + return max.getValue() + nan; + } + + private Integer mostCommonValue(int[] list) { + Map map = new HashMap<>(); + for (Integer t : list) { + if (t != -1) { + Integer val = map.get(t); + map.put(t, val == null ? 1 : val + 1); + } + } + if (map.size() == 0) + return -1; + + Map.Entry max = null; + for (Map.Entry e : map.entrySet()) { + if (max == null || e.getValue() > max.getValue()) + max = e; + } + return max.getKey(); + } + + private KeyTrie[] buildColsKeyPatternSingleRow() { + Pair[], ArrayList[]> prefixStrings = extractAllPrefixStringsOfColsSingleLine(false); + ArrayList[] suffixStrings = extractAllSuffixStringsOfColsSingleLine(); + KeyTrie[] colKeyPattens = new KeyTrie[ncols]; + + // Clean prefix strings + for(int c =0; c< ncols; c++) { + ArrayList list = prefixStrings.getKey()[c]; + String token = null; + boolean flag = true; + for(int w = 1; w < windowSize && flag; w++) { + HashSet wts = new HashSet<>(); + for(String s : list) { + if(s.length() < w) + flag = false; + else { + String subStr = s.substring(s.length() - w); + if (!subStr.contains(Lop.OPERAND_DELIMITOR)) + wts.add(subStr); + else + flag = false; + } + } + + if(flag) { + if(wts.size() == 1) + token = wts.iterator().next(); + else { + for(String t : wts) { + int count = 0; + for(String s : list) { + if(s.endsWith(t)) + count++; + } + float percent = (float) count / list.size(); + if(percent >= 1) + token = t; + } + } + } + else if(wts.size() == 0) + token = ""; + } + if(token == null) { + int[] listLength = new int[nrows]; + for (int r = 0; r< nrows; r++) + listLength[r] = mapCol[r][c]; + int commonLength = mostCommonValue(listLength); + if (commonLength == 0){ + ArrayList newList = new ArrayList<>(); + for(String s: list){ + if(s.length() == 0) + newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } + else + throw new RuntimeException("can't build a key pattern for the column: " + c); + } + else if(token.length() > 0){ + ArrayList newList = new ArrayList<>(); + for(String s: list){ + if(s.endsWith(token)) + newList.add(s); + } + prefixStrings.getKey()[c] = newList; + } + } + + for(int c=0; c> keyPatterns = null; + + + for(String ps: prefixStrings.getKey()[c]) + trie.reverseInsert(ps, prefixStrings.getValue()[c].get(ri++)); + + if (trie.getRoot().getChildren().size() == 1){ + String[] splitPattern= prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); + ArrayList reverseSplitPattern = new ArrayList<>(); + for (String ps: splitPattern) + if (ps.length() > 0) + reverseSplitPattern.add(ps); + if (reverseSplitPattern.size() == 0) + reverseSplitPattern.add(""); + + int maxPatternLength = reverseSplitPattern.size(); + check = false; + for(int sp= 0; sp< maxPatternLength;sp++){ + ArrayList shortPattern = new ArrayList<>(); + for(int spi= maxPatternLength - sp-1; spi< maxPatternLength; spi++){ + shortPattern.add(reverseSplitPattern.get(spi)); + } + check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], shortPattern); + if (check) { + keyPatterns = new ArrayList<>(); + keyPatterns.add(shortPattern); + break; + } + } + } + else { + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = trie.getAllSequentialKeys(); + check = false; + for (ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + if (newCheck) + selectedKeyPatterns.add(keyPattern); + } + if (check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = trie.reConstruct(); + if (!flagReconstruct) + break; + } + } while (!check); + } + + if(check){ + colKeyPattens[c] = new KeyTrie(keyPatterns); + for(String suffix: suffixStrings[c]) { + colKeyPattens[c].insertSuffixKeys(suffix.substring(0,Math.min(suffixStringLength, suffix.length())).toCharArray()); + } + } + } + return colKeyPattens; + } + + // Get all prefix strings of a column + public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { + ArrayList[] prefixStrings = new ArrayList[ncols]; + ArrayList[] rowIndexes = new ArrayList[ncols]; + for(int c=0; c< ncols; c++){ + Pair, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse); + prefixStrings[c] = pair.getKey(); + rowIndexes[c] = pair.getValue(); + } + return new Pair<>(prefixStrings, rowIndexes); + } + + public Pair, ArrayList> extractAllPrefixStringsOfAColSingleLine(int colIndex, boolean reverse) { + ArrayList prefixStrings = new ArrayList(); + ArrayList rowIndexes = new ArrayList(); + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][colIndex]; + if(rowIndex != -1) { + rowIndexes.add(rowIndex); + String str = sampleRawIndexes.get(rowIndex).getRemainedTexts(mapCol[r][colIndex]);//sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]); + if(reverse) + prefixStrings.add(new StringBuilder(str).reverse().toString()); + else + prefixStrings.add(str); + } + } + return new Pair<>(prefixStrings, rowIndexes); + } + + private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { + ArrayList[] result = new ArrayList[ncols]; + for(int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][c]; + if(rowIndex == -1) + continue; + String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); + result[c].add(str); + } + } + return result; + } + + ///////////////////////////////////////////////////////////////////////////// + // Methods For Multi Lines Mapping // + //////////////////////////////////////////////////////////////////////////// + // This implementation is for nested datasets are scattered on multiple lines + // The following steps are required: + // 1. Extract all prefix strings per column + // 2. Build key pattern tree for each column + // 3. Build key pattern for end of values + + private ArrayList> findRowDelimiters(){ + ArrayList> keyPattern = new ArrayList<>(); + Hirschberg hirschberg = new Hirschberg(); + int misMatchPenalty = 3; + int gapPenalty = 2; + + //extract all lines are in record boundary + ArrayList recordBoundaries = new ArrayList<>(); + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + int[] minList = new int[nrows]; + HashMap maxColPos = new HashMap<>(); + int[] minColPos = new int[nrows]; + for(int r=0; r= 0; beginLine--) + if(usedLines[r].get(beginLine)) + break; + + StringBuilder sb = new StringBuilder(); + beginLine = Math.max(beginLine, 0); + + if(beginLine+1 == nlines) + continue; + + Integer subStrPos = 0; + if(maxColPos.containsKey(beginLine)) + subStrPos = maxColPos.get(beginLine); + + String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); + if(str.length() >0) { + sb.append(str).append("\n"); + } + for(int i = beginLine+1 ; i < minList[r]; i++){ + str = sampleRawIndexes.get(i).getRaw(); + if(str.length() > 0) + sb.append(str).append("\n"); + } + + str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); + if(str.length() > 0) + sb.append(str); + recordBoundaries.add(sb.toString()); + } + recordBoundaries.remove(recordBoundaries.size()-1); + + String str1 = recordBoundaries.get(0); + String str2 = recordBoundaries.get(1); + Pair, String> pattern = hirschberg.getLCS(str1, str2, misMatchPenalty, gapPenalty); + if(pattern != null) { + String intersect = pattern.getValue(); + ArrayList intersectPattern = pattern.getKey(); + for(int i = 2; i < recordBoundaries.size(); i++) { + pattern = hirschberg.getLCS(intersect, recordBoundaries.get(i), misMatchPenalty, gapPenalty); + if(pattern != null) { + intersect = pattern.getValue(); + intersectPattern = pattern.getKey(); + } + else + intersect = null; + } + if(intersect != null && intersect.length() > 0) { + keyPattern.add(intersectPattern); + return keyPattern; + } + } + return null; + } + + + // Build key pattern tree for each column + private KeyTrie[] buildColsKeyPatternMultiRow(){ + Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); + ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); + + KeyTrie[] colKeyPattens = new KeyTrie[ncols]; + for(int c=0; c intersect = new HashSet<>(); + intersect.add(colDelim); + + KeyTrie trie = new KeyTrie(colDelim); + ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); + boolean check; + do { + ArrayList> keyPatterns = trie.getPrefixKeyPatterns(); + check = false; + for(ArrayList keyPattern: keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); + check |= newCheck; + if(newCheck){ + trie.setAPrefixPath(keyPattern); + } + } + + if(!check){ + remainedPrefixes.clear(); + boolean flag = true; + for(ArrayList keyPattern: keyPatterns){ + ArrayList remainedPrefix = new ArrayList<>(); + for(String ps : prefixStrings.getKey()[c]) + remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); + + intersect = findStartWithIntersectOfStrings(remainedPrefix); + if(intersect != null) { + trie.insertPrefixKeysConcurrent(intersect); + } + else { + remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); + flag = false; + break; + } + } + if(!flag) + break; + } + } + while(!check); + + // Suffix pattern is based on char, so we need to extract all chars of a string + for(String suffix: suffixStrings[c]) { + trie.insertSuffixKeys(suffix.toCharArray()); + } + colKeyPattens[c] = trie; + } + return colKeyPattens; + } + + // Extract prefix strings: + private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse){ + + ArrayList[] result = new ArrayList[ncols]; + Pair[] minmax = new Pair[ncols]; + BitSet[] tmpUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + for(int r=0; r(); + int min = 0; + int max = 0; + for(int r=0; r=0; i--) + if(usedLines[r].get(i)) { + lastLine = i; + break; + } + for(int i= lastLine; i 0 ) + sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); + } + String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][c]); + if(str.length() > 0 && !str.equals("\n")) + sb.append(str); + else if(lastLine < rowIndex) + sb.deleteCharAt(sb.length()-1); + + + if(reverse) + result[c].add(sb.reverse().toString()); + else + result[c].add(sb.toString()); + max = Math.max(max, sb.length()); + if(sb.length()< min || min == 0) + min = sb.length(); + minmax[c] = new Pair<>(min, max); + } + } + return new Pair<>(result, minmax); + } + + private String findStartWithIntersectOfStrings(ArrayList strList, int minLength){ + StringBuilder sb = new StringBuilder(); + int i = 0; + boolean flag = true; + do { + char ch = strList.get(0).charAt(i); + for(int j=1; j findStartWithIntersectOfStrings(ArrayList strList){ + // 1. Extract all substrings + // 2. Find intersection of substrings + + HashSet[] substrings = new HashSet[strList.size()]; + for(int i=0; i< strList.size(); i++) + substrings[i] = new HashSet<>(); + + for(int w = windowSize; w > 2; w--) { + for(int i=0; i totalIntersect = new HashSet<>(substrings[0]); + for(int r=1; r 0) + return totalIntersect; + + } + return null; + } + + private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys){ + if(keys.size() == 1){ + String k = keys.get(0); + if (k.length() == 0) + return true; + } + + for(String ps: prefixStrings){ + int currentPos = 0; + int patternCount = 0; + do { + currentPos = getIndexOfKeyPatternOnString(ps, keys, currentPos).getKey(); + if(currentPos == -1) + break; + else { + patternCount++; + currentPos++; + } + }while(true); + if(patternCount!=1) + return false; + } + return true; + } + + // Check the sequential list of keys are on a string + private Pair getIndexOfKeyPatternOnString(String str, ArrayList key, int beginPos) { + + int currPos = beginPos; + boolean flag = true; + int startPos = -1; + for(String k : key) { + int index = str.indexOf(k, currPos); + if(index != -1) + currPos = index + k.length(); + else { + flag = false; + break; + } + if(startPos==-1) + startPos = currPos; + } + if(flag) + return new Pair<>(startPos, currPos+key.get(key.size()-1).length()); + else + return new Pair<>(-1,-1); + } + + private ArrayList getAllSubstringsOfAString(String str,int size){ + ArrayList result = new ArrayList<>(); + if(str == null) + return result; + for(int i = 0; i <= str.length() - size; i++){ + String s = str.substring(i, i + size); + if(!s.contains("\n")) + result.add(s); + } + return result; + } + + private String getRemainedSubstring(String str, ArrayList keys){ + boolean flag = true; + int currPos = 0; + for(String k : keys) { + int index = str.indexOf(k, currPos); + if(index != -1) + currPos = index + k.length(); + else { + flag = false; + break; + } + } + if(flag) + return str.substring(currPos); + else + return null; + } + + private ArrayList[] extractAllSuffixStringsOfColsMultiLine() { + ArrayList[] result = new ArrayList[ncols]; + for(int c = 0; c < ncols; c++) { + result[c] = new ArrayList<>(); + + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][c]; + if(rowIndex == -1) + continue; + StringBuilder sb = new StringBuilder(); + String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); + boolean enter = false; + if(str.length() > 0) { + sb.append(str); + enter = true; + } + + for(int i = rowIndex + 1; i < nlines; i++) { + str = sampleRawIndexes.get(i).getRaw().substring(0, Math.min(sampleRawIndexes.get(i).getRawLength(), suffixStringLength)); + if(str.length() > 0 && !enter) { + sb.append(str); + break; + } + } + if(sb.length() > 0) + sb.deleteCharAt(sb.length() - 1); + result[c].add(sb.toString()); + } + } + return result; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index 1bcddd8b2ed..de6c1e98611 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -33,10 +33,9 @@ public class ReaderMapping { - private MatrixBlock mapRow; - private MatrixBlock mapCol; - private MatrixBlock mapLen; - private MatrixBlock mapHas; + private int[][] mapRow; + private int[][] mapCol; + private int[][] mapLen; private boolean mapped; private final int nrows; private final int ncols; @@ -97,12 +96,16 @@ private void runMapping(boolean isIndexMapping) { } protected boolean findMapping(boolean isIndexMapping) { - mapRow = new MatrixBlock(nrows, ncols, true); - mapCol = new MatrixBlock(nrows, ncols, true); - mapLen = new MatrixBlock(nrows, ncols, true); - mapHas = new MatrixBlock(nrows, ncols, true); + mapRow = new int[nrows][ncols]; + mapCol = new int[nrows][ncols]; + mapLen = new int[nrows][ncols]; NaN = 0; + // Set "-1" as default value for all defined matrix + for(int r = 0; r < nrows; r++) + for(int c = 0; c < ncols; c++) + mapRow[r][c] = mapCol[r][c] = mapLen[r][c] = -1; + int itRow = 0; for(int r = 0; r < nrows; r++) { for(int c = 0; c < ncols; c++) { @@ -115,10 +118,9 @@ protected boolean findMapping(boolean isIndexMapping) { Pair pair = this.isMatrix ? ri.findValue( sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); if(pair != null) { - mapRow.setValue(r,c, itRow); - mapCol.setValue(r,c,pair.getKey()); - mapLen.setValue(r,c,pair.getValue()); - mapHas.setValue(r,c,1); + mapRow[r][c] = itRow; + mapCol[r][c] = pair.getKey(); + mapLen[r][c] = pair.getValue(); break; } else { @@ -136,7 +138,7 @@ protected boolean findMapping(boolean isIndexMapping) { boolean flagMap = true; for(int r = 0; r < nrows && flagMap; r++) for(int c = 0; c < ncols && flagMap; c++) - if(mapHas.getDouble(r,c) == -1 && ((!this.isMatrix && this.sampleFrame.get(r, + if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, c) != null) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, c) != null) || (schema[c].isNumeric() && this.sampleFrame.getDouble(r, c) != 0))))) { flagMap = false; @@ -148,22 +150,18 @@ public int getNaN() { return NaN; } - public MatrixBlock getMapRow() { + public int[][] getMapRow() { return mapRow; } - public MatrixBlock getMapCol() { + public int[][] getMapCol() { return mapCol; } - public MatrixBlock getMapLen() { + public int[][] getMapLen() { return mapLen; } - public MatrixBlock getMapHas() { - return mapHas; - } - public ArrayList getSampleRawIndexes() { return sampleRawIndexes; } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index ec38386f6c1..b14ae35225c 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -90,12 +90,16 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - //FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); - MatrixReader mr = gr.getReader(); - MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); - - TestUtils.compareMatrices(sampleMB, matrixBlock, 0); + FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); +// myTest mt = new myTest(formatIdentifying.getFormatProperties()); +// mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); +// int a = 100; + +// GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); +// MatrixReader mr = gr.getReader(); +// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); +// +// TestUtils.compareMatrices(sampleMB, matrixBlock, 0); } catch(Exception exception) { diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 6f9727579d7..40424bc54f3 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -19,36 +19,21 @@ package org.apache.sysds.test.functions.iogen.Identify; -import com.google.gson.Gson; import org.apache.sysds.common.Types; -import org.apache.sysds.lops.Lop; -import org.apache.sysds.runtime.io.FileFormatPropertiesLIBSVM; import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.io.FrameReaderJSONJackson; import org.apache.sysds.runtime.io.FrameReaderJSONL; -import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.io.ReaderTextLIBSVM; -import org.apache.sysds.runtime.iogen.FormatIdentifying; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.iogen.EXP.Util; -import org.apache.sysds.runtime.iogen.Hirschberg; -import org.apache.sysds.runtime.iogen.MappingTrie; import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; import org.junit.Test; -import java.io.IOException; -import java.security.GeneralSecurityException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; import java.util.Random; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { @@ -185,22 +170,101 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou runGenerateReaderTest(); } - @Test public void test101() throws IOException { + @Test public void test12() { + // sampleRaw = "#index 1\n" + + // "#t 2,3\n" + + // "#s 1980\n"+ + // "#index 10\n\n" + + // "#t 21,30\n" + + // "#s 2000\n\n"+ + // "#index 100\n" + + // "#t 200,300\n" + + // "#s 2222"; + // + // sampleMatrix = new double[][] {{1,2,3}, {10,21,30}, {100,200,300},{1000,2000,3000}}; + // runGenerateReaderTest(); - FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(" ", ":"); - ReaderTextLIBSVM readerTextLIBSVM = new ReaderTextLIBSVM(propertiesLIBSVM); - MatrixBlock mb = readerTextLIBSVM.readMatrixFromHDFS("/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/susy-libsvm/susy-libsvm.data",-1,18,-1,-1); + StringBuilder sb = new StringBuilder( + " ,)R2I( hcraeseR mmocofnI rof etutitsnI ,tnemtrapeD gniniM ataD\"[:\"snoitailiffa\",\"tuhN hniM neyugN \":\"eman\",802:\"xedni\"{"); + System.out.println(sb.reverse()); } + // @Test + // public void test13() throws Exception { + // String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.raw"; + // String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.frame"; + // Integer sampleNRows = 1000; + // String delimiter = "\\t"; + // String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/aminer_author_5.schema"; + // String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/aminer_author.data"; + // + // Float percent = 7f;//Float.parseFloat(args[6]); + // String datasetName = "aminer_paper";//args[7]; + // String LOG_HOME ="/home/saeed/Documents/ExpLog";//args[8]; + // + // if(delimiter.equals("\\t")) + // delimiter = "\t"; + // + // Util util = new Util(); + // Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + // int ncols = sampleSchema.length; + // + // ArrayList newSampleSchema = new ArrayList<>(); + // ArrayList> newSampleFrame = new ArrayList<>(); + // + // String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); + // + // for(int c = 0; c < sampleFrameStrings[0].length; c++) { + // HashSet valueSet = new HashSet<>(); + // for(int r=0; r0){ + // ArrayList tempList = new ArrayList<>(); + // for(int r=0; r newSampleSchema = new ArrayList<>(); ArrayList> newSampleFrame = new ArrayList<>(); - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, delimiter,0); + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); for(int c = 0; c < sampleFrameStrings[0].length; c++) { HashSet valueSet = new HashSet<>(); @@ -244,214 +308,42 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou String sampleRaw = util.readEntireTextFile(sampleRawFileName); GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - FrameReader fr = gr.getReader(); + FrameReader fr =gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); - int a = 100; } } - - @Test public void test14() throws Exception { - ///home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/twitter-examples/F10 - for(int f = 1; f <= 784; f++) { - System.out.println("+++++++++++++++++++++ Q=" + f); - String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/mnist8m-libsvm/F" + f + "/sample-mnist8m-libsvm200.raw"; - String sampleMatrixFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/mnist8m-libsvm/F" + f + "/sample-mnist8m-libsvm200.matrix"; - String delimiter = "\\t"; - String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/mnist8m-libsvm/mnist8m-libsvm.data"; - - Util util = new Util(); - - MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, delimiter); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); - MatrixReader mr = gr.getReader(); -// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); - -// FormatIdentifying fi = new FormatIdentifying(sampleRaw,sampleMB); -// -// myregex mr = new myregex(fi.getFormatProperties()); -// mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); - - int a = 100; - - } - } - - @Test public void test15() throws Exception { - String str = "0 1:9.728614687919616699e-01 2:6.538545489311218262e-01 3:1.176224589347839355e+00 4:1.157156467437744141e+00 5:-1.739873170852661133e+00 6:-8.743090629577636719e-01 7:5.677649974822998047e-01 8:-1.750000417232513428e-01 9:8.100607395172119141e-01 10:-2.525521218776702881e-01 11:1.921887040138244629e+00 12:8.896374106407165527e-01 13:4.107718467712402344e-01 14:1.145620822906494141e+00 15:1.932632088661193848e+00 16:9.944640994071960449e-01 17:1.367815494537353516e+00 18:4.071449860930442810e-02"; - String str1="0 1:0.30151 2:0.30151 3:0.30151 4:0.30151 5:0.30151 6:0.30151 7:0.30151 8:0.30151 9:0.30151 10:0.30151 11:0.30151"; - - -// String str = " 123:"; -// String s= str.replaceAll("\\d+","\\\\d+"); -// System.out.println(s); - - //(?<=^|[\w\d]\s)([\w\d]+)(?=\s|$) - - String regex = "(\\d+:)";//"(?<=\\d:)(.*?)(?=\\d:)"; //(.*?)(\d+:) - -// String regex="\\d+:"; - - List allMatches = new ArrayList(); - - for(int i=0;i<10000000;i++) { - Matcher m = Pattern.compile(regex).matcher(str1); - while(m.find()) { - String s = m.group(1) + " ";//+ m.group(3);//+" "+ m.group(5); - //System.out.println(s); - //allMatches.add(m.group(5)); - } - } - - - +// FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); // -// Pattern p = Pattern.compile(regex); +// String FILENAME_SINGLE = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.raw"; +// Types.ValueType[] schema = {Types.ValueType.STRING,Types.ValueType.STRING,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64}; // -// // Find match between given string -// // and regular expression -// // using Pattern.matcher() -// Matcher m = p.matcher(str); +// Map schemaMap = new HashMap<>(); +// schemaMap.put("/returnFlag",0); +// schemaMap.put("/lineStatus",1); +// schemaMap.put("/quantity",2); +// schemaMap.put("/extendedPrice",3); +// schemaMap.put("/discount",4); +// schemaMap.put("/tax",5); +// // Read FrameBlock +// FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(FILENAME_SINGLE, schema, schemaMap, -1, schema.length); // -// // Get the subsequence -// // using find() method -// while(m.find()) { -// System.out.println(m.group()+" "+m.start()+" "+ m.end()+" "); -// } - - // int misMatchPenalty = 3; - // int gapPenalty = 2; - // Hirschberg hirschberg = new Hirschberg(); - - // ArrayList list = new ArrayList<>(); - // for(int i=0;i<100000000;i++){ - // list.add(" "+i+":"+i+"--"); - // } - // - // ArrayList ll = hirschberg.getLCS(list, misMatchPenalty,gapPenalty); - // Gson gson = new Gson(); - // System.out.println(gson.toJson(ll)); - // - // - // - //// List allMatches = new ArrayList(); - //// Matcher m = Pattern.compile("\\s\\w:").matcher(str); - //// while (m.find()) { - //// - //// allMatches.add(m.group()); - //// } - //// for(String s: allMatches) - //// System.out.println(s); - //// - - //--------------------------------------------- - // Regex to extract the string - // between two delimiters - // String regex = "\\[(.*?)\\]"; - // - // // Compile the Regex. - // Pattern p = Pattern.compile(regex); - // - // // Find match between given string - // // and regular expression - // // using Pattern.matcher() - // Matcher m = p.matcher(str); - // - // // Get the subsequence - // // using find() method - // while (m.find()) - // { - // System.out.println(m.group(1)); - // } - // //---------------------------------------------- - // Pattern.compile() - // MappingTrie mappingTrie = new MappingTrie(); - // for(int i=0;i<1000000;i++){ - // mappingTrie.insert(" "+i+":",i); - // } - // - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 123:",0); - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 124:",0); - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 125:",0); - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 256233:",0); - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 58296:",0); - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 10000:",0); - // mappingTrie.insert(","+Lop.OPERAND_DELIMITOR+" 9658263:",0); - // - // boolean flag=false; - // do { - // flag = mappingTrie.reConstruct(); - // }while(flag); - // - // ArrayList> myList = mappingTrie.getAllSequentialKeys(); - // Gson gson = new Gson(); - // System.out.println(gson.toJson(myList.get(0))); - - } - - - @Test public void test16() throws Exception { - ///home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/twitter-examples/F10 - for(int f = 1; f <= 2; f++) { - System.out.println("+++++++++++++++++++++ Q=" + f); - String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/Q" + f + "/sample-yelp-csv200.raw"; - String sampleFrameFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/Q" + f + "/sample-yelp-csv200.frame"; - String delimiter = "\\t"; - String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/yelp-csv.data"; - String schemaFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/yelp-csv/Q" + f + "/yelp-csv.schema"; - - Util util = new Util(); - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; +// int a = 100; - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, delimiter,ncols); + String schemaFileName ="/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schema"; + String schemaMapFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schemaMap"; + String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/twitter-json.data"; + long nrows = 1000; - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); - FrameReader fr = gr.getReader(); + Util util = new Util(); + Types.ValueType[] schema = util.getSchema(schemaFileName); + int ncols = schema.length; + Map schemaMap = util.getSchemaMap(schemaMapFileName); - //FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, ncols); - int a = 100; - } - } - - - - @Test public void test17() throws Exception { - - MatrixBlock m = new MatrixBlock(10,10,true); - - for(int f = 2; f <= 2; f++) { - System.out.println("+++++++++++++++++++++ Q=" + f); - String sampleRawFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/queen-mm/F" + f + "/sample-queen" + - "-mm200.raw"; - String sampleMatrixFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/queen-mm/F" + f + "/sample-queen-mm200.matrix"; - String delimiter = "\\t"; - String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/queen-mm/queen-mm.data"; - - Util util = new Util(); - - MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, delimiter); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); - MatrixReader mr = gr.getReader(); - // MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); - - // FormatIdentifying fi = new FormatIdentifying(sampleRaw,sampleMB); - // - // myregex mr = new myregex(fi.getFormatProperties()); - // mr.readMatrixFromHDFS(dataFileName, -1, sampleMB.getNumColumns(), -1, -1); - - int a = 100; - - } + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index abc5328671b..5ea4c199c56 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -21,8 +21,6 @@ import org.junit.Test; -import java.util.Random; - public class MatrixSingleRowFlatTest extends GenerateReaderMatrixTest { private final static String TEST_NAME = "MatrixSingleRowFlatTest"; @@ -162,47 +160,4 @@ public void test14() { sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 204, 0}, {0, 0, 305}}; runGenerateReaderTest(); } - - - public static int getRandomNumber() { - Random r = new Random(); - int low = 0; - int high = 100000000; - int result = r.nextInt(high - low) + low; - return result; - } - - @Test - public void test15() { - - Integer[][] data = new Integer[1000][33554432]; - -// for(int i=0;i<1000;i++){ -// for(int j=Math.max(i-5,0);j<=i;j++) -// data[i][j] = getRandomNumber(); -// } -// StringBuilder sb = new StringBuilder(); -// -// int r=2; -// int c=1000000; -// sampleMatrix = new double[r][c]; -// for(int i=0;i Date: Sun, 27 Feb 2022 02:31:34 +0100 Subject: [PATCH 44/84] Update GIO EXp, SystemDS Reader --- .../java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java index f9bdc5f9a0b..58ad3bc933c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -26,8 +26,8 @@ public static void main(String[] args) throws IOException, JSONException { String config = null; String schemaMapFileName = null; - Util util = new Util(); + schemaFileName = System.getProperty("schemaFileName"); dataFileName = System.getProperty("dataFileName"); // read and parse mtd file String mtdFileName = dataFileName + ".mtd"; @@ -47,6 +47,10 @@ public static void main(String[] args) throws IOException, JSONException { if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); if (jsonObject.containsKey("header")) header = jsonObject.getBoolean("header"); + + if (jsonObject.containsKey("schema_path")) schemaFileName = jsonObject.getString("schema_path"); + + } catch (Exception exception) { } @@ -68,12 +72,10 @@ public static void main(String[] args) throws IOException, JSONException { if (matrixReader == null) throw new IOException("The Matrix Reader is NULL: " + dataFileName + ", format: " + format); matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); } else { - schemaFileName = System.getProperty("schemaFileName"); Types.ValueType[] schema = util.getSchema(schemaFileName); cols = schema.length; FrameBlock frameBlock = null; - switch (format) { case "csv": FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); From c09aac0c482b8ada70ccaf8a024c0456eedfa493 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 27 Feb 2022 03:43:54 +0100 Subject: [PATCH 45/84] Update GIO EXp --- src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java index 58ad3bc933c..89c78aab9ff 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -34,7 +34,7 @@ public static void main(String[] args) throws IOException, JSONException { try { String mtd = util.readEntireTextFile(mtdFileName); mtd = mtd.replace("\n", "").replace("\r", ""); - mtd = mtd.toLowerCase().trim(); + mtd = mtd.trim(); JSONObject jsonObject = new JSONObject(mtd); if (jsonObject.containsKey("data_type")) dataType = jsonObject.getString("data_type"); From 6c1e602ae9f7398d0fc5d5bc5bea8979a1ed2aff Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 27 Feb 2022 11:57:54 +0100 Subject: [PATCH 46/84] Update GIO Exp --- .../java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java index 89c78aab9ff..c6af316fb2d 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -50,6 +50,10 @@ public static void main(String[] args) throws IOException, JSONException { if (jsonObject.containsKey("schema_path")) schemaFileName = jsonObject.getString("schema_path"); + if (jsonObject.containsKey("sep")) sep = jsonObject.getString("sep"); + + if (jsonObject.containsKey("indSep")) indSep = jsonObject.getString("indSep"); + } catch (Exception exception) { } From 5fe869abec1625320b8b3487fd2debc38ff20e8b Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 27 Feb 2022 15:23:54 +0100 Subject: [PATCH 47/84] Update GIO Exp --- .../java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java index c6af316fb2d..f9faa672ba1 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -64,17 +64,19 @@ public static void main(String[] args) throws IOException, JSONException { case "csv": FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); matrixReader = new ReaderTextCSV(propertiesCSV); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); break; case "libsvm": FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); matrixReader = new ReaderTextLIBSVM(propertiesLIBSVM); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); break; case "mm": - matrixReader = new ReaderTextCell(Types.FileFormat.MM); + matrixReader = new ReaderTextCell(Types.FileFormat.MM, true); break; } if (matrixReader == null) throw new IOException("The Matrix Reader is NULL: " + dataFileName + ", format: " + format); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); } else { Types.ValueType[] schema = util.getSchema(schemaFileName); cols = schema.length; From 28b9f0e6813392c85ef9bf1859b289c4e262e346 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 6 Apr 2022 23:44:48 +0200 Subject: [PATCH 48/84] Update for new custom properties --- .../sysds/runtime/iogen/CustomProperties.java | 6 +- .../runtime/iogen/FormatIdentifying.java | 103 +++++++++++++----- 2 files changed, 83 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 23aad4a5651..76e645c6b44 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -29,14 +29,18 @@ public class CustomProperties extends FileFormatProperties implements Serializable { public enum IndexProperties { + IDENTITY, EXIST,SEQSCATTER,ARRAY, IDENTIFY, PREFIX, KEY; - @Override public String toString() { return this.name().toUpperCase(); } } + private IndexProperties rowIndexProperties; + private IndexProperties colIndexProperties; + + private KeyTrie[] colKeyPattern; private Types.ValueType[] schema; private IndexProperties rowIndex; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 9a1d6c5552f..c277781ce10 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -30,11 +30,10 @@ import java.util.HashSet; import java.util.Map; - public class FormatIdentifying { private int[][] mapRow; - private int[] mapRowPrevious; + private int[] mapRowPrevious; private int[][] mapCol; private int[][] mapLen; private int NaN; @@ -48,7 +47,6 @@ public class FormatIdentifying { private ReaderMapping mappingValues; private CustomProperties properties; - public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { this.mappingValues = new ReaderMapping(raw, matrix); this.runIdentification(); @@ -67,7 +65,7 @@ private void runIdentification() { sampleRawIndexes = mappingValues.getSampleRawIndexes(); mapRowPrevious = new int[ncols]; - for(int c=0; c< ncols; c++) + for(int c = 0; c < ncols; c++) mapRowPrevious[c] = 0; nrows = mappingValues.getNrows(); @@ -75,23 +73,57 @@ private void runIdentification() { nlines = mappingValues.getNlines(); NaN = (ncols * nrows) - mappingValues.getNaN(); + // Index properties: + // 1. Identity: + // 2. Exist: + // 3. Sequential Scattered: + // 4. Array: + + /* supported formats by row and column indexes: + # | row | col | Value | example + -------------------------------------- + 1 | Identity | Identity | Exist | csv, JSON/XML L + 2 | Identity | Exist | Exist | LibSVm + 3 | Identity | Exist | Not-Exist | LibSVM+Pattern + 4 | Exist | Exist | Exist | MM Coordinate General + 5 | Array | Array | Exist | MM Array + 6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric + 7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric + 8 | Exist | Exist | Not-Exist | MM Coordinate Pattern + 9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern + 10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner + */ + + // First, check the properties of row-index + boolean identity = isRowIndexIdentity(); + if(identity){ + KeyTrie[] colKeyPattern; + + // TODO: change method name from buildColsKeyPatternSingleRow to buildColPatternRowIdentity + colKeyPattern = buildColsKeyPatternSingleRow(); + + + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTIFY); + } + // Check the map row: // If all cells of a row mapped to a single line of sample raw, it is a single row mapping // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping boolean isSingleRow = false; int missedCount = 0; - for(int r=0; r1) + if(index > 1) break; } - for(int c=0; c< Math.min(numberOfSelectedCols, ncols); c++){ + for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); ArrayList prefixStrings = colPrefixString.getKey(); ArrayList prefixStringRowIndexes = colPrefixString.getValue(); @@ -127,19 +159,20 @@ private void runIdentification() { MappingTrie trie = new MappingTrie(); int ri = 0; - for(String ps: prefixStrings ) + for(String ps : prefixStrings) trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); do { flag = trie.reConstruct(); - }while(flag); + } + while(flag); ArrayList> keyPatterns = trie.getAllSequentialKeys(); - for(ArrayList kp: keyPatterns){ - for(String ps: prefixStrings){ + for(ArrayList kp : keyPatterns) { + for(String ps : prefixStrings) { StringBuilder sb = new StringBuilder(); int currPos = 0; - for(String k: kp){ + for(String k : kp) { sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); currPos += sb.length() + k.length(); } @@ -147,7 +180,7 @@ private void runIdentification() { } } - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + flag = checkPrefixRowIndex(c, begin, prefixRawIndex); if(!flag) { begin = 1; flag = checkPrefixRowIndex(c, begin, prefixRawIndex); @@ -158,11 +191,11 @@ private void runIdentification() { } else beginPos.add(begin); - if(c== numberOfSelectedCols -1){ + if(c == numberOfSelectedCols - 1) { ArrayList rowPrefixStrings = new ArrayList<>(); MappingTrie rowTrie = new MappingTrie(); rowKeyPattern = new KeyTrie(); - for(int si: selectedRowIndex) { + for(int si : selectedRowIndex) { for(int ci = 0; ci < ncols; ci++) { int cri = mapRow[si][ci]; if(cri != -1) { @@ -198,9 +231,10 @@ private void runIdentification() { if(!flagReconstruct) break; } - }while(!check); + } + while(!check); - if(keyPatterns.size() == 0){ + if(keyPatterns.size() == 0) { ArrayList> kpl = new ArrayList<>(); ArrayList kpli = new ArrayList<>(); kpli.add(""); @@ -211,12 +245,12 @@ private void runIdentification() { } } - if(beginPos.size() == 1){ + if(beginPos.size() == 1) { colKeyPattern = buildColsKeyPatternSingleRow(); properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); Integer bpos = beginPos.iterator().next(); - if(bpos>0) - properties.setRowIndexBegin("-"+bpos); + if(bpos > 0) + properties.setRowIndexBegin("-" + bpos); else properties.setRowIndexBegin(""); } @@ -228,6 +262,25 @@ private void runIdentification() { } } + //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // check row-index Identity + private boolean isRowIndexIdentity() { + boolean identity = false; + int missedCount = 0; + for(int r = 0; r < nrows; r++) + missedCount += ncols - mostCommonScore(mapRow[r]); + if((float) missedCount / NaN < 0.07) + identity = true; + return identity; + } + // check col-index Identity + private boolean isColIndexIdentity(){ + return false; + } + + //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex){ for(int r=0;r Date: Mon, 30 May 2022 16:05:29 +0200 Subject: [PATCH 49/84] Extend ReaderMapping t support symmetric, skew-symmetric and pattern properties of matrix --- .../runtime/iogen/FormatIdentifying.java | 558 +++++++++++++----- .../sysds/runtime/iogen/ReaderMapping.java | 124 +++- .../runtime/iogen/RowIndexStructure.java | 67 +++ .../Identify/MatrixGRRowColIdentifyTest.java | 2 +- .../iogen/MatrixSingleRowFlatTest.java | 41 ++ 5 files changed, 640 insertions(+), 152 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index c277781ce10..49e33fffce7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -33,7 +33,6 @@ public class FormatIdentifying { private int[][] mapRow; - private int[] mapRowPrevious; private int[][] mapCol; private int[][] mapLen; private int NaN; @@ -59,51 +58,65 @@ public FormatIdentifying(String raw, FrameBlock frame) throws Exception { private void runIdentification() { + /* Index properties: + 1. Identity: + 2. Exist: + 3. Sequential Scattered: + 4. Array: + + supported formats by row and column indexes: + # | row | col | Value | example + -------------------------------------- + 1 | Identity | Identity | Exist | csv, JSON/XML L + 2 | Identity | Exist | Exist | LibSVM + 3 | Identity | Exist | Not-Exist | LibSVM+Pattern + 4 | Exist | Exist | Exist | MM Coordinate General + 5 | Array | Array | Exist | MM Array + 6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric + 7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric + 8 | Exist | Exist | Not-Exist | MM Coordinate Pattern + 9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern + 10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner + + strategy for checking the structure of indexes and values: + 1. map values: + 1.a values are full exist in the source + 1.b values are partially exist in the dataset (we have to check the Symmetric, Skew-Symmetric, and so on) + 1.c values are not exist in the source, in this case we have to check static value(s) + 2. map indexes: + 2.a after finding value properties the next step is looking for index maps, row index is in the first order + 2.b column index mapping + */ + + // value mapping mapRow = mappingValues.getMapRow(); mapCol = mappingValues.getMapCol(); mapLen = mappingValues.getMapLen(); - sampleRawIndexes = mappingValues.getSampleRawIndexes(); - mapRowPrevious = new int[ncols]; - for(int c = 0; c < ncols; c++) - mapRowPrevious[c] = 0; + // save line by line index of string(index for Int, Long, float, Double, String, Boolean) + sampleRawIndexes = mappingValues.getSampleRawIndexes(); + // matrix/frame properties for analysis and create datastructures nrows = mappingValues.getNrows(); ncols = mappingValues.getNcols(); nlines = mappingValues.getNlines(); NaN = (ncols * nrows) - mappingValues.getNaN(); - // Index properties: - // 1. Identity: - // 2. Exist: - // 3. Sequential Scattered: - // 4. Array: - - /* supported formats by row and column indexes: - # | row | col | Value | example - -------------------------------------- - 1 | Identity | Identity | Exist | csv, JSON/XML L - 2 | Identity | Exist | Exist | LibSVm - 3 | Identity | Exist | Not-Exist | LibSVM+Pattern - 4 | Exist | Exist | Exist | MM Coordinate General - 5 | Array | Array | Exist | MM Array - 6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric - 7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric - 8 | Exist | Exist | Not-Exist | MM Coordinate Pattern - 9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern - 10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner - */ + // analysis mapping of values + // 1. check (exist, partially exist, not exist) + // 2. check the records represented in single/multilines + // 3. check the Symmetric, Skew-Symmetric, Pattern, and Array + + // First, check the properties of row-index boolean identity = isRowIndexIdentity(); - if(identity){ + if(identity) { KeyTrie[] colKeyPattern; // TODO: change method name from buildColsKeyPatternSingleRow to buildColPatternRowIdentity colKeyPattern = buildColsKeyPatternSingleRow(); - - - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTIFY); + properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTITY); } // Check the map row: @@ -273,21 +286,291 @@ private boolean isRowIndexIdentity() { identity = true; return identity; } - // check col-index Identity - private boolean isColIndexIdentity(){ - return false; + + // check roe-index Exist + // 1. row-index exist and can be reachable with a pattern + // 2. row-index exist but there is no pattern for it + // 3. row-index exist but just not for all cells! row-index appeared when the text broken newline="\n" + private RowIndexStructure isRowIndexExist() { + // Check the row index is a prefix string in sample raw + // if the row indexes are in the prefix of values, so we need to build a key pattern to extract row indexes + // for understanding row indexes are in sample raw we check just 3 column of data + // for build a key pattern related to row indexes we just selected a row + + //public enum IndexProperties { + // IDENTITY, + // CELLWISEEXIST, + // CELLWISEEXISTPATTERNLESS, + // ROWWISEEXIST, *** + // SEQSCATTER, + // ARRAY; + // @Override + // public String toString() { + // return this.name().toUpperCase(); + // } + // } + RowIndexStructure rowIndexStructure = new RowIndexStructure(); + BitSet[] bitSets = new BitSet[nrows]; + int[] rowCardinality = new int[nrows]; + int[] rowNZ = new int[nrows]; + boolean isCellWise = true; + boolean isSeqScatter = true; + boolean isExist = true; + for(int r = 0; r < nrows; r++) { + bitSets[r] = new BitSet(nlines); + rowNZ[r] = 0; + for(int c = 0; c < ncols; c++) { + if(mapRow[r][c] != -1) { + bitSets[r].set(mapRow[r][c]); + rowNZ[r]++; + } + } + rowCardinality[r] = bitSets[r].cardinality(); + } + // check for Cell Wise + for(int r = 0; r < nrows && isCellWise; r++) + isCellWise = rowCardinality[r] == rowNZ[r]; + + // check for Sequential: + for(int r = 0; r < nrows && isSeqScatter; r++) { + BitSet bitSet = bitSets[r]; + int beginIndex = bitSet.nextSetBit(0); + for(int i = bitSet.nextSetBit(beginIndex + 1); i != -1 && isSeqScatter; i = bitSet.nextSetBit(i + 1)) + isSeqScatter = i == ++beginIndex; + } + + // check exist: + int begin; + if(isCellWise) { + for(int c = 0; c < ncols; c++) { + begin = checkRowIndexesOnColumnRaw(c, 0); + if(begin == -1) { + isExist = false; + break; + } + } + } + else { + ArrayList list = new ArrayList<>(); + for(int r = 0; r < nrows; r++) { + BitSet bitSet = bitSets[r]; + for(int i = bitSet.nextSetBit(0); i != -1 && isSeqScatter; i = bitSet.nextSetBit(i + 1)) + list.add(sampleRawIndexes.get(i)); + begin = checkRowIndexOnRaws(r, 0, list); + if(begin == -1) { + isExist = false; + break; + } + } + } + +// if(isCellWise && isExist) { +// rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); +// } +// else if(!isCellWise && isExist) { +// rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.ROWWISEEXIST); +// } +// else if(isCellWise && !isExist) { +// +// } + + // RowIndexStructure rowIndexStructure = null; + // int numberOfSelectedCols = 3; + // MatrixBlock rowIndexMB = new MatrixBlock(nrows, ncols, false); + // int scol = Math.min(ncols - numberOfSelectedCols, ncols); + // for(int r=0; r beginPos = new HashSet<>(); +// KeyTrie rowPattern = null; +// +// // Select two none zero row as a row index candidate +// int index = 0; +// for(int r = 1; r < nrows; r++) { +// for(int c = 0; c < ncols; c++) +// if(mapRow[r][c] != -1) { +// selectedRowIndex[index++] = r; +// break; +// } +// if(index > 1) +// break; +// } + +// // CELLWISEEXIST: when row index exist in each cell value +// for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { +// +// Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); +// ArrayList prefixStrings = colPrefixString.getKey(); +// ArrayList prefixStringRowIndexes = colPrefixString.getValue(); +// ArrayList prefixRawIndex = new ArrayList<>(); +// +// MappingTrie trie = new MappingTrie(); +// int ri = 0; +// for(String ps : prefixStrings) +// trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); +// +// do { +// flag = trie.reConstruct(); +// } +// while(flag); +// +// ArrayList> keyPatterns = trie.getAllSequentialKeys(); +// for(ArrayList kp : keyPatterns) { +// for(String ps : prefixStrings) { +// StringBuilder sb = new StringBuilder(); +// int currPos = 0; +// for(String k : kp) { +// sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); +// currPos += sb.length() + k.length(); +// } +// prefixRawIndex.add(new RawIndex(sb.toString())); +// } +// } +// +// flag = checkPrefixRowIndex(c, begin, prefixRawIndex); +// if(!flag) { +// begin = 1; +// flag = checkPrefixRowIndex(c, begin, prefixRawIndex); +// } +// if(!flag) { +// beginPos.clear(); +// break; +// } +// else +// beginPos.add(begin); +// if(c == numberOfSelectedCols - 1) { +// ArrayList rowPrefixStrings = new ArrayList<>(); +// MappingTrie rowTrie = new MappingTrie(); +// rowPattern = new KeyTrie(); +// for(int si : selectedRowIndex) { +// for(int ci = 0; ci < ncols; ci++) { +// int cri = mapRow[si][ci]; +// if(cri != -1) { +// String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); +// RawIndex rawIndex = new RawIndex(str); +// Pair pair = rawIndex.findValue(si + begin); +// if(pair != null) { +// String pstr = str.substring(0, pair.getKey()); +// if(pstr.length() > 0) { +// rowPrefixStrings.add(pstr); +// rowTrie.insert(pstr, 1); +// } +// rowPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); +// } +// } +// } +// } +// +// do { +// ArrayList> selectedKeyPatterns = new ArrayList<>(); +// keyPatterns = rowTrie.getAllSequentialKeys(); +// check = false; +// for(ArrayList keyPattern : keyPatterns) { +// boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); +// check |= newCheck; +// if(newCheck) +// selectedKeyPatterns.add(keyPattern); +// } +// if(check) +// keyPatterns = selectedKeyPatterns; +// else { +// flagReconstruct = rowTrie.reConstruct(); +// if(!flagReconstruct) +// break; +// } +// } +// while(!check); +// +// if(keyPatterns.size() == 0) { +// ArrayList> kpl = new ArrayList<>(); +// ArrayList kpli = new ArrayList<>(); +// kpli.add(""); +// kpl.add(kpli); +// keyPatterns = kpl; +// } +// rowPattern.setPrefixKeyPattern(keyPatterns); +// } +// } + // if(beginPos.size() == 1) { + // rowIndexStructure = new RowIndexStructure(); + // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); + // rowIndexStructure.setKeyPattern(rowPattern); + // Integer bpos = beginPos.iterator().next(); + // if(bpos > 0) + // rowIndexStructure.setRowIndexBegin("-" + bpos); + // else + // rowIndexStructure.setRowIndexBegin(""); + // } + // return rowIndexStructure; + return null; } - //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + private int checkRowIndexesOnColumnRaw(int colIndex, int beginPos) { + int nne = 0; + for(int r = 0; r < nrows; r++) { + RawIndex raw = sampleRawIndexes.get(mapRow[r][colIndex]); + raw.cloneReservedPositions(); + Pair pair = raw.findValue(r + beginPos); + raw.restoreReservedPositions(); + if(pair == null) + nne++; + } + + if(nne > nrows * 0.3) { + if(beginPos == 1) + return -1; + else + return checkRowIndexesOnColumnRaw(colIndex, 1); + } + else + return beginPos; + } + private int checkRowIndexOnRaws(int rowIndex, int beginPos, ArrayList list) { + int nne = 0; + for(RawIndex raw : list) { + raw.cloneReservedPositions(); + Pair pair = raw.findValue(rowIndex + beginPos); + if(pair == null) + nne++; + raw.restoreReservedPositions(); + } - private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex){ - for(int r=0;r list.size() * 0.3) { + if(beginPos == 1) + return -1; + else + return checkRowIndexOnRaws(rowIndex, 1, list); + } + else + return beginPos; + } + + //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex) { + for(int r = 0; r < nrows; r++) { int rowIndex = this.mapRow[r][colIndex]; - if(rowIndex!=-1){ + if(rowIndex != -1) { boolean flag = false; - for(RawIndex ri: prefixRawIndex) { - if(ri.findValue(r+ beginPos) != null) { + for(RawIndex ri : prefixRawIndex) { + if(ri.findValue(r + beginPos) != null) { flag = true; break; } @@ -306,19 +589,20 @@ public CustomProperties getFormatProperties() { private Integer mostCommonScore(int[] list) { Map map = new HashMap<>(); int nan = 0; - for (Integer t : list) { - if (t != -1) { + for(Integer t : list) { + if(t != -1) { Integer val = map.get(t); map.put(t, val == null ? 1 : val + 1); - } else + } + else nan++; } - if (map.size() == 0) + if(map.size() == 0) return nan; Map.Entry max = null; - for (Map.Entry e : map.entrySet()) { - if (max == null || e.getValue() > max.getValue()) + for(Map.Entry e : map.entrySet()) { + if(max == null || e.getValue() > max.getValue()) max = e; } return max.getValue() + nan; @@ -326,18 +610,18 @@ private Integer mostCommonScore(int[] list) { private Integer mostCommonValue(int[] list) { Map map = new HashMap<>(); - for (Integer t : list) { - if (t != -1) { + for(Integer t : list) { + if(t != -1) { Integer val = map.get(t); map.put(t, val == null ? 1 : val + 1); } } - if (map.size() == 0) + if(map.size() == 0) return -1; Map.Entry max = null; - for (Map.Entry e : map.entrySet()) { - if (max == null || e.getValue() > max.getValue()) + for(Map.Entry e : map.entrySet()) { + if(max == null || e.getValue() > max.getValue()) max = e; } return max.getKey(); @@ -349,7 +633,7 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { KeyTrie[] colKeyPattens = new KeyTrie[ncols]; // Clean prefix strings - for(int c =0; c< ncols; c++) { + for(int c = 0; c < ncols; c++) { ArrayList list = prefixStrings.getKey()[c]; String token = null; boolean flag = true; @@ -360,7 +644,7 @@ private KeyTrie[] buildColsKeyPatternSingleRow() { flag = false; else { String subStr = s.substring(s.length() - w); - if (!subStr.contains(Lop.OPERAND_DELIMITOR)) + if(!subStr.contains(Lop.OPERAND_DELIMITOR)) wts.add(subStr); else flag = false; @@ -388,12 +672,12 @@ else if(wts.size() == 0) } if(token == null) { int[] listLength = new int[nrows]; - for (int r = 0; r< nrows; r++) + for(int r = 0; r < nrows; r++) listLength[r] = mapCol[r][c]; int commonLength = mostCommonValue(listLength); - if (commonLength == 0){ + if(commonLength == 0) { ArrayList newList = new ArrayList<>(); - for(String s: list){ + for(String s : list) { if(s.length() == 0) newList.add(s); } @@ -402,9 +686,9 @@ else if(wts.size() == 0) else throw new RuntimeException("can't build a key pattern for the column: " + c); } - else if(token.length() > 0){ + else if(token.length() > 0) { ArrayList newList = new ArrayList<>(); - for(String s: list){ + for(String s : list) { if(s.endsWith(token)) newList.add(s); } @@ -412,35 +696,34 @@ else if(token.length() > 0){ } } - for(int c=0; c> keyPatterns = null; - - for(String ps: prefixStrings.getKey()[c]) + for(String ps : prefixStrings.getKey()[c]) trie.reverseInsert(ps, prefixStrings.getValue()[c].get(ri++)); - if (trie.getRoot().getChildren().size() == 1){ - String[] splitPattern= prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); + if(trie.getRoot().getChildren().size() == 1) { + String[] splitPattern = prefixStrings.getKey()[c].get(0).split(Lop.OPERAND_DELIMITOR); ArrayList reverseSplitPattern = new ArrayList<>(); - for (String ps: splitPattern) - if (ps.length() > 0) + for(String ps : splitPattern) + if(ps.length() > 0) reverseSplitPattern.add(ps); - if (reverseSplitPattern.size() == 0) + if(reverseSplitPattern.size() == 0) reverseSplitPattern.add(""); int maxPatternLength = reverseSplitPattern.size(); check = false; - for(int sp= 0; sp< maxPatternLength;sp++){ + for(int sp = 0; sp < maxPatternLength; sp++) { ArrayList shortPattern = new ArrayList<>(); - for(int spi= maxPatternLength - sp-1; spi< maxPatternLength; spi++){ + for(int spi = maxPatternLength - sp - 1; spi < maxPatternLength; spi++) { shortPattern.add(reverseSplitPattern.get(spi)); } check = checkKeyPatternIsUnique(prefixStrings.getKey()[c], shortPattern); - if (check) { + if(check) { keyPatterns = new ArrayList<>(); keyPatterns.add(shortPattern); break; @@ -452,26 +735,27 @@ else if(token.length() > 0){ ArrayList> selectedKeyPatterns = new ArrayList<>(); keyPatterns = trie.getAllSequentialKeys(); check = false; - for (ArrayList keyPattern : keyPatterns) { + for(ArrayList keyPattern : keyPatterns) { boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); check |= newCheck; - if (newCheck) + if(newCheck) selectedKeyPatterns.add(keyPattern); } - if (check) + if(check) keyPatterns = selectedKeyPatterns; else { flagReconstruct = trie.reConstruct(); - if (!flagReconstruct) + if(!flagReconstruct) break; } - } while (!check); + } + while(!check); } - if(check){ + if(check) { colKeyPattens[c] = new KeyTrie(keyPatterns); - for(String suffix: suffixStrings[c]) { - colKeyPattens[c].insertSuffixKeys(suffix.substring(0,Math.min(suffixStringLength, suffix.length())).toCharArray()); + for(String suffix : suffixStrings[c]) { + colKeyPattens[c].insertSuffixKeys(suffix.substring(0, Math.min(suffixStringLength, suffix.length())).toCharArray()); } } } @@ -482,7 +766,7 @@ else if(token.length() > 0){ public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { ArrayList[] prefixStrings = new ArrayList[ncols]; ArrayList[] rowIndexes = new ArrayList[ncols]; - for(int c=0; c< ncols; c++){ + for(int c = 0; c < ncols; c++) { Pair, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, reverse); prefixStrings[c] = pair.getKey(); rowIndexes[c] = pair.getValue(); @@ -497,7 +781,7 @@ public Pair, ArrayList> extractAllPrefixStringsOfACol int rowIndex = mapRow[r][colIndex]; if(rowIndex != -1) { rowIndexes.add(rowIndex); - String str = sampleRawIndexes.get(rowIndex).getRemainedTexts(mapCol[r][colIndex]);//sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][colIndex]); + String str = sampleRawIndexes.get(rowIndex).getRemainedTexts(mapCol[r][colIndex]); if(reverse) prefixStrings.add(new StringBuilder(str).reverse().toString()); else @@ -531,7 +815,7 @@ private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { // 2. Build key pattern tree for each column // 3. Build key pattern for end of values - private ArrayList> findRowDelimiters(){ + private ArrayList> findRowDelimiters() { ArrayList> keyPattern = new ArrayList<>(); Hirschberg hirschberg = new Hirschberg(); int misMatchPenalty = 3; @@ -544,36 +828,36 @@ private ArrayList> findRowDelimiters(){ int[] minList = new int[nrows]; HashMap maxColPos = new HashMap<>(); int[] minColPos = new int[nrows]; - for(int r=0; r> findRowDelimiters(){ StringBuilder sb = new StringBuilder(); beginLine = Math.max(beginLine, 0); - if(beginLine+1 == nlines) + if(beginLine + 1 == nlines) continue; Integer subStrPos = 0; @@ -595,10 +879,10 @@ private ArrayList> findRowDelimiters(){ subStrPos = maxColPos.get(beginLine); String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); - if(str.length() >0) { + if(str.length() > 0) { sb.append(str).append("\n"); } - for(int i = beginLine+1 ; i < minList[r]; i++){ + for(int i = beginLine + 1; i < minList[r]; i++) { str = sampleRawIndexes.get(i).getRaw(); if(str.length() > 0) sb.append(str).append("\n"); @@ -609,7 +893,7 @@ private ArrayList> findRowDelimiters(){ sb.append(str); recordBoundaries.add(sb.toString()); } - recordBoundaries.remove(recordBoundaries.size()-1); + recordBoundaries.remove(recordBoundaries.size() - 1); String str1 = recordBoundaries.get(0); String str2 = recordBoundaries.get(1); @@ -634,14 +918,13 @@ private ArrayList> findRowDelimiters(){ return null; } - // Build key pattern tree for each column - private KeyTrie[] buildColsKeyPatternMultiRow(){ + private KeyTrie[] buildColsKeyPatternMultiRow() { Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - for(int c=0; c> keyPatterns = trie.getPrefixKeyPatterns(); check = false; - for(ArrayList keyPattern: keyPatterns) { + for(ArrayList keyPattern : keyPatterns) { boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); check |= newCheck; - if(newCheck){ + if(newCheck) { trie.setAPrefixPath(keyPattern); } } - if(!check){ + if(!check) { remainedPrefixes.clear(); boolean flag = true; - for(ArrayList keyPattern: keyPatterns){ + for(ArrayList keyPattern : keyPatterns) { ArrayList remainedPrefix = new ArrayList<>(); for(String ps : prefixStrings.getKey()[c]) remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); @@ -687,7 +970,7 @@ private KeyTrie[] buildColsKeyPatternMultiRow(){ while(!check); // Suffix pattern is based on char, so we need to extract all chars of a string - for(String suffix: suffixStrings[c]) { + for(String suffix : suffixStrings[c]) { trie.insertSuffixKeys(suffix.toCharArray()); } colKeyPattens[c] = trie; @@ -696,62 +979,61 @@ private KeyTrie[] buildColsKeyPatternMultiRow(){ } // Extract prefix strings: - private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse){ + private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse) { ArrayList[] result = new ArrayList[ncols]; Pair[] minmax = new Pair[ncols]; BitSet[] tmpUsedLines = new BitSet[nlines]; BitSet[] usedLines = new BitSet[nlines]; - for(int r=0; r(); int min = 0; int max = 0; - for(int r=0; r=0; i--) + for(int i = rowIndex - 1; i >= 0; i--) if(usedLines[r].get(i)) { lastLine = i; break; } - for(int i= lastLine; i 0 ) + for(int i = lastLine; i < rowIndex; i++) { + if(sampleRawIndexes.get(i).getRawLength() > 0) sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); } String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][c]); if(str.length() > 0 && !str.equals("\n")) sb.append(str); else if(lastLine < rowIndex) - sb.deleteCharAt(sb.length()-1); - + sb.deleteCharAt(sb.length() - 1); if(reverse) result[c].add(sb.reverse().toString()); else result[c].add(sb.toString()); max = Math.max(max, sb.length()); - if(sb.length()< min || min == 0) + if(sb.length() < min || min == 0) min = sb.length(); minmax[c] = new Pair<>(min, max); } @@ -759,15 +1041,15 @@ else if(lastLine < rowIndex) return new Pair<>(result, minmax); } - private String findStartWithIntersectOfStrings(ArrayList strList, int minLength){ + private String findStartWithIntersectOfStrings(ArrayList strList, int minLength) { StringBuilder sb = new StringBuilder(); int i = 0; boolean flag = true; do { char ch = strList.get(0).charAt(i); - for(int j=1; j strList, int mi if(flag) sb.append(ch); i++; - }while(flag && i< minLength); + } + while(flag && i < minLength); return sb.toString(); } - private HashSet findStartWithIntersectOfStrings(ArrayList strList){ + private HashSet findStartWithIntersectOfStrings(ArrayList strList) { // 1. Extract all substrings // 2. Find intersection of substrings HashSet[] substrings = new HashSet[strList.size()]; - for(int i=0; i< strList.size(); i++) + for(int i = 0; i < strList.size(); i++) substrings[i] = new HashSet<>(); for(int w = windowSize; w > 2; w--) { - for(int i=0; i totalIntersect = new HashSet<>(substrings[0]); - for(int r=1; r 0) - return totalIntersect; + return totalIntersect; } return null; } - private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys){ - if(keys.size() == 1){ + private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys) { + if(keys.size() == 1) { String k = keys.get(0); - if (k.length() == 0) + if(k.length() == 0) return true; } - for(String ps: prefixStrings){ + for(String ps : prefixStrings) { int currentPos = 0; int patternCount = 0; do { @@ -823,8 +1106,9 @@ private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayLi patternCount++; currentPos++; } - }while(true); - if(patternCount!=1) + } + while(true); + if(patternCount != 1) return false; } return true; @@ -844,20 +1128,20 @@ private Pair getIndexOfKeyPatternOnString(String str, ArrayLis flag = false; break; } - if(startPos==-1) + if(startPos == -1) startPos = currPos; } if(flag) - return new Pair<>(startPos, currPos+key.get(key.size()-1).length()); + return new Pair<>(startPos, currPos + key.get(key.size() - 1).length()); else - return new Pair<>(-1,-1); + return new Pair<>(-1, -1); } - private ArrayList getAllSubstringsOfAString(String str,int size){ + private ArrayList getAllSubstringsOfAString(String str, int size) { ArrayList result = new ArrayList<>(); if(str == null) - return result; - for(int i = 0; i <= str.length() - size; i++){ + return result; + for(int i = 0; i <= str.length() - size; i++) { String s = str.substring(i, i + size); if(!s.contains("\n")) result.add(s); @@ -865,7 +1149,7 @@ private ArrayList getAllSubstringsOfAString(String str,int size){ return result; } - private String getRemainedSubstring(String str, ArrayList keys){ + private String getRemainedSubstring(String str, ArrayList keys) { boolean flag = true; int currPos = 0; for(String k : keys) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index de6c1e98611..1429debe4ff 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -37,6 +37,16 @@ public class ReaderMapping { private int[][] mapCol; private int[][] mapLen; private boolean mapped; + private boolean fullMap; + private boolean upperTriangularMap; + private boolean lowerTriangularMap; + private boolean symmetricMap; + private boolean skewSymmetricMap; + private boolean symmetricUpperMap; + private boolean skewSymmetricUpperMap; + private boolean patternMap; + private Double patternValueMap; + private final int nrows; private final int ncols; private int nlines; @@ -47,8 +57,7 @@ public class ReaderMapping { private Types.ValueType[] schema; private final boolean isMatrix; - public ReaderMapping(int nlines, int nrows, int ncols, ArrayList sampleRawIndexes, MatrixBlock matrix) - throws Exception { + public ReaderMapping(int nlines, int nrows, int ncols, ArrayList sampleRawIndexes, MatrixBlock matrix) throws Exception { this.nlines = nlines; this.nrows = nrows; this.ncols = ncols; @@ -109,14 +118,12 @@ protected boolean findMapping(boolean isIndexMapping) { int itRow = 0; for(int r = 0; r < nrows; r++) { for(int c = 0; c < ncols; c++) { - if(isIndexMapping || ((this.isMatrix && this.sampleMatrix.getValue(r, - c) != 0) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, + if(isIndexMapping || ((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, c) != null) || (schema[c].isNumeric() && this.sampleFrame.getDouble(r, c) != 0))))) { HashSet checkedLines = new HashSet<>(); while(checkedLines.size() < nlines) { RawIndex ri = sampleRawIndexes.get(itRow); - Pair pair = this.isMatrix ? ri.findValue( - sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); + Pair pair = this.isMatrix ? ri.findValue(sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); if(pair != null) { mapRow[r][c] = itRow; mapCol[r][c] = pair.getKey(); @@ -135,15 +142,104 @@ protected boolean findMapping(boolean isIndexMapping) { NaN++; } } - boolean flagMap = true; - for(int r = 0; r < nrows && flagMap; r++) - for(int c = 0; c < ncols && flagMap; c++) - if(mapRow[r][c] == -1 && ((!this.isMatrix && this.sampleFrame.get(r, - c) != null) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, - c) != null) || (schema[c].isNumeric() && this.sampleFrame.getDouble(r, c) != 0))))) { - flagMap = false; + + // analysis mapping of values + // 1. check (exist, partially exist, not exist) + // 2. check the records represented in single/multilines + // 3. check the Symmetric, Skew-Symmetric, Pattern, and Array + + int fullMap = 0; + int upperTriangular = 0; + int upperTriangularZeros = 0; + int lowerTriangular = 0; + int lowerTriangularZeros = 0; + boolean singleLineRecord = true; + + // check full map + for(int r = 0; r < nrows; r++) + for(int c = 0; c < ncols; c++) + if(mapRow[r][c] != -1) + fullMap++; + + // check for upper and lower triangular + if(nrows == ncols) { + this.upperTriangularMap = true; + this.lowerTriangularMap = true; + this.symmetricMap = true; + this.symmetricUpperMap = true; + this.skewSymmetricMap = true; + this.skewSymmetricUpperMap = true; + this.patternMap = false; + this.patternValueMap = null; + + if(this.isMatrix) { + for(int r = 0; r < nrows; r++) { + // upper triangular check + for(int c = r; c < ncols && this.upperTriangularMap; c++) + if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] == -1) + this.upperTriangularMap = false; + + for(int c = 0; c < r && this.upperTriangularMap; c++) + if(this.sampleMatrix.getValue(r, c) != 0) + this.upperTriangularMap = false; + + // lower triangular check + for(int c = 0; c <= r && this.lowerTriangularMap; c++) + if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] == -1) + this.lowerTriangularMap = false; + + for(int c = r + 1; c < ncols && this.lowerTriangularMap; c++) + if(this.sampleMatrix.getValue(r, c) != 0) + this.lowerTriangularMap = false; + + // Symmetric check + for(int c = 0; c <= r && this.symmetricMap; c++) + if(this.sampleMatrix.getValue(r, c) != this.sampleMatrix.getValue(c, r)) + this.symmetricMap = false; + + if(this.symmetricMap) { + for(int c = 0; c <= r && this.symmetricUpperMap; c++) + if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] != -1) { + this.symmetricUpperMap = false; + break; + } + } + + // Skew-Symmetric check + for(int c = 0; c <= r && this.skewSymmetricMap; c++) + if(this.sampleMatrix.getValue(r, c) != this.sampleMatrix.getValue(c, r) * -1) + this.skewSymmetricMap = false; + + if(this.skewSymmetricMap) { + for(int c = 0; c <= r && this.skewSymmetricUpperMap; c++) + if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] != -1) { + this.skewSymmetricUpperMap = false; + break; + } + } + // pattern check + HashSet patternValueSet = new HashSet<>(); + for(int c = 0; c < ncols; c++) + patternValueSet.add(this.sampleMatrix.getValue(r, c)); + if(patternValueSet.size() == 1) { + this.patternMap = true; + this.patternValueMap = patternValueSet.iterator().next(); + } } - return flagMap; + } + + } + + System.out.println("upperTriangularMap=" + upperTriangularMap); + System.out.println("lowerTriangularMap=" + lowerTriangularMap); + System.out.println("symmetric=" + symmetricMap); + System.out.println("symmetricUpperMap=" + symmetricUpperMap); + System.out.println("skewSymmetricMap = " + skewSymmetricMap); + System.out.println("skewSymmetricUpperMap=" + skewSymmetricUpperMap); + System.out.println("patternMap=" + patternMap); + System.out.println("patternValueMap=" + patternValueMap); + + return false; } public int getNaN() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java new file mode 100644 index 00000000000..eb586dc80b8 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +public class RowIndexStructure { + + public enum IndexProperties { + Identity, + CellWiseExist, + CellWiseExistPatternLess, + RowWiseExist, + SeqScatter, + ArrayRowMajor, + ArrayColMajor, + ArrayShapeRowMajor, + ArrayShapeCol; + @Override + public String toString() { + return this.name().toUpperCase(); + } + } + + private IndexProperties properties; + private KeyTrie keyPattern; + private String rowIndexBegin; + + public IndexProperties getProperties() { + return properties; + } + + public void setProperties(IndexProperties properties) { + this.properties = properties; + } + + public KeyTrie getKeyPattern() { + return keyPattern; + } + + public void setKeyPattern(KeyTrie keyPattern) { + this.keyPattern = keyPattern; + } + + public String getRowIndexBegin() { + return rowIndexBegin; + } + + public void setRowIndexBegin(String rowIndexBegin) { + this.rowIndexBegin = rowIndexBegin; + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 40424bc54f3..2f1463de0a0 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -273,7 +273,7 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou ArrayList newSampleSchema = new ArrayList<>(); ArrayList> newSampleFrame = new ArrayList<>(); - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, delimiter,ncols); for(int c = 0; c < sampleFrameStrings[0].length; c++) { HashSet valueSet = new HashSet<>(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index 5ea4c199c56..b239a6f835d 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -160,4 +160,45 @@ public void test14() { sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 204, 0}, {0, 0, 305}}; runGenerateReaderTest(); } + + + //========================= + @Test + public void test15() { + sampleRaw = "0,1,2,3\n" + "10,0,20,30\n" + "100,200,0,300\n"+"1000,2000,3000,0"; + sampleMatrix = new double[][] {{0,1,2,3}, {10,0,20,30}, {100,200,300,0},{1000,2000,3000,0}}; + runGenerateReaderTest(); + } + + //upper-triangular + @Test + public void test16() { + sampleRaw = "1,2,3,4\n" + "0,20,30,40\n" + "0,0,300,400\n"+"0,0,0,4000"; + sampleMatrix = new double[][] {{1,2,3,4}, {0,20,30,40}, {0,0,300,400},{0,0,0,4000}}; + runGenerateReaderTest(); + } + + //lower-triangular + @Test + public void test17() { + sampleRaw = "1,0,0,0\n" + "10,20,0,0\n" + "100,200,300,0\n"+"1000,2000,3000,4000"; + sampleMatrix = new double[][] {{1,0,0,0}, {10,20,0,0}, {100,200,300,0},{1000,2000,3000,4000}}; + runGenerateReaderTest(); + } + + //symmetric + @Test + public void test19() { + sampleRaw = "1,2,3,4\n" + "2,2,4,5\n" + "3,4,3,6\n"+"4,5,6,4"; + sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; + runGenerateReaderTest(); + } + + //symmetric-upper + @Test + public void test20() { + sampleRaw = "1,2,3,4\n" + "0,2,4,5\n" + "0,0,3,6\n"+"0,0,0,4"; + sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; + runGenerateReaderTest(); + } } From d377fd0b41c1de25c323779f06f644f0bcb0f176 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 7 Jun 2022 15:45:31 +0200 Subject: [PATCH 50/84] Fix Symmetric and Skew-Symmetric properties bugs --- .../sysds/runtime/iogen/ReaderMapping.java | 136 +++++++++++------- .../iogen/MatrixSingleRowFlatTest.java | 8 ++ 2 files changed, 90 insertions(+), 54 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index 1429debe4ff..11edaf8d486 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -24,6 +24,7 @@ import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.data.Pair; +import org.apache.sysds.runtime.util.UtilFunctions; import java.io.BufferedReader; import java.io.InputStream; @@ -42,10 +43,9 @@ public class ReaderMapping { private boolean lowerTriangularMap; private boolean symmetricMap; private boolean skewSymmetricMap; - private boolean symmetricUpperMap; - private boolean skewSymmetricUpperMap; private boolean patternMap; - private Double patternValueMap; + private Object patternValueMap; + private Types.ValueType patternValueType; private final int nrows; private final int ncols; @@ -166,82 +166,110 @@ protected boolean findMapping(boolean isIndexMapping) { this.upperTriangularMap = true; this.lowerTriangularMap = true; this.symmetricMap = true; - this.symmetricUpperMap = true; this.skewSymmetricMap = true; - this.skewSymmetricUpperMap = true; this.patternMap = false; this.patternValueMap = null; - if(this.isMatrix) { - for(int r = 0; r < nrows; r++) { - // upper triangular check - for(int c = r; c < ncols && this.upperTriangularMap; c++) - if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] == -1) - this.upperTriangularMap = false; - - for(int c = 0; c < r && this.upperTriangularMap; c++) - if(this.sampleMatrix.getValue(r, c) != 0) - this.upperTriangularMap = false; - - // lower triangular check - for(int c = 0; c <= r && this.lowerTriangularMap; c++) - if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] == -1) - this.lowerTriangularMap = false; - - for(int c = r + 1; c < ncols && this.lowerTriangularMap; c++) - if(this.sampleMatrix.getValue(r, c) != 0) - this.lowerTriangularMap = false; - - // Symmetric check - for(int c = 0; c <= r && this.symmetricMap; c++) - if(this.sampleMatrix.getValue(r, c) != this.sampleMatrix.getValue(c, r)) - this.symmetricMap = false; - - if(this.symmetricMap) { - for(int c = 0; c <= r && this.symmetricUpperMap; c++) - if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] != -1) { - this.symmetricUpperMap = false; - break; - } - } + // pattern check for Frame: in Frame the schema must be same for all columns + boolean homoSchema = true; + Types.ValueType vtc0 = null; + if(!this.isMatrix) { + vtc0 = this.sampleFrame.getSchema()[0]; + for(int c = 1; c < ncols && homoSchema; c++) + homoSchema = this.sampleFrame.getSchema()[c] == vtc0; + } - // Skew-Symmetric check - for(int c = 0; c <= r && this.skewSymmetricMap; c++) - if(this.sampleMatrix.getValue(r, c) != this.sampleMatrix.getValue(c, r) * -1) - this.skewSymmetricMap = false; - - if(this.skewSymmetricMap) { - for(int c = 0; c <= r && this.skewSymmetricUpperMap; c++) - if(this.sampleMatrix.getValue(r, c) != 0 && mapRow[r][c] != -1) { - this.skewSymmetricUpperMap = false; - break; - } - } - // pattern check + for(int r = 0; r < nrows; r++) { + // upper triangular check + for(int c = r; c < ncols && this.upperTriangularMap; c++) + if(this.checkValueIsNotNullZero(r, c) && mapRow[r][c] == -1) + this.upperTriangularMap = false; + + for(int c = 0; c < r && this.upperTriangularMap; c++) + if(this.checkValueIsNotNullZero(r, c)) + this.upperTriangularMap = false; + + // lower triangular check + for(int c = 0; c <= r && this.lowerTriangularMap; c++) + if(this.checkValueIsNotNullZero(r, c) && mapRow[r][c] == -1) + this.lowerTriangularMap = false; + + for(int c = r + 1; c < ncols && this.lowerTriangularMap; c++) + if(this.checkValueIsNotNullZero(r, c)) + this.lowerTriangularMap = false; + + // Symmetric check + for(int c = 0; c <= r && this.symmetricMap; c++) + this.symmetricMap = this.checkSymmetricValue(r, c, 1); + + // Skew-Symmetric check + for(int c = 0; c <= r && this.skewSymmetricMap; c++) + this.skewSymmetricMap = this.checkSymmetricValue(r, c, -1); + + // pattern check for Matrix + if(this.isMatrix) { HashSet patternValueSet = new HashSet<>(); for(int c = 0; c < ncols; c++) patternValueSet.add(this.sampleMatrix.getValue(r, c)); if(patternValueSet.size() == 1) { + this.patternValueType = Types.ValueType.FP64; this.patternMap = true; this.patternValueMap = patternValueSet.iterator().next(); } } + else { + if(homoSchema) { + HashSet patternValueSet = new HashSet<>(); + for(int c = 0; c < ncols; c++) + patternValueSet.add(this.sampleFrame.get(r, c)); + if(patternValueSet.size() == 1) { + this.patternValueType = vtc0; + this.patternMap = true; + this.patternValueMap = patternValueSet.iterator().next(); + } + } + } } - } System.out.println("upperTriangularMap=" + upperTriangularMap); System.out.println("lowerTriangularMap=" + lowerTriangularMap); System.out.println("symmetric=" + symmetricMap); - System.out.println("symmetricUpperMap=" + symmetricUpperMap); System.out.println("skewSymmetricMap = " + skewSymmetricMap); - System.out.println("skewSymmetricUpperMap=" + skewSymmetricUpperMap); System.out.println("patternMap=" + patternMap); - System.out.println("patternValueMap=" + patternValueMap); + System.out.println("patternValueType = "+patternValueType); + System.out.println("patternValueMap=" + UtilFunctions.objectToString(patternValueType)); + return false; } + private boolean checkValueIsNotNullZero(int r, int c) { + boolean result; + if(this.isMatrix) + result = this.sampleMatrix.getValue(r, c) != 0; + else { + if(this.sampleFrame.getSchema()[c].isNumeric()) + result = this.sampleFrame.getDouble(r, c) != 0; + else + result = this.sampleFrame.get(r, c) != null; + } + return result; + } + + // Symmetric checks just available for numeric values in the frame representations + private boolean checkSymmetricValue(int r, int c, int a) { + boolean result; + if(this.isMatrix) + result = this.sampleMatrix.getValue(r, c) == this.sampleMatrix.getValue(c, r) * a; + else if(this.sampleFrame.getSchema()[c].isNumeric()) + result = this.sampleFrame.getDouble(r, c) == this.sampleFrame.getDouble(c, r) * a; + else + result = this.sampleFrame.get(r, c).equals(this.sampleFrame.get(c, r)); + + return result; + } + public int getNaN() { return NaN; } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index b239a6f835d..6cb48b8e55d 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -201,4 +201,12 @@ public void test20() { sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; runGenerateReaderTest(); } + + //symmetric-lower + @Test + public void test21() { + sampleRaw = "1,0,0,0\n" + "2,2,0,0\n" + "3,4,3,0\n"+"4,5,6,4"; + sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; + runGenerateReaderTest(); + } } From 93c6e9ed20c096f64e6cd640bab3402e6f16ab65 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 9 Jun 2022 20:19:28 +0200 Subject: [PATCH 51/84] New synchronization with the paper background section --- .../runtime/iogen/ColIndexStructure.java | 86 ++ .../sysds/runtime/iogen/CustomProperties.java | 44 +- .../runtime/iogen/FormatIdentifying.java | 854 +++++++++++------- .../runtime/iogen/MappingProperties.java | 130 +++ .../apache/sysds/runtime/iogen/RawIndex.java | 633 ++++++------- .../sysds/runtime/iogen/ReaderMapping.java | 260 +++--- .../runtime/iogen/RowIndexStructure.java | 26 +- .../iogen/MatrixSingleRowFlatTest.java | 127 ++- 8 files changed, 1354 insertions(+), 806 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java new file mode 100644 index 00000000000..ac492107d99 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +public class ColIndexStructure { + + public enum IndexProperties { + Identity, // col number of sample raw data equal to the row index of matrix/frame + CellWiseExist; // col index of every cell values are in the sample raw data + @Override + public String toString() { + return this.name().toUpperCase(); + } + } + + public ColIndexStructure() { + this.properties = null; + this.keyPattern = null; + this.colIndexBegin = "0"; + } + + private IndexProperties properties; + private KeyTrie keyPattern; + private String colIndexBegin; + + // when the index properties is CellWiseExist: + private String indexDelim; + private String valueDelim; + + public IndexProperties getProperties() { + return properties; + } + + public void setProperties(IndexProperties properties) { + this.properties = properties; + } + + public KeyTrie getKeyPattern() { + return keyPattern; + } + + public void setKeyPattern(KeyTrie keyPattern) { + this.keyPattern = keyPattern; + } + + public String getColIndexBegin() { + return colIndexBegin; + } + + public void setColIndexBegin(int colIndexBegin) { + this.colIndexBegin = colIndexBegin + ""; + } + + public String getIndexDelim() { + return indexDelim; + } + + public void setIndexDelim(String indexDelim) { + this.indexDelim = indexDelim; + } + + public String getValueDelim() { + return valueDelim; + } + + public void setValueDelim(String valueDelim) { + this.valueDelim = valueDelim; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 76e645c6b44..27f821d12f3 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -28,8 +28,46 @@ public class CustomProperties extends FileFormatProperties implements Serializable { + private MappingProperties mappingProperties; + private RowIndexStructure rowIndexStructure; + private ColIndexStructure colIndexStructure; + + public CustomProperties(MappingProperties mappingProperties, RowIndexStructure rowIndexStructure, ColIndexStructure colIndexStructure) { + this.mappingProperties = mappingProperties; + this.rowIndexStructure = rowIndexStructure; + this.colIndexStructure = colIndexStructure; + } + + public MappingProperties getMappingProperties() { + return mappingProperties; + } + + public void setMappingProperties(MappingProperties mappingProperties) { + this.mappingProperties = mappingProperties; + } + + public RowIndexStructure getRowIndexStructure() { + return rowIndexStructure; + } + + public void setRowIndexStructure(RowIndexStructure rowIndexStructure) { + this.rowIndexStructure = rowIndexStructure; + } + + public ColIndexStructure getColIndexStructure() { + return colIndexStructure; + } + + public void setColIndexStructure(ColIndexStructure colIndexStructure) { + this.colIndexStructure = colIndexStructure; + } + + + + //-------------------------------------- + public enum IndexProperties { - IDENTITY, EXIST,SEQSCATTER,ARRAY, + IDENTITY, EXIST, SEQSCATTER, XARRAY, YARRAY, IDENTIFY, PREFIX, KEY; @Override public String toString() { @@ -37,8 +75,8 @@ public String toString() { } } - private IndexProperties rowIndexProperties; - private IndexProperties colIndexProperties; + + private KeyTrie[] colKeyPattern; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 49e33fffce7..3c201a35951 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -35,12 +35,14 @@ public class FormatIdentifying { private int[][] mapRow; private int[][] mapCol; private int[][] mapLen; - private int NaN; + private int actualValueCount; + private MappingProperties mappingProperties; private ArrayList sampleRawIndexes; private static int nrows; private static int ncols; private int nlines; + private int windowSize = 20; private int suffixStringLength = 50; private ReaderMapping mappingValues; @@ -64,19 +66,19 @@ private void runIdentification() { 3. Sequential Scattered: 4. Array: - supported formats by row and column indexes: + Table 1: supported formats by row and column indexes: # | row | col | Value | example -------------------------------------- - 1 | Identity | Identity | Exist | csv, JSON/XML L - 2 | Identity | Exist | Exist | LibSVM - 3 | Identity | Exist | Not-Exist | LibSVM+Pattern - 4 | Exist | Exist | Exist | MM Coordinate General - 5 | Array | Array | Exist | MM Array - 6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric - 7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric - 8 | Exist | Exist | Not-Exist | MM Coordinate Pattern - 9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern - 10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner + 1 | Identity | Identity | Exist | csv, JSON/XML L single-line + 2 | Identity | Exist | Exist | LibSVM single + 3 | Identity | Exist | Not-Exist | LibSVM+Pattern single + 4 | Exist | Exist | Exist | MM Coordinate General multi + 5 | Array | Array | Exist | MM Array multi + 6 | Exist | Exist | Partially-Exist | MM Coordinate Symmetric multi + 7 | Exist | Exist | Partially-Exist+Pattern | MM Coordinate Skew-Symmetric multi + 8 | Exist | Exist | Not-Exist | MM Coordinate Pattern multi + 9 | Exist | Exist | Not-Exist+Pattern | MM Coordinate Symmetric Pattern multi + 10 | SEQSCATTER| Identity | Exist | JSON/XML Multi Line, AMiner multi strategy for checking the structure of indexes and values: 1. map values: @@ -92,6 +94,7 @@ private void runIdentification() { mapRow = mappingValues.getMapRow(); mapCol = mappingValues.getMapCol(); mapLen = mappingValues.getMapLen(); + mappingProperties = mappingValues.getMappingProperties(); // save line by line index of string(index for Int, Long, float, Double, String, Boolean) sampleRawIndexes = mappingValues.getSampleRawIndexes(); @@ -100,222 +103,319 @@ private void runIdentification() { nrows = mappingValues.getNrows(); ncols = mappingValues.getNcols(); nlines = mappingValues.getNlines(); - NaN = (ncols * nrows) - mappingValues.getNaN(); - - // analysis mapping of values - // 1. check (exist, partially exist, not exist) - // 2. check the records represented in single/multilines - // 3. check the Symmetric, Skew-Symmetric, Pattern, and Array - + actualValueCount = mappingValues.getActualValueCount(); + // collect custom properties + // 1. properties of row-index + RowIndexStructure rowIndexStructure = getRowIndexStructure(); - // First, check the properties of row-index - boolean identity = isRowIndexIdentity(); - if(identity) { - KeyTrie[] colKeyPattern; + // 2. properties of column-index + ColIndexStructure colIndexStructure = getColIndexStructure(); - // TODO: change method name from buildColsKeyPatternSingleRow to buildColPatternRowIdentity - colKeyPattern = buildColsKeyPatternSingleRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTITY); - } + properties = new CustomProperties(mappingProperties, rowIndexStructure, colIndexStructure); - // Check the map row: - // If all cells of a row mapped to a single line of sample raw, it is a single row mapping - // If all cells of a row mapped to multiple lines of sample raw, it is a multi row mapping + // ref to Table 1: + if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { - boolean isSingleRow = false; - int missedCount = 0; - for(int r = 0; r < nrows; r++) - missedCount += ncols - mostCommonScore(mapRow[r]); - if((float) missedCount / NaN < 0.07) - isSingleRow = true; + // #1 + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { + KeyTrie[] colKeyPattern; + // TODO: change method name from buildColsKeyPatternSingleRow to buildColPatternRowIdentity + colKeyPattern = buildColsKeyPatternSingleRow(); + properties.setColKeyPattern(colKeyPattern); + } - KeyTrie[] colKeyPattern; + // #2 + else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + // find cell-index and value separators + RawIndex raw = null; + for(int c = 0; c < ncols; c++) { + if(mapCol[0][c] != -1) { + raw = sampleRawIndexes.get(mapRow[0][c]); + raw.cloneReservedPositions(); + break; + } + } + HashMap indexDelimCount = new HashMap<>(); + String valueDelim = null; + String indexDelim = null; + Long maxCount = 0L; + int begin = Integer.parseInt(colIndexStructure.getColIndexBegin()); + for(int c = 0; c < ncols; c++) { + if(mapCol[0][c] != -1) { + Pair pair = raw.findValue(c + begin); + String tmpIndexDelim = raw.getSubString(pair.getKey() + pair.getValue(), mapCol[0][c]); + if(indexDelimCount.containsKey(tmpIndexDelim)) + indexDelimCount.put(tmpIndexDelim, indexDelimCount.get(tmpIndexDelim) + 1); + else + indexDelimCount.put(tmpIndexDelim, 1L); + if(maxCount < indexDelimCount.get(tmpIndexDelim)) { + maxCount = indexDelimCount.get(tmpIndexDelim); + indexDelim = tmpIndexDelim; + } + if(valueDelim == null) { + int nextPos = raw.getNextNumericPosition(mapCol[0][c] + mapLen[0][c]); + if(nextPos < raw.getRawLength()) { + valueDelim = raw.getSubString(mapCol[0][c] + mapLen[0][c], nextPos); + } + } + } + } + // update properties + colIndexStructure.setIndexDelim(indexDelim); + colIndexStructure.setValueDelim(valueDelim); + } - if(isSingleRow) { - colKeyPattern = buildColsKeyPatternSingleRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.IDENTIFY); } else { - - // Check the row index is a prefix string in sample raw - // if the row indexes are in the prefix of values, so we need to build a key pattern - // to extract row indexes - // for understanding row indexes are in sample raw we check just 3 column of data - // for build a key pattern related to row indexes we just selected a row - boolean flag; - int numberOfSelectedCols = 3; - int begin = 0; - boolean check, flagReconstruct; - int[] selectedRowIndex = new int[2]; - HashSet beginPos = new HashSet<>(); - KeyTrie rowKeyPattern = null; - - // Select two none zero row as a row index candidate - - int index = 0; - for(int r = 1; r < nrows; r++) { - for(int c = 0; c < ncols; c++) - if(mapRow[r][c] != -1) { - selectedRowIndex[index++] = r; + // # 4, 6, 7, 8, 9 + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + + // build key pattern for row index + int numberOfSelectedCols = 3; + int begin = Integer.parseInt(rowIndexStructure.getRowIndexBegin()); + boolean check, flagReconstruct; + int[] selectedRowIndex = new int[2]; + KeyTrie rowKeyPattern = null; + + // Select two none zero row as a row index candidate + int index = 0; + for(int r = 1; r < nrows; r++) { + for(int c = 0; c < ncols; c++) + if(mapRow[r][c] != -1) { + selectedRowIndex[index++] = r; + break; + } + if(index > 1) break; - } - if(index > 1) - break; - } + } - for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { - Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); - ArrayList prefixStrings = colPrefixString.getKey(); - ArrayList prefixStringRowIndexes = colPrefixString.getValue(); - ArrayList prefixRawIndex = new ArrayList<>(); + for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { + Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); + ArrayList prefixStrings = colPrefixString.getKey(); + ArrayList prefixStringRowIndexes = colPrefixString.getValue(); + ArrayList prefixRawIndex = new ArrayList<>(); - MappingTrie trie = new MappingTrie(); - int ri = 0; - for(String ps : prefixStrings) - trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); + MappingTrie trie = new MappingTrie(); + int ri = 0; + for(String ps : prefixStrings) + trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); - do { - flag = trie.reConstruct(); - } - while(flag); - - ArrayList> keyPatterns = trie.getAllSequentialKeys(); - for(ArrayList kp : keyPatterns) { - for(String ps : prefixStrings) { - StringBuilder sb = new StringBuilder(); - int currPos = 0; - for(String k : kp) { - sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); - currPos += sb.length() + k.length(); + do { + flagReconstruct = trie.reConstruct(); + } + while(flagReconstruct); + + ArrayList> keyPatterns = trie.getAllSequentialKeys(); + for(ArrayList kp : keyPatterns) { + for(String ps : prefixStrings) { + StringBuilder sb = new StringBuilder(); + int currPos = 0; + for(String k : kp) { + sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); + currPos += sb.length() + k.length(); + } + prefixRawIndex.add(new RawIndex(sb.toString())); } - prefixRawIndex.add(new RawIndex(sb.toString())); } - } + if(c == numberOfSelectedCols - 1) { + ArrayList rowPrefixStrings = new ArrayList<>(); + MappingTrie rowTrie = new MappingTrie(); + rowKeyPattern = new KeyTrie(); + for(int si : selectedRowIndex) { + for(int ci = 0; ci < ncols; ci++) { + int cri = mapRow[si][ci]; + if(cri != -1) { + String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); + RawIndex rawIndex = new RawIndex(str); + Pair pair = rawIndex.findValue(si + begin); + if(pair != null) { + String pstr = str.substring(0, pair.getKey()); + if(pstr.length() > 0) { + rowPrefixStrings.add(pstr); + rowTrie.insert(pstr, 1); + } + rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); + } + } + } + } - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - if(!flag) { - begin = 1; - flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = rowTrie.getAllSequentialKeys(); + check = false; + for(ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); + check |= newCheck; + if(newCheck) + selectedKeyPatterns.add(keyPattern); + } + if(check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = rowTrie.reConstruct(); + if(!flagReconstruct) + break; + } + } + while(!check); + + if(keyPatterns.size() == 0) { + ArrayList> kpl = new ArrayList<>(); + ArrayList kpli = new ArrayList<>(); + kpli.add(""); + kpl.add(kpli); + keyPatterns = kpl; + } + rowKeyPattern.setPrefixKeyPattern(keyPatterns); + } } - if(!flag) { - beginPos.clear(); - break; + rowIndexStructure.setKeyPattern(rowKeyPattern); + + // build key pattern for column index + begin = Integer.parseInt(colIndexStructure.getColIndexBegin()); + int[] selectedColIndex = new int[2]; + KeyTrie colKeyPattern = null; + + // Select two none zero row as a row index candidate + index = 0; + for(int c = 0; c < ncols; c++) { + for(int r = 1; r < nrows; r++) + if(mapRow[r][c] != -1) { + selectedColIndex[index++] = c; + break; + } + if(index > 1) + break; } - else - beginPos.add(begin); - if(c == numberOfSelectedCols - 1) { - ArrayList rowPrefixStrings = new ArrayList<>(); - MappingTrie rowTrie = new MappingTrie(); - rowKeyPattern = new KeyTrie(); - for(int si : selectedRowIndex) { - for(int ci = 0; ci < ncols; ci++) { - int cri = mapRow[si][ci]; - if(cri != -1) { - String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); - RawIndex rawIndex = new RawIndex(str); - Pair pair = rawIndex.findValue(si + begin); - if(pair != null) { - String pstr = str.substring(0, pair.getKey()); - if(pstr.length() > 0) { - rowPrefixStrings.add(pstr); - rowTrie.insert(pstr, 1); + + for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { + Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); + ArrayList prefixStrings = colPrefixString.getKey(); + ArrayList prefixStringRowIndexes = colPrefixString.getValue(); + ArrayList prefixRawIndex = new ArrayList<>(); + + MappingTrie trie = new MappingTrie(); + int ri = 0; + for(String ps : prefixStrings) + trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); + + do { + flagReconstruct = trie.reConstruct(); + } + while(flagReconstruct); + + ArrayList> keyPatterns = trie.getAllSequentialKeys(); + for(ArrayList kp : keyPatterns) { + for(String ps : prefixStrings) { + StringBuilder sb = new StringBuilder(); + int currPos = 0; + for(String k : kp) { + sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); + currPos += sb.length() + k.length(); + } + prefixRawIndex.add(new RawIndex(sb.toString())); + } + } + if(c == numberOfSelectedCols - 1) { + ArrayList colPrefixStrings = new ArrayList<>(); + MappingTrie colTrie = new MappingTrie(); + colKeyPattern = new KeyTrie(); + for(int si : selectedColIndex) { + for(int ir = 0; ir < nrows; ir++) { + int cri = mapRow[ir][si]; + if(cri != -1) { + String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[ir][si]); + RawIndex rawIndex = new RawIndex(str); + Pair pair = rawIndex.findValue(si + begin); + if(pair != null) { + String pstr = str.substring(0, pair.getKey()); + if(pstr.length() > 0) { + colPrefixStrings.add(pstr); + colTrie.insert(pstr, 1); + } + colKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); } - rowKeyPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); } } } - } - do { - ArrayList> selectedKeyPatterns = new ArrayList<>(); - keyPatterns = rowTrie.getAllSequentialKeys(); - check = false; - for(ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); - check |= newCheck; - if(newCheck) - selectedKeyPatterns.add(keyPattern); + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = colTrie.getAllSequentialKeys(); + check = false; + for(ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(colPrefixStrings, keyPattern); + check |= newCheck; + if(newCheck) + selectedKeyPatterns.add(keyPattern); + } + if(check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = colTrie.reConstruct(); + if(!flagReconstruct) + break; + } } - if(check) - keyPatterns = selectedKeyPatterns; - else { - flagReconstruct = rowTrie.reConstruct(); - if(!flagReconstruct) - break; + while(!check); + + if(keyPatterns.size() == 0) { + ArrayList> kpl = new ArrayList<>(); + ArrayList kpli = new ArrayList<>(); + kpli.add(""); + kpl.add(kpli); + keyPatterns = kpl; } + colKeyPattern.setPrefixKeyPattern(keyPatterns); } - while(!check); - - if(keyPatterns.size() == 0) { - ArrayList> kpl = new ArrayList<>(); - ArrayList kpli = new ArrayList<>(); - kpli.add(""); - kpl.add(kpli); - keyPatterns = kpl; - } - rowKeyPattern.setPrefixKeyPattern(keyPatterns); } - } - - if(beginPos.size() == 1) { - colKeyPattern = buildColsKeyPatternSingleRow(); - properties = new CustomProperties(colKeyPattern, CustomProperties.IndexProperties.PREFIX, rowKeyPattern); - Integer bpos = beginPos.iterator().next(); - if(bpos > 0) - properties.setRowIndexBegin("-" + bpos); - else - properties.setRowIndexBegin(""); - } - else { - KeyTrie rowDelimPattern = new KeyTrie(findRowDelimiters()); - colKeyPattern = buildColsKeyPatternMultiRow(); - properties = new CustomProperties(colKeyPattern, rowDelimPattern); + colIndexStructure.setKeyPattern(colKeyPattern); } } } - //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - // check row-index Identity - private boolean isRowIndexIdentity() { - boolean identity = false; - int missedCount = 0; - for(int r = 0; r < nrows; r++) - missedCount += ncols - mostCommonScore(mapRow[r]); - if((float) missedCount / NaN < 0.07) - identity = true; - return identity; - } - - // check roe-index Exist + // check row-index Exist // 1. row-index exist and can be reachable with a pattern // 2. row-index exist but there is no pattern for it // 3. row-index exist but just not for all cells! row-index appeared when the text broken newline="\n" - private RowIndexStructure isRowIndexExist() { - // Check the row index is a prefix string in sample raw + private RowIndexStructure getRowIndexStructure() { + // check the row index is a prefix string in sample raw, or the sample data line number equal to the sample matrix/frame row index // if the row indexes are in the prefix of values, so we need to build a key pattern to extract row indexes - // for understanding row indexes are in sample raw we check just 3 column of data - // for build a key pattern related to row indexes we just selected a row - - //public enum IndexProperties { - // IDENTITY, - // CELLWISEEXIST, - // CELLWISEEXISTPATTERNLESS, - // ROWWISEEXIST, *** - // SEQSCATTER, - // ARRAY; - // @Override - // public String toString() { - // return this.name().toUpperCase(); - // } - // } + // to understanding row indexes are in sample raw we check just 3 column of data + // to build a key pattern related to row indexes we just selected a row + // TODO: decrease the number of row/col indexes want to check(3 or 5) + RowIndexStructure rowIndexStructure = new RowIndexStructure(); + + // check row-index Identity, the identity properties available just for + // exist and partially exist mapped values + if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { + boolean identity = false; + int missedCount = 0; + for(int r = 0; r < nrows; r++) + missedCount += ncols - mostCommonScore(mapRow[r]); + + if(mappingProperties.getRepresentationProperties() == MappingProperties.RepresentationProperties.SYMMETRIC || mappingProperties.getRepresentationProperties() == MappingProperties.RepresentationProperties.SKEWSYMMETRIC) + missedCount -= (nrows - 1) * (ncols - 1); + + if((float) missedCount / actualValueCount < 0.07) + identity = true; + + if(identity) { + rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.Identity); + return rowIndexStructure; + } + } + BitSet[] bitSets = new BitSet[nrows]; int[] rowCardinality = new int[nrows]; int[] rowNZ = new int[nrows]; boolean isCellWise = true; boolean isSeqScatter = true; boolean isExist = true; + for(int r = 0; r < nrows; r++) { bitSets[r] = new BitSet(nlines); rowNZ[r] = 0; @@ -327,10 +427,6 @@ private RowIndexStructure isRowIndexExist() { } rowCardinality[r] = bitSets[r].cardinality(); } - // check for Cell Wise - for(int r = 0; r < nrows && isCellWise; r++) - isCellWise = rowCardinality[r] == rowNZ[r]; - // check for Sequential: for(int r = 0; r < nrows && isSeqScatter; r++) { BitSet bitSet = bitSets[r]; @@ -339,8 +435,12 @@ private RowIndexStructure isRowIndexExist() { isSeqScatter = i == ++beginIndex; } + // check for Cell Wise + for(int r = 0; r < nrows && isCellWise; r++) + isCellWise = rowCardinality[r] == rowNZ[r]; + // check exist: - int begin; + int begin = 0; if(isCellWise) { for(int c = 0; c < ncols; c++) { begin = checkRowIndexesOnColumnRaw(c, 0); @@ -349,12 +449,17 @@ private RowIndexStructure isRowIndexExist() { break; } } + if(isExist) { + rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CellWiseExist); + rowIndexStructure.setRowIndexBegin(begin); + return rowIndexStructure; + } } else { ArrayList list = new ArrayList<>(); for(int r = 0; r < nrows; r++) { BitSet bitSet = bitSets[r]; - for(int i = bitSet.nextSetBit(0); i != -1 && isSeqScatter; i = bitSet.nextSetBit(i + 1)) + for(int i = bitSet.nextSetBit(0); i != -1; i = bitSet.nextSetBit(i + 1)) list.add(sampleRawIndexes.get(i)); begin = checkRowIndexOnRaws(r, 0, list); if(begin == -1) { @@ -362,17 +467,27 @@ private RowIndexStructure isRowIndexExist() { break; } } + + if(isExist) { + rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.RowWiseExist); + rowIndexStructure.setRowIndexBegin(begin); + return rowIndexStructure; + } + else if(isSeqScatter) { + rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.SeqScatter); + return rowIndexStructure; + } } -// if(isCellWise && isExist) { -// rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); -// } -// else if(!isCellWise && isExist) { -// rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.ROWWISEEXIST); -// } -// else if(isCellWise && !isExist) { -// -// } + // if(isCellWise && isExist) { + // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); + // } + // else if(!isCellWise && isExist) { + // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.ROWWISEEXIST); + // } + // else if(isCellWise && !isExist) { + // + // } // RowIndexStructure rowIndexStructure = null; // int numberOfSelectedCols = 3; @@ -392,121 +507,121 @@ private RowIndexStructure isRowIndexExist() { // // // /////////////////////////////////////////////////////////////////////// -// boolean flag; -// int numberOfSelectedCols = 3; -// int begin = 0; -// boolean check, flagReconstruct; -// int[] selectedRowIndex = new int[2]; -// HashSet beginPos = new HashSet<>(); -// KeyTrie rowPattern = null; -// -// // Select two none zero row as a row index candidate -// int index = 0; -// for(int r = 1; r < nrows; r++) { -// for(int c = 0; c < ncols; c++) -// if(mapRow[r][c] != -1) { -// selectedRowIndex[index++] = r; -// break; -// } -// if(index > 1) -// break; -// } - -// // CELLWISEEXIST: when row index exist in each cell value -// for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { -// -// Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); -// ArrayList prefixStrings = colPrefixString.getKey(); -// ArrayList prefixStringRowIndexes = colPrefixString.getValue(); -// ArrayList prefixRawIndex = new ArrayList<>(); -// -// MappingTrie trie = new MappingTrie(); -// int ri = 0; -// for(String ps : prefixStrings) -// trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); -// -// do { -// flag = trie.reConstruct(); -// } -// while(flag); -// -// ArrayList> keyPatterns = trie.getAllSequentialKeys(); -// for(ArrayList kp : keyPatterns) { -// for(String ps : prefixStrings) { -// StringBuilder sb = new StringBuilder(); -// int currPos = 0; -// for(String k : kp) { -// sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); -// currPos += sb.length() + k.length(); -// } -// prefixRawIndex.add(new RawIndex(sb.toString())); -// } -// } -// -// flag = checkPrefixRowIndex(c, begin, prefixRawIndex); -// if(!flag) { -// begin = 1; -// flag = checkPrefixRowIndex(c, begin, prefixRawIndex); -// } -// if(!flag) { -// beginPos.clear(); -// break; -// } -// else -// beginPos.add(begin); -// if(c == numberOfSelectedCols - 1) { -// ArrayList rowPrefixStrings = new ArrayList<>(); -// MappingTrie rowTrie = new MappingTrie(); -// rowPattern = new KeyTrie(); -// for(int si : selectedRowIndex) { -// for(int ci = 0; ci < ncols; ci++) { -// int cri = mapRow[si][ci]; -// if(cri != -1) { -// String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); -// RawIndex rawIndex = new RawIndex(str); -// Pair pair = rawIndex.findValue(si + begin); -// if(pair != null) { -// String pstr = str.substring(0, pair.getKey()); -// if(pstr.length() > 0) { -// rowPrefixStrings.add(pstr); -// rowTrie.insert(pstr, 1); -// } -// rowPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); -// } -// } -// } -// } -// -// do { -// ArrayList> selectedKeyPatterns = new ArrayList<>(); -// keyPatterns = rowTrie.getAllSequentialKeys(); -// check = false; -// for(ArrayList keyPattern : keyPatterns) { -// boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); -// check |= newCheck; -// if(newCheck) -// selectedKeyPatterns.add(keyPattern); -// } -// if(check) -// keyPatterns = selectedKeyPatterns; -// else { -// flagReconstruct = rowTrie.reConstruct(); -// if(!flagReconstruct) -// break; -// } -// } -// while(!check); -// -// if(keyPatterns.size() == 0) { -// ArrayList> kpl = new ArrayList<>(); -// ArrayList kpli = new ArrayList<>(); -// kpli.add(""); -// kpl.add(kpli); -// keyPatterns = kpl; -// } -// rowPattern.setPrefixKeyPattern(keyPatterns); -// } -// } + // boolean flag; + // int numberOfSelectedCols = 3; + // int begin = 0; + // boolean check, flagReconstruct; + // int[] selectedRowIndex = new int[2]; + // HashSet beginPos = new HashSet<>(); + // KeyTrie rowPattern = null; + // + // // Select two none zero row as a row index candidate + // int index = 0; + // for(int r = 1; r < nrows; r++) { + // for(int c = 0; c < ncols; c++) + // if(mapRow[r][c] != -1) { + // selectedRowIndex[index++] = r; + // break; + // } + // if(index > 1) + // break; + // } + + // // CELLWISEEXIST: when row index exist in each cell value + // for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { + // + // Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); + // ArrayList prefixStrings = colPrefixString.getKey(); + // ArrayList prefixStringRowIndexes = colPrefixString.getValue(); + // ArrayList prefixRawIndex = new ArrayList<>(); + // + // MappingTrie trie = new MappingTrie(); + // int ri = 0; + // for(String ps : prefixStrings) + // trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); + // + // do { + // flag = trie.reConstruct(); + // } + // while(flag); + // + // ArrayList> keyPatterns = trie.getAllSequentialKeys(); + // for(ArrayList kp : keyPatterns) { + // for(String ps : prefixStrings) { + // StringBuilder sb = new StringBuilder(); + // int currPos = 0; + // for(String k : kp) { + // sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); + // currPos += sb.length() + k.length(); + // } + // prefixRawIndex.add(new RawIndex(sb.toString())); + // } + // } + // + // flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + // if(!flag) { + // begin = 1; + // flag = checkPrefixRowIndex(c, begin, prefixRawIndex); + // } + // if(!flag) { + // beginPos.clear(); + // break; + // } + // else + // beginPos.add(begin); + // if(c == numberOfSelectedCols - 1) { + // ArrayList rowPrefixStrings = new ArrayList<>(); + // MappingTrie rowTrie = new MappingTrie(); + // rowPattern = new KeyTrie(); + // for(int si : selectedRowIndex) { + // for(int ci = 0; ci < ncols; ci++) { + // int cri = mapRow[si][ci]; + // if(cri != -1) { + // String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); + // RawIndex rawIndex = new RawIndex(str); + // Pair pair = rawIndex.findValue(si + begin); + // if(pair != null) { + // String pstr = str.substring(0, pair.getKey()); + // if(pstr.length() > 0) { + // rowPrefixStrings.add(pstr); + // rowTrie.insert(pstr, 1); + // } + // rowPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); + // } + // } + // } + // } + // + // do { + // ArrayList> selectedKeyPatterns = new ArrayList<>(); + // keyPatterns = rowTrie.getAllSequentialKeys(); + // check = false; + // for(ArrayList keyPattern : keyPatterns) { + // boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); + // check |= newCheck; + // if(newCheck) + // selectedKeyPatterns.add(keyPattern); + // } + // if(check) + // keyPatterns = selectedKeyPatterns; + // else { + // flagReconstruct = rowTrie.reConstruct(); + // if(!flagReconstruct) + // break; + // } + // } + // while(!check); + // + // if(keyPatterns.size() == 0) { + // ArrayList> kpl = new ArrayList<>(); + // ArrayList kpli = new ArrayList<>(); + // kpli.add(""); + // kpl.add(kpli); + // keyPatterns = kpl; + // } + // rowPattern.setPrefixKeyPattern(keyPatterns); + // } + // } // if(beginPos.size() == 1) { // rowIndexStructure = new RowIndexStructure(); // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); @@ -518,18 +633,73 @@ private RowIndexStructure isRowIndexExist() { // rowIndexStructure.setRowIndexBegin(""); // } // return rowIndexStructure; - return null; + return rowIndexStructure; + } + + private ColIndexStructure getColIndexStructure() { + ColIndexStructure colIndexStructure = new ColIndexStructure(); + int begin = 0; + boolean colIndexExist = true; + if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { + // 1. check for column index are in the record + for(int r = 0; r < Math.min(10, nrows); r++) { + int rowIndex = -1; + for(int c = 0; c < ncols; c++) { + rowIndex = mapRow[r][c]; + if(rowIndex != -1) + break; + } + begin = checkColIndexesOnRowRaw(rowIndex, 0); + if(begin == -1) { + colIndexExist = false; + break; + } + } + if(colIndexExist) { + colIndexStructure.setColIndexBegin(begin); + colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); + return colIndexStructure; + } + // 2. check the column index are identity + else { + colIndexStructure.setProperties(ColIndexStructure.IndexProperties.Identity); + return colIndexStructure; + } + } + else { + for(int r = 0; r < nrows && colIndexExist; r++) { + for(int c = 0; c < Math.min(10, ncols) && colIndexExist; c++) { + if(mapRow[r][c] != -1) { + begin = checkColIndexOnRowRaw(mapRow[r][c], c, begin); + colIndexExist = begin != -1; + if(begin == -1) { + int a = 100; + } + } + } + } + + if(colIndexExist) { + colIndexStructure.setColIndexBegin(begin); + colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); + return colIndexStructure; + } + } + + return colIndexStructure; } private int checkRowIndexesOnColumnRaw(int colIndex, int beginPos) { int nne = 0; for(int r = 0; r < nrows; r++) { - RawIndex raw = sampleRawIndexes.get(mapRow[r][colIndex]); - raw.cloneReservedPositions(); - Pair pair = raw.findValue(r + beginPos); - raw.restoreReservedPositions(); - if(pair == null) - nne++; + if(mapRow[r][colIndex] != -1) { + RawIndex raw = sampleRawIndexes.get(mapRow[r][colIndex]); + raw.cloneReservedPositions(); + Pair pair = raw.findValue(r + beginPos); + raw.restoreReservedPositions(); + if(pair == null) + nne++; + } } if(nne > nrows * 0.3) { @@ -562,6 +732,44 @@ private int checkRowIndexOnRaws(int rowIndex, int beginPos, ArrayList return beginPos; } + private int checkColIndexesOnRowRaw(int rowIndex, int beginPos) { + int nne = 0; + RawIndex raw = sampleRawIndexes.get(rowIndex); + raw.cloneReservedPositions(); + for(int c = 0; c < ncols; c++) { + if(mapCol[rowIndex][c] != -1) { + Pair pair = raw.findValue(c + beginPos); + if(pair == null || pair.getKey() > mapCol[rowIndex][c]) + nne++; + } + } + raw.restoreReservedPositions(); + if(nne > ncols * 0.05) { + if(beginPos == 1) + return -1; + else + return checkColIndexesOnRowRaw(rowIndex, 1); + } + else + return beginPos; + } + + private int checkColIndexOnRowRaw(int rowIndex, int colIndex, int beginPos) { + RawIndex raw = sampleRawIndexes.get(rowIndex); + raw.cloneReservedPositions(); + Pair pair = raw.findValue(colIndex + beginPos); + raw.restoreReservedPositions(); + + if(pair == null) { + if(beginPos == 1) + return -1; + else + return checkColIndexOnRowRaw(rowIndex, colIndex, 1); + } + else + return beginPos; + } + //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java new file mode 100644 index 00000000000..67356cb74cb --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import org.apache.sysds.common.Types; + +public class MappingProperties { + + public enum RepresentationProperties { + TYPICAL, + SYMMETRIC, SKEWSYMMETRIC, + PATTERN, + ARRAYCOLWISE, ARRAYROWWISE; + @Override + public String toString() { + return this.name().toUpperCase(); + } + } + + public enum RecordProperties { + SINGLELINE, MULTILINE; + @Override + public String toString() { + return this.name().toUpperCase(); + } + } + + public enum DataProperties { + FULLEXIST, PARTIALLYEXIST, NOTEXIST; + @Override + public String toString() { + return this.name().toUpperCase(); + } + } + + private RepresentationProperties representationProperties; + private RecordProperties recordProperties; + private DataProperties dataProperties; + private Object patternValue; + private Types.ValueType patternValueType; + + public void setSymmetricRepresentation(){ + this.representationProperties = RepresentationProperties.SYMMETRIC; + } + + public void setSkewSymmetricRepresentation(){ + this.representationProperties = RepresentationProperties.SKEWSYMMETRIC; + } + + public void setPatternRepresentation(Types.ValueType valueType, Object value){ + this.representationProperties = RepresentationProperties.PATTERN; + this.patternValueType = valueType; + this.patternValue = value; + } + + public void setTypicalRepresentation(){ + this.representationProperties = RepresentationProperties.TYPICAL; + } + + public void setArrayColWiseRepresentation(){ + this.representationProperties = RepresentationProperties.ARRAYCOLWISE; + } + + public void setArrayRowWiseRepresentation(){ + this.representationProperties = RepresentationProperties.ARRAYROWWISE; + } + + public void setDataFullExist(){ + this.dataProperties = DataProperties.FULLEXIST; + } + + public void setDataNotExist(){ + this.dataProperties = DataProperties.NOTEXIST; + } + + public void setDataPartiallyExist(){ + this.dataProperties = DataProperties.PARTIALLYEXIST; + } + + + + public void setRecordSingleLine(){ + this.recordProperties = RecordProperties.SINGLELINE; + } + + public void setRecordMultiLine(){ + this.recordProperties = RecordProperties.MULTILINE; + } + + public boolean isRepresentation(){ + return this.representationProperties != null; + } + + public RepresentationProperties getRepresentationProperties() { + return representationProperties; + } + + public RecordProperties getRecordProperties() { + return recordProperties; + } + + public DataProperties getDataProperties() { + return dataProperties; + } + + public Object getPatternValue() { + return patternValue; + } + + public Types.ValueType getPatternValueType() { + return patternValueType; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index cdbf7cbb897..762f19b4a86 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -29,316 +29,333 @@ import java.util.HashMap; public class RawIndex { - private final String raw; - private final int rawLength; - private final BitSet numberBitSet; - private final BitSet dotBitSet; - private final BitSet eBitSet; - private final BitSet plusMinusBitSet; - private BitSet reservedPositions; - private BitSet backupReservedPositions; - private HashMap>> actualNumericValues; - private HashMap>> dotActualNumericValues; - private HashMap>> dotEActualNumericValues; - - - public RawIndex(String raw) { - this.raw = raw; - this.rawLength = raw.length(); - this.numberBitSet = new BitSet(rawLength); - this.dotBitSet = new BitSet(rawLength); - this.eBitSet = new BitSet(rawLength); - this.plusMinusBitSet = new BitSet(rawLength); - this.reservedPositions = new BitSet(rawLength); - this.backupReservedPositions = new BitSet(rawLength); - this.actualNumericValues = null; - this.dotActualNumericValues = null; - this.dotEActualNumericValues = new HashMap<>(); - - for (int i = 0; i < this.rawLength; i++) { - switch (raw.charAt(i)) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - this.numberBitSet.set(i); - break; - case '+': - case '-': - this.plusMinusBitSet.set(i); - break; - case '.': - this.dotBitSet.set(i); - break; - case 'e': - case 'E': - this.eBitSet.set(i); - break; - } - } - // Clean unnecessary sets - // Clean for "." - for (int i = dotBitSet.nextSetBit(0); i != -1; i = dotBitSet.nextSetBit(i + 1)) { - boolean flag = false; - if (i > 0) { - if (i < rawLength - 2) { - flag = !numberBitSet.get(i - 1) && - !numberBitSet.get(i + 1) && - !plusMinusBitSet.get(i + 1) && - !eBitSet.get(i + 1); - } - } else if (i == rawLength - 1) { - flag = !numberBitSet.get(i - 1); - } else if (i == 0) { - if (i < rawLength - 2) { - flag = !numberBitSet.get(i + 1) && - !plusMinusBitSet.get(i + 1) && - !eBitSet.get(i + 1); - } else if (i == rawLength - 1) { - flag = true; - } - } - - if (flag) - dotBitSet.set(i, false); - } - - // Clean for "+/-" - for (int i = plusMinusBitSet.nextSetBit(0); i != -1; i = plusMinusBitSet.nextSetBit(i + 1)) { - boolean flag; - if (i < rawLength - 1) { - flag = numberBitSet.get(i + 1); - if (!flag && i < rawLength - 2) - flag = dotBitSet.get(i + 1) && numberBitSet.get(i + 2); - } else { - flag = false; - } - if (!flag) - plusMinusBitSet.set(i, false); - } - - // Clean for "e/E" - for (int i = eBitSet.nextSetBit(0); i != -1; i = eBitSet.nextSetBit(i + 1)) { - boolean flag = false; - if ((i == 1 && !numberBitSet.get(0)) || i == 0 || i == rawLength - 1) { - flag = false; - } else if (i > 1 && i < rawLength - 2) { - flag = numberBitSet.get(i - 1) || (numberBitSet.get(i - 2) && dotBitSet.get(i - 1)); - if (flag) - flag = numberBitSet.get(i + 1) || (numberBitSet.get(i + 2) && plusMinusBitSet.get(i + 1)); - } else if (i == rawLength - 2) { - flag = numberBitSet.get(rawLength - 1); - } - if (!flag) - eBitSet.set(i, false); - } - if (numberBitSet.cardinality() > 0) - extractNumericDotEActualValues(); - } - - public Pair findValue(Object value, Types.ValueType valueType) { - if (valueType.isNumeric()) - return findValue(UtilFunctions.getDouble(value)); - else if (valueType == Types.ValueType.STRING) { - String os = UtilFunctions.objectToString(value); - if (os == null || os.length() == 0) - return null; - else - return findValue(os); - } -// else if(valueType == Types.ValueType.BOOLEAN) -// return findValue(UtilFunctions.objectToString()) + private final String raw; + private final int rawLength; + private final BitSet numberBitSet; + private final BitSet dotBitSet; + private final BitSet eBitSet; + private final BitSet plusMinusBitSet; + private BitSet reservedPositions; + private BitSet backupReservedPositions; + private HashMap>> actualNumericValues; + private HashMap>> dotActualNumericValues; + private HashMap>> dotEActualNumericValues; + + public RawIndex(String raw) { + this.raw = raw; + this.rawLength = raw.length(); + this.numberBitSet = new BitSet(rawLength); + this.dotBitSet = new BitSet(rawLength); + this.eBitSet = new BitSet(rawLength); + this.plusMinusBitSet = new BitSet(rawLength); + this.reservedPositions = new BitSet(rawLength); + this.backupReservedPositions = new BitSet(rawLength); + this.actualNumericValues = null; + this.dotActualNumericValues = null; + this.dotEActualNumericValues = new HashMap<>(); + + for(int i = 0; i < this.rawLength; i++) { + switch(raw.charAt(i)) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + this.numberBitSet.set(i); + break; + case '+': + case '-': + this.plusMinusBitSet.set(i); + break; + case '.': + this.dotBitSet.set(i); + break; + case 'e': + case 'E': + this.eBitSet.set(i); + break; + } + } + // Clean unnecessary sets + // Clean for "." + for(int i = dotBitSet.nextSetBit(0); i != -1; i = dotBitSet.nextSetBit(i + 1)) { + boolean flag = false; + if(i > 0) { + if(i < rawLength - 2) { + flag = !numberBitSet.get(i - 1) && !numberBitSet.get(i + 1) && !plusMinusBitSet.get(i + 1) && !eBitSet.get(i + 1); + } + } + else if(i == rawLength - 1) { + flag = !numberBitSet.get(i - 1); + } + else if(i == 0) { + if(i < rawLength - 2) { + flag = !numberBitSet.get(i + 1) && !plusMinusBitSet.get(i + 1) && !eBitSet.get(i + 1); + } + else if(i == rawLength - 1) { + flag = true; + } + } + + if(flag) + dotBitSet.set(i, false); + } + + // Clean for "+/-" + for(int i = plusMinusBitSet.nextSetBit(0); i != -1; i = plusMinusBitSet.nextSetBit(i + 1)) { + boolean flag; + if(i < rawLength - 1) { + flag = numberBitSet.get(i + 1); + if(!flag && i < rawLength - 2) + flag = dotBitSet.get(i + 1) && numberBitSet.get(i + 2); + } + else { + flag = false; + } + if(!flag) + plusMinusBitSet.set(i, false); + } + + // Clean for "e/E" + for(int i = eBitSet.nextSetBit(0); i != -1; i = eBitSet.nextSetBit(i + 1)) { + boolean flag = false; + if((i == 1 && !numberBitSet.get(0)) || i == 0 || i == rawLength - 1) { + flag = false; + } + else if(i > 1 && i < rawLength - 2) { + flag = numberBitSet.get(i - 1) || (numberBitSet.get(i - 2) && dotBitSet.get(i - 1)); + if(flag) + flag = numberBitSet.get(i + 1) || (numberBitSet.get(i + 2) && plusMinusBitSet.get(i + 1)); + } + else if(i == rawLength - 2) { + flag = numberBitSet.get(rawLength - 1); + } + if(!flag) + eBitSet.set(i, false); + } + if(numberBitSet.cardinality() > 0) + extractNumericDotEActualValues(); + } + + public Pair findValue(Object value, Types.ValueType valueType) { + if(valueType.isNumeric()) + return findValue(UtilFunctions.getDouble(value)); + else if(valueType == Types.ValueType.STRING) { + String os = UtilFunctions.objectToString(value); + if(os == null || os.length() == 0) + return null; + else + return findValue(os); + } + // else if(valueType == Types.ValueType.BOOLEAN) + // return findValue(UtilFunctions.objectToString()) else return null; } - public Pair findValue(double value) { -// extractNumericActualValues(); -// if(actualNumericValues.containsKey(value)){ -// return getValuePositionAndLength(actualNumericValues.get(value)); -// } -// -// extractNumericDotActualValues(); -// if(dotActualNumericValues.containsKey(value)){ -// return getValuePositionAndLength(dotActualNumericValues.get(value)); -// } -// -// extractNumericDotEActualValues(); - if (dotEActualNumericValues.containsKey(value)) { - return getValuePositionAndLength(dotEActualNumericValues.get(value)); - } - return null; - } - - private Pair findValue(String value) { - int index = 0; - boolean flag; - do { - flag = true; - index = this.raw.indexOf(value, index); - if (index == -1) - return null; - - for (int i = index; i < index + value.length(); i++) - if (reservedPositions.get(i)) { - flag = false; - break; - } - if (!flag) - index += value.length(); - - } while (!flag); - - reservedPositions.set(index, index + value.length()); - return new Pair<>(index, value.length()); - - } - - private Pair getValuePositionAndLength(ArrayList> list) { - for (Pair p : list) { - if (!reservedPositions.get(p.getKey())) { - reservedPositions.set(p.getKey(), p.getKey() + p.getValue()); - return p; - } - } - return null; - } - - private void extractNumericActualValues() { - if (this.actualNumericValues == null) - this.actualNumericValues = new HashMap<>(); - else - return; - StringBuilder sb = new StringBuilder(); - BitSet nBitSet = (BitSet) numberBitSet.clone(); - nBitSet.or(plusMinusBitSet); - int pi = nBitSet.nextSetBit(0); - sb.append(raw.charAt(pi)); - - for (int i = nBitSet.nextSetBit(pi + 1); i != -1; i = nBitSet.nextSetBit(i + 1)) { - if (pi + sb.length() != i) { - addActualValueToList(sb.toString(), pi, actualNumericValues); - sb = new StringBuilder(); - sb.append(raw.charAt(i)); - pi = i; - } else - sb.append(raw.charAt(i)); - } - if (sb.length() > 0) - addActualValueToList(sb.toString(), pi, actualNumericValues); - } - - private void extractNumericDotActualValues() { - if (this.dotActualNumericValues == null) - this.dotActualNumericValues = new HashMap<>(); - else - return; - - BitSet numericDotBitSet = (BitSet) numberBitSet.clone(); - numericDotBitSet.or(dotBitSet); - numericDotBitSet.or(plusMinusBitSet); - StringBuilder sb = new StringBuilder(); - int pi = numericDotBitSet.nextSetBit(0); - sb.append(raw.charAt(pi)); - - for (int i = numericDotBitSet.nextSetBit(pi + 1); i != -1; i = numericDotBitSet.nextSetBit(i + 1)) { - if (pi + sb.length() != i) { - addActualValueToList(sb.toString(), pi, dotActualNumericValues); - sb = new StringBuilder(); - sb.append(raw.charAt(i)); - pi = i; - } else - sb.append(raw.charAt(i)); - } - if (sb.length() > 0) - addActualValueToList(sb.toString(), pi, dotActualNumericValues); - } - - private void extractNumericDotEActualValues() { -// if(this.dotEActualNumericValues == null) -// this.dotEActualNumericValues = new HashMap<>(); -// else -// return; - - BitSet numericDotEBitSet = (BitSet) numberBitSet.clone(); - numericDotEBitSet.or(dotBitSet); - numericDotEBitSet.or(eBitSet); - numericDotEBitSet.or(plusMinusBitSet); - - StringBuilder sb = new StringBuilder(); - int pi = numericDotEBitSet.nextSetBit(0); - sb.append(raw.charAt(pi)); - - for (int i = numericDotEBitSet.nextSetBit(pi + 1); i != -1; i = numericDotEBitSet.nextSetBit(i + 1)) { - if (pi + sb.length() != i) { - addActualValueToList(sb.toString(), pi, dotEActualNumericValues); - sb = new StringBuilder(); - sb.append(raw.charAt(i)); - pi = i; - } else - sb.append(raw.charAt(i)); - } - if (sb.length() > 0) - addActualValueToList(sb.toString(), pi, dotEActualNumericValues); - } - - private void addActualValueToList(String stringValue, Integer position, HashMap>> list) { - try { - double d = UtilFunctions.getDouble(stringValue); - Pair pair = new Pair(position, stringValue.length()); - if (!list.containsKey(d)) { - ArrayList> tmpList = new ArrayList<>(); - tmpList.add(pair); - list.put(d, tmpList); - } else - list.get(d).add(pair); - } catch (Exception e) { - - } - } - - public String getRemainedTexts(int endPos) { - StringBuilder sb = new StringBuilder(); - StringBuilder result = new StringBuilder(); - for (int i = 0; i < endPos; i++) { - if (!reservedPositions.get(i)) - sb.append(raw.charAt(i)); - else { - if (sb.length() > 0) { - result.append(Lop.OPERAND_DELIMITOR).append(sb); - sb = new StringBuilder(); - } - } - } - if (sb.length() > 0) - result.append(Lop.OPERAND_DELIMITOR).append(sb); - - return result.toString(); - } - - public void cloneReservedPositions() { - this.backupReservedPositions = (BitSet) this.reservedPositions.clone(); - } - - public void restoreReservedPositions() { - this.reservedPositions = this.backupReservedPositions; - } - - public String getSubString(int start, int end) { - return raw.substring(start, end); - } - - public int getRawLength() { - return rawLength; - } - - public String getRaw() { - return raw; - } + public Pair findValue(double value) { + // extractNumericActualValues(); + // if(actualNumericValues.containsKey(value)){ + // return getValuePositionAndLength(actualNumericValues.get(value)); + // } + // + // extractNumericDotActualValues(); + // if(dotActualNumericValues.containsKey(value)){ + // return getValuePositionAndLength(dotActualNumericValues.get(value)); + // } + // + // extractNumericDotEActualValues(); + if(dotEActualNumericValues.containsKey(value)) { + return getValuePositionAndLength(dotEActualNumericValues.get(value)); + } + return null; + } + + private Pair findValue(String value) { + int index = 0; + boolean flag; + do { + flag = true; + index = this.raw.indexOf(value, index); + if(index == -1) + return null; + + for(int i = index; i < index + value.length(); i++) + if(reservedPositions.get(i)) { + flag = false; + break; + } + if(!flag) + index += value.length(); + + } + while(!flag); + + reservedPositions.set(index, index + value.length()); + return new Pair<>(index, value.length()); + + } + + private Pair getValuePositionAndLength(ArrayList> list) { + for(Pair p : list) { + if(!reservedPositions.get(p.getKey())) { + reservedPositions.set(p.getKey(), p.getKey() + p.getValue()); + return p; + } + } + return null; + } + + private void extractNumericActualValues() { + if(this.actualNumericValues == null) + this.actualNumericValues = new HashMap<>(); + else + return; + StringBuilder sb = new StringBuilder(); + BitSet nBitSet = (BitSet) numberBitSet.clone(); + nBitSet.or(plusMinusBitSet); + int pi = nBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for(int i = nBitSet.nextSetBit(pi + 1); i != -1; i = nBitSet.nextSetBit(i + 1)) { + if(pi + sb.length() != i) { + addActualValueToList(sb.toString(), pi, actualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } + else + sb.append(raw.charAt(i)); + } + if(sb.length() > 0) + addActualValueToList(sb.toString(), pi, actualNumericValues); + } + + private void extractNumericDotActualValues() { + if(this.dotActualNumericValues == null) + this.dotActualNumericValues = new HashMap<>(); + else + return; + + BitSet numericDotBitSet = (BitSet) numberBitSet.clone(); + numericDotBitSet.or(dotBitSet); + numericDotBitSet.or(plusMinusBitSet); + StringBuilder sb = new StringBuilder(); + int pi = numericDotBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for(int i = numericDotBitSet.nextSetBit(pi + 1); i != -1; i = numericDotBitSet.nextSetBit(i + 1)) { + if(pi + sb.length() != i) { + addActualValueToList(sb.toString(), pi, dotActualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } + else + sb.append(raw.charAt(i)); + } + if(sb.length() > 0) + addActualValueToList(sb.toString(), pi, dotActualNumericValues); + } + + private void extractNumericDotEActualValues() { + // if(this.dotEActualNumericValues == null) + // this.dotEActualNumericValues = new HashMap<>(); + // else + // return; + + BitSet numericDotEBitSet = (BitSet) numberBitSet.clone(); + numericDotEBitSet.or(dotBitSet); + numericDotEBitSet.or(eBitSet); + numericDotEBitSet.or(plusMinusBitSet); + + StringBuilder sb = new StringBuilder(); + int pi = numericDotEBitSet.nextSetBit(0); + sb.append(raw.charAt(pi)); + + for(int i = numericDotEBitSet.nextSetBit(pi + 1); i != -1; i = numericDotEBitSet.nextSetBit(i + 1)) { + if(pi + sb.length() != i) { + addActualValueToList(sb.toString(), pi, dotEActualNumericValues); + sb = new StringBuilder(); + sb.append(raw.charAt(i)); + pi = i; + } + else + sb.append(raw.charAt(i)); + } + if(sb.length() > 0) + addActualValueToList(sb.toString(), pi, dotEActualNumericValues); + } + + private void addActualValueToList(String stringValue, Integer position, HashMap>> list) { + try { + double d = UtilFunctions.getDouble(stringValue); + Pair pair = new Pair(position, stringValue.length()); + if(!list.containsKey(d)) { + ArrayList> tmpList = new ArrayList<>(); + tmpList.add(pair); + list.put(d, tmpList); + } + else + list.get(d).add(pair); + } + catch(Exception e) { + + } + } + + public String getRemainedTexts(int endPos) { + StringBuilder sb = new StringBuilder(); + StringBuilder result = new StringBuilder(); + for(int i = 0; i < endPos; i++) { + if(!reservedPositions.get(i)) + sb.append(raw.charAt(i)); + else { + if(sb.length() > 0) { + result.append(Lop.OPERAND_DELIMITOR).append(sb); + sb = new StringBuilder(); + } + } + } + if(sb.length() > 0) + result.append(Lop.OPERAND_DELIMITOR).append(sb); + + return result.toString(); + } + + public void cloneReservedPositions() { + this.backupReservedPositions = (BitSet) this.reservedPositions.clone(); + } + + public void restoreReservedPositions() { + this.reservedPositions = this.backupReservedPositions; + } + + public String getSubString(int start, int end) { + return raw.substring(start, end); + } + + public int getRawLength() { + return rawLength; + } + + public String getRaw() { + return raw; + } + + public int getNextNumericPosition(int curPosition){ + int pos = this.rawLength; + for(Double d: this.dotEActualNumericValues.keySet()){ + for(Pair p: this.dotEActualNumericValues.get(d)){ + if(p.getKey() > curPosition && p.getKey() < pos) + pos = p.getKey(); + } + } + return pos; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index 11edaf8d486..0a6468476c4 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -24,7 +24,6 @@ import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.matrix.data.Pair; -import org.apache.sysds.runtime.util.UtilFunctions; import java.io.BufferedReader; import java.io.InputStream; @@ -37,20 +36,12 @@ public class ReaderMapping { private int[][] mapRow; private int[][] mapCol; private int[][] mapLen; - private boolean mapped; - private boolean fullMap; - private boolean upperTriangularMap; - private boolean lowerTriangularMap; - private boolean symmetricMap; - private boolean skewSymmetricMap; - private boolean patternMap; - private Object patternValueMap; - private Types.ValueType patternValueType; + private MappingProperties mappingProperties; private final int nrows; private final int ncols; private int nlines; - private int NaN; + private int actualValueCount; private ArrayList sampleRawIndexes; private MatrixBlock sampleMatrix; private FrameBlock sampleFrame; @@ -101,33 +92,29 @@ private void ReadRaw(String raw) throws Exception { } private void runMapping(boolean isIndexMapping) { - mapped = findMapping(isIndexMapping); - } - protected boolean findMapping(boolean isIndexMapping) { - mapRow = new int[nrows][ncols]; - mapCol = new int[nrows][ncols]; - mapLen = new int[nrows][ncols]; - NaN = 0; + this.mapRow = new int[nrows][ncols]; + this.mapCol = new int[nrows][ncols]; + this.mapLen = new int[nrows][ncols]; + this.mappingProperties = new MappingProperties(); // Set "-1" as default value for all defined matrix for(int r = 0; r < nrows; r++) for(int c = 0; c < ncols; c++) - mapRow[r][c] = mapCol[r][c] = mapLen[r][c] = -1; + this.mapRow[r][c] = this.mapCol[r][c] = this.mapLen[r][c] = -1; int itRow = 0; for(int r = 0; r < nrows; r++) { for(int c = 0; c < ncols; c++) { - if(isIndexMapping || ((this.isMatrix && this.sampleMatrix.getValue(r, c) != 0) || (!this.isMatrix && ((!schema[c].isNumeric() && this.sampleFrame.get(r, - c) != null) || (schema[c].isNumeric() && this.sampleFrame.getDouble(r, c) != 0))))) { + if(isIndexMapping || checkValueIsNotNullZero(r, c)) { HashSet checkedLines = new HashSet<>(); while(checkedLines.size() < nlines) { - RawIndex ri = sampleRawIndexes.get(itRow); - Pair pair = this.isMatrix ? ri.findValue(sampleMatrix.getValue(r, c)) : ri.findValue(sampleFrame.get(r, c), schema[c]); + RawIndex ri = this.sampleRawIndexes.get(itRow); + Pair pair = this.isMatrix ? ri.findValue(this.sampleMatrix.getValue(r, c)) : ri.findValue(this.sampleFrame.get(r, c), this.schema[c]); if(pair != null) { - mapRow[r][c] = itRow; - mapCol[r][c] = pair.getKey(); - mapLen[r][c] = pair.getValue(); + this.mapRow[r][c] = itRow; + this.mapCol[r][c] = pair.getKey(); + this.mapLen[r][c] = pair.getValue(); break; } else { @@ -138,110 +125,151 @@ protected boolean findMapping(boolean isIndexMapping) { } } } - else - NaN++; } } // analysis mapping of values // 1. check (exist, partially exist, not exist) - // 2. check the records represented in single/multilines - // 3. check the Symmetric, Skew-Symmetric, Pattern, and Array + actualValueCount = 0; + int mappedValueCount = 0; + for(int r = 0; r < nrows; r++) { + for(int c = 0; c < ncols; c++) { + if(checkValueIsNotNullZero(r, c)) { + actualValueCount++; + if(this.mapRow[r][c] != -1) + mappedValueCount++; + } + } + } + if(actualValueCount == mappedValueCount) { + this.mappingProperties.setTypicalRepresentation(); + this.mappingProperties.setDataFullExist(); + } + else if(actualValueCount > 0 && mappedValueCount == 0) + this.mappingProperties.setDataNotExist(); - int fullMap = 0; - int upperTriangular = 0; - int upperTriangularZeros = 0; - int lowerTriangular = 0; - int lowerTriangularZeros = 0; - boolean singleLineRecord = true; + else if(mappedValueCount > 0 && mappedValueCount < actualValueCount) + this.mappingProperties.setDataPartiallyExist(); - // check full map - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) + // 2. check the records represented in single/multilines + boolean singleLine = true; + + // first mapped value + int firstLineNumber = -1; + for(int r = 0; r < nrows; r++) { + int c = 0; + firstLineNumber = -1; + for(; c < ncols && firstLineNumber == -1; c++) + firstLineNumber = mapRow[r][c]; + // other mapped + for(; c < ncols && singleLine; c++) if(mapRow[r][c] != -1) - fullMap++; + singleLine = firstLineNumber == mapRow[r][c]; + } + if(singleLine) { + mappingProperties.setRecordSingleLine(); + // 3.a check for array representation + boolean allValuesInALine = true; + for(int r=0; r mapCol[r+1][c]) + t++; + } + } + if((float)t/actualValueCount <0.03) + this.mappingProperties.setArrayRowWiseRepresentation(); + else + this.mappingProperties.setArrayColWiseRepresentation(); + } + } + else { + mappingProperties.setRecordMultiLine(); + // 3.a check for array representation + // TODO: array properties for multi-line + } + + // 3.b check the Typical, Symmetric, Skew-Symmetric, Pattern, and Array // check for upper and lower triangular - if(nrows == ncols) { - this.upperTriangularMap = true; - this.lowerTriangularMap = true; - this.symmetricMap = true; - this.skewSymmetricMap = true; - this.patternMap = false; - this.patternValueMap = null; - - // pattern check for Frame: in Frame the schema must be same for all columns - boolean homoSchema = true; - Types.ValueType vtc0 = null; - if(!this.isMatrix) { - vtc0 = this.sampleFrame.getSchema()[0]; - for(int c = 1; c < ncols && homoSchema; c++) - homoSchema = this.sampleFrame.getSchema()[c] == vtc0; + if(nrows == ncols && !this.mappingProperties.isRepresentation()) { + boolean symmetricMap = true; + + // Symmetric check + for(int r = 0; r < nrows && symmetricMap; r++) { + for(int c = 0; c <= r && symmetricMap; c++) + symmetricMap = this.checkSymmetricValue(r, c, 1); + } + if(symmetricMap) + mappingProperties.setSymmetricRepresentation(); + + // Skew-Symmetric check + if(!mappingProperties.isRepresentation()) { + boolean skewSymmetricMap = true; + for(int r = 0; r < nrows && skewSymmetricMap; r++) { + for(int c = 0; c <= r && skewSymmetricMap; c++) + skewSymmetricMap = this.checkSymmetricValue(r, c, -1); + } + if(skewSymmetricMap) + mappingProperties.setSkewSymmetricRepresentation(); } - for(int r = 0; r < nrows; r++) { - // upper triangular check - for(int c = r; c < ncols && this.upperTriangularMap; c++) - if(this.checkValueIsNotNullZero(r, c) && mapRow[r][c] == -1) - this.upperTriangularMap = false; - - for(int c = 0; c < r && this.upperTriangularMap; c++) - if(this.checkValueIsNotNullZero(r, c)) - this.upperTriangularMap = false; - - // lower triangular check - for(int c = 0; c <= r && this.lowerTriangularMap; c++) - if(this.checkValueIsNotNullZero(r, c) && mapRow[r][c] == -1) - this.lowerTriangularMap = false; - - for(int c = r + 1; c < ncols && this.lowerTriangularMap; c++) - if(this.checkValueIsNotNullZero(r, c)) - this.lowerTriangularMap = false; - - // Symmetric check - for(int c = 0; c <= r && this.symmetricMap; c++) - this.symmetricMap = this.checkSymmetricValue(r, c, 1); - - // Skew-Symmetric check - for(int c = 0; c <= r && this.skewSymmetricMap; c++) - this.skewSymmetricMap = this.checkSymmetricValue(r, c, -1); - - // pattern check for Matrix - if(this.isMatrix) { - HashSet patternValueSet = new HashSet<>(); - for(int c = 0; c < ncols; c++) - patternValueSet.add(this.sampleMatrix.getValue(r, c)); - if(patternValueSet.size() == 1) { - this.patternValueType = Types.ValueType.FP64; - this.patternMap = true; - this.patternValueMap = patternValueSet.iterator().next(); - } + // Pattern check + if(!mappingProperties.isRepresentation()) { + boolean patternMap = false; + Object patternValueMap = null; + + // pattern check for Frame: in Frame the schema must be same for all columns + boolean homoSchema = true; + Types.ValueType vtc0 = null; + if(!this.isMatrix) { + vtc0 = this.sampleFrame.getSchema()[0]; + for(int c = 1; c < ncols && homoSchema; c++) + homoSchema = this.sampleFrame.getSchema()[c] == vtc0; } - else { - if(homoSchema) { - HashSet patternValueSet = new HashSet<>(); + // pattern check for Matrix representation + for(int r = 0; r < nrows; r++) { + if(this.isMatrix) { + HashSet patternValueSet = new HashSet<>(); for(int c = 0; c < ncols; c++) - patternValueSet.add(this.sampleFrame.get(r, c)); + patternValueSet.add(this.sampleMatrix.getValue(r, c)); if(patternValueSet.size() == 1) { - this.patternValueType = vtc0; - this.patternMap = true; - this.patternValueMap = patternValueSet.iterator().next(); + vtc0 = Types.ValueType.FP64; + patternMap = true; + patternValueMap = patternValueSet.iterator().next(); + } + } + else { // pattern check for Frame representation + if(homoSchema) { + HashSet patternValueSet = new HashSet<>(); + for(int c = 0; c < ncols; c++) + patternValueSet.add(this.sampleFrame.get(r, c)); + if(patternValueSet.size() == 1) { + patternMap = true; + patternValueMap = patternValueSet.iterator().next(); + } } } } + + if(patternMap) + mappingProperties.setPatternRepresentation(vtc0, patternValueMap); } } - - System.out.println("upperTriangularMap=" + upperTriangularMap); - System.out.println("lowerTriangularMap=" + lowerTriangularMap); - System.out.println("symmetric=" + symmetricMap); - System.out.println("skewSymmetricMap = " + skewSymmetricMap); - System.out.println("patternMap=" + patternMap); - System.out.println("patternValueType = "+patternValueType); - System.out.println("patternValueMap=" + UtilFunctions.objectToString(patternValueType)); - - - return false; } private boolean checkValueIsNotNullZero(int r, int c) { @@ -270,10 +298,6 @@ else if(this.sampleFrame.getSchema()[c].isNumeric()) return result; } - public int getNaN() { - return NaN; - } - public int[][] getMapRow() { return mapRow; } @@ -302,7 +326,11 @@ public int getNlines() { return nlines; } - public boolean isMapped() { - return mapped; + public MappingProperties getMappingProperties() { + return mappingProperties; + } + + public int getActualValueCount() { + return actualValueCount; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java index eb586dc80b8..8136d04ac0f 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java @@ -22,21 +22,23 @@ public class RowIndexStructure { public enum IndexProperties { - Identity, - CellWiseExist, - CellWiseExistPatternLess, - RowWiseExist, - SeqScatter, - ArrayRowMajor, - ArrayColMajor, - ArrayShapeRowMajor, - ArrayShapeCol; + Identity, // line number of sample raw data equal to the row index of matrix/frame + CellWiseExist, // row index of every cell values are in the sample raw data + CellWiseExistPatternLess, // ? + RowWiseExist, // index of every record in matrix/frame has an index in sample raw + SeqScatter; // the row index is not exist but the record scattered sequentially in multi lines @Override public String toString() { return this.name().toUpperCase(); } } + public RowIndexStructure() { + this.properties = null; + this.keyPattern = null; + this.rowIndexBegin = "0"; + } + private IndexProperties properties; private KeyTrie keyPattern; private String rowIndexBegin; @@ -61,7 +63,9 @@ public String getRowIndexBegin() { return rowIndexBegin; } - public void setRowIndexBegin(String rowIndexBegin) { - this.rowIndexBegin = rowIndexBegin; + public void setRowIndexBegin(int rowIndexBegin) { + this.rowIndexBegin = rowIndexBegin+""; } + + } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index 6cb48b8e55d..44a76b8f6ee 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -19,8 +19,13 @@ package org.apache.sysds.test.functions.iogen; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.Test; +import java.io.IOException; + public class MatrixSingleRowFlatTest extends GenerateReaderMatrixTest { private final static String TEST_NAME = "MatrixSingleRowFlatTest"; @@ -163,50 +168,82 @@ public void test14() { //========================= - @Test - public void test15() { - sampleRaw = "0,1,2,3\n" + "10,0,20,30\n" + "100,200,0,300\n"+"1000,2000,3000,0"; - sampleMatrix = new double[][] {{0,1,2,3}, {10,0,20,30}, {100,200,300,0},{1000,2000,3000,0}}; - runGenerateReaderTest(); - } - - //upper-triangular - @Test - public void test16() { - sampleRaw = "1,2,3,4\n" + "0,20,30,40\n" + "0,0,300,400\n"+"0,0,0,4000"; - sampleMatrix = new double[][] {{1,2,3,4}, {0,20,30,40}, {0,0,300,400},{0,0,0,4000}}; - runGenerateReaderTest(); - } - - //lower-triangular - @Test - public void test17() { - sampleRaw = "1,0,0,0\n" + "10,20,0,0\n" + "100,200,300,0\n"+"1000,2000,3000,4000"; - sampleMatrix = new double[][] {{1,0,0,0}, {10,20,0,0}, {100,200,300,0},{1000,2000,3000,4000}}; - runGenerateReaderTest(); - } - - //symmetric - @Test - public void test19() { - sampleRaw = "1,2,3,4\n" + "2,2,4,5\n" + "3,4,3,6\n"+"4,5,6,4"; - sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; - runGenerateReaderTest(); - } - - //symmetric-upper - @Test - public void test20() { - sampleRaw = "1,2,3,4\n" + "0,2,4,5\n" + "0,0,3,6\n"+"0,0,0,4"; - sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; - runGenerateReaderTest(); - } - //symmetric-lower - @Test - public void test21() { - sampleRaw = "1,0,0,0\n" + "2,2,0,0\n" + "3,4,3,0\n"+"4,5,6,4"; - sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; - runGenerateReaderTest(); - } + @Test public void test15() { + sampleRaw = "1,2,3,4\n" + "5,6,7,8\n" + "9,10,11,12\n" + "13,14,15,16"; + sampleMatrix = new double[][] {{1,2,3,4}, {5,6,7,8}, {9,10,11,12}, {13,14,15,16}}; + runGenerateReaderTest(); + } + + @Test public void test16() { + sampleRaw = "1,2,3,0\n" + "5,0,7,8\n" + "9,0,0,12\n" + "13,14,0,0"; + sampleMatrix = new double[][] {{1,2,3,0}, {5,0,7,8}, {9,0,0,12}, {13,14,0,0}}; + runGenerateReaderTest(); + } + + @Test public void test17() { + sampleRaw = "1:10 2:20 3:30\n" + "4:40 5:50\n" + "1:60 2:70 3:80\n" + "4:90 5:100"; + sampleMatrix = new double[][] {{10,20,30,0,0}, {0,0,0,40,50}, {60,70,80,0,0}, {0,0,0,90,100}}; + runGenerateReaderTest(); + } + + @Test public void test18() { + String jsonInString = "{\"a\":1, \"b\":2}\n" + "{\"d\":1, \"e\":2}"; + try { + final ObjectMapper mapper = new ObjectMapper(); + mapper.readTree(jsonInString); + System.out.println("Yes"); + } catch (IOException e) { + System.out.println("No"); + } + } + + + +// @Test +// public void test15() { +// sampleRaw = "0,1,2,3\n" + "10,0,20,30\n" + "100,200,0,300\n"+"1000,2000,3000,0"; +// sampleMatrix = new double[][] {{0,1,2,3}, {10,0,20,30}, {100,200,300,0},{1000,2000,3000,0}}; +// runGenerateReaderTest(); +// } +// +// //upper-triangular +// @Test +// public void test16() { +// sampleRaw = "1,2,3,4\n" + "0,20,30,40\n" + "0,0,300,400\n"+"0,0,0,4000"; +// sampleMatrix = new double[][] {{1,2,3,4}, {0,20,30,40}, {0,0,300,400},{0,0,0,4000}}; +// runGenerateReaderTest(); +// } +// +// //lower-triangular +// @Test +// public void test17() { +// sampleRaw = "1,0,0,0\n" + "10,20,0,0\n" + "100,200,300,0\n"+"1000,2000,3000,4000"; +// sampleMatrix = new double[][] {{1,0,0,0}, {10,20,0,0}, {100,200,300,0},{1000,2000,3000,4000}}; +// runGenerateReaderTest(); +// } +// +// //symmetric +// @Test +// public void test19() { +// sampleRaw = "1,2,3,4\n" + "2,2,4,5\n" + "3,4,3,6\n"+"4,5,6,4"; +// sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; +// runGenerateReaderTest(); +// } +// +// //symmetric-upper +// @Test +// public void test20() { +// sampleRaw = "1,2,3,4\n" + "0,2,4,5\n" + "0,0,3,6\n"+"0,0,0,4"; +// sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; +// runGenerateReaderTest(); +// } +// +// //symmetric-lower +// @Test +// public void test21() { +// sampleRaw = "1,0,0,0\n" + "2,2,0,0\n" + "3,4,3,0\n"+"4,5,6,4"; +// sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; +// runGenerateReaderTest(); +// } } From 9cdba95aee1cfbd939a7d60168297fd1f2c7a511 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 9 Jun 2022 21:41:40 +0200 Subject: [PATCH 52/84] Fix row index identity detection bug --- .../sysds/runtime/iogen/FormatIdentifying.java | 13 +++++-------- .../functions/iogen/MatrixSingleRowFlatTest.java | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 3c201a35951..12449d52d88 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -394,12 +394,12 @@ private RowIndexStructure getRowIndexStructure() { if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { boolean identity = false; int missedCount = 0; - for(int r = 0; r < nrows; r++) - missedCount += ncols - mostCommonScore(mapRow[r]); - - if(mappingProperties.getRepresentationProperties() == MappingProperties.RepresentationProperties.SYMMETRIC || mappingProperties.getRepresentationProperties() == MappingProperties.RepresentationProperties.SKEWSYMMETRIC) - missedCount -= (nrows - 1) * (ncols - 1); + for(int r = 0; r < nrows; r++) + for(int c=0; c Date: Fri, 10 Jun 2022 03:21:53 +0200 Subject: [PATCH 53/84] Update codegen section based on the new implementation --- .../runtime/iogen/ColIndexStructure.java | 15 +- .../sysds/runtime/iogen/CustomProperties.java | 102 ++----- .../runtime/iogen/FormatIdentifying.java | 113 +++++++- .../sysds/runtime/iogen/GenerateReader.java | 2 + .../runtime/iogen/MappingProperties.java | 2 - .../runtime/iogen/RowIndexStructure.java | 16 +- .../runtime/iogen/codegen/CodeGenTrie.java | 259 +++++------------- .../iogen/codegen/CodeGenTrieNode.java | 125 ++++++--- .../runtime/iogen/codegen/FrameCodeGen.java | 6 +- .../runtime/iogen/codegen/MatrixCodeGen.java | 65 ++--- .../iogen/template/FrameGenerateReader.java | 3 +- .../iogen/template/MatrixGenerateReader.java | 3 +- .../iogen/GenerateReaderMatrixTest.java | 8 +- 13 files changed, 328 insertions(+), 391 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java index ac492107d99..f1e88275888 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java @@ -19,6 +19,8 @@ package org.apache.sysds.runtime.iogen; +import java.util.HashSet; + public class ColIndexStructure { public enum IndexProperties { @@ -33,17 +35,22 @@ public String toString() { public ColIndexStructure() { this.properties = null; this.keyPattern = null; - this.colIndexBegin = "0"; + this.colIndexBegin = 0; } private IndexProperties properties; private KeyTrie keyPattern; - private String colIndexBegin; + private int colIndexBegin; // when the index properties is CellWiseExist: private String indexDelim; private String valueDelim; + public HashSet endWithValueStrings() { + HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); + return endWithValueString; + } + public IndexProperties getProperties() { return properties; } @@ -60,12 +67,12 @@ public void setKeyPattern(KeyTrie keyPattern) { this.keyPattern = keyPattern; } - public String getColIndexBegin() { + public int getColIndexBegin() { return colIndexBegin; } public void setColIndexBegin(int colIndexBegin) { - this.colIndexBegin = colIndexBegin + ""; + this.colIndexBegin = colIndexBegin; } public String getIndexDelim() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 27f821d12f3..45398ef0d01 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -21,9 +21,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FileFormatProperties; - import java.io.Serializable; -import java.util.HashMap; import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { @@ -31,6 +29,9 @@ public class CustomProperties extends FileFormatProperties implements Serializab private MappingProperties mappingProperties; private RowIndexStructure rowIndexStructure; private ColIndexStructure colIndexStructure; + private KeyTrie[] colKeyPatterns; + private KeyTrie valueKeyPattern; + private Types.ValueType[] schema; public CustomProperties(MappingProperties mappingProperties, RowIndexStructure rowIndexStructure, ColIndexStructure colIndexStructure) { this.mappingProperties = mappingProperties; @@ -62,65 +63,12 @@ public void setColIndexStructure(ColIndexStructure colIndexStructure) { this.colIndexStructure = colIndexStructure; } - - - //-------------------------------------- - - public enum IndexProperties { - IDENTITY, EXIST, SEQSCATTER, XARRAY, YARRAY, - IDENTIFY, PREFIX, KEY; - @Override - public String toString() { - return this.name().toUpperCase(); - } - } - - - - - - private KeyTrie[] colKeyPattern; - private Types.ValueType[] schema; - private IndexProperties rowIndex; - private KeyTrie rowKeyPattern; - private String rowIndexBegin; - private HashMap colKeyPatternMap; - - public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex) { - this.colKeyPattern = colKeyPattern; - this.rowIndex = rowIndex; - } - - public CustomProperties(KeyTrie[] colKeyPattern, KeyTrie rowKeyPattern) { - this.colKeyPattern = colKeyPattern; - this.rowIndex = IndexProperties.KEY; - this.rowKeyPattern = rowKeyPattern; - } - - public CustomProperties(KeyTrie[] colKeyPattern, IndexProperties rowIndex, KeyTrie rowKeyPattern) { - this.colKeyPattern = colKeyPattern; - this.rowIndex = rowIndex; - this.rowKeyPattern = rowKeyPattern; - } - - public KeyTrie[] getColKeyPattern() { - return colKeyPattern; + public KeyTrie[] getColKeyPatterns() { + return colKeyPatterns; } - public HashSet[] endWithValueStrings() { - HashSet[] endWithValueString = new HashSet[colKeyPattern.length]; - for(int i = 0; i < colKeyPattern.length; i++) - if(colKeyPattern[i] != null) - endWithValueString[i] = colKeyPattern[i].getFirstSuffixKeyPatterns(); - return endWithValueString; - } - - public HashSet endWithValueStringsRow() { - return rowKeyPattern.getFirstSuffixKeyPatterns(); - } - - public void setColKeyPattern(KeyTrie[] colKeyPattern) { - this.colKeyPattern = colKeyPattern; + public void setColKeyPatterns(KeyTrie[] colKeyPatterns) { + this.colKeyPatterns = colKeyPatterns; } public Types.ValueType[] getSchema() { @@ -131,35 +79,19 @@ public void setSchema(Types.ValueType[] schema) { this.schema = schema; } - public IndexProperties getRowIndex() { - return rowIndex; - } - - public void setRowIndex(IndexProperties rowIndex) { - this.rowIndex = rowIndex; - } - - public KeyTrie getRowKeyPattern() { - return rowKeyPattern; - } - - public void setRowKeyPattern(KeyTrie rowKeyPattern) { - this.rowKeyPattern = rowKeyPattern; - } - - public String getRowIndexBegin() { - return rowIndexBegin; - } - - public void setRowIndexBegin(String rowIndexBegin) { - this.rowIndexBegin = rowIndexBegin; + public HashSet[] endWithValueStrings() { + HashSet[] endWithValueString = new HashSet[colKeyPatterns.length]; + for(int i = 0; i < colKeyPatterns.length; i++) + if(colKeyPatterns[i] != null) + endWithValueString[i] = colKeyPatterns[i].getFirstSuffixKeyPatterns(); + return endWithValueString; } - public HashMap getColKeyPatternMap() { - return colKeyPatternMap; + public KeyTrie getValueKeyPattern() { + return valueKeyPattern; } - public void setColKeyPatternMap(HashMap colKeyPatternMap) { - this.colKeyPatternMap = colKeyPatternMap; + public void setValueKeyPattern(KeyTrie valueKeyPattern) { + this.valueKeyPattern = valueKeyPattern; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 12449d52d88..77d6087b871 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -116,13 +116,11 @@ private void runIdentification() { // ref to Table 1: if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { - // #1 if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { - KeyTrie[] colKeyPattern; - // TODO: change method name from buildColsKeyPatternSingleRow to buildColPatternRowIdentity - colKeyPattern = buildColsKeyPatternSingleRow(); - properties.setColKeyPattern(colKeyPattern); + KeyTrie[] colKeyPatterns; + colKeyPatterns = buildColsKeyPatternSingleRow(); + properties.setColKeyPatterns(colKeyPatterns); } // #2 @@ -140,7 +138,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I String valueDelim = null; String indexDelim = null; Long maxCount = 0L; - int begin = Integer.parseInt(colIndexStructure.getColIndexBegin()); + int begin = colIndexStructure.getColIndexBegin(); for(int c = 0; c < ncols; c++) { if(mapCol[0][c] != -1) { Pair pair = raw.findValue(c + begin); @@ -171,9 +169,14 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I // # 4, 6, 7, 8, 9 if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { + KeyTrie valueKeyPattern = buildValueKeyPattern(); + properties.setValueKeyPattern(valueKeyPattern); + } + // build key pattern for row index int numberOfSelectedCols = 3; - int begin = Integer.parseInt(rowIndexStructure.getRowIndexBegin()); + int begin = rowIndexStructure.getRowIndexBegin(); boolean check, flagReconstruct; int[] selectedRowIndex = new int[2]; KeyTrie rowKeyPattern = null; @@ -274,7 +277,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I rowIndexStructure.setKeyPattern(rowKeyPattern); // build key pattern for column index - begin = Integer.parseInt(colIndexStructure.getColIndexBegin()); + begin = colIndexStructure.getColIndexBegin(); int[] selectedColIndex = new int[2]; KeyTrie colKeyPattern = null; @@ -396,8 +399,8 @@ private RowIndexStructure getRowIndexStructure() { int missedCount = 0; for(int r = 0; r < nrows; r++) - for(int c=0; c 0) { return colKeyPattens; } + private KeyTrie buildValueKeyPattern() { + int minSelectCols = Math.min(10, ncols); + ArrayList prefixStrings = new ArrayList<>(); + ArrayList rowIndexes = new ArrayList<>(); + ArrayList suffixStrings = new ArrayList<>(); + + for(int c = 0; c < minSelectCols; c++) { + Pair, ArrayList> pair = extractAllPrefixStringsOfAColSingleLine(c, false); + prefixStrings.addAll(pair.getKey()); + rowIndexes.addAll(pair.getValue()); + } + + for(int c = 0; c < minSelectCols; c++) { + for(int r = 0; r < nrows; r++) { + int rowIndex = mapRow[r][c]; + if(rowIndex == -1) + continue; + String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); + suffixStrings.add(str); + } + } + + KeyTrie valueKeyPatten = new KeyTrie(); + for(int c = 0; c < ncols; c++) { + MappingTrie trie = new MappingTrie(); + int ri = 0; + boolean check; + boolean flagReconstruct; + ArrayList> keyPatterns = null; + + int psIndex = 0; + for(String ps : prefixStrings) + trie.reverseInsert(ps, rowIndexes.get(psIndex++)); + + if(trie.getRoot().getChildren().size() == 1) { + String[] splitPattern = prefixStrings.get(0).split(Lop.OPERAND_DELIMITOR); + ArrayList reverseSplitPattern = new ArrayList<>(); + for(String ps : splitPattern) + if(ps.length() > 0) + reverseSplitPattern.add(ps); + if(reverseSplitPattern.size() == 0) + reverseSplitPattern.add(""); + + int maxPatternLength = reverseSplitPattern.size(); + check = false; + for(int sp = 0; sp < maxPatternLength; sp++) { + ArrayList shortPattern = new ArrayList<>(); + for(int spi = maxPatternLength - sp - 1; spi < maxPatternLength; spi++) { + shortPattern.add(reverseSplitPattern.get(spi)); + } + check = checkKeyPatternIsUnique(prefixStrings, shortPattern); + if(check) { + keyPatterns = new ArrayList<>(); + keyPatterns.add(shortPattern); + break; + } + } + } + else { + do { + ArrayList> selectedKeyPatterns = new ArrayList<>(); + keyPatterns = trie.getAllSequentialKeys(); + check = false; + for(ArrayList keyPattern : keyPatterns) { + boolean newCheck = checkKeyPatternIsUnique(prefixStrings, keyPattern); + check |= newCheck; + if(newCheck) + selectedKeyPatterns.add(keyPattern); + } + if(check) + keyPatterns = selectedKeyPatterns; + else { + flagReconstruct = trie.reConstruct(); + if(!flagReconstruct) + break; + } + } + while(!check); + } + + if(check) { + valueKeyPatten = new KeyTrie(keyPatterns); + for(String suffix : suffixStrings) { + valueKeyPatten.insertSuffixKeys(suffix.substring(0, Math.min(suffixStringLength, suffix.length())).toCharArray()); + } + } + } + return valueKeyPatten; + } + // Get all prefix strings of a column public Pair[], ArrayList[]> extractAllPrefixStringsOfColsSingleLine(boolean reverse) { ArrayList[] prefixStrings = new ArrayList[ncols]; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 3fdf0a10655..bc7fcb9d55b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -93,6 +93,8 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; + String ss = src.generateCodeJava(); + matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java index 67356cb74cb..99b9e5103ab 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/MappingProperties.java @@ -94,8 +94,6 @@ public void setDataPartiallyExist(){ this.dataProperties = DataProperties.PARTIALLYEXIST; } - - public void setRecordSingleLine(){ this.recordProperties = RecordProperties.SINGLELINE; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java index 8136d04ac0f..f59610b4218 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java @@ -19,12 +19,13 @@ package org.apache.sysds.runtime.iogen; +import java.util.HashSet; + public class RowIndexStructure { public enum IndexProperties { Identity, // line number of sample raw data equal to the row index of matrix/frame CellWiseExist, // row index of every cell values are in the sample raw data - CellWiseExistPatternLess, // ? RowWiseExist, // index of every record in matrix/frame has an index in sample raw SeqScatter; // the row index is not exist but the record scattered sequentially in multi lines @Override @@ -36,12 +37,17 @@ public String toString() { public RowIndexStructure() { this.properties = null; this.keyPattern = null; - this.rowIndexBegin = "0"; + this.rowIndexBegin = 0; } private IndexProperties properties; private KeyTrie keyPattern; - private String rowIndexBegin; + private int rowIndexBegin; + + public HashSet endWithValueStrings() { + HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); + return endWithValueString; + } public IndexProperties getProperties() { return properties; @@ -59,12 +65,12 @@ public void setKeyPattern(KeyTrie keyPattern) { this.keyPattern = keyPattern; } - public String getRowIndexBegin() { + public int getRowIndexBegin() { return rowIndexBegin; } public void setRowIndexBegin(int rowIndexBegin) { - this.rowIndexBegin = rowIndexBegin+""; + this.rowIndexBegin = rowIndexBegin; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 653d7f932f5..2dfcb39cae9 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -19,119 +19,70 @@ package org.apache.sysds.runtime.iogen.codegen; -import com.google.gson.Gson; import org.apache.sysds.common.Types; -import org.apache.sysds.lops.Lop; +import org.apache.sysds.runtime.iogen.ColIndexStructure; import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.KeyTrie; +import org.apache.sysds.runtime.iogen.MappingProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; import java.util.ArrayList; -import java.util.HashMap; import java.util.HashSet; import java.util.Random; public class CodeGenTrie { - private final CodeGenTrieNode rootCol; - private final CodeGenTrieNode rootRow; private final CustomProperties properties; + private final CodeGenTrieNode ctnCol; + private final CodeGenTrieNode ctnRow; + private final CodeGenTrieNode ctnValue; + private final CodeGenTrieNode ctnIndexes; + private final String destination; - private HashMap colKeyPatternMap; - private HashSet regexSet; - private final boolean isRegexBase; private boolean isMatrix; - public CodeGenTrie(CustomProperties properties, String destination) { - this.rootCol = new CodeGenTrieNode(CodeGenTrieNode.NodeType.COL); - this.rootRow = new CodeGenTrieNode(CodeGenTrieNode.NodeType.ROW); + public CodeGenTrie(CustomProperties properties, String destination, boolean isMatrix) { this.properties = properties; this.destination = destination; - this.isMatrix = false; - - HashSet conditions = new HashSet<>(); - for(int c = 0; c < properties.getColKeyPattern().length; c++) { - KeyTrie keyTrie = properties.getColKeyPattern()[c]; - if(keyTrie != null) { - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { - conditions.add(keys.get(0)); -// Gson gson=new Gson(); -// System.out.println(c+" >> "+gson.toJson(keys)); - break; - } - } - } - - if(conditions.size() < 100) { - buildPrefixTree(); - this.isRegexBase = false; - } - else { - this.colKeyPatternMap = new HashMap<>(); - this.regexSet = new HashSet<>(); - this.isRegexBase = true; - buildPrefixTreeRegex(); - } - - } - - // Build Trie for Col and Row Key Patterns - private void buildPrefixTreeRegex() { - for(int c = 0; c < properties.getColKeyPattern().length; c++) { - KeyTrie keyTrie = properties.getColKeyPattern()[c]; - if(keyTrie != null) { - StringBuilder ksb = new StringBuilder(); - StringBuilder sbr = new StringBuilder(); - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) { - for(String ks : keys) { - ksb.append(ks).append(Lop.OPERAND_DELIMITOR); - String tmp = ks.replaceAll("\\s+", "\\\\s+"); - tmp = tmp.replaceAll("\\d+", "\\\\d+"); - sbr.append("(").append(tmp).append(")").append(Lop.OPERAND_DELIMITOR); - } - ksb.deleteCharAt(ksb.length() - 1); - sbr.deleteCharAt(sbr.length() - 1); - break; + this.isMatrix = isMatrix; + + this.ctnValue = new CodeGenTrieNode(CodeGenTrieNode.NodeType.VALUE); + this.ctnIndexes = new CodeGenTrieNode(CodeGenTrieNode.NodeType.INDEX); + + this.ctnCol = new CodeGenTrieNode(CodeGenTrieNode.NodeType.COL); + this.ctnRow = new CodeGenTrieNode(CodeGenTrieNode.NodeType.ROW); + + if(properties.getColKeyPatterns() != null) { + for(int c = 0; c < properties.getColKeyPatterns().length; c++) { + KeyTrie keyTrie = properties.getColKeyPatterns()[c]; + Types.ValueType vt = Types.ValueType.FP64; + if(!this.isMatrix) + vt = properties.getSchema()[c]; + if(keyTrie != null) { + for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) + this.insert(ctnValue, c + "", vt, keys); } - if(ksb.length() == 0) - colKeyPatternMap.put("", c); - else - colKeyPatternMap.put(ksb.toString(), c); - regexSet.add(sbr.toString()); - } } - - if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { - KeyTrie keyTrie = properties.getRowKeyPattern(); - Types.ValueType vt = Types.ValueType.FP32; - if(keyTrie != null) { - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) - this.insert(rootRow, -1, vt, keys); + else if(properties.getValueKeyPattern() != null) { + // TODO: same pattern for all columns but the ValueTypes are different- fix it ! + for(ArrayList keys : properties.getValueKeyPattern().getPrefixKeyPatterns()) { + this.insert(ctnValue, "col", Types.ValueType.FP64, keys); } } - } - // Build Trie for Col and Row Key Patterns - private void buildPrefixTree() { - for(int c = 0; c < properties.getColKeyPattern().length; c++) { - KeyTrie keyTrie = properties.getColKeyPattern()[c]; - Types.ValueType vt = properties.getSchema() == null ? Types.ValueType.FP64 : properties.getSchema()[c]; - if(keyTrie != null) { - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) - this.insert(rootCol, c, vt, keys); - } + if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || properties.getRowIndexStructure() + .getProperties() == RowIndexStructure.IndexProperties.CellWiseExist) { + for(ArrayList keys : properties.getRowIndexStructure().getKeyPattern().getPrefixKeyPatterns()) + this.insert(ctnIndexes, "0", Types.ValueType.INT32, keys); } - if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) { - KeyTrie keyTrie = properties.getRowKeyPattern(); - Types.ValueType vt = Types.ValueType.FP32; - if(keyTrie != null) { - for(ArrayList keys : keyTrie.getReversePrefixKeyPatterns()) - this.insert(rootRow, -1, vt, keys); - } + if(properties.getColIndexStructure().getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + for(ArrayList keys : properties.getColIndexStructure().getKeyPattern().getPrefixKeyPatterns()) + this.insert(ctnIndexes, "1", Types.ValueType.INT32, keys); } } - private void insert(CodeGenTrieNode root, int index, Types.ValueType valueType, ArrayList keys) { + private void insert(CodeGenTrieNode root, String index, Types.ValueType valueType, ArrayList keys) { CodeGenTrieNode currentNode = root; int rci = 0; for(String key : keys) { @@ -149,9 +100,9 @@ private void insert(CodeGenTrieNode root, int index, Types.ValueType valueType, else { CodeGenTrieNode newNode; for(int i = rci; i < keys.size(); i++) { - newNode = new CodeGenTrieNode(i == keys.size() - 1, index, valueType, keys.get(i), new HashSet<>(), - root.getType()); - newNode.setRowIndexBeginPos(properties.getRowIndexBegin()); + newNode = new CodeGenTrieNode(i == keys.size() - 1, index, valueType, keys.get(i), new HashSet<>(), root.getType()); + newNode.setRowIndexBeginPos(properties.getRowIndexStructure().getRowIndexBegin()); + newNode.setColIndexBeginPos(properties.getColIndexStructure().getColIndexBegin()); currentNode.getChildren().put(keys.get(i), newNode); currentNode = newNode; } @@ -160,39 +111,27 @@ private void insert(CodeGenTrieNode root, int index, Types.ValueType valueType, public String getJavaCode() { StringBuilder src = new StringBuilder(); - switch(properties.getRowIndex()) { - case IDENTIFY: - getJavaCode(rootCol, src, "0"); - src.append("row++; \n"); - break; - case PREFIX: - getJavaCodeIndexOf(rootRow, src, "0"); - getJavaCode(rootCol, src, "0"); - break; - case KEY: - src.append("String strChunk, remainedStr = null; \n"); - src.append("int chunkSize = 2048; \n"); - src.append("int recordIndex = 0; \n"); - src.append("try { \n"); - src.append("do{ \n"); - src.append("strChunk = getStringChunkOfBufferReader(br, remainedStr, chunkSize); \n"); - src.append("if(strChunk == null || strChunk.length() == 0) break; \n"); - src.append("do { \n"); - ArrayList> kp = properties.getRowKeyPattern().getPrefixKeyPatterns(); - getJavaRowCode(src, kp, kp); - getJavaCode(rootCol, src, "0"); - src.append("row++; \n"); - src.append("}while(true); \n"); - src.append("remainedStr = strChunk.substring(recordIndex); \n"); - src.append("}while(true); \n"); - src.append("} \n"); - src.append("finally { \n"); - src.append("IOUtilFunctions.closeSilently(br); \n"); - src.append("} \n"); - break; + MappingProperties.RepresentationProperties representation = properties.getMappingProperties().getRepresentationProperties(); + MappingProperties.DataProperties data = properties.getMappingProperties().getDataProperties(); + MappingProperties.RecordProperties record = properties.getMappingProperties().getRecordProperties(); + + RowIndexStructure.IndexProperties rowIndex = properties.getRowIndexStructure().getProperties(); + ColIndexStructure.IndexProperties colIndex = properties.getColIndexStructure().getProperties(); + + if(data != MappingProperties.DataProperties.NOTEXIST && rowIndex == RowIndexStructure.IndexProperties.Identity && colIndex == ColIndexStructure.IndexProperties.Identity) { + getJavaCode(ctnValue, src, "0"); + src.append("row++; \n"); } + else if(rowIndex == RowIndexStructure.IndexProperties.CellWiseExist && colIndex == ColIndexStructure.IndexProperties.CellWiseExist) { + if(data != MappingProperties.DataProperties.NOTEXIST) { + src.append("/* ++++++++++++++++++++++ INDEXES +++++++++++++++++++++++++++++++++++++ */\n"); + getJavaCode(ctnIndexes, src, "0"); + src.append("/* ++++++++++++++++++++++ END INDEXES +++++++++++++++++++++++++++++++++++++ */\n"); + getJavaCode(ctnValue, src, "0"); + } + } return src.toString(); } @@ -206,68 +145,7 @@ public String getRandomName(String base) { } private void getJavaCode(CodeGenTrieNode node, StringBuilder src, String currPos) { - if(!isRegexBase) - getJavaCodeIndexOf(node, src, currPos); - else - getJavaCodeRegex(src); - } - - private void getJavaCodeRegex(StringBuilder src) { - - // TODO: for fist item start with "" - //src.append("List allMatches = new ArrayList(); \n"); - for(String s : regexSet) { - if(s.equals("()")) { - src.append("int colIndex0 = getColIndex(colKeyPatternMap, \"\"); \n"); - src.append("endPos = getEndPos(str, strLen, 0, endWithValueString[colIndex0]); \n"); - src.append("String cellStr0 = str.substring(0, endPos); \n"); - src.append("if ( cellStr0.length() > 0 ){\n"); - if(isMatrix) { - src.append("Double cellValue0; \n"); - src.append( - "try{cellValue0 = Double.parseDouble(cellStr0); } catch(Exception e){cellValue0= 0d;}\n"); - src.append("if(cellValue0!= 0) { \n"); - src.append(destination).append("(row, colIndex0 , cellValue0); \n"); - src.append("lnnz++;\n"); - src.append("} \n"); - } - else { - src.append(destination).append( - "(row, colIndex0 , UtilFunctions.stringToObject(properties.getSchema()[colIndex0], cellStr)0); \n"); - } - src.append("}\n"); - } - else { - int groupCount = s.split(Lop.OPERAND_DELIMITOR).length; - if(groupCount > 1) - break; - src.append("Matcher matcher = Pattern.compile(\"" + s.replace("\\", "\\\\") + "\").matcher(str); \n"); - src.append("while(matcher.find()) { \n"); - src.append("String key = ").append("matcher.group(1);\n"); - src.append("int currPos = matcher.end();\n"); - src.append("int colIndex = getColIndex(colKeyPatternMap, key); \n"); - src.append("if(colIndex!=-1) { \n"); - //src.append("Types.ValueType vt = pair.getValue();\n"); - src.append("endPos = getEndPos(str, strLen, currPos, endWithValueString[colIndex]); \n"); - src.append("String cellStr = str.substring(currPos, endPos); \n"); - src.append("if ( cellStr.length() > 0 ){\n"); - if(isMatrix) { - src.append("Double cellValue; \n"); - src.append("try{cellValue = Double.parseDouble(cellStr); } catch(Exception e){cellValue= 0d;}\n"); - src.append("if(cellValue!= 0) { \n"); - src.append(destination).append("(row, colIndex , cellValue); \n"); - src.append("lnnz++;\n"); - src.append("} \n"); - } - else { - src.append(destination).append( - "(row, colIndex , UtilFunctions.stringToObject(properties.getSchema()[colIndex], cellStr)); \n"); - } - src.append("}\n"); - src.append("}\n"); - src.append("}\n"); - } - } + getJavaCodeIndexOf(node, src, currPos); } private void getJavaCodeIndexOf(CodeGenTrieNode node, StringBuilder src, String currPos) { @@ -280,11 +158,9 @@ private void getJavaCodeIndexOf(CodeGenTrieNode node, StringBuilder src, String if(key.length() > 0) { currPosVariable = getRandomName("curPos"); if(node.getKey() == null) - src.append( - "index = str.indexOf(\"" + key.replace("\\\"", "\"").replace("\"", "\\\"") + "\"); \n"); + src.append("index = str.indexOf(\"" + key.replace("\\\"", "\"").replace("\"", "\\\"") + "\"); \n"); else - src.append("index = str.indexOf(\"" + key.replace("\\\"", "\"") - .replace("\"", "\\\"") + "\", " + currPos + "); \n"); + src.append("index = str.indexOf(\"" + key.replace("\\\"", "\"").replace("\"", "\\\"") + "\", " + currPos + "); \n"); src.append("if(index != -1) { \n"); src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); } @@ -296,8 +172,7 @@ private void getJavaCodeIndexOf(CodeGenTrieNode node, StringBuilder src, String } } - private void getJavaRowCode(StringBuilder src, ArrayList> rowBeginPattern, - ArrayList> rowEndPattern) { + private void getJavaRowCode(StringBuilder src, ArrayList> rowBeginPattern, ArrayList> rowEndPattern) { // TODO: we have to extend it to multi patterns // now, we assumed each row can have single pattern for begin and end @@ -321,12 +196,4 @@ private void getJavaRowCode(StringBuilder src, ArrayList> rowB public void setMatrix(boolean matrix) { isMatrix = matrix; } - - public boolean isRegexBase() { - return isRegexBase; - } - - public HashMap getColKeyPatternMap() { - return colKeyPatternMap; - } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 54f029796dc..51051c7406c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -20,6 +20,7 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; + import java.util.HashMap; import java.util.HashSet; import java.util.Map; @@ -27,113 +28,139 @@ public class CodeGenTrieNode { public enum NodeType { - ROW, COL; - @Override - public String toString() { + VALUE, ROW, COL, INDEX; + + @Override public String toString() { return this.name().toUpperCase(); } } + private final Map children = new HashMap<>(); private boolean endOfCondition; - private int colIndex; + private String colIndex; private Types.ValueType valueType; private String key; private HashSet naStrings; private final NodeType type; - private String rowIndexBeginPos; + private int rowIndexBeginPos; + private int colIndexBeginPos; public CodeGenTrieNode(NodeType type) { this.endOfCondition = false; this.type = type; } - public CodeGenTrieNode(int colIndex, String key, NodeType type) { + public CodeGenTrieNode(String colIndex, String key, NodeType type) { this.colIndex = colIndex; this.key = key; this.type = type; } - public CodeGenTrieNode(boolean endOfCondition, int colIndex, Types.ValueType valueType, String key, HashSet naStrings, NodeType type) { + public CodeGenTrieNode(boolean endOfCondition, String colIndex, Types.ValueType valueType, String key, HashSet naStrings, NodeType type) { this.endOfCondition = endOfCondition; this.colIndex = colIndex; this.valueType = valueType; this.key = key; - if(endOfCondition){ + if(endOfCondition) { this.naStrings = naStrings; } this.type = type; } - public String geValueCode(String destination, String currPos){ - if(this.type == NodeType.ROW) - return this.getRowPrefixValueCode(currPos); + public String geValueCode(String destination, String currPos) { + if(this.type == NodeType.INDEX) + return this.getIndexCode(currPos); else return this.getColValueCode(destination, currPos); } - private String getRowPrefixValueCode(String currPos){ + private String getIndexCode(String currPos) { StringBuilder src = new StringBuilder(); String subStr; - src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueStringRow); \n"); - subStr = "str.substring("+currPos+",endPos)"; - src.append("try{ \n"); - if(rowIndexBeginPos.length() > 0) - src.append("row = ").append("Integer.parseInt(" + subStr + ") " + rowIndexBeginPos + "; \n"); + String ewvs; + if(this.colIndex.equals("0")) + ewvs = "endWithValueStringRow"; else - src.append("row = ").append("Integer.parseInt("+subStr+"); \n"); + ewvs = "endWithValueStringCol"; + + src.append("endPos = getEndPos(str, strLen, " + currPos + "," + ewvs + "); \n"); + subStr = "str.substring(" + currPos + ",endPos)"; + src.append("try{ \n"); + if(this.colIndex.equals("0")) { + if(rowIndexBeginPos > 0) + src.append("row = ").append("Integer.parseInt(" + subStr + ") - " + rowIndexBeginPos + "; \n"); + else + src.append("row = ").append("Integer.parseInt(" + subStr + "); \n"); + } + else { + if(colIndexBeginPos > 0) + src.append("col = ").append("Integer.parseInt(" + subStr + ") - " + colIndexBeginPos + "; \n"); + else + src.append("col = ").append("Integer.parseInt(" + subStr + "); \n"); + } src.append("} catch(Exception e){} \n"); return src.toString(); } - private String getColValueCode(String destination, String currPos){ + + private String getColValueCode(String destination, String currPos) { StringBuilder src = new StringBuilder(); + if(this.colIndex.equals("col")) + src.append("endPos = getEndPos(str, strLen, " + currPos + ", endWithValueStringVal); \n"); + else + src.append("endPos = getEndPos(str, strLen, " + currPos + ", endWithValueString[" + colIndex + "]); \n"); - src.append("endPos = getEndPos(str, strLen, "+ currPos+", endWithValueString["+colIndex+"]); \n"); - src.append("String cellStr"+colIndex+" = str.substring("+currPos+",endPos); \n"); + src.append("String cellStr" + colIndex + " = str.substring(" + currPos + ",endPos); \n"); if(valueType.isNumeric()) { - src.append("if ( cellStr"+colIndex+".length() > 0 ){\n"); - src.append(getParsCode("cellStr"+colIndex)); - src.append("if(cellValue"+colIndex+" != 0) { \n"); - src.append(destination).append("(row, " + colIndex + ", cellValue"+colIndex+"); \n"); + src.append("if ( cellStr" + colIndex + ".length() > 0 ){\n"); + src.append(getParsCode("cellStr" + colIndex)); + src.append("if(cellValue" + colIndex + " != 0) { \n"); + src.append(destination).append("(row, " + colIndex + ", cellValue" + colIndex + "); \n"); src.append("lnnz++;\n"); src.append("}\n"); src.append("}\n"); } - else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOLEAN){ - if(naStrings !=null && naStrings.size() > 0) { + else if(valueType == Types.ValueType.STRING || valueType == Types.ValueType.BOOLEAN) { + if(naStrings != null && naStrings.size() > 0) { StringBuilder sb = new StringBuilder(); sb.append("if("); for(String na : naStrings) { src.append("naStrings.contains(\"" + na + "\")").append("|| \n"); } - sb.delete(sb.length()-2, sb.length()); + sb.delete(sb.length() - 2, sb.length()); sb.append("){ \n"); - sb.append("cellValue+"+colIndex+" = null;"); + sb.append("cellValue+" + colIndex + " = null;"); sb.append("}\n"); } else - src.append(getParsCode("cellStr"+colIndex)); - src.append(destination).append("(row, " + colIndex + ", cellValue"+colIndex+"); \n"); + src.append(getParsCode("cellStr" + colIndex)); + src.append(destination).append("(row, " + colIndex + ", cellValue" + colIndex + "); \n"); } return src.toString(); } private String getParsCode(String subStr) { - String cellValue = "cellValue"+colIndex; - switch(valueType ) { - case STRING: return "String "+cellValue+" = "+subStr+"; \n"; - case BOOLEAN: return "Boolean "+cellValue+"; \n try{ "+cellValue+"= Boolean.parseBoolean("+subStr+");} catch(Exception e){"+cellValue+"=false;} \n"; - case INT32: return "Integer "+cellValue+"; \n try{ "+cellValue+"= Integer.parseInt("+subStr+");} catch(Exception e){"+cellValue+" = 0;} \n"; - case INT64: return "Long "+cellValue+"; \n try{"+cellValue+"= Long.parseLong("+subStr+"); } catch(Exception e){"+cellValue+" = 0l;} \n"; - case FP64: return "Double "+cellValue+"; \n try{ "+cellValue+"= Double.parseDouble("+subStr+"); } catch(Exception e){"+cellValue+" = 0d;}\n"; - case FP32: return "Float "+cellValue+"; \n try{ "+cellValue+"= Float.parseFloat("+subStr+");} catch(Exception e){"+cellValue+" = 0f;} \n"; - default: throw new RuntimeException("Unsupported value type: "+valueType); + String cellValue = "cellValue" + colIndex; + switch(valueType) { + case STRING: + return "String " + cellValue + " = " + subStr + "; \n"; + case BOOLEAN: + return "Boolean " + cellValue + "; \n try{ " + cellValue + "= Boolean.parseBoolean(" + subStr + ");} catch(Exception e){" + cellValue + "=false;} \n"; + case INT32: + return "Integer " + cellValue + "; \n try{ " + cellValue + "= Integer.parseInt(" + subStr + ");} catch(Exception e){" + cellValue + " = 0;} \n"; + case INT64: + return "Long " + cellValue + "; \n try{" + cellValue + "= Long.parseLong(" + subStr + "); } catch(Exception e){" + cellValue + " = 0l;} \n"; + case FP64: + return "Double " + cellValue + "; \n try{ " + cellValue + "= Double.parseDouble(" + subStr + "); } catch(Exception e){" + cellValue + " = 0d;}\n"; + case FP32: + return "Float " + cellValue + "; \n try{ " + cellValue + "= Float.parseFloat(" + subStr + ");} catch(Exception e){" + cellValue + " = 0f;} \n"; + default: + throw new RuntimeException("Unsupported value type: " + valueType); } } - public Map getChildren() { return children; } @@ -146,11 +173,11 @@ public void setEndOfCondition(boolean endOfCondition) { this.endOfCondition = endOfCondition; } - public int getColIndex() { + public String getColIndex() { return colIndex; } - public void setColIndex(int colIndex) { + public void setColIndex(String colIndex) { this.colIndex = colIndex; } @@ -174,11 +201,19 @@ public NodeType getType() { return type; } - public String getRowIndexBeginPos() { + public int getRowIndexBeginPos() { return rowIndexBeginPos; } - public void setRowIndexBeginPos(String rowIndexBeginPos) { + public void setRowIndexBeginPos(int rowIndexBeginPos) { this.rowIndexBeginPos = rowIndexBeginPos; } + + public int getColIndexBeginPos() { + return colIndexBeginPos; + } + + public void setColIndexBeginPos(int colIndexBeginPos) { + this.colIndexBeginPos = colIndexBeginPos; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 9abf0ffc1c2..549b18382ce 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -65,15 +65,15 @@ public String generateCodeJava() { src.append("int row = rl; \n"); src.append("long lnnz = 0; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); - if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) - src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); +// if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) +// src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); src.append("int index, endPos, strLen; \n"); src.append("try { \n"); src.append("while(reader.next(key, value)){ \n"); src.append("String str = value.toString(); \n"); src.append("strLen = str.length(); \n"); - CodeGenTrie trie = new CodeGenTrie(properties, "dest.set"); + CodeGenTrie trie = new CodeGenTrie(properties, "dest.set", false); src.append(trie.getJavaCode()); src.append("}} \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 11b0a1c7da1..b39ca352cfc 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -19,7 +19,9 @@ package org.apache.sysds.runtime.iogen.codegen; +import org.apache.sysds.runtime.iogen.ColIndexStructure; import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; public class MatrixCodeGen extends TemplateCodeGenBase { @@ -28,50 +30,44 @@ public MatrixCodeGen(CustomProperties properties, String className) { super(properties, className); // 1. set java code template - javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + - "import java.util.HashMap;\n" + - "import java.util.HashSet;\n" + - "import java.util.regex.Matcher;\n" + - "import java.util.regex.Pattern; \n"+ - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + - "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + - "import java.io.BufferedReader;\n" + - "import java.io.IOException;\n" + - "import java.io.InputStream;\n" + - "import java.io.InputStreamReader;\n" + - "public class "+className+" extends MatrixGenerateReader {\n"+ - - " public "+className+"(CustomProperties _props) {\n"+ - " super(_props);\n"+ - " }\n"+ - - " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n"+ - " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n"+ - code+ - "}}\n"; + javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + "import java.util.HashMap;\n" + "import java.util.HashSet;\n" + "import java.util.regex.Matcher;\n" + "import java.util.regex.Pattern; \n" + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + "import java.io.BufferedReader;\n" + "import java.io.IOException;\n" + "import java.io.InputStream;\n" + "import java.io.InputStreamReader;\n" + "public class " + className + " extends MatrixGenerateReader {\n" + + + " public " + className + "(CustomProperties _props) {\n" + " super(_props);\n" + " }\n" + + + " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n" + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n" + code + "}}\n"; // 2. set cpp code template } - @Override - public String generateCodeJava() { + @Override public String generateCodeJava() { StringBuilder src = new StringBuilder(); - CodeGenTrie trie= new CodeGenTrie(properties, "dest.appendValue"); + CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); trie.setMatrix(true); src.append("String str; \n"); src.append("int row = rowPos.intValue(); \n"); + src.append("int col = -1; \n"); src.append("long lnnz = 0; \n"); src.append("int index, endPos, strLen; \n"); - src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); - if(trie.isRegexBase()) { - properties.setColKeyPatternMap(trie.getColKeyPatternMap()); - src.append( - "HashMap colKeyPatternMap = _props.getColKeyPatternMap(); \n"); + + boolean flag1 = false; + boolean flag2 = false; + + if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || properties.getRowIndexStructure() + .getProperties() == RowIndexStructure.IndexProperties.CellWiseExist) { + src.append("HashSet endWithValueStringRow = _props.getRowIndexStructure().endWithValueStrings(); \n"); + flag1 = true; } - if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) - src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); + + if(properties.getColIndexStructure().getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + src.append("HashSet endWithValueStringCol = _props.getColIndexStructure().endWithValueStrings(); \n"); + flag2 = true; + } + + if(flag1 && flag2) + src.append("HashSet endWithValueStringVal = _props.getValueKeyPattern().getFirstSuffixKeyPatterns(); \n"); + else + src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); + src.append("try { \n"); src.append("while((str = br.readLine()) != null){ \n"); src.append("strLen = str.length(); \n"); @@ -89,8 +85,7 @@ public String generateCodeJava() { return javaTemplate.replace(code, src.toString()); } - @Override - public String generateCodeCPP() { + @Override public String generateCodeCPP() { return null; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java index 50ab279fbf6..52063a7b905 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java @@ -30,6 +30,7 @@ import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.io.IOUtilFunctions; import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.util.InputStreamInputFormat; @@ -55,7 +56,7 @@ private int getNumRows(List files, FileSystem fs) throws IOException, DMLR BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); try { // Row Identify - if(_props.getRowIndex().equals(CustomProperties.IndexProperties.IDENTIFY)) { + if(_props.getRowIndexStructure().getProperties().equals(RowIndexStructure.IndexProperties.Identity)) { while(br.readLine() != null) rows++; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index 90c530c845f..d0da3491bf3 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -25,6 +25,7 @@ import org.apache.sysds.conf.ConfigurationManager; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -68,7 +69,7 @@ private static int getNumRows(List files, FileSystem fs) throws IOExceptio BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); try { // Row Identify - if(_props.getRowIndex().equals(CustomProperties.IndexProperties.IDENTIFY)) { + if(_props.getRowIndexStructure().getProperties().equals(RowIndexStructure.IndexProperties.Identity)) { while(br.readLine() != null) rows++; } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index b14ae35225c..a77ef184662 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -95,10 +95,10 @@ protected void runGenerateReaderTest() { // mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); // int a = 100; -// GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); -// MatrixReader mr = gr.getReader(); -// MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); -// + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + MatrixReader mr = gr.getReader(); + MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); + // TestUtils.compareMatrices(sampleMB, matrixBlock, 0); } From 893c43de70a43769c3287248eb52ed13ec2dd5df Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 10 Jun 2022 15:55:08 +0200 Subject: [PATCH 54/84] Update codegen section with row-index identity and col-index exist --- .../sysds/runtime/iogen/CustomProperties.java | 9 + .../runtime/iogen/FormatIdentifying.java | 179 +++--------------- .../runtime/iogen/codegen/CodeGenTrie.java | 68 ++++--- .../runtime/iogen/codegen/MatrixCodeGen.java | 28 ++- .../iogen/MatrixSingleRowFlatTest.java | 6 + 5 files changed, 98 insertions(+), 192 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 45398ef0d01..54a529ae17e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -32,6 +32,7 @@ public class CustomProperties extends FileFormatProperties implements Serializab private KeyTrie[] colKeyPatterns; private KeyTrie valueKeyPattern; private Types.ValueType[] schema; + private int ncols; public CustomProperties(MappingProperties mappingProperties, RowIndexStructure rowIndexStructure, ColIndexStructure colIndexStructure) { this.mappingProperties = mappingProperties; @@ -94,4 +95,12 @@ public KeyTrie getValueKeyPattern() { public void setValueKeyPattern(KeyTrie valueKeyPattern) { this.valueKeyPattern = valueKeyPattern; } + + public int getNcols() { + return ncols; + } + + public void setNcols(int ncols) { + this.ncols = ncols; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 77d6087b871..32c02b5df9a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -113,18 +113,21 @@ private void runIdentification() { ColIndexStructure colIndexStructure = getColIndexStructure(); properties = new CustomProperties(mappingProperties, rowIndexStructure, colIndexStructure); + properties.setNcols(ncols); // ref to Table 1: if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { // #1 - if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && + colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { KeyTrie[] colKeyPatterns; colKeyPatterns = buildColsKeyPatternSingleRow(); properties.setColKeyPatterns(colKeyPatterns); } // #2 - else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && + colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { // find cell-index and value separators RawIndex raw = null; for(int c = 0; c < ncols; c++) { @@ -392,6 +395,14 @@ private RowIndexStructure getRowIndexStructure() { RowIndexStructure rowIndexStructure = new RowIndexStructure(); + if(mappingProperties.getDataProperties() == MappingProperties.DataProperties.NOTEXIST){ + if(nlines >= this.actualValueCount) { + rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CellWiseExist); + rowIndexStructure.setRowIndexBegin(0); + return rowIndexStructure; + } + } + // check row-index Identity, the identity properties available just for // exist and partially exist mapped values if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { @@ -481,161 +492,6 @@ else if(isSeqScatter) { return rowIndexStructure; } } - - // if(isCellWise && isExist) { - // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); - // } - // else if(!isCellWise && isExist) { - // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.ROWWISEEXIST); - // } - // else if(isCellWise && !isExist) { - // - // } - - // RowIndexStructure rowIndexStructure = null; - // int numberOfSelectedCols = 3; - // MatrixBlock rowIndexMB = new MatrixBlock(nrows, ncols, false); - // int scol = Math.min(ncols - numberOfSelectedCols, ncols); - // for(int r=0; r beginPos = new HashSet<>(); - // KeyTrie rowPattern = null; - // - // // Select two none zero row as a row index candidate - // int index = 0; - // for(int r = 1; r < nrows; r++) { - // for(int c = 0; c < ncols; c++) - // if(mapRow[r][c] != -1) { - // selectedRowIndex[index++] = r; - // break; - // } - // if(index > 1) - // break; - // } - - // // CELLWISEEXIST: when row index exist in each cell value - // for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { - // - // Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); - // ArrayList prefixStrings = colPrefixString.getKey(); - // ArrayList prefixStringRowIndexes = colPrefixString.getValue(); - // ArrayList prefixRawIndex = new ArrayList<>(); - // - // MappingTrie trie = new MappingTrie(); - // int ri = 0; - // for(String ps : prefixStrings) - // trie.reverseInsert(ps, prefixStringRowIndexes.get(ri++)); - // - // do { - // flag = trie.reConstruct(); - // } - // while(flag); - // - // ArrayList> keyPatterns = trie.getAllSequentialKeys(); - // for(ArrayList kp : keyPatterns) { - // for(String ps : prefixStrings) { - // StringBuilder sb = new StringBuilder(); - // int currPos = 0; - // for(String k : kp) { - // sb.append(ps.substring(currPos, ps.indexOf(k, currPos))); - // currPos += sb.length() + k.length(); - // } - // prefixRawIndex.add(new RawIndex(sb.toString())); - // } - // } - // - // flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - // if(!flag) { - // begin = 1; - // flag = checkPrefixRowIndex(c, begin, prefixRawIndex); - // } - // if(!flag) { - // beginPos.clear(); - // break; - // } - // else - // beginPos.add(begin); - // if(c == numberOfSelectedCols - 1) { - // ArrayList rowPrefixStrings = new ArrayList<>(); - // MappingTrie rowTrie = new MappingTrie(); - // rowPattern = new KeyTrie(); - // for(int si : selectedRowIndex) { - // for(int ci = 0; ci < ncols; ci++) { - // int cri = mapRow[si][ci]; - // if(cri != -1) { - // String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); - // RawIndex rawIndex = new RawIndex(str); - // Pair pair = rawIndex.findValue(si + begin); - // if(pair != null) { - // String pstr = str.substring(0, pair.getKey()); - // if(pstr.length() > 0) { - // rowPrefixStrings.add(pstr); - // rowTrie.insert(pstr, 1); - // } - // rowPattern.insertSuffixKeys(str.substring(pair.getKey() + pair.getValue()).toCharArray()); - // } - // } - // } - // } - // - // do { - // ArrayList> selectedKeyPatterns = new ArrayList<>(); - // keyPatterns = rowTrie.getAllSequentialKeys(); - // check = false; - // for(ArrayList keyPattern : keyPatterns) { - // boolean newCheck = checkKeyPatternIsUnique(rowPrefixStrings, keyPattern); - // check |= newCheck; - // if(newCheck) - // selectedKeyPatterns.add(keyPattern); - // } - // if(check) - // keyPatterns = selectedKeyPatterns; - // else { - // flagReconstruct = rowTrie.reConstruct(); - // if(!flagReconstruct) - // break; - // } - // } - // while(!check); - // - // if(keyPatterns.size() == 0) { - // ArrayList> kpl = new ArrayList<>(); - // ArrayList kpli = new ArrayList<>(); - // kpli.add(""); - // kpl.add(kpli); - // keyPatterns = kpl; - // } - // rowPattern.setPrefixKeyPattern(keyPatterns); - // } - // } - // if(beginPos.size() == 1) { - // rowIndexStructure = new RowIndexStructure(); - // rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CELLWISEEXIST); - // rowIndexStructure.setKeyPattern(rowPattern); - // Integer bpos = beginPos.iterator().next(); - // if(bpos > 0) - // rowIndexStructure.setRowIndexBegin("-" + bpos); - // else - // rowIndexStructure.setRowIndexBegin(""); - // } - // return rowIndexStructure; return rowIndexStructure; } @@ -643,6 +499,15 @@ private ColIndexStructure getColIndexStructure() { ColIndexStructure colIndexStructure = new ColIndexStructure(); int begin = 0; boolean colIndexExist = true; + + if(mappingProperties.getDataProperties() == MappingProperties.DataProperties.NOTEXIST){ + if(nlines >= this.actualValueCount) { + colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); + colIndexStructure.setColIndexBegin(0); + return colIndexStructure; + } + } + if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { // 1. check for column index are in the record for(int r = 0; r < Math.min(10, nrows); r++) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index 2dfcb39cae9..6cbda3d78be 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -70,13 +70,14 @@ else if(properties.getValueKeyPattern() != null) { } } - if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || properties.getRowIndexStructure() - .getProperties() == RowIndexStructure.IndexProperties.CellWiseExist) { + if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || + properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist) { for(ArrayList keys : properties.getRowIndexStructure().getKeyPattern().getPrefixKeyPatterns()) this.insert(ctnIndexes, "0", Types.ValueType.INT32, keys); } - if(properties.getColIndexStructure().getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + if(properties.getColIndexStructure().getProperties() == ColIndexStructure.IndexProperties.CellWiseExist && + properties.getColIndexStructure().getKeyPattern() !=null) { for(ArrayList keys : properties.getColIndexStructure().getKeyPattern().getPrefixKeyPatterns()) this.insert(ctnIndexes, "1", Types.ValueType.INT32, keys); } @@ -111,6 +112,7 @@ private void insert(CodeGenTrieNode root, String index, Types.ValueType valueTyp public String getJavaCode() { StringBuilder src = new StringBuilder(); + int ncols = properties.getNcols(); MappingProperties.RepresentationProperties representation = properties.getMappingProperties().getRepresentationProperties(); MappingProperties.DataProperties data = properties.getMappingProperties().getDataProperties(); @@ -119,18 +121,45 @@ public String getJavaCode() { RowIndexStructure.IndexProperties rowIndex = properties.getRowIndexStructure().getProperties(); ColIndexStructure.IndexProperties colIndex = properties.getColIndexStructure().getProperties(); - if(data != MappingProperties.DataProperties.NOTEXIST && rowIndex == RowIndexStructure.IndexProperties.Identity && colIndex == ColIndexStructure.IndexProperties.Identity) { + // example: csv + if(data != MappingProperties.DataProperties.NOTEXIST && + rowIndex == RowIndexStructure.IndexProperties.Identity && + colIndex == ColIndexStructure.IndexProperties.Identity) { getJavaCode(ctnValue, src, "0"); src.append("row++; \n"); } - else if(rowIndex == RowIndexStructure.IndexProperties.CellWiseExist && colIndex == ColIndexStructure.IndexProperties.CellWiseExist) { + // example: MM + else if(rowIndex == RowIndexStructure.IndexProperties.CellWiseExist && + colIndex == ColIndexStructure.IndexProperties.CellWiseExist) { + getJavaCode(ctnIndexes, src, "0"); + src.append("if(col < " + ncols + "){ \n"); if(data != MappingProperties.DataProperties.NOTEXIST) { - src.append("/* ++++++++++++++++++++++ INDEXES +++++++++++++++++++++++++++++++++++++ */\n"); - getJavaCode(ctnIndexes, src, "0"); - src.append("/* ++++++++++++++++++++++ END INDEXES +++++++++++++++++++++++++++++++++++++ */\n"); getJavaCode(ctnValue, src, "0"); } - + else + src.append(destination).append("(row, col, cellValue); \n"); + src.append("} \n"); + } + // example: LibSVM + else if(rowIndex == RowIndexStructure.IndexProperties.Identity && + colIndex == ColIndexStructure.IndexProperties.CellWiseExist){ + src.append("String strValues[] = str.split(\""+ properties.getColIndexStructure().getValueDelim()+"\"); \n"); + src.append("for(String si: strValues){ \n"); + src.append("String strIndexValue[] = si.split(\""+ properties.getColIndexStructure().getIndexDelim()+"\", -1); \n"); + src.append("if(strIndexValue.length == 2){ \n"); + src.append("col = UtilFunctions.parseToInt(strIndexValue[0]); \n"); + src.append("if(col < "+ncols+"){ \n"); + if(this.isMatrix){ + src.append("try{ \n"); + src.append(destination).append("(row, col, Double.parseDouble(strIndexValue[1]); \n"); + src.append("} catch(Exception e){"+destination+".append(row, col, 0d);} \n"); + } + else { + src.append(destination).append("(row, col, UtilFunctions.stringToObject(_props.getSchema()[col], strIndexValue[1]); \n"); + } + src.append("} \n"); + src.append("} \n"); + src.append("} \n"); } return src.toString(); } @@ -172,27 +201,6 @@ private void getJavaCodeIndexOf(CodeGenTrieNode node, StringBuilder src, String } } - private void getJavaRowCode(StringBuilder src, ArrayList> rowBeginPattern, ArrayList> rowEndPattern) { - - // TODO: we have to extend it to multi patterns - // now, we assumed each row can have single pattern for begin and end - - for(ArrayList kb : rowBeginPattern) { - for(String k : kb) { - src.append("recordIndex = strChunk.indexOf(\"" + k + "\", recordIndex); \n"); - src.append("if(recordIndex == -1) break; \n"); - } - src.append("recordIndex +=" + kb.get(kb.size() - 1).length() + "; \n"); - break; - } - src.append("int recordBeginPos = recordIndex; \n"); - String endKey = rowEndPattern.get(0).get(0); - src.append("recordIndex = strChunk.indexOf(\"" + endKey + "\", recordBeginPos);"); - src.append("if(recordIndex == -1) break; \n"); - src.append("str = strChunk.substring(recordBeginPos, recordIndex); \n"); - src.append("strLen = str.length(); \n"); - } - public void setMatrix(boolean matrix) { isMatrix = matrix; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index b39ca352cfc..8e6526212a5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -21,6 +21,7 @@ import org.apache.sysds.runtime.iogen.ColIndexStructure; import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.MappingProperties; import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; @@ -30,11 +31,25 @@ public MatrixCodeGen(CustomProperties properties, String className) { super(properties, className); // 1. set java code template - javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + "import java.util.HashMap;\n" + "import java.util.HashSet;\n" + "import java.util.regex.Matcher;\n" + "import java.util.regex.Pattern; \n" + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + "import java.io.BufferedReader;\n" + "import java.io.IOException;\n" + "import java.io.InputStream;\n" + "import java.io.InputStreamReader;\n" + "public class " + className + " extends MatrixGenerateReader {\n" + - - " public " + className + "(CustomProperties _props) {\n" + " super(_props);\n" + " }\n" + - - " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n" + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n" + code + "}}\n"; + javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + + "import java.util.HashMap;\n" + + "import java.util.HashSet;\n" + + "import java.util.regex.Matcher;\n" + + "import java.util.regex.Pattern; \n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + + "import java.io.BufferedReader;\n" + + "import java.io.IOException;\n" + + "import java.io.InputStream;\n" + + "import java.io.InputStreamReader;\n" + + "public class " + className + " extends MatrixGenerateReader {\n" + + " public " + className + "(CustomProperties _props) {\n" + " super(_props);\n" + " }\n" + + " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n" + + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n" + + code + + "}}\n"; // 2. set cpp code template } @@ -48,6 +63,9 @@ public MatrixCodeGen(CustomProperties properties, String className) { src.append("long lnnz = 0; \n"); src.append("int index, endPos, strLen; \n"); src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); + if(properties.getMappingProperties().getDataProperties() == MappingProperties.DataProperties.NOTEXIST) { + src.append("double cellValue = "+ properties.getMappingProperties().getPatternValue() +"; \n"); + } boolean flag1 = false; boolean flag2 = false; diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index 68777709663..bfc6812763e 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -199,6 +199,12 @@ public void test14() { runGenerateReaderTest(); } + @Test public void test20() { + sampleRaw = "1,1\n" + "1,2\n" + "1,3\n" + "1,4\n" + "2,2\n"+ "2,3\n"+ "2,4\n"+ "3,3\n"+ "3,4\n"+ "4,4\n"; + sampleMatrix = new double[][] {{10,10,10,10}, {0,10,10,10}, {0,0,10,10}, {0,0,0,10}}; + runGenerateReaderTest(); + } + @Test public void test180() { From 3d4b36c849a1a73e5fd650898ad5a4f12bfe8908 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sun, 12 Jun 2022 03:41:51 +0200 Subject: [PATCH 55/84] Initial commit of multi-line format identification --- .../runtime/iogen/FormatIdentifying.java | 114 +++++++++++++++++- .../apache/sysds/runtime/iogen/TextTrie.java | 72 +++++++++++ .../sysds/runtime/iogen/TextTrieNode.java | 67 ++++++++++ .../iogen/GenerateReaderMatrixTest.java | 2 +- 4 files changed, 249 insertions(+), 6 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 32c02b5df9a..47b1c133706 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -170,7 +170,8 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I } else { // # 4, 6, 7, 8, 9 - if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && + colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { KeyTrie valueKeyPattern = buildValueKeyPattern(); @@ -379,6 +380,21 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I } colIndexStructure.setKeyPattern(colKeyPattern); } + // #10 sequential scattered + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + ArrayList> prefixSuffixBeginEndCells = extractPrefixSuffixBeginEndCells(false); + + TextTrie textTrie = new TextTrie(); + textTrie.insert(prefixSuffixBeginEndCells.get(0).getKey(), 0); + + for(int i=1; i< prefixSuffixBeginEndCells.size(); i++){ + String prefix = prefixSuffixBeginEndCells.get(i).getKey(); + for(int j=0; j< prefix.length(); j++){ + textTrie.insert(prefix.substring(j),i); + } + } + int mm = 100; + } } } @@ -487,10 +503,11 @@ private RowIndexStructure getRowIndexStructure() { rowIndexStructure.setRowIndexBegin(begin); return rowIndexStructure; } - else if(isSeqScatter) { - rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.SeqScatter); - return rowIndexStructure; - } + + } + if(isSeqScatter) { + rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.SeqScatter); + return rowIndexStructure; } return rowIndexStructure; } @@ -635,6 +652,93 @@ private int checkColIndexOnRowRaw(int rowIndex, int colIndex, int beginPos) { return beginPos; } + // Extract prefix strings: + private ArrayList> extractPrefixSuffixBeginEndCells(boolean reverse) { + + ArrayList> result = new ArrayList<>(); + BitSet[] recordUsedLines = new BitSet[nlines]; + BitSet[] usedLines = new BitSet[nlines]; + for(int r = 0; r < nrows; r++) + recordUsedLines[r] = new BitSet(); + + for(int r = 0; r < nrows; r++) + for(int c = 0; c < ncols; c++) + if(mapRow[r][c] != -1) + recordUsedLines[r].set(mapRow[r][c]); + + for(int r = 0; r < nrows; r++) { + usedLines[r] = new BitSet(nlines); + for(int i = 0; i < nrows; i++) { + if(i != r) + usedLines[r].or(recordUsedLines[i]); + } + } + int lastLine = 0; + int lastPos = 0; + int nextLine = 0; + for(int r=0; r= 0; i--) + if(recordUsedLines[r].get(i)) { + endLine = i; + break; + } + if(r+1 < nrows) { + for(int i = 0; i < nlines; i++) + if(recordUsedLines[r+1].get(i)) { + nextLine = i; + break; + } + } + else + nextLine = nlines -1; + + endPos = sampleRawIndexes.get(endLine).getRawLength(); + nextPos = sampleRawIndexes.get(nextLine).getRawLength(); + for(int c=0; c(sbPrefix.toString(), sbSuffix.toString())); + } + + return result; + } + //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex) { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java new file mode 100644 index 00000000000..7624e94ae3d --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +public class TextTrie { + private TextTrieNode root; + + TextTrie() { + root = new TextTrieNode(); + } + + public void reverseInsert(String word, int rowIndex){ + TextTrieNode current = root; + for(int i = word.length() -1; i>=0; i-- ) { + current = current.getChildren().computeIfAbsent(word.charAt(i), c -> new TextTrieNode()); + current.addRowIndex(rowIndex); + } + current.setEndOfWord(true); + } + + public void insert(String word, int rowIndex) { + TextTrieNode current = root; + for(char l : word.toCharArray()) { + current = current.getChildren().computeIfAbsent(l, c -> new TextTrieNode()); + current.addRowIndex(rowIndex); + } + current.setEndOfWord(true); + } + + public TextTrieNode containsString(String word) { + TextTrieNode current = root; + for(int i = 0; i < word.length(); i++) { + char ch = word.charAt(i); + TextTrieNode node = current.getChildren().get(ch); + if(node == null) { + return null; + } + current = node; + } + return current; + } + + public int containsStringAndSet(String word) { + TextTrieNode result = containsString(word); + int rowIndex = -1; + if(result != null) { + rowIndex = result.getRowIndex(); + if(rowIndex != -1) + result.setRowIndexUsed(rowIndex); + } + return rowIndex; + } + +} + diff --git a/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java new file mode 100644 index 00000000000..d52fe26651a --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.HashMap; +import java.util.Map; + +public class TextTrieNode { + private final Map children = new HashMap<>(); + private boolean endOfWord; + private ArrayList rowIndexes; + private BitSet rowIndexesBitSet; + + public TextTrieNode() { + rowIndexes = new ArrayList<>(); + rowIndexesBitSet = new BitSet(); + } + + public void addRowIndex(int rowIndex) { + rowIndexes.add(rowIndex); + } + + Map getChildren() { + return children; + } + + public boolean isEndOfWord() { + return endOfWord; + } + + public void setEndOfWord(boolean endOfWord) { + this.endOfWord = endOfWord; + } + + public int getRowIndex() { + for(int i = 0; i < rowIndexes.size(); i++) { + int index = rowIndexes.get(i); + if(!rowIndexesBitSet.get(index)) + return index; + } + return -1; + } + + public void setRowIndexUsed(int rowIndex) { + this.rowIndexesBitSet.set(rowIndex); + } +} + diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index a77ef184662..b226756af43 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -90,7 +90,7 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); +// FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); // myTest mt = new myTest(formatIdentifying.getFormatProperties()); // mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); // int a = 100; From e1b2ff52ccbab50685b65fd23149c252e4702f35 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Mon, 13 Jun 2022 01:57:46 +0200 Subject: [PATCH 56/84] Add record delimiter in seq-scattered record formats --- .../runtime/iogen/FormatIdentifying.java | 85 +++++++++++++---- .../apache/sysds/runtime/iogen/TextTrie.java | 92 +++++++++++++++++++ .../sysds/runtime/iogen/TextTrieNode.java | 4 + .../iogen/MatrixMultiRowNestedTest.java | 33 +++++++ 4 files changed, 197 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 47b1c133706..f9612529654 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.Map; +import java.util.Set; public class FormatIdentifying { @@ -393,7 +394,54 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I textTrie.insert(prefix.substring(j),i); } } - int mm = 100; + // scoring the prefix tree + ArrayList>> keys = textTrie.getAllKeys(); + String beginString = null; + String endString = null; + if(keys.get(0).getValue().size() == nrows){ + int index = keys.get(0).getKey().indexOf("\n"); + if(index == -1) + beginString = keys.get(0).getKey(); + else + beginString = keys.get(0).getKey().substring(0, index); + + // recompute suffix strings to find end of string + ArrayList suffixes = new ArrayList<>(); + for(int i=0; i> extractPrefixSuffixBeginEndCells(boolean int lastLine = 0; int lastPos = 0; int nextLine = 0; - for(int r=0; r> extractPrefixSuffixBeginEndCells(boolean endLine = i; break; } - if(r+1 < nrows) { + if(r + 1 < nrows) { for(int i = 0; i < nlines; i++) - if(recordUsedLines[r+1].get(i)) { + if(recordUsedLines[r + 1].get(i)) { nextLine = i; break; } } else - nextLine = nlines -1; + nextLine = nlines - 1; endPos = sampleRawIndexes.get(endLine).getRawLength(); nextPos = sampleRawIndexes.get(nextLine).getRawLength(); - for(int c=0; c>> getAllKeys(){ + ArrayList>> result = new ArrayList<>(); + ArrayList allKeys = new ArrayList<>(); + getAllKeys(root, allKeys, new Key(new StringBuilder(), new ArrayList<>())); + + Comparator compare = Comparator.comparing(Key::getIndexSetSize).thenComparing(Key::getKeyLength).reversed(); + List sortedKeys = allKeys.stream().sorted(compare).collect(Collectors.toList()); + + for(Key k: sortedKeys){ + result.add(new Pair<>(k.getKey().toString(), k.getIndexSet())); + //k.print(); + //System.out.println("++++++++++++++"); + } + return result; + } + + private void getAllKeys(TextTrieNode node, ArrayList result, Key curKey){ + if(node.getChildren().size() == 0) + return; + else { + for(Character k: node.getChildren().keySet()){ + TextTrieNode child = node.getChildren().get(k); + ArrayList tList = new ArrayList<>(); + tList.addAll(child.getRowIndexes()); + Key key = new Key( new StringBuilder(curKey.getKey()).append(k), tList); + result.add(key); + getAllKeys(child, result, key); + } + } + } + + private class Key{ + private StringBuilder key; + private ArrayList rowIndexes; + private int keyLength; + private Set indexSet; + private int indexSetSize; + + public Key(StringBuilder key, ArrayList rowIndexes) { + this.key = key; + this.rowIndexes = rowIndexes; + this.keyLength = key.length(); + this.indexSet = new HashSet<>(); + this.indexSet.addAll(rowIndexes); + this.indexSetSize = this.indexSet.size(); + } + + public StringBuilder getKey() { + return key; + } + + + public void setKey(StringBuilder key) { + this.key = key; + } + + public ArrayList getRowIndexes() { + return rowIndexes; + } + + public void setRowIndexes(ArrayList rowIndexes) { + this.rowIndexes = rowIndexes; + } + + public int getKeyLength() { + return keyLength; + } + + public Set getIndexSet() { + return indexSet; + } + + public int getIndexSetSize() { + return indexSetSize; + } + + public void print(){ + Gson gson = new Gson(); + System.out.println(key.toString()+" "+gson.toJson(this.indexSet)); + } + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java index d52fe26651a..f069d346071 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/TextTrieNode.java @@ -63,5 +63,9 @@ public int getRowIndex() { public void setRowIndexUsed(int rowIndex) { this.rowIndexesBitSet.set(rowIndex); } + + public ArrayList getRowIndexes() { + return rowIndexes; + } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java index b38fd586cca..fbfd2f619ec 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java @@ -125,4 +125,37 @@ public void test8() { sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; runGenerateReaderTest(); } + + @Test + public void test9() { + sampleRaw = "#index 1\n" + + "#* 12\n" + + "#@ 13;14;15\n" + + "#o 16;17;18\n" + + "#t 19\n" + + "#c 110\n" + + "#% 111\n" + + "#% 112\n" + + "\n" + + "#index 2\n" + + "#* 22\n" + + "#@ 23;24;25\n" + + "#o 26;27;28\n" + + "#t 29\n" + + "#c 210\n" + + "#% 211\n" + + "#% 212\n" + + "\n" + + "\n" + + "#index 3\n" + + "#* 32\n" + + "#@ 33;34;35\n" + + "#o 36;37;38\n" + + "#t 39\n" + + "#c 310\n" + + "#% 311\n" + + "#% 500"; + sampleMatrix = new double[][] {{1,12,13,14,15},{2,22,23,24,25},{3,32,33,34,35}}; + runGenerateReaderTest(); + } } From caf01d8d86899bff21c9874e1108638678da28cf Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 14 Jun 2022 21:29:39 +0200 Subject: [PATCH 57/84] First commit of the seq-scattered reader code gen --- .../runtime/iogen/FormatIdentifying.java | 438 +++--------------- .../apache/sysds/runtime/iogen/RawIndex.java | 25 +- .../runtime/iogen/codegen/CodeGenTrie.java | 10 +- .../iogen/MatrixMultiRowNestedTest.java | 7 +- 4 files changed, 100 insertions(+), 380 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index f9612529654..bf51a5b3f55 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -438,6 +438,10 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I else endString = beginString; + updateMapsAndExtractAllSuffixStringsOfColsMultiLine(beginString, endString); + KeyTrie[] colKeyPatterns; + colKeyPatterns = buildColsKeyPatternSingleRow(); + properties.setColKeyPatterns(colKeyPatterns); } else { // TODO: extend sequential scattered format algorithm for heterogeneous structures @@ -508,9 +512,11 @@ private RowIndexStructure getRowIndexStructure() { // check for Sequential: for(int r = 0; r < nrows && isSeqScatter; r++) { BitSet bitSet = bitSets[r]; - int beginIndex = bitSet.nextSetBit(0); - for(int i = bitSet.nextSetBit(beginIndex + 1); i != -1 && isSeqScatter; i = bitSet.nextSetBit(i + 1)) - isSeqScatter = i == ++beginIndex; + ArrayList list = new ArrayList<>(); + for(int i = bitSet.nextSetBit(0); i != -1; i = bitSet.nextSetBit(i + 1)) + list.add(i); + for(int i=0; i> extractPrefixSuffixBeginEndCells(boolean return result; } - //+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - - private boolean checkPrefixRowIndex(int colIndex, int beginPos, ArrayList prefixRawIndex) { - for(int r = 0; r < nrows; r++) { - int rowIndex = this.mapRow[r][colIndex]; - if(rowIndex != -1) { - boolean flag = false; - for(RawIndex ri : prefixRawIndex) { - if(ri.findValue(r + beginPos) != null) { - flag = true; - break; - } - } - if(!flag) - return false; - } - } - return true; - } - public CustomProperties getFormatProperties() { return properties; } - private Integer mostCommonScore(int[] list) { - Map map = new HashMap<>(); - int nan = 0; - for(Integer t : list) { - if(t != -1) { - Integer val = map.get(t); - map.put(t, val == null ? 1 : val + 1); - } - else - nan++; - } - if(map.size() == 0) - return nan; - - Map.Entry max = null; - for(Map.Entry e : map.entrySet()) { - if(max == null || e.getValue() > max.getValue()) - max = e; - } - return max.getValue() + nan; - } - private Integer mostCommonValue(int[] list) { Map map = new HashMap<>(); for(Integer t : list) { @@ -1127,283 +1091,87 @@ private ArrayList[] extractAllSuffixStringsOfColsSingleLine() { ///////////////////////////////////////////////////////////////////////////// // Methods For Multi Lines Mapping // //////////////////////////////////////////////////////////////////////////// - // This implementation is for nested datasets are scattered on multiple lines - // The following steps are required: - // 1. Extract all prefix strings per column - // 2. Build key pattern tree for each column - // 3. Build key pattern for end of values - - private ArrayList> findRowDelimiters() { - ArrayList> keyPattern = new ArrayList<>(); - Hirschberg hirschberg = new Hirschberg(); - int misMatchPenalty = 3; - int gapPenalty = 2; - - //extract all lines are in record boundary - ArrayList recordBoundaries = new ArrayList<>(); - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - int[] minList = new int[nrows]; - HashMap maxColPos = new HashMap<>(); - int[] minColPos = new int[nrows]; - for(int r = 0; r < nrows; r++) - tmpUsedLines[r] = new BitSet(); - for(int r = 0; r < nrows; r++) { - int min = nlines; - int minPos = 0; - for(int c = 0; c < ncols; c++) - if(mapRow[r][c] != -1) { - tmpUsedLines[r].set(mapRow[r][c]); - if(mapRow[r][c] <= min) { - min = mapRow[r][c]; - if(minPos != 0) - minPos = Math.min(minPos, mapCol[r][c]); - else - minPos = mapCol[r][c]; - - } - if(maxColPos.containsKey(mapRow[r][c])) - maxColPos.put(mapRow[r][c], Math.max(maxColPos.get(mapRow[r][c]), mapCol[r][c] + mapLen[r][c])); - else - maxColPos.put(mapRow[r][c], mapCol[r][c] + mapLen[r][c]); - } - minList[r] = min; - minColPos[r] = minPos; + private void updateMapsAndExtractAllSuffixStringsOfColsMultiLine(String beginString, String endString){ + ArrayList upRawIndexes = new ArrayList<>(); + ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(beginString); + ArrayList> endIndexes; + String endToken; + if(!beginString.equals(endString)) { + endIndexes = getTokenIndexOnMultiLineRecords(endString); + endToken = endString; } - - for(int r = 0; r < nrows; r++) { - usedLines[r] = new BitSet(nlines); - for(int i = 0; i < nrows; i++) { - if(i != r) - usedLines[r].or(tmpUsedLines[i]); - } + else { + endIndexes = new ArrayList<>(); + for(int i = 1; i < beginIndexes.size(); i++) + endIndexes.add(beginIndexes.get(i)); + endIndexes.add(new Pair<>(this.sampleRawIndexes.size() - 1, this.sampleRawIndexes.get(this.sampleRawIndexes.size() - 1).getRawLength())); + endToken = ""; } - - for(int r = 0; r < nrows; r++) { - int beginLine = minList[r]; - for(; beginLine >= 0; beginLine--) - if(usedLines[r].get(beginLine)) + int r = 0; + int i = 0; + int j = 0; + StringBuilder sb = new StringBuilder(); + while(i < beginIndexes.size() && j < endIndexes.size()) { + Pair p1 = beginIndexes.get(i); + Pair p2 = endIndexes.get(j); + int n = 0; + while(p1.getKey() < p2.getKey() || (p1.getKey() == p2.getKey() && p1.getValue() < p2.getValue())) { + n++; + i++; + if(i == beginIndexes.size()) break; - - StringBuilder sb = new StringBuilder(); - beginLine = Math.max(beginLine, 0); - - if(beginLine + 1 == nlines) - continue; - - Integer subStrPos = 0; - if(maxColPos.containsKey(beginLine)) - subStrPos = maxColPos.get(beginLine); - - String str = sampleRawIndexes.get(beginLine).getRaw().substring(subStrPos); - if(str.length() > 0) { - sb.append(str).append("\n"); + p1 = beginIndexes.get(i); } - for(int i = beginLine + 1; i < minList[r]; i++) { - str = sampleRawIndexes.get(i).getRaw(); - if(str.length() > 0) - sb.append(str).append("\n"); + j += n - 1; + sb.append(this.sampleRawIndexes.get(beginIndexes.get(i - n).getKey()).getRaw().substring(beginIndexes.get(i - n).getValue())); + for(int ri = beginIndexes.get(i - n).getKey() + 1; ri < endIndexes.get(j).getKey(); ri++) { + sb.append(this.sampleRawIndexes.get(ri).getRaw()); } + sb.append(this.sampleRawIndexes.get(endIndexes.get(j).getKey()).getRaw().substring(0, endIndexes.get(j).getValue())).append(endToken); + RawIndex rawIndex = new RawIndex(); + rawIndex.setRaw(sb.toString()); + sb = new StringBuilder(); + j++; + // update mapping + for(int c = 0; c < ncols; c++) { + if(mapRow[r][c] != -1) { + if(mapRow[r][c] != beginIndexes.get(i - n).getKey()) + this.mapCol[r][c] += this.sampleRawIndexes.get(beginIndexes.get(i - n).getKey()).getRawLength() - beginIndexes.get(i - n) + .getValue(); + else + this.mapCol[r][c] -= beginIndexes.get(i - n).getValue(); - str = sampleRawIndexes.get(minList[r]).getRaw().substring(0, minColPos[r]); - if(str.length() > 0) - sb.append(str); - recordBoundaries.add(sb.toString()); - } - recordBoundaries.remove(recordBoundaries.size() - 1); - - String str1 = recordBoundaries.get(0); - String str2 = recordBoundaries.get(1); - Pair, String> pattern = hirschberg.getLCS(str1, str2, misMatchPenalty, gapPenalty); - if(pattern != null) { - String intersect = pattern.getValue(); - ArrayList intersectPattern = pattern.getKey(); - for(int i = 2; i < recordBoundaries.size(); i++) { - pattern = hirschberg.getLCS(intersect, recordBoundaries.get(i), misMatchPenalty, gapPenalty); - if(pattern != null) { - intersect = pattern.getValue(); - intersectPattern = pattern.getKey(); + for(int ci = beginIndexes.get(i - n).getKey() + 1; ci < this.mapRow[r][c]; ci++) + this.mapCol[r][c] += this.sampleRawIndexes.get(ci).getRawLength(); + rawIndex.setReservedPositions(mapCol[r][c], mapLen[r][c]); + this.mapRow[r][c] = r; } - else - intersect = null; - } - if(intersect != null && intersect.length() > 0) { - keyPattern.add(intersectPattern); - return keyPattern; } + upRawIndexes.add(rawIndex); + r++; } - return null; + this.sampleRawIndexes = upRawIndexes; } - // Build key pattern tree for each column - private KeyTrie[] buildColsKeyPatternMultiRow() { - Pair[], Pair[]> prefixStrings = extractAllPrefixStringsOfColsMultiLine(true); - ArrayList[] suffixStrings = extractAllSuffixStringsOfColsMultiLine(); - - KeyTrie[] colKeyPattens = new KeyTrie[ncols]; - for(int c = 0; c < ncols; c++) { - // 1. Build Prefix Key Pattern - String colDelim = findStartWithIntersectOfStrings(prefixStrings.getKey()[c], prefixStrings.getValue()[c].getKey()); - - HashSet intersect = new HashSet<>(); - intersect.add(colDelim); + private ArrayList> getTokenIndexOnMultiLineRecords(String token){ + ArrayList> result = new ArrayList<>(); - KeyTrie trie = new KeyTrie(colDelim); - ArrayList, ArrayList>> remainedPrefixes = new ArrayList<>(); - boolean check; + for(int ri=0; ri< this.sampleRawIndexes.size(); ri++){ + String raw = this.sampleRawIndexes.get(ri).getRaw(); + int index; + int fromIndex = 0; do { - ArrayList> keyPatterns = trie.getPrefixKeyPatterns(); - check = false; - for(ArrayList keyPattern : keyPatterns) { - boolean newCheck = checkKeyPatternIsUnique(prefixStrings.getKey()[c], keyPattern); - check |= newCheck; - if(newCheck) { - trie.setAPrefixPath(keyPattern); - } - } - - if(!check) { - remainedPrefixes.clear(); - boolean flag = true; - for(ArrayList keyPattern : keyPatterns) { - ArrayList remainedPrefix = new ArrayList<>(); - for(String ps : prefixStrings.getKey()[c]) - remainedPrefix.add(getRemainedSubstring(ps, keyPattern)); - - intersect = findStartWithIntersectOfStrings(remainedPrefix); - if(intersect != null) { - trie.insertPrefixKeysConcurrent(intersect); - } - else { - remainedPrefixes.add(new Pair<>(keyPattern, remainedPrefix)); - flag = false; - break; - } - } - if(!flag) - break; - } - } - while(!check); - - // Suffix pattern is based on char, so we need to extract all chars of a string - for(String suffix : suffixStrings[c]) { - trie.insertSuffixKeys(suffix.toCharArray()); - } - colKeyPattens[c] = trie; - } - return colKeyPattens; - } - - // Extract prefix strings: - private Pair[], Pair[]> extractAllPrefixStringsOfColsMultiLine(boolean reverse) { - - ArrayList[] result = new ArrayList[ncols]; - Pair[] minmax = new Pair[ncols]; - BitSet[] tmpUsedLines = new BitSet[nlines]; - BitSet[] usedLines = new BitSet[nlines]; - for(int r = 0; r < nrows; r++) - tmpUsedLines[r] = new BitSet(); - - for(int r = 0; r < nrows; r++) - for(int c = 0; c < ncols; c++) - if(mapRow[r][c] != -1) - tmpUsedLines[r].set(mapRow[r][c]); - - for(int r = 0; r < nrows; r++) { - usedLines[r] = new BitSet(nlines); - for(int i = 0; i < nrows; i++) { - if(i != r) - usedLines[r].or(tmpUsedLines[i]); - } - } - - // extract prefix strings - for(int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - int min = 0; - int max = 0; - for(int r = 0; r < nrows; r++) { - int rowIndex = mapRow[r][c]; - if(rowIndex == -1) - continue; - StringBuilder sb = new StringBuilder(); - int lastLine = 0; - - for(int i = rowIndex - 1; i >= 0; i--) - if(usedLines[r].get(i)) { - lastLine = i; - break; - } - for(int i = lastLine; i < rowIndex; i++) { - if(sampleRawIndexes.get(i).getRawLength() > 0) - sb.append(sampleRawIndexes.get(i).getRaw()).append("\n"); + index = raw.indexOf(token, fromIndex); + if(index !=-1){ + result.add(new Pair<>(ri, index)); + fromIndex = index+token.length(); } - String str = sampleRawIndexes.get(rowIndex).getSubString(0, mapCol[r][c]); - if(str.length() > 0 && !str.equals("\n")) - sb.append(str); - else if(lastLine < rowIndex) - sb.deleteCharAt(sb.length() - 1); - - if(reverse) - result[c].add(sb.reverse().toString()); else - result[c].add(sb.toString()); - max = Math.max(max, sb.length()); - if(sb.length() < min || min == 0) - min = sb.length(); - minmax[c] = new Pair<>(min, max); - } - } - return new Pair<>(result, minmax); - } - - private String findStartWithIntersectOfStrings(ArrayList strList, int minLength) { - StringBuilder sb = new StringBuilder(); - int i = 0; - boolean flag = true; - do { - char ch = strList.get(0).charAt(i); - for(int j = 1; j < Math.min(strList.size(), minLength); j++) { - char cch = strList.get(j).charAt(i); - if(ch != cch || ch == '\n') { - flag = false; break; - } - } - if(flag) - sb.append(ch); - i++; - } - while(flag && i < minLength); - return sb.toString(); - - } - - private HashSet findStartWithIntersectOfStrings(ArrayList strList) { - // 1. Extract all substrings - // 2. Find intersection of substrings - - HashSet[] substrings = new HashSet[strList.size()]; - for(int i = 0; i < strList.size(); i++) - substrings[i] = new HashSet<>(); - - for(int w = windowSize; w > 2; w--) { - for(int i = 0; i < strList.size(); i++) { - substrings[i].clear(); - substrings[i].addAll(getAllSubstringsOfAString(strList.get(i), w)); - } - - HashSet totalIntersect = new HashSet<>(substrings[0]); - for(int r = 1; r < substrings.length; r++) - totalIntersect.retainAll(substrings[r]); - - if(totalIntersect.size() > 0) - return totalIntersect; - + }while(true); } - return null; + return result; } private boolean checkKeyPatternIsUnique(ArrayList prefixStrings, ArrayList keys) { @@ -1455,66 +1223,4 @@ private Pair getIndexOfKeyPatternOnString(String str, ArrayLis return new Pair<>(-1, -1); } - private ArrayList getAllSubstringsOfAString(String str, int size) { - ArrayList result = new ArrayList<>(); - if(str == null) - return result; - for(int i = 0; i <= str.length() - size; i++) { - String s = str.substring(i, i + size); - if(!s.contains("\n")) - result.add(s); - } - return result; - } - - private String getRemainedSubstring(String str, ArrayList keys) { - boolean flag = true; - int currPos = 0; - for(String k : keys) { - int index = str.indexOf(k, currPos); - if(index != -1) - currPos = index + k.length(); - else { - flag = false; - break; - } - } - if(flag) - return str.substring(currPos); - else - return null; - } - - private ArrayList[] extractAllSuffixStringsOfColsMultiLine() { - ArrayList[] result = new ArrayList[ncols]; - for(int c = 0; c < ncols; c++) { - result[c] = new ArrayList<>(); - - for(int r = 0; r < nrows; r++) { - int rowIndex = mapRow[r][c]; - if(rowIndex == -1) - continue; - StringBuilder sb = new StringBuilder(); - String str = sampleRawIndexes.get(rowIndex).getRaw().substring(mapCol[r][c] + mapLen[r][c]); - boolean enter = false; - if(str.length() > 0) { - sb.append(str); - enter = true; - } - - for(int i = rowIndex + 1; i < nlines; i++) { - str = sampleRawIndexes.get(i).getRaw().substring(0, Math.min(sampleRawIndexes.get(i).getRawLength(), suffixStringLength)); - if(str.length() > 0 && !enter) { - sb.append(str); - break; - } - } - if(sb.length() > 0) - sb.deleteCharAt(sb.length() - 1); - result[c].add(sb.toString()); - } - } - return result; - } - } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index 762f19b4a86..58159c754ec 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -29,12 +29,12 @@ import java.util.HashMap; public class RawIndex { - private final String raw; - private final int rawLength; - private final BitSet numberBitSet; - private final BitSet dotBitSet; - private final BitSet eBitSet; - private final BitSet plusMinusBitSet; + private String raw; + private int rawLength; + private BitSet numberBitSet; + private BitSet dotBitSet; + private BitSet eBitSet; + private BitSet plusMinusBitSet; private BitSet reservedPositions; private BitSet backupReservedPositions; private HashMap>> actualNumericValues; @@ -142,6 +142,8 @@ else if(i == rawLength - 2) { extractNumericDotEActualValues(); } + public RawIndex() {} + public Pair findValue(Object value, Types.ValueType valueType) { if(valueType.isNumeric()) return findValue(UtilFunctions.getDouble(value)); @@ -358,4 +360,15 @@ public int getNextNumericPosition(int curPosition){ } return pos; } + + public void setRaw(String raw){ + this.raw = raw; + this.rawLength = raw.length(); + this.reservedPositions = new BitSet(rawLength); + } + + public void setReservedPositions(int pos, int len){ + for(int i=pos; i1\n" + + "\n" + + "70\n" + + "85\n" + + "90\n" + + ""+ "2\n" + "3\n" + "\n" + @@ -43,7 +48,7 @@ public void test1() { "4\n" + "5\n" + "6\n" + - "\n" + + "" + "\n" + "7\n" + "8\n" + From 0c27ee207d64f36ace9cd3fbafd75ca8a351514f Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 16 Jun 2022 01:24:30 +0200 Subject: [PATCH 58/84] Init commit of parallel code gen --- .../sysds/runtime/iogen/CustomProperties.java | 18 + .../sysds/runtime/iogen/EXP/GIOMatrix.java | 2 +- .../iogen/EXP/GIOMatrixIdentification.java | 2 +- .../runtime/iogen/FormatIdentifying.java | 2 + .../sysds/runtime/iogen/GenerateReader.java | 11 +- .../runtime/iogen/RowIndexStructure.java | 17 + .../runtime/iogen/codegen/FrameCodeGen.java | 5 + .../runtime/iogen/codegen/MatrixCodeGen.java | 117 ++++- .../MatrixGenerateReaderParallel.java | 446 ++++++++++++++++++ .../iogen/template/TemplateCodeGenBase.java | 2 + .../iogen/GenerateReaderMatrixTest.java | 4 +- 11 files changed, 597 insertions(+), 29 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 54a529ae17e..28df09ec60c 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -33,6 +33,8 @@ public class CustomProperties extends FileFormatProperties implements Serializab private KeyTrie valueKeyPattern; private Types.ValueType[] schema; private int ncols; + private boolean sparse; + private boolean parallel; public CustomProperties(MappingProperties mappingProperties, RowIndexStructure rowIndexStructure, ColIndexStructure colIndexStructure) { this.mappingProperties = mappingProperties; @@ -103,4 +105,20 @@ public int getNcols() { public void setNcols(int ncols) { this.ncols = ncols; } + + public boolean isSparse() { + return sparse; + } + + public void setSparse(boolean sparse) { + this.sparse = sparse; + } + + public boolean isParallel() { + return parallel; + } + + public void setParallel(boolean parallel) { + this.parallel = parallel; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java index 17bd0eccdbb..75f956ca1c3 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java @@ -33,7 +33,7 @@ public static void main(String[] args) throws Exception { MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, false); MatrixReader matrixReader = gr.getReader(); MatrixBlock matrixBlock = matrixReader.readMatrixFromHDFS(dataFileName, rows, sampleMB.getNumColumns(), -1, -1); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java index 73b08fd480d..dae84e36865 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java @@ -18,7 +18,7 @@ public static void main(String[] args) throws Exception { MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, false); gr.getReader(); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index bf51a5b3f55..5d613b178ff 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -439,6 +439,8 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I endString = beginString; updateMapsAndExtractAllSuffixStringsOfColsMultiLine(beginString, endString); + rowIndexStructure.setSeqBeginString(beginString); + rowIndexStructure.setSeqEndString(endString); KeyTrie[] colKeyPatterns; colKeyPatterns = buildColsKeyPatternSingleRow(); properties.setColKeyPatterns(colKeyPatterns); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index bc7fcb9d55b..0034cf849ad 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -82,20 +82,19 @@ public GenerateReaderMatrix(SampleProperties sampleProperties) throws Exception super(sampleProperties); } - public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix) throws Exception { + public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix, boolean parallel) throws Exception { super(new SampleProperties(sampleRaw, sampleMatrix)); + properties.setParallel(parallel); } public MatrixReader getReader() throws Exception { String className = getRandomClassName(); MatrixCodeGen src = new MatrixCodeGen(properties, className); - - // constructor with arguments as CustomProperties + // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - String ss = src.generateCodeJava(); - - matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); + String srcJava = properties.isParallel() ? src.generateCodeJavaParallel(): src.generateCodeJava(); + matrixReader = (MatrixReader) CodegenUtils.compileClass(className, srcJava).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java index f59610b4218..6e0ec3343e6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java @@ -44,6 +44,9 @@ public RowIndexStructure() { private KeyTrie keyPattern; private int rowIndexBegin; + private String seqBeginString; + private String seqEndString; + public HashSet endWithValueStrings() { HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); return endWithValueString; @@ -73,5 +76,19 @@ public void setRowIndexBegin(int rowIndexBegin) { this.rowIndexBegin = rowIndexBegin; } + public String getSeqBeginString() { + return seqBeginString; + } + + public void setSeqBeginString(String seqBeginString) { + this.seqBeginString = seqBeginString; + } + public String getSeqEndString() { + return seqEndString; + } + + public void setSeqEndString(String seqEndString) { + this.seqEndString = seqEndString; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 549b18382ce..23e0ea638ac 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -85,6 +85,11 @@ public String generateCodeJava() { return javaTemplate.replace(code, src.toString()); } + @Override + public String generateCodeJavaParallel() { + return null; + } + @Override public String generateCodeCPP() { return null; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 8e6526212a5..0c8b06fa116 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -31,29 +31,53 @@ public MatrixCodeGen(CustomProperties properties, String className) { super(properties, className); // 1. set java code template - javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + - "import java.util.HashMap;\n" + - "import java.util.HashSet;\n" + - "import java.util.regex.Matcher;\n" + - "import java.util.regex.Pattern; \n" + - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + - "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + - "import java.io.BufferedReader;\n" + - "import java.io.IOException;\n" + - "import java.io.InputStream;\n" + - "import java.io.InputStreamReader;\n" + - "public class " + className + " extends MatrixGenerateReader {\n" + - " public " + className + "(CustomProperties _props) {\n" + " super(_props);\n" + " }\n" + - " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n" + - " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n" + + // 1.a: single thread code gen + if(!properties.isParallel()){ + javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + + "import java.util.HashMap;\n" + + "import java.util.HashSet;\n" + + "import java.util.regex.Matcher;\n" + + "import java.util.regex.Pattern; \n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + + "import java.io.BufferedReader;\n" + + "import java.io.IOException;\n" + + "import java.io.InputStream;\n" + + "import java.io.InputStreamReader;\n" + + "public class " + className + " extends MatrixGenerateReader {\n" + + " public " + className + "(CustomProperties _props) {\n" + " super(_props);\n" + " }\n" + + " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n" + + " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n" + + code + + "}}\n"; + } + // 1.b: multi-thread code gen + else { + javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + + "import org.apache.hadoop.io.Text;\n" + + "import org.apache.hadoop.mapred.RecordReader;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.iogen.RowIndexStructure;\n" + + "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReaderParallel;\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import java.io.IOException;\n" + + "import java.util.HashSet; \n" + + "public class "+className+" extends MatrixGenerateReaderParallel {\n" + + "\tpublic "+className+"(CustomProperties _props) {\n" + "super(_props);} \n" + + "@Override \n" + + "protected long readMatrixFromHDFS(RecordReader reader, " + + " LongWritable key, Text value, MatrixBlock dest, int row,\n" + "SplitInfo splitInfo) throws IOException { \n" + code + - "}}\n"; + "}}\n"; + + } // 2. set cpp code template } - @Override public String generateCodeJava() { + @Override + public String generateCodeJava() { StringBuilder src = new StringBuilder(); CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); trie.setMatrix(true); @@ -103,7 +127,60 @@ public MatrixCodeGen(CustomProperties properties, String className) { return javaTemplate.replace(code, src.toString()); } - @Override public String generateCodeCPP() { + @Override + public String generateCodeJavaParallel() { + StringBuilder src = new StringBuilder(); + CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); + trie.setMatrix(true); + src.append("String str=\"\"; \n"); + src.append("String remainStr = \"\"; \n"); + src.append("int col = -1; \n"); + src.append("long lnnz = 0; \n"); + src.append("int index, endPos, strLen; \n"); + src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); + src.append("try { \n"); + src.append("int ri = -1; \n"); + src.append("int beginPosStr, endPosStr; \n"); + src.append("StringBuilder sb = new StringBuilder(); \n"); + src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); + src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); + src.append("boolean flag = true; \n"); + src.append("while(flag) { \n"); + src.append("flag = reader.next(key, value); \n"); + src.append("if(flag) { \n"); + if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + src.append("ri++; \n"); + src.append("String valStr = value.toString(); \n"); + src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); + src.append("endPosStr = ri == endIndex ? splitInfo.getRecordPositionEnd(row): valStr.length(); \n"); + src.append("if(ri >= beginIndex && ri <= endIndex){ \n"); + src.append("sb.append(valStr.substring(beginPosStr, endPosStr)); \n"); + src.append("remainStr = valStr.substring(endPosStr); \n"); + src.append("continue; \n"); + src.append("} \n"); + src.append("else { \n"); + src.append("str = sb.toString(); \n"); + src.append("sb = new StringBuilder(); \n"); + src.append("sb.append(remainStr).append(valStr); \n"); + src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); + src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); + src.append("} \n"); + } + src.append("} \n"); + src.append("else \n"); + src.append("str = sb.toString(); \n"); + src.append("strLen = str.length(); \n"); + src.append(trie.getJavaCode()); + src.append("} \n"); + src.append("} \n"); + src.append("catch(Exception ex){ \n"); + src.append("} \n"); + src.append("return lnnz; \n"); + return javaTemplate.replace(code, src.toString()); + } + + @Override + public String generateCodeCPP() { return null; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java new file mode 100644 index 00000000000..f9592f277ff --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java @@ -0,0 +1,446 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.template; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.io.MatrixReader; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.data.Pair; +import org.apache.sysds.runtime.util.CommonThreadPool; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; + +public abstract class MatrixGenerateReaderParallel extends MatrixReader { + + protected static CustomProperties _props; + protected int _numThreads = 1; + protected JobConf job; + protected SplitOffsetInfos _offsets; + protected int _rLen; + protected int _cLen; + + public MatrixGenerateReaderParallel(CustomProperties _props) { + _numThreads = OptimizerUtils.getParallelTextReadParallelism(); + MatrixGenerateReaderParallel._props = _props; + } + + @Override + public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { + + //prepare file access + job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + InputSplit[] splits = informat.getSplits(job, _numThreads); + splits = IOUtilFunctions.sortInputSplits(splits); + + // check existence and non-empty file + checkValidInputFile(fs, path); + + //allocate output matrix block + MatrixBlock ret = computeSizeAndCreateOutputMatrixBlock(splits, path, rlen, _props.getNcols(), blen, estnnz); + + // Second Read Pass (read, parse strings, append to matrix block) + readMatrixFromHDFS(splits, path, job, ret, rlen, clen, blen); + + return ret; + } + + private MatrixBlock computeSizeAndCreateOutputMatrixBlock(InputSplit[] splits, Path path, long rlen, long clen, int blen, long estnnz) + throws IOException, DMLRuntimeException { + _rLen = 0; + _cLen = _props.getNcols(); + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + // count rows in parallel per split + try { + ExecutorService pool = CommonThreadPool.get(_numThreads); + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.Identity) { + ArrayList tasks = new ArrayList<>(); + for(InputSplit split : splits) + tasks.add(new IOUtilFunctions.CountRowsTask(split, informat, job, false)); + + // collect row counts for offset computation + // early error notify in case not all tasks successful + _offsets = new SplitOffsetInfos(tasks.size()); + int i = 0; + for(Future rc : pool.invokeAll(tasks)) { + int lnrow = (int) rc.get().longValue(); // incl error handling + _offsets.setOffsetPerSplit(i, _rLen); + _offsets.setLenghtPerSplit(i, lnrow); + _rLen = _rLen + lnrow; + i++; + } + pool.shutdown(); + } + else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { + ArrayList tasks = new ArrayList<>(); + for(InputSplit split : splits) + tasks.add(new CountSeqScatteredRowsTask(split, informat, job, _props.getRowIndexStructure().getSeqBeginString(), + _props.getRowIndexStructure().getSeqEndString())); + + // collect row counts for offset computation + // early error notify in case not all tasks successful + _offsets = new SplitOffsetInfos(tasks.size()); + int i = 0; + for(Future rc : pool.invokeAll(tasks)) { + SplitInfo splitInfo = rc.get(); + _offsets.setSeqOffsetPerSplit(i, splitInfo); + _offsets.setOffsetPerSplit(i, _rLen); + _rLen = _rLen + splitInfo.nrows; + i++; + } + pool.shutdown(); + } + } + catch(Exception e) { + throw new IOException("Thread pool Error " + e.getMessage(), e); + } + + // robustness for wrong dimensions which are already compiled into the plan + if(rlen != -1 && _rLen != rlen) { + String msg = "Read matrix dimensions differ from meta data: [" + _rLen + "x" + _cLen + "] vs. [" + rlen+ "x" + clen + "]."; + if(rlen < _rLen || clen < _cLen) { + // a) specified matrix dimensions too small + throw new DMLRuntimeException(msg); + } + else { + // b) specified matrix dimensions too large -> padding and warning + LOG.warn(msg); + _rLen = (int) rlen; + _cLen = (int) clen; + } + } + + // allocate target matrix block based on given size; + // need to allocate sparse as well since lock-free insert into target + long estnnz2 = (estnnz < 0) ? (long) _rLen * _cLen : estnnz; + return createOutputMatrixBlock(_rLen, _cLen, blen, estnnz2, !_props.isSparse(), _props.isSparse()); + } + + @Override + public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz) + throws IOException, DMLRuntimeException { + + MatrixBlock ret = null; + if(rlen >= 0 && clen >= 0) //otherwise allocated on read + ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false); + + return ret; + } + + private void readMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int blen) throws IOException + { + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + ExecutorService pool = CommonThreadPool.get(_numThreads); + try{ + // create read tasks for all splits + ArrayList tasks = new ArrayList<>(); + int splitCount = 0; + for (InputSplit split : splits) { + tasks.add( new ReadTask(split, informat, dest, splitCount++) ); + } + pool.invokeAll(tasks); + pool.shutdown(); + + // check return codes and aggregate nnz + long lnnz = 0; + for (ReadTask rt : tasks) + lnnz += rt.getNnz(); + dest.setNonZeros(lnnz); + } + catch (Exception e) { + throw new IOException("Threadpool issue, while parallel read.", e); + } + } + + private static class SplitOffsetInfos { + // offset & length info per split + private int[] offsetPerSplit = null; + private int[] lenghtPerSplit = null; + private SplitInfo[] seqOffsetPerSplit = null; + + public SplitOffsetInfos(int numSplits) { + lenghtPerSplit = new int[numSplits]; + offsetPerSplit = new int[numSplits]; + seqOffsetPerSplit = new SplitInfo[numSplits]; + } + + public int getLenghtPerSplit(int split) { + return lenghtPerSplit[split]; + } + + public void setLenghtPerSplit(int split, int r) { + lenghtPerSplit[split] = r; + } + + public int getOffsetPerSplit(int split) { + return offsetPerSplit[split]; + } + + public void setOffsetPerSplit(int split, int o) { + offsetPerSplit[split] = o; + } + + public SplitInfo getSeqOffsetPerSplit(int split) { + return seqOffsetPerSplit[split]; + } + + public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { + seqOffsetPerSplit[split] = splitInfo; + } + } + + private class ReadTask implements Callable { + + private final InputSplit _split; + private final TextInputFormat _informat; + private final MatrixBlock _dest; + private final int _splitCount; + private int _row = 0; + private long _nnz = 0; + + public ReadTask(InputSplit split, TextInputFormat informat, MatrixBlock dest, int splitCount) { + _split = split; + _informat = informat; + _dest = dest; + _splitCount = splitCount; + } + + @Override + public Long call() throws IOException { + RecordReader reader = _informat.getRecordReader(_split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + _row = _offsets.getOffsetPerSplit(_splitCount); + SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); + _nnz = readMatrixFromHDFS(reader, key, value, _dest, _row, _splitInfo); + return _nnz; + } + + public long getNnz() { + return _nnz; + } + } + + private static class CountSeqScatteredRowsTask implements Callable { + private final InputSplit _split; + private final TextInputFormat _inputFormat; + private final JobConf _jobConf; + private final String _beginString; + private final String _endString; + + public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, String beginString, String endString){ + _split = split; + _inputFormat = inputFormat; + _jobConf = jobConf; + _beginString = beginString; + _endString = endString; + } + + @Override + public SplitInfo call() throws Exception { + SplitInfo splitInfo = new SplitInfo(); + int nrows = 0; + ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); + ArrayList> endIndexes; + int tokenLength = 0; + if(!_beginString.equals(_endString)) { + endIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); + tokenLength = _endString.length(); + } + else { + endIndexes = new ArrayList<>(); + for(int i = 1; i < beginIndexes.size(); i++) + endIndexes.add(beginIndexes.get(i)); + } + + int i = 0; + int j = 0; + while(i < beginIndexes.size() && j < endIndexes.size()) { + Pair p1 = beginIndexes.get(i); + Pair p2 = endIndexes.get(j); + int n = 0; + while(p1.getKey() < p2.getKey() || (p1.getKey() == p2.getKey() && p1.getValue() < p2.getValue())) { + n++; + i++; + if(i == beginIndexes.size()) + break; + p1 = beginIndexes.get(i); + } + j += n-1; + splitInfo.addIndexAndPosition(beginIndexes.get(i - n).getKey(), endIndexes.get(j).getKey(), beginIndexes.get(i - n).getValue(), + endIndexes.get(j).getValue()+tokenLength); + j++; + nrows++; + } + if(i == beginIndexes.size() && j < endIndexes.size()) + nrows++; + if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) + splitInfo.setRemainString(""); + else{ + RecordReader reader = _inputFormat.getRecordReader(_split, _jobConf, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + StringBuilder sb = new StringBuilder(); + for(int ri = 0; ri< beginIndexes.get(0).getKey(); ri++){ + reader.next(key, value); + String raw = value.toString(); + sb.append(raw); + } + if(beginIndexes.get(0).getValue() != 0) { + reader.next(key, value); + sb.append(value.toString().substring(0, beginIndexes.get(0).getValue())); + } + splitInfo.setRemainString(sb.toString()); + } + splitInfo.setNrows(nrows); + return splitInfo; + } + } + + protected static class SplitInfo{ + private int nrows; + private ArrayList recordIndexBegin; + private ArrayList recordIndexEnd; + private ArrayList recordPositionBegin; + private ArrayList recordPositionEnd; + private String remainString; + + public SplitInfo() { + recordIndexBegin = new ArrayList<>(); + recordIndexEnd = new ArrayList<>(); + recordPositionBegin = new ArrayList<>(); + recordPositionEnd = new ArrayList<>(); + } + + public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos){ + recordIndexBegin.add(beginIndex); + recordIndexEnd.add(endIndex); + recordPositionBegin.add(beginPos); + recordPositionEnd.add(endPos); + } + + public int getNrows() { + return nrows; + } + + public void setNrows(int nrows) { + this.nrows = nrows; + } + + public String getRemainString() { + return remainString; + } + + public void setRemainString(String remainString) { + this.remainString = remainString; + } + + public int getRecordIndexBegin(int index) { + return recordIndexBegin.get(index); + } + + public int getRecordIndexEnd(int index) { + return recordIndexEnd.get(index); + } + + public int getRecordPositionBegin(int index) { + return recordPositionBegin.get(index); + } + + public int getRecordPositionEnd(int index) { + return recordPositionEnd.get(index); + } + } + + private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, + String token) throws IOException { + RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + ArrayList> result = new ArrayList<>(); + + int ri = 0; + while (reader.next(key, value)){ + String raw = value.toString(); + int index; + int fromIndex = 0; + do { + index = raw.indexOf(token, fromIndex); + if(index !=-1){ + result.add(new Pair<>(ri, index)); + fromIndex = index+token.length(); + } + else + break; + }while(true); + ri++; + } + return result; + } + + protected abstract long readMatrixFromHDFS(RecordReader reader, LongWritable key, Text value, MatrixBlock dest, + int rowPos, SplitInfo splitInfo) throws IOException; + + + protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { + int endPos = strLen; + for(String d : endWithValueString) { + int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; + if(pos != -1) + endPos = Math.min(endPos, pos); + } + return endPos; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java index fd813fe96fe..7ab5d91a306 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java @@ -38,5 +38,7 @@ public TemplateCodeGenBase(CustomProperties properties, String className) { public abstract String generateCodeJava(); + public abstract String generateCodeJavaParallel(); + public abstract String generateCodeCPP(); } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index b226756af43..2b11bc7dd5c 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -95,12 +95,14 @@ protected void runGenerateReaderTest() { // mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); // int a = 100; - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, true); MatrixReader mr = gr.getReader(); MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); // TestUtils.compareMatrices(sampleMB, matrixBlock, 0); + int a = 100; + } catch(Exception exception) { exception.printStackTrace(); From cefabd04121ed4f4f183909055f670808b1a6fb6 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 16 Jun 2022 02:28:52 +0200 Subject: [PATCH 59/84] Init commit of parallel code gen for Frame --- .../runtime/iogen/codegen/FrameCodeGen.java | 56 ++- .../template/FrameGenerateReaderParallel.java | 458 ++++++++++++++++++ 2 files changed, 488 insertions(+), 26 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 23e0ea638ac..1132b9ed7b5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -28,36 +28,41 @@ public FrameCodeGen(CustomProperties properties, String className) { super(properties, className); // 1. set java code template - javaTemplate = "import org.apache.hadoop.io.LongWritable; \n" + - "import org.apache.hadoop.io.Text; \n" + - "import org.apache.hadoop.mapred.InputFormat; \n" + - "import org.apache.hadoop.mapred.InputSplit; \n" + - "import org.apache.hadoop.mapred.JobConf; \n" + - "import org.apache.hadoop.mapred.RecordReader; \n" + - "import org.apache.hadoop.mapred.Reporter; \n" + - "import org.apache.sysds.common.Types; \n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions; \n" + - "import org.apache.sysds.runtime.iogen.CustomProperties; \n" + - "import org.apache.sysds.runtime.matrix.data.FrameBlock; \n" + - "import org.apache.sysds.runtime.iogen.template.FrameGenerateReader; \n" + - "import java.io.IOException; \n" + - "import java.util.HashSet; \n" + - "public class "+className+" extends FrameGenerateReader{ \n" + - "public "+className+"(CustomProperties _props) { \n" + - " super(_props); \n" + - " } \n" + + // 1.a: single thread code gen + if(!properties.isParallel()){ + javaTemplate = "import org.apache.hadoop.io.LongWritable; \n" + + "import org.apache.hadoop.io.Text; \n" + + "import org.apache.hadoop.mapred.InputFormat; \n" + + "import org.apache.hadoop.mapred.InputSplit; \n" + + "import org.apache.hadoop.mapred.JobConf; \n" + + "import org.apache.hadoop.mapred.RecordReader; \n" + + "import org.apache.hadoop.mapred.Reporter; \n" + + "import org.apache.sysds.common.Types; \n" + + "import org.apache.sysds.runtime.io.IOUtilFunctions; \n" + + "import org.apache.sysds.runtime.iogen.CustomProperties; \n" + + "import org.apache.sysds.runtime.matrix.data.FrameBlock; \n" + + "import org.apache.sysds.runtime.iogen.template.FrameGenerateReader; \n" + + "import java.io.IOException; \n" + + "import java.util.HashSet; \n" + + "public class "+className+" extends FrameGenerateReader{ \n" + + "public "+className+"(CustomProperties _props) { \n" + + " super(_props); \n" + + " } \n" + - "@Override protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, \n" + - " JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, \n" + - " boolean first) throws IOException { \n" + - code+ - "}} \n"; + "@Override protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, \n" + + " JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, \n" + + " boolean first) throws IOException { \n" + + code+ + "}} \n"; + } + else { + } } + @Override public String generateCodeJava() { - StringBuilder src = new StringBuilder(); src.append("RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); \n"); src.append("LongWritable key = new LongWritable(); \n"); @@ -65,8 +70,7 @@ public String generateCodeJava() { src.append("int row = rl; \n"); src.append("long lnnz = 0; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); -// if(properties.getRowIndex() == CustomProperties.IndexProperties.PREFIX) -// src.append("HashSet endWithValueStringRow = _props.endWithValueStringsRow(); \n"); + src.append("int index, endPos, strLen; \n"); src.append("try { \n"); src.append("while(reader.next(key, value)){ \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java new file mode 100644 index 00000000000..4546a60c6f9 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -0,0 +1,458 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.template; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.io.IOUtilFunctions; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.Pair; +import org.apache.sysds.runtime.util.CommonThreadPool; +import org.apache.sysds.runtime.util.InputStreamInputFormat; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; + +public abstract class FrameGenerateReaderParallel extends FrameReader { + + protected CustomProperties _props; + protected int _numThreads = 1; + protected JobConf job; + protected SplitOffsetInfos _offsets; + protected int _rLen; + protected int _cLen; + + public FrameGenerateReaderParallel(CustomProperties _props) { + this._numThreads = OptimizerUtils.getParallelTextReadParallelism(); + this._props = _props; + } + + @Override + public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, + long clen) throws IOException, DMLRuntimeException { + + //prepare file access + job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + InputSplit[] splits = informat.getSplits(job, _numThreads); + splits = IOUtilFunctions.sortInputSplits(splits); + + // check existence and non-empty file + checkValidInputFile(fs, path); + + // allocate output frame block + FrameBlock ret = computeSizeAndCreateOutputFrameBlock(schema, names, splits, path, rlen, clen); + + // core read (sequential/parallel) + readFrameFromHDFS(splits, path, job, ret); + return ret; + } + + private FrameBlock computeSizeAndCreateOutputFrameBlock(Types.ValueType[] schema, String[] names, InputSplit[] splits, Path path, long rlen, + long clen) throws IOException, DMLRuntimeException { + _rLen = 0; + _cLen = _props.getNcols(); + + Types.ValueType[] lschema = createOutputSchema(schema, _cLen); + String[] lnames = createOutputNames(names, _cLen); + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + // count rows in parallel per split + try { + ExecutorService pool = CommonThreadPool.get(_numThreads); + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.Identity) { + ArrayList tasks = new ArrayList<>(); + for(InputSplit split : splits) + tasks.add(new IOUtilFunctions.CountRowsTask(split, informat, job, false)); + + // collect row counts for offset computation + // early error notify in case not all tasks successful + _offsets = new SplitOffsetInfos(tasks.size()); + int i = 0; + for(Future rc : pool.invokeAll(tasks)) { + int lnrow = (int) rc.get().longValue(); // incl error handling + _offsets.setOffsetPerSplit(i, _rLen); + _offsets.setLenghtPerSplit(i, lnrow); + _rLen = _rLen + lnrow; + i++; + } + pool.shutdown(); + } + else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { + ArrayList tasks = new ArrayList<>(); + for(InputSplit split : splits) + tasks.add(new CountSeqScatteredRowsTask(split, informat, job, _props.getRowIndexStructure().getSeqBeginString(), + _props.getRowIndexStructure().getSeqEndString())); + + // collect row counts for offset computation + // early error notify in case not all tasks successful + _offsets = new SplitOffsetInfos(tasks.size()); + int i = 0; + for(Future rc : pool.invokeAll(tasks)) { + SplitInfo splitInfo = rc.get(); + _offsets.setSeqOffsetPerSplit(i, splitInfo); + _offsets.setOffsetPerSplit(i, _rLen); + _rLen = _rLen + splitInfo.nrows; + i++; + } + pool.shutdown(); + } + } + catch(Exception e) { + throw new IOException("Thread pool Error " + e.getMessage(), e); + } + + // robustness for wrong dimensions which are already compiled into the plan + if(rlen != -1 && _rLen != rlen) { + String msg = "Read frame dimensions differ from meta data: [" + _rLen + "x" + _cLen + "] vs. [" + rlen+ "x" + clen + "]."; + if(rlen < _rLen || clen < _cLen) { + // a) specified matrix dimensions too small + throw new DMLRuntimeException(msg); + } + else { + // b) specified matrix dimensions too large -> padding and warning + LOG.warn(msg); + _rLen = (int) rlen; + _cLen = (int) clen; + } + } + + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + return ret; + } + + @Override + public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] schema, String[] names, + long rlen, long clen) throws IOException, DMLRuntimeException { + + // allocate output frame block + InputStreamInputFormat informat = new InputStreamInputFormat(is); + InputSplit[] splits = informat.getSplits(null, 1); + FrameBlock ret = computeSizeAndCreateOutputFrameBlock(schema, names, splits,null, rlen, clen); +// +// // core read (sequential/parallel) +// +// +// ReadTask rt = new ReadTask(splits[0], informat, ret, 1) +// +// //readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); +// +// ArrayList tasks = new ArrayList<>(); +// int splitCount = 0; +// for (InputSplit split : splits) { +// tasks.add( new ReadTask(split, informat, dest, splitCount++). ); +// } +// pool.invokeAll(tasks); +// pool.shutdown(); + // TODO: implement parallel reader for input stream + return ret; + } + + protected void readFrameFromHDFS(InputSplit[] splits, Path path, JobConf job, FrameBlock dest) throws IOException { + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + ExecutorService pool = CommonThreadPool.get(_numThreads); + try{ + // create read tasks for all splits + ArrayList tasks = new ArrayList<>(); + int splitCount = 0; + for (InputSplit split : splits) { + tasks.add( new ReadTask(split, informat, dest, splitCount++) ); + } + pool.invokeAll(tasks); + pool.shutdown(); + + } + catch (Exception e) { + throw new IOException("Threadpool issue, while parallel read.", e); + } + } + + + protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { + int endPos = strLen; + for(String d : endWithValueString) { + int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; + if(pos != -1) + endPos = Math.min(endPos, pos); + } + return endPos; + } + + private static class SplitOffsetInfos { + // offset & length info per split + private int[] offsetPerSplit = null; + private int[] lenghtPerSplit = null; + private SplitInfo[] seqOffsetPerSplit = null; + + public SplitOffsetInfos(int numSplits) { + lenghtPerSplit = new int[numSplits]; + offsetPerSplit = new int[numSplits]; + seqOffsetPerSplit = new SplitInfo[numSplits]; + } + + public int getLenghtPerSplit(int split) { + return lenghtPerSplit[split]; + } + + public void setLenghtPerSplit(int split, int r) { + lenghtPerSplit[split] = r; + } + + public int getOffsetPerSplit(int split) { + return offsetPerSplit[split]; + } + + public void setOffsetPerSplit(int split, int o) { + offsetPerSplit[split] = o; + } + + public SplitInfo getSeqOffsetPerSplit(int split) { + return seqOffsetPerSplit[split]; + } + + public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { + seqOffsetPerSplit[split] = splitInfo; + } + } + + private class ReadTask implements Callable { + + private final InputSplit _split; + private final TextInputFormat _informat; + private final FrameBlock _dest; + private final int _splitCount; + private int _row = 0; + + public ReadTask(InputSplit split, TextInputFormat informat, FrameBlock dest, int splitCount) { + _split = split; + _informat = informat; + _dest = dest; + _splitCount = splitCount; + } + + @Override + public Long call() throws IOException { + RecordReader reader = _informat.getRecordReader(_split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + _row = _offsets.getOffsetPerSplit(_splitCount); + SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); + reaFrameFromHDFS(reader, key, value, _dest, _row, _splitInfo); + return 0L; + } + } + + private static class CountSeqScatteredRowsTask implements Callable { + private final InputSplit _split; + private final TextInputFormat _inputFormat; + private final JobConf _jobConf; + private final String _beginString; + private final String _endString; + + public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, String beginString, String endString){ + _split = split; + _inputFormat = inputFormat; + _jobConf = jobConf; + _beginString = beginString; + _endString = endString; + } + + @Override + public SplitInfo call() throws Exception { + SplitInfo splitInfo = new SplitInfo(); + int nrows = 0; + ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); + ArrayList> endIndexes; + int tokenLength = 0; + if(!_beginString.equals(_endString)) { + endIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); + tokenLength = _endString.length(); + } + else { + endIndexes = new ArrayList<>(); + for(int i = 1; i < beginIndexes.size(); i++) + endIndexes.add(beginIndexes.get(i)); + } + + int i = 0; + int j = 0; + while(i < beginIndexes.size() && j < endIndexes.size()) { + Pair p1 = beginIndexes.get(i); + Pair p2 = endIndexes.get(j); + int n = 0; + while(p1.getKey() < p2.getKey() || (p1.getKey() == p2.getKey() && p1.getValue() < p2.getValue())) { + n++; + i++; + if(i == beginIndexes.size()) + break; + p1 = beginIndexes.get(i); + } + j += n-1; + splitInfo.addIndexAndPosition(beginIndexes.get(i - n).getKey(), endIndexes.get(j).getKey(), beginIndexes.get(i - n).getValue(), + endIndexes.get(j).getValue()+tokenLength); + j++; + nrows++; + } + if(i == beginIndexes.size() && j < endIndexes.size()) + nrows++; + if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) + splitInfo.setRemainString(""); + else{ + RecordReader reader = _inputFormat.getRecordReader(_split, _jobConf, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + StringBuilder sb = new StringBuilder(); + for(int ri = 0; ri< beginIndexes.get(0).getKey(); ri++){ + reader.next(key, value); + String raw = value.toString(); + sb.append(raw); + } + if(beginIndexes.get(0).getValue() != 0) { + reader.next(key, value); + sb.append(value.toString().substring(0, beginIndexes.get(0).getValue())); + } + splitInfo.setRemainString(sb.toString()); + } + splitInfo.setNrows(nrows); + return splitInfo; + } + } + + protected static class SplitInfo{ + private int nrows; + private ArrayList recordIndexBegin; + private ArrayList recordIndexEnd; + private ArrayList recordPositionBegin; + private ArrayList recordPositionEnd; + private String remainString; + + public SplitInfo() { + recordIndexBegin = new ArrayList<>(); + recordIndexEnd = new ArrayList<>(); + recordPositionBegin = new ArrayList<>(); + recordPositionEnd = new ArrayList<>(); + } + + public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos){ + recordIndexBegin.add(beginIndex); + recordIndexEnd.add(endIndex); + recordPositionBegin.add(beginPos); + recordPositionEnd.add(endPos); + } + + public int getNrows() { + return nrows; + } + + public void setNrows(int nrows) { + this.nrows = nrows; + } + + public String getRemainString() { + return remainString; + } + + public void setRemainString(String remainString) { + this.remainString = remainString; + } + + public int getRecordIndexBegin(int index) { + return recordIndexBegin.get(index); + } + + public int getRecordIndexEnd(int index) { + return recordIndexEnd.get(index); + } + + public int getRecordPositionBegin(int index) { + return recordPositionBegin.get(index); + } + + public int getRecordPositionEnd(int index) { + return recordPositionEnd.get(index); + } + } + + private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, + String token) throws IOException { + RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + ArrayList> result = new ArrayList<>(); + + int ri = 0; + while (reader.next(key, value)){ + String raw = value.toString(); + int index; + int fromIndex = 0; + do { + index = raw.indexOf(token, fromIndex); + if(index !=-1){ + result.add(new Pair<>(ri, index)); + fromIndex = index+token.length(); + } + else + break; + }while(true); + ri++; + } + return result; + } + + protected abstract void reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, + int rowPos, SplitInfo splitInfo) throws IOException; + + + +} From bbf0f70dff5ed2f2ce9b06372894563263bfea72 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 16 Jun 2022 19:08:36 +0200 Subject: [PATCH 60/84] Parallel implementation of Frame reader --- .../sysds/runtime/iogen/EXP/GIOFrame.java | 2 +- .../iogen/EXP/GIOFrameIdentification.java | 2 +- .../sysds/runtime/iogen/GenerateReader.java | 20 ++---- .../runtime/iogen/codegen/FrameCodeGen.java | 65 ++++++++++++++++++- .../iogen/FrameSingleRowNestedTest.java | 11 ++++ .../iogen/GenerateReaderFrameTest.java | 4 +- .../Identify/MatrixGRRowColIdentifyTest.java | 2 +- 7 files changed, 85 insertions(+), 21 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java index 4e5e2b9ce5a..bfa1203f993 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java @@ -42,7 +42,7 @@ public static void main(String[] args) throws Exception { String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, false); FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java index 54593553a9b..20bab875452 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java @@ -25,7 +25,7 @@ public static void main(String[] args) throws Exception { FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, false); gr.getReader(); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 0034cf849ad..d2707cf5939 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -31,17 +31,6 @@ import java.util.Random; -/* - Generate Reader has two steps: - 1. Identify file format and extract the properties of it based on the Sample Matrix. - The ReaderMapping class tries to map the Sample Matrix on the Sample Raw Matrix. - The result of a ReaderMapping is a FileFormatProperties object. - - 2. Generate a reader based on inferred properties. - - Note. Base on this implementation, it is possible to generate a reader - base on Sample Matrix and generate a reader for a frame or vice versa. -*/ public abstract class GenerateReader { protected static final Log LOG = LogFactory.getLog(GenerateReader.class.getName()); @@ -90,7 +79,7 @@ public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix, boolean public MatrixReader getReader() throws Exception { String className = getRandomClassName(); MatrixCodeGen src = new MatrixCodeGen(properties, className); - // constructor with arguments as CustomProperties + // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; String srcJava = properties.isParallel() ? src.generateCodeJavaParallel(): src.generateCodeJava(); @@ -108,18 +97,19 @@ public GenerateReaderFrame(SampleProperties sampleProperties) throws Exception { super(sampleProperties); } - public GenerateReaderFrame(String sampleRaw, FrameBlock sampleFrame) throws Exception { + public GenerateReaderFrame(String sampleRaw, FrameBlock sampleFrame, boolean parallel) throws Exception { super(new SampleProperties(sampleRaw, sampleFrame)); + properties.setParallel(parallel); } public FrameReader getReader() throws Exception { String className = getRandomClassName(); FrameCodeGen src = new FrameCodeGen(properties, className); - // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - frameReader = (FrameReader) CodegenUtils.compileClass(className, src.generateCodeJava()).getDeclaredConstructor(cArg).newInstance(properties); + String srcJava = properties.isParallel() ? src.generateCodeJavaParallel(): src.generateCodeJava(); + frameReader = (FrameReader) CodegenUtils.compileClass(className, srcJava).getDeclaredConstructor(cArg).newInstance(properties); return frameReader; } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 1132b9ed7b5..8fa5e9ea9e0 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -20,6 +20,7 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; public class FrameCodeGen extends TemplateCodeGenBase { @@ -56,7 +57,21 @@ public FrameCodeGen(CustomProperties properties, String className) { "}} \n"; } else { - + javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + + "import org.apache.hadoop.io.Text;\n" + + "import org.apache.hadoop.mapred.RecordReader;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.iogen.template.FrameGenerateReaderParallel;\n" + + "import org.apache.sysds.runtime.matrix.data.FrameBlock;\n" + + "import java.io.IOException;\n" + "import java.util.HashSet;\n" + + "public class "+className+" extends FrameGenerateReaderParallel {\n" + + "public "+className+"(CustomProperties _props) {\n" + + "super(_props);} \n" + + "@Override \n" + + "protected void reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, " + + "FrameBlock dest, int row, SplitInfo splitInfo) throws IOException {\n"+ + code+ + "}} \n"; } } @@ -91,7 +106,53 @@ public String generateCodeJava() { @Override public String generateCodeJavaParallel() { - return null; + StringBuilder src = new StringBuilder(); + CodeGenTrie trie = new CodeGenTrie(properties, "dest.set", false); + trie.setMatrix(true); + src.append("String str=\"\"; \n"); + src.append("String remainStr = \"\"; \n"); + src.append("int col = -1; \n"); + src.append("long lnnz = 0; \n"); + src.append("int index, endPos, strLen; \n"); + src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); + src.append("try { \n"); + src.append("int ri = -1; \n"); + src.append("int beginPosStr, endPosStr; \n"); + src.append("StringBuilder sb = new StringBuilder(); \n"); + src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); + src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); + src.append("boolean flag = true; \n"); + src.append("while(flag) { \n"); + src.append("flag = reader.next(key, value); \n"); + src.append("if(flag) { \n"); + if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + src.append("ri++; \n"); + src.append("String valStr = value.toString(); \n"); + src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); + src.append("endPosStr = ri == endIndex ? splitInfo.getRecordPositionEnd(row): valStr.length(); \n"); + src.append("if(ri >= beginIndex && ri <= endIndex){ \n"); + src.append("sb.append(valStr.substring(beginPosStr, endPosStr)); \n"); + src.append("remainStr = valStr.substring(endPosStr); \n"); + src.append("continue; \n"); + src.append("} \n"); + src.append("else { \n"); + src.append("str = sb.toString(); \n"); + src.append("sb = new StringBuilder(); \n"); + src.append("sb.append(remainStr).append(valStr); \n"); + src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); + src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); + src.append("} \n"); + } + src.append("} \n"); + src.append("else \n"); + src.append("str = sb.toString(); \n"); + src.append("strLen = str.length(); \n"); + src.append(trie.getJavaCode()); + src.append("} \n"); + src.append("} \n"); + src.append("catch(Exception ex){ \n"); + src.append("} \n"); + return javaTemplate.replace(code, src.toString()); } @Override diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java index b3e1bb934ab..b0907ee7123 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java @@ -88,4 +88,15 @@ public void test6() { data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; runGenerateReaderTest(); } + + @Test + public void test7() { + sampleRaw = "{\n\"a\":1,\n\"b\":2,\n\"c\":3,\n\"d\":4,\n\"e\":5\n}\n" + + "{\"a\":6,\n\"b\":7,\"c\":8,\"d\":9,\"e\":10\n}\n" + + "{\"a\":11,\"b\":12,\n\"c\":13,\"d\":14,\"e\":15\n}"; + + data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; + schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; + runGenerateReaderTest(); + } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java index 8b113c978db..34af434013b 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java @@ -175,10 +175,12 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "frame_data.raw"; int clen = data[0].length; writeRawString(sampleRaw, dataPath); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, true); FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath, schema, data.length, clen); + + int a = 100; } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java index 2f1463de0a0..c99555d051a 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java @@ -307,7 +307,7 @@ private void generateRandomCSV(int nrows, int ncols, double min, double max, dou String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, false); FrameReader fr =gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); From 3ffecd5d82d649bb52998ade4b06abc76a70d60e Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 16 Jun 2022 19:13:03 +0200 Subject: [PATCH 61/84] minor update, code style --- .../runtime/iogen/template/FrameGenerateReaderParallel.java | 6 +----- .../sysds/runtime/iogen/template/MatrixGenerateReader.java | 1 - .../iogen/template/MatrixGenerateReaderParallel.java | 1 - 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java index 4546a60c6f9..5ba5b6c6488 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -53,7 +53,7 @@ public abstract class FrameGenerateReaderParallel extends FrameReader { protected CustomProperties _props; - protected int _numThreads = 1; + protected int _numThreads; protected JobConf job; protected SplitOffsetInfos _offsets; protected int _rLen; @@ -217,7 +217,6 @@ protected void readFrameFromHDFS(InputSplit[] splits, Path path, JobConf job, Fr } } - protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { int endPos = strLen; for(String d : endWithValueString) { @@ -452,7 +451,4 @@ private static ArrayList> getTokenIndexOnMultiLineRecords protected abstract void reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, int rowPos, SplitInfo splitInfo) throws IOException; - - - } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index d0da3491bf3..6572c46e1fd 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -167,7 +167,6 @@ protected int getEndPos(String str, int strLen, int currPos, HashSet end return endPos; } - protected int getColIndex(HashMap colKeyPatternMap, String key){ if(colKeyPatternMap.containsKey(key)) return colKeyPatternMap.get(key); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java index f9592f277ff..eb10feb51ec 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java @@ -433,7 +433,6 @@ private static ArrayList> getTokenIndexOnMultiLineRecords protected abstract long readMatrixFromHDFS(RecordReader reader, LongWritable key, Text value, MatrixBlock dest, int rowPos, SplitInfo splitInfo) throws IOException; - protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { int endPos = strLen; for(String d : endWithValueString) { From 283e83a3c9da0ef0f6a8a5663849b8e1f0031abd Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 16 Jun 2022 19:43:51 +0200 Subject: [PATCH 62/84] Add Gson, Jackson parallel readers, update experiment source code --- .../sysds/runtime/io/FrameReaderJSONGson.java | 4 +- .../io/FrameReaderJSONGsonParallel.java | 110 +++++++ .../runtime/io/FrameReaderJSONJackson.java | 4 +- .../io/FrameReaderJSONJacksonParallel.java | 110 +++++++ .../sysds/runtime/iogen/EXP/GIOFrame.java | 4 +- .../iogen/EXP/GIOFrameIdentification.java | 4 +- .../sysds/runtime/iogen/EXP/GIOMatrix.java | 4 +- .../iogen/EXP/GIOMatrixIdentification.java | 5 +- .../sysds/runtime/iogen/EXP/SystemDS.java | 269 +++++++++++------- .../sysds/runtime/iogen/EXP/SystemDSGson.java | 13 +- .../runtime/iogen/EXP/SystemDSJackson.java | 13 +- .../runtime/iogen/EXP/SystemDSJson4j.java | 13 +- 12 files changed, 436 insertions(+), 117 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java index f569b742247..3a2560b76dc 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java @@ -56,7 +56,7 @@ public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Map< } - public void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, + protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, Types.ValueType[] schema, Map schemaMap) throws IOException { TextInputFormat inputFormat = new TextInputFormat(); @@ -70,7 +70,7 @@ public void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSy } - private int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, + protected static int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, JobConf jobConf, Types.ValueType[] schema, Map schemaMap, FrameBlock dest, int currentRow) throws IOException { diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java new file mode 100644 index 00000000000..5bea3627f7c --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.io; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.io.IOUtilFunctions.CountRowsTask; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.CommonThreadPool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; + +public class FrameReaderJSONGsonParallel extends FrameReaderJSONGson +{ + @Override + protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, + FrameBlock dest, Types.ValueType[] schema, Map schemaMap) + throws IOException + { + int numThreads = OptimizerUtils.getParallelTextReadParallelism(); + + TextInputFormat inputFormat = new TextInputFormat(); + inputFormat.configure(jobConf); + InputSplit[] splits = inputFormat.getSplits(jobConf, numThreads); + splits = IOUtilFunctions.sortInputSplits(splits); + + try{ + ExecutorService executorPool = CommonThreadPool.get(Math.min(numThreads, splits.length)); + + //compute num rows per split + ArrayList countRowsTasks = new ArrayList<>(); + for (InputSplit split : splits){ + countRowsTasks.add(new CountRowsTask(split, inputFormat, jobConf)); + } + List> ret = executorPool.invokeAll(countRowsTasks); + + //compute row offset per split via cumsum on row counts + long offset = 0; + List offsets = new ArrayList<>(); + for( Future rc : ret ) { + offsets.add(offset); + offset += rc.get(); + } + + //read individual splits + ArrayList readRowsTasks = new ArrayList<>(); + for( int i=0; i{ + private InputSplit _split; + private TextInputFormat _inputFormat; + private JobConf _jobConf; + private FrameBlock _dest; + Map _schemaMap; + private int _offset; + + public ReadRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, + FrameBlock dest, Map schemaMap, int offset) + { + _split = split; + _inputFormat = inputFormat; + _jobConf = jobConf; + _dest = dest; + _schemaMap = schemaMap; + _offset = offset; + } + + @Override + public Object call() throws Exception { + readJSONLFrameFromInputSplit(_split, _inputFormat, _jobConf, _dest.getSchema(), _schemaMap, _dest, _offset); + return null; + } + } +} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java index 5d0bf47d9f5..a86bc8d4dad 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java @@ -55,7 +55,7 @@ public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Map< } - public void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, + protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, Types.ValueType[] schema, Map schemaMap) throws IOException { TextInputFormat inputFormat = new TextInputFormat(); @@ -69,7 +69,7 @@ public void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSy } - private int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, + protected static int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, JobConf jobConf, Types.ValueType[] schema, Map schemaMap, FrameBlock dest, int currentRow) throws IOException { diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java new file mode 100644 index 00000000000..9b3d5de7841 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.io; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.io.IOUtilFunctions.CountRowsTask; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.CommonThreadPool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; + +public class FrameReaderJSONJacksonParallel extends FrameReaderJSONJackson +{ + @Override + protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, + FrameBlock dest, Types.ValueType[] schema, Map schemaMap) + throws IOException + { + int numThreads = OptimizerUtils.getParallelTextReadParallelism(); + + TextInputFormat inputFormat = new TextInputFormat(); + inputFormat.configure(jobConf); + InputSplit[] splits = inputFormat.getSplits(jobConf, numThreads); + splits = IOUtilFunctions.sortInputSplits(splits); + + try{ + ExecutorService executorPool = CommonThreadPool.get(Math.min(numThreads, splits.length)); + + //compute num rows per split + ArrayList countRowsTasks = new ArrayList<>(); + for (InputSplit split : splits){ + countRowsTasks.add(new CountRowsTask(split, inputFormat, jobConf)); + } + List> ret = executorPool.invokeAll(countRowsTasks); + + //compute row offset per split via cumsum on row counts + long offset = 0; + List offsets = new ArrayList<>(); + for( Future rc : ret ) { + offsets.add(offset); + offset += rc.get(); + } + + //read individual splits + ArrayList readRowsTasks = new ArrayList<>(); + for( int i=0; i{ + private InputSplit _split; + private TextInputFormat _inputFormat; + private JobConf _jobConf; + private FrameBlock _dest; + Map _schemaMap; + private int _offset; + + public ReadRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, + FrameBlock dest, Map schemaMap, int offset) + { + _split = split; + _inputFormat = inputFormat; + _jobConf = jobConf; + _dest = dest; + _schemaMap = schemaMap; + _offset = offset; + } + + @Override + public Object call() throws Exception { + readJSONLFrameFromInputSplit(_split, _inputFormat, _jobConf, _dest.getSchema(), _schemaMap, _dest, _offset); + return null; + } + } +} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java index bfa1203f993..70bb582a790 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java @@ -17,6 +17,7 @@ public static void main(String[] args) throws Exception { String sampleRawDelimiter; String schemaFileName; String dataFileName; + boolean parallel; long rows = -1; sampleRawFileName = System.getProperty("sampleRawFileName"); @@ -24,6 +25,7 @@ public static void main(String[] args) throws Exception { sampleRawDelimiter = "\t"; schemaFileName = System.getProperty("schemaFileName"); dataFileName = System.getProperty("dataFileName"); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); Util util = new Util(); // read and parse mtd file @@ -42,7 +44,7 @@ public static void main(String[] args) throws Exception { String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, false); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java index 20bab875452..0b1b8f00941 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java @@ -11,9 +11,11 @@ public static void main(String[] args) throws Exception { String sampleFrameFileName; String sampleRawDelimiter; String schemaFileName; + boolean parallel; sampleRawFileName = System.getProperty("sampleRawFileName"); sampleFrameFileName = System.getProperty("sampleFrameFileName"); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); sampleRawDelimiter = "\t"; schemaFileName = System.getProperty("schemaFileName"); @@ -25,7 +27,7 @@ public static void main(String[] args) throws Exception { FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, false); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); gr.getReader(); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java index 75f956ca1c3..0aab12731e1 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java @@ -12,12 +12,14 @@ public static void main(String[] args) throws Exception { String sampleMatrixFileName; String sampleRawDelimiter; String dataFileName; + boolean parallel; long rows = -1; sampleRawFileName = System.getProperty("sampleRawFileName"); sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); sampleRawDelimiter = "\t"; dataFileName = System.getProperty("dataFileName"); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); Util util = new Util(); // read and parse mtd file String mtdFileName = dataFileName + ".mtd"; @@ -33,7 +35,7 @@ public static void main(String[] args) throws Exception { MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, false); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, parallel); MatrixReader matrixReader = gr.getReader(); MatrixBlock matrixBlock = matrixReader.readMatrixFromHDFS(dataFileName, rows, sampleMB.getNumColumns(), -1, -1); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java index dae84e36865..ba56bc2a9f8 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java @@ -9,16 +9,17 @@ public static void main(String[] args) throws Exception { String sampleRawFileName; String sampleMatrixFileName; String sampleRawDelimiter; - + boolean parallel; sampleRawFileName = System.getProperty("sampleRawFileName"); sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); sampleRawDelimiter = "\t"; Util util = new Util(); MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, false); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, parallel); gr.getReader(); } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java index f9faa672ba1..67bbbe0f63b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -11,108 +11,173 @@ public class SystemDS { - public static void main(String[] args) throws IOException, JSONException { - - String schemaFileName; - String dataFileName; - String dataType = null; - String valueType; - String sep = null; - String indSep = null; - boolean header = false; - long cols = -1; - long rows = -1; - String format = null; - String config = null; - String schemaMapFileName = null; - - Util util = new Util(); - schemaFileName = System.getProperty("schemaFileName"); - dataFileName = System.getProperty("dataFileName"); - // read and parse mtd file - String mtdFileName = dataFileName + ".mtd"; - try { - String mtd = util.readEntireTextFile(mtdFileName); - mtd = mtd.replace("\n", "").replace("\r", ""); - mtd = mtd.trim(); - JSONObject jsonObject = new JSONObject(mtd); - if (jsonObject.containsKey("data_type")) dataType = jsonObject.getString("data_type"); - - if (jsonObject.containsKey("value_type")) valueType = jsonObject.getString("value_type"); - - if (jsonObject.containsKey("format")) format = jsonObject.getString("format"); - - if (jsonObject.containsKey("cols")) cols = jsonObject.getLong("cols"); - - if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); - - if (jsonObject.containsKey("header")) header = jsonObject.getBoolean("header"); - - if (jsonObject.containsKey("schema_path")) schemaFileName = jsonObject.getString("schema_path"); - - if (jsonObject.containsKey("sep")) sep = jsonObject.getString("sep"); - - if (jsonObject.containsKey("indSep")) indSep = jsonObject.getString("indSep"); - - - } catch (Exception exception) { - } - - if (dataType.equalsIgnoreCase("matrix")) { - MatrixReader matrixReader = null; - switch (format) { - case "csv": - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); - matrixReader = new ReaderTextCSV(propertiesCSV); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - break; - case "libsvm": - FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); - matrixReader = new ReaderTextLIBSVM(propertiesLIBSVM); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - break; - case "mm": - matrixReader = new ReaderTextCell(Types.FileFormat.MM, true); - break; - } - if (matrixReader == null) throw new IOException("The Matrix Reader is NULL: " + dataFileName + ", format: " + format); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - } else { - Types.ValueType[] schema = util.getSchema(schemaFileName); - cols = schema.length; - FrameBlock frameBlock = null; - - switch (format) { - case "csv": - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); - FrameReader frameReader = new FrameReaderTextCSV(propertiesCSV); - frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); - break; - case "json": - schemaMapFileName = System.getProperty("schemaMapFileName"); - Map schemaMap = util.getSchemaMap(schemaMapFileName); - config = System.getProperty("config"); - switch (config.toLowerCase()) { - case "gson": - FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); - frameBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - - case "jackson": - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - frameBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - case "json4j": - FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); - frameBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - default: - throw new IOException("JSON Config don't support!!" + config); - } - break; - } + public static void main(String[] args) throws IOException, JSONException { + + String schemaFileName; + String dataFileName; + String dataType = null; + String valueType; + String sep = null; + String indSep = null; + boolean header = false; + long cols = -1; + long rows = -1; + String format = null; + String config = null; + String schemaMapFileName = null; + boolean parallel; + + Util util = new Util(); + schemaFileName = System.getProperty("schemaFileName"); + dataFileName = System.getProperty("dataFileName"); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); + // read and parse mtd file + String mtdFileName = dataFileName + ".mtd"; + try { + String mtd = util.readEntireTextFile(mtdFileName); + mtd = mtd.replace("\n", "").replace("\r", ""); + mtd = mtd.trim(); + JSONObject jsonObject = new JSONObject(mtd); + if(jsonObject.containsKey("data_type")) + dataType = jsonObject.getString("data_type"); + + if(jsonObject.containsKey("value_type")) + valueType = jsonObject.getString("value_type"); + + if(jsonObject.containsKey("format")) + format = jsonObject.getString("format"); + + if(jsonObject.containsKey("cols")) + cols = jsonObject.getLong("cols"); + + if(jsonObject.containsKey("rows")) + rows = jsonObject.getLong("rows"); + + if(jsonObject.containsKey("header")) + header = jsonObject.getBoolean("header"); + + if(jsonObject.containsKey("schema_path")) + schemaFileName = jsonObject.getString("schema_path"); + + if(jsonObject.containsKey("sep")) + sep = jsonObject.getString("sep"); + + if(jsonObject.containsKey("indSep")) + indSep = jsonObject.getString("indSep"); + + } + catch(Exception exception) { + } + + if(dataType.equalsIgnoreCase("matrix")) { + MatrixReader matrixReader = null; + if(!parallel) { + switch(format) { + case "csv": + FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); + matrixReader = new ReaderTextCSV(propertiesCSV); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + break; + case "libsvm": + FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); + matrixReader = new ReaderTextLIBSVM(propertiesLIBSVM); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + break; + case "mm": + matrixReader = new ReaderTextCell(Types.FileFormat.MM, true); + break; + } + } + else { + switch(format) { + case "csv": + FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); + matrixReader = new ReaderTextCSVParallel(propertiesCSV); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + break; + case "libsvm": + FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); + matrixReader = new ReaderTextLIBSVMParallel(propertiesLIBSVM); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + break; + case "mm": + matrixReader = new ReaderTextCellParallel(Types.FileFormat.MM); + break; + } + } + if(matrixReader == null) + throw new IOException("The Matrix Reader is NULL: " + dataFileName + ", format: " + format); + matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); + } + else { + Types.ValueType[] schema = util.getSchema(schemaFileName); + cols = schema.length; + FrameBlock frameBlock = null; + + if(!parallel) { + switch(format) { + case "csv": + FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); + FrameReader frameReader = new FrameReaderTextCSV(propertiesCSV); + frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); + break; + case "json": + schemaMapFileName = System.getProperty("schemaMapFileName"); + Map schemaMap = util.getSchemaMap(schemaMapFileName); + config = System.getProperty("config"); + switch(config.toLowerCase()) { + case "gson": + FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); + frameBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + + case "jackson": + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + frameBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + case "json4j": + FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); + frameBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + default: + throw new IOException("JSON Config don't support!!" + config); + } + break; + } + } + else { + switch(format) { + case "csv": + FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); + FrameReader frameReader = new FrameReaderTextCSVParallel(propertiesCSV); + frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); + break; + case "json": + schemaMapFileName = System.getProperty("schemaMapFileName"); + Map schemaMap = util.getSchemaMap(schemaMapFileName); + config = System.getProperty("config"); + switch(config.toLowerCase()) { + case "gson": + FrameReaderJSONGsonParallel frameReaderJSONGson = new FrameReaderJSONGsonParallel(); + frameBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + + case "jackson": + FrameReaderJSONJacksonParallel frameReaderJSONJackson = new FrameReaderJSONJacksonParallel(); + frameBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + case "json4j": + FrameReaderJSONLParallel frameReaderJSONL = new FrameReaderJSONLParallel(); + frameBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); + break; + default: + throw new IOException("JSON Config don't support!!" + config); + } + break; + } + } - } + } - } + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java index 9b5cdf327c7..206f6003992 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java @@ -2,6 +2,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReaderJSONGson; +import org.apache.sysds.runtime.io.FrameReaderJSONGsonParallel; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.wink.json4j.JSONException; @@ -16,19 +17,27 @@ public static void main(String[] args) throws IOException, JSONException { String schemaMapFileName; String dataFileName; long nrows; + boolean parallel; schemaFileName = System.getProperty("schemaFileName"); schemaMapFileName = System.getProperty("schemaMapFileName"); dataFileName = System.getProperty("dataFileName"); nrows = Long.parseLong(System.getProperty("nrows")); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); Util util = new Util(); Types.ValueType[] schema = util.getSchema(schemaFileName); int ncols = schema.length; Map schemaMap = util.getSchemaMap(schemaMapFileName); - FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); - FrameBlock readBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + if(!parallel) { + FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); + FrameBlock readBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } + else { + FrameReaderJSONGsonParallel frameReaderJSONGson = new FrameReaderJSONGsonParallel(); + FrameBlock readBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java index 6850264315c..23a4fea97cb 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java @@ -2,6 +2,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReaderJSONJackson; +import org.apache.sysds.runtime.io.FrameReaderJSONJacksonParallel; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.wink.json4j.JSONException; @@ -16,19 +17,27 @@ public static void main(String[] args) throws IOException, JSONException { String schemaMapFileName; String dataFileName; long nrows; + boolean parallel; schemaFileName = System.getProperty("schemaFileName"); schemaMapFileName = System.getProperty("schemaMapFileName"); dataFileName = System.getProperty("dataFileName"); nrows = Long.parseLong(System.getProperty("nrows")); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); Util util = new Util(); Types.ValueType[] schema = util.getSchema(schemaFileName); int ncols = schema.length; Map schemaMap = util.getSchemaMap(schemaMapFileName); - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + if(!parallel) { + FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); + FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } + else { + FrameReaderJSONJacksonParallel frameReaderJSONJackson = new FrameReaderJSONJacksonParallel(); + FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java index e19ac5eedff..09832b844d6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java @@ -2,6 +2,7 @@ import org.apache.sysds.common.Types; import org.apache.sysds.runtime.io.FrameReaderJSONL; +import org.apache.sysds.runtime.io.FrameReaderJSONLParallel; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.wink.json4j.JSONException; @@ -16,18 +17,26 @@ public static void main(String[] args) throws IOException, JSONException { String schemaMapFileName; String dataFileName; long nrows; + boolean parallel; schemaFileName = System.getProperty("schemaFileName"); schemaMapFileName = System.getProperty("schemaMapFileName"); dataFileName = System.getProperty("dataFileName"); nrows = Long.parseLong(System.getProperty("nrows")); + parallel = Boolean.parseBoolean(System.getProperty("parallel")); Util util = new Util(); Types.ValueType[] schema = util.getSchema(schemaFileName); int ncols = schema.length; Map schemaMap = util.getSchemaMap(schemaMapFileName); - FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); - FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + if(parallel) { + FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); + FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } + else { + FrameReaderJSONLParallel frameReaderJSONL = new FrameReaderJSONLParallel(); + FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); + } } } From 1091058317a8710a9a2bb7ea3e63c7e7fa8518bd Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 17 Jun 2022 11:25:04 +0200 Subject: [PATCH 63/84] Minor update, clean-up --- .../sysds/runtime/iogen/EXP/GIOFrame.java | 3 -- .../apache/sysds/runtime/iogen/EXP/Util.java | 3 -- .../iogen/FrameSingleRowNestedTest.java | 43 +++++++++++++++++++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java index 70bb582a790..84504a596e1 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java @@ -6,9 +6,6 @@ import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.wink.json4j.JSONObject; -import java.util.ArrayList; -import java.util.HashSet; - public class GIOFrame { public static void main(String[] args) throws Exception { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java index 97989a0ea98..f364fa8efeb 100755 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java @@ -12,11 +12,8 @@ import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; -import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java index b0907ee7123..df4007bb656 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java @@ -20,8 +20,15 @@ package org.apache.sysds.test.functions.iogen; import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.io.FrameReader; +import org.apache.sysds.runtime.iogen.EXP.Util; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.wink.json4j.JSONObject; import org.junit.Test; +import java.io.IOException; + public class FrameSingleRowNestedTest extends GenerateReaderFrameTest { private final static String TEST_NAME = "FrameSingleRowNestedTest"; @@ -99,4 +106,40 @@ public void test7() { schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; runGenerateReaderTest(); } + + @Test + public void test8() throws Exception { + //java -Xms15g -Xmx15g -Dparallel=true -cp ./lib/*:./SystemDS.jar org.apache.sysds.runtime.iogen.EXP.GIOFrame + String dpath = "/home/sfathollahzadeh/Documents/GitHub/papers/2022-vldb-GIO/Experiments/"; + String sampleRawFileName = dpath+"data/aminer-author-json/Q4/sample-aminer-author-json200.raw"; + String sampleFrameFileName = dpath+"data/aminer-author-json/Q4/sample-aminer-author-json200.frame"; + String sampleRawDelimiter = "\t"; + String schemaFileName = dpath+"data/aminer-author-json/Q4/aminer-author-json.schema"; + String dataFileName = dpath+"data/aminer-author-json.dat"; + boolean parallel = false; + long rows = -1; + Util util = new Util(); + + // read and parse mtd file + String mtdFileName = dataFileName + ".mtd"; + try { + String mtd = util.readEntireTextFile(mtdFileName); + mtd = mtd.replace("\n", "").replace("\r", ""); + mtd = mtd.toLowerCase().trim(); + JSONObject jsonObject = new JSONObject(mtd); + if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); + } catch (Exception exception) {} + + Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); + int ncols = sampleSchema.length; + + String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); + FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); + String sampleRaw = util.readEntireTextFile(sampleRawFileName); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); + FrameReader fr = gr.getReader(); + FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); + + int a = 100; + } } From bd845921e3c1936a2a70e2f91add58cb6e53bf09 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 17 Jun 2022 17:32:38 +0200 Subject: [PATCH 64/84] Fix nrow calc in parallel frame reader --- .../runtime/iogen/template/FrameGenerateReaderParallel.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java index 5ba5b6c6488..86aa19d0d41 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -163,7 +163,7 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index } } - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + FrameBlock ret = createOutputFrameBlock(lschema, lnames, _rLen); return ret; } From 014a98744ae4cbf084943614eeb9dd7412de109f Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 21 Jun 2022 02:07:00 +0200 Subject: [PATCH 65/84] Fix multi-line reader bug --- .../runtime/iogen/FormatIdentifying.java | 62 +++++++++---------- .../apache/sysds/runtime/iogen/TextTrie.java | 2 - .../runtime/iogen/codegen/CodeGenTrie.java | 11 +++- .../iogen/FrameSingleRowNestedTest.java | 43 +++++++++---- 4 files changed, 70 insertions(+), 48 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 5d613b178ff..1dc5e7d5555 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -411,7 +411,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I String str = prefixSuffixBeginEndCells.get(i).getValue(); int indexBeginString = str.indexOf(beginString); if(indexBeginString != -1) - suffixes.add(str.substring(0, indexBeginString)); + suffixes.add(str.substring(0, indexBeginString+beginString.length())); else suffixes.add(str); } @@ -427,6 +427,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I } } keys = textTrieEnd.getAllKeys(); + if(keys.get(0).getValue().size() == nrows){ index = keys.get(0).getKey().indexOf(Lop.OPERAND_DELIMITOR); if(index == -1) @@ -731,12 +732,11 @@ private ArrayList> extractPrefixSuffixBeginEndCells(boolean } int lastLine = 0; int lastPos = 0; - int nextLine = 0; for(int r = 0; r < nrows; r++) { int beginLine = 0; int endLine = 0; int beginPos = 0; - int endPos, nextPos; + int endPos; for(int i = 0; i < nlines; i++) if(recordUsedLines[r].get(i)) { beginLine = i; @@ -747,36 +747,17 @@ private ArrayList> extractPrefixSuffixBeginEndCells(boolean endLine = i; break; } - if(r + 1 < nrows) { - for(int i = 0; i < nlines; i++) - if(recordUsedLines[r + 1].get(i)) { - nextLine = i; - break; - } - } - else - nextLine = nlines - 1; - endPos = sampleRawIndexes.get(endLine).getRawLength(); - nextPos = sampleRawIndexes.get(nextLine).getRawLength(); + endPos = 0; beginPos = sampleRawIndexes.get(beginLine).getRawLength(); for(int c = 0; c < ncols; c++) { if(mapRow[r][c] == beginLine) beginPos = Math.min(beginPos, mapCol[r][c]); if(mapRow[r][c] == endLine) - endPos = Math.min(endPos, mapCol[r][c] + mapLen[r][c]); - - if(r + 1 < nrows) { - if(mapRow[r + 1][c] == nextLine) - nextPos = Math.min(nextPos, mapCol[r + 1][c]); - } - else - nextPos = sampleRawIndexes.get(sampleRawIndexes.size() - 1).getRawLength(); + endPos = Math.max(endPos, mapCol[r][c] + mapLen[r][c]); } StringBuilder sbPrefix = new StringBuilder(); - StringBuilder sbSuffix = new StringBuilder(); - if(lastLine != beginLine) sbPrefix.append(sampleRawIndexes.get(lastLine).getRaw().substring(lastPos)).append("\n"); @@ -784,16 +765,35 @@ private ArrayList> extractPrefixSuffixBeginEndCells(boolean sbPrefix.append(sampleRawIndexes.get(i).getRaw()).append("\n"); sbPrefix.append(sampleRawIndexes.get(beginLine).getRaw().substring(0, beginPos)); - sbSuffix.append(sampleRawIndexes.get(endLine).getRaw().substring(endPos)).append("\n"); - for(int i = endLine + 1; i < nextLine; i++) - sbSuffix.append(sampleRawIndexes.get(i).getRaw()).append("\n"); - - sbSuffix.append(sampleRawIndexes.get(nextLine).getRaw().substring(0, nextPos)); lastLine = endLine; lastPos = endPos; - result.add(new Pair<>(sbPrefix.toString(), sbSuffix.toString())); + result.add(new Pair<>(sbPrefix.toString(), null)); + } + + // set suffix + for(int r = 0; r p1 = beginIndexes.get(i); Pair p2 = endIndexes.get(j); int n = 0; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java index c231f191921..8b38064e679 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java @@ -89,8 +89,6 @@ public ArrayList>> getAllKeys(){ for(Key k: sortedKeys){ result.add(new Pair<>(k.getKey().toString(), k.getIndexSet())); - //k.print(); - //System.out.println("++++++++++++++"); } return result; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index f336e222a0f..fe7cf08e57a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -20,6 +20,7 @@ package org.apache.sysds.runtime.iogen.codegen; import org.apache.sysds.common.Types; +import org.apache.sysds.lops.Lop; import org.apache.sysds.runtime.iogen.ColIndexStructure; import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.KeyTrie; @@ -182,10 +183,14 @@ private void getJavaCodeIndexOf(CodeGenTrieNode node, StringBuilder src, String for(String key : node.getChildren().keySet()) { if(key.length() > 0) { currPosVariable = getRandomName("curPos"); - if(node.getKey() == null) - src.append("index = str.indexOf(\"" + key.replace("\\\"", "\"").replace("\"", "\\\"") + "\"); \n"); + String mKey = key.replace("\\\"", Lop.OPERAND_DELIMITOR); + mKey = mKey.replace("\\", "\\\\"); + mKey = mKey.replace(Lop.OPERAND_DELIMITOR,"\\\""); + if(node.getKey() == null) { + src.append("index = str.indexOf(\"" + mKey.replace("\\\"", "\"").replace("\"", "\\\"") + "\"); \n"); + } else - src.append("index = str.indexOf(\"" + key.replace("\\\"", "\"").replace("\"", "\\\"") + "\", " + currPos + "); \n"); + src.append("index = str.indexOf(\"" + mKey.replace("\\\"", "\"").replace("\"", "\\\"") + "\", " + currPos + "); \n"); src.append("if(index != -1) { \n"); src.append("int " + currPosVariable + " = index + " + key.length() + "; \n"); } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java index df4007bb656..3a6197454a1 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java @@ -20,6 +20,7 @@ package org.apache.sysds.test.functions.iogen; import org.apache.sysds.common.Types; +import org.apache.sysds.lops.Lop; import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.iogen.EXP.Util; import org.apache.sysds.runtime.iogen.GenerateReader; @@ -98,25 +99,38 @@ public void test6() { @Test public void test7() { - sampleRaw = "{\n\"a\":1,\n\"b\":2,\n\"c\":3,\n\"d\":4,\n\"e\":5\n}\n" + - "{\"a\":6,\n\"b\":7,\"c\":8,\"d\":9,\"e\":10\n}\n" + - "{\"a\":11,\"b\":12,\n\"c\":13,\"d\":14,\"e\":15\n}"; - - data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(); + String key = "\\aaa\""; + key = key.replace("\\\"", Lop.OPERAND_DELIMITOR); + key = key.replace("\\", "\\\\"); + key = key.replace(Lop.OPERAND_DELIMITOR,"\\\""); + //System.out.println(key.length()); + + StringBuilder src = new StringBuilder(); + src.append("index = str.indexOf(\"" + + key.replace("\\\"", "\"").replace("\"", "\\\"") + + + "\"); \n"); + + System.out.println(src); +// sampleRaw = "{\n\"a\":1,\n\"b\":2,\n\"c\":3,\n\"d\":4,\n\"e\":5\n}\n" + +// "{\"a\":6,\n\"b\":7,\"c\":8,\"d\":9,\"e\":10\n}\n" + +// "{\"a\":11,\"b\":12,\n\"c\":13,\"d\":14,\"e\":15\n}"; +// +// data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; +// schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; +// runGenerateReaderTest(); } @Test public void test8() throws Exception { //java -Xms15g -Xmx15g -Dparallel=true -cp ./lib/*:./SystemDS.jar org.apache.sysds.runtime.iogen.EXP.GIOFrame String dpath = "/home/sfathollahzadeh/Documents/GitHub/papers/2022-vldb-GIO/Experiments/"; - String sampleRawFileName = dpath+"data/aminer-author-json/Q4/sample-aminer-author-json200.raw"; - String sampleFrameFileName = dpath+"data/aminer-author-json/Q4/sample-aminer-author-json200.frame"; + String sampleRawFileName = dpath+"data/message-hl7/F173/sample-message-hl7200.raw"; + String sampleFrameFileName = dpath+"data/message-hl7/F173/sample-message-hl7200.frame"; String sampleRawDelimiter = "\t"; - String schemaFileName = dpath+"data/aminer-author-json/Q4/aminer-author-json.schema"; - String dataFileName = dpath+"data/aminer-author-json.dat"; - boolean parallel = false; + String schemaFileName = dpath+"data/message-hl7/F173/message-hl7.schema"; + String dataFileName = dpath+"data/message-hl7.dat"; + boolean parallel = true; long rows = -1; Util util = new Util(); @@ -139,6 +153,11 @@ public void test8() throws Exception { GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); + for(int r=0; r< 10; r++) { + for(int c = 0; c < frameBlock.getNumColumns(); c++) + System.out.print(c+":"+frameBlock.get(r,c)+" "); + System.out.println(); + } int a = 100; } From 9afaf2a7135e1955fe0beec1030e9cce599f5e23 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Tue, 21 Jun 2022 18:19:10 +0200 Subject: [PATCH 66/84] Add AMiner Dataset Reader to SystemDS --- .../io/FileFormatPropertiesAMiner.java | 45 ++ .../runtime/io/FrameReaderTextAMiner.java | 498 ++++++++++++++++++ 2 files changed, 543 insertions(+) create mode 100644 src/main/java/org/apache/sysds/runtime/io/FileFormatPropertiesAMiner.java create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java diff --git a/src/main/java/org/apache/sysds/runtime/io/FileFormatPropertiesAMiner.java b/src/main/java/org/apache/sysds/runtime/io/FileFormatPropertiesAMiner.java new file mode 100644 index 00000000000..ffc5496bda6 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FileFormatPropertiesAMiner.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.io; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.io.Serializable; + +public class FileFormatPropertiesAMiner extends FileFormatProperties implements Serializable +{ + protected static final Log LOG = LogFactory.getLog(FileFormatPropertiesAMiner.class.getName()); + private static final long serialVersionUID = -2870393360885401604L; + + private String type; + + public FileFormatPropertiesAMiner(String type) { + this.type = type; + } + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } +} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java new file mode 100644 index 00000000000..49ca9ccd1ae --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java @@ -0,0 +1,498 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.io; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.InputStreamInputFormat; +import org.apache.sysds.runtime.util.UtilFunctions; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; + +public class FrameReaderTextAMiner extends FrameReader { + protected final FileFormatPropertiesAMiner _props; + private DatasetMetaDataPaper paperMetaData; + private DatasetMetaDataAuthor authorMetaData; + private ArrayList[] rowIndexs; + private ArrayList[] colBeginIndexs; + + public FrameReaderTextAMiner(FileFormatPropertiesAMiner props) { + //if unspecified use default properties for robustness + _props = props; + } + + @Override public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException { + LOG.debug("readFrameFromHDFS AMiner"); + // prepare file access + JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + FileInputFormat.addInputPath(job, path); + + // check existence and non-empty file + checkValidInputFile(fs, path); + + ValueType[] lschema = null; + String[] lnames = null; + if(_props.getType().equals("paper")) { + paperMetaData = computeAMinerSizePaper(job); + rlen = paperMetaData.nrow; + lschema = paperMetaData.schema; + lnames = paperMetaData.names; + } + else { + authorMetaData = computeAMinerSizeAuthor(job); + rlen = authorMetaData.nrow; + lschema = authorMetaData.schema; + lnames = authorMetaData.names; + } + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + // core read (sequential/parallel) + readAMinerFrameFromHDFS(job, ret, lschema); + + return ret; + } + + @Override public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException { + LOG.debug("readFrameFromInputStream csv"); + ValueType[] lschema = null; + String[] lnames = null; + if(_props.getType().equals("paper")) { + paperMetaData = computeAMinerSizePaper(null); + rlen = paperMetaData.nrow; + lschema = paperMetaData.schema; + lnames = paperMetaData.names; + } + else { + authorMetaData = computeAMinerSizeAuthor(null); + rlen = authorMetaData.nrow; + lschema = authorMetaData.schema; + lnames = authorMetaData.names; + } + FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); + + // core read (sequential/parallel) + InputStreamInputFormat informat = new InputStreamInputFormat(is); + InputSplit split = informat.getSplits(null, 1)[0]; + if(_props.getType().equals("paper")) { + readAMinerPaperFrameFromInputSplit(split, rowIndexs[0], colBeginIndexs[0], informat, null, ret, schema); + } + else { + readAMinerAuthorFrameFromInputSplit(split, rowIndexs[0], informat, null, ret, schema); + } + + return ret; + } + + protected void readAMinerFrameFromHDFS(JobConf job, FrameBlock dest, ValueType[] schema) throws IOException { + LOG.debug("readAMinerFrameFromHDFS csv"); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + if(_props.getType().equals("paper")) { + for(int i = 0; i < splits.length; i++) + readAMinerPaperFrameFromInputSplit(splits[i], rowIndexs[i], colBeginIndexs[i], informat, job, dest, schema); + } + else { + for(int i = 0; i < splits.length; i++) + readAMinerAuthorFrameFromInputSplit(splits[i], rowIndexs[i], informat, job, dest, schema); + } + } + + // #index ---- index id of this paper + // #* ---- paper title + // #@ ---- authors (separated by semicolons) + // #o ---- affiliations (separated by semicolons, and each affiliaiton corresponds to an author in order) + // #t ---- year + // #c ---- publication venue + // #% ---- the id of references of this paper (there are multiple lines, with each indicating a reference) + // #! ---- abstract + protected final void readAMinerPaperFrameFromInputSplit(InputSplit split, ArrayList rowIndex, ArrayList colBeginIndex, + InputFormat informat, JobConf job, FrameBlock dest, ValueType[] schema) throws IOException { + + // create record reader + RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + int row, col; + int colBegin = 0; + int index = -1; + String valStr; + // Read the data + try { + while(reader.next(key, value)) // foreach line + { + index++; + String rowStr = value.toString().trim(); + if(rowStr.length() == 0) + continue; + row = rowIndex.get(index); + colBegin = colBeginIndex.get(index); + + if(rowStr.startsWith("#index ")) { + col = colBegin; + valStr = rowStr.substring("#index ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#* ")) { + col = colBegin + 1; + valStr = rowStr.substring("#* ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#@ ")) { + col = colBegin + paperMetaData.authorStartCol; + valStr = rowStr.substring("#@ ".length()).trim(); + String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); + for(int i = 0; i < valList.length; i++) + dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); + } + else if(rowStr.startsWith("#o ")) { + col = colBegin + paperMetaData.getAffiliationStartCol(); + valStr = rowStr.substring("#o ".length()).trim(); + String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); + for(int i = 0; i < valList.length; i++) + dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); + } + else if(rowStr.startsWith("#t ")) { + col = colBegin + 2; + valStr = rowStr.substring("#t ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#c ")) { + col = colBegin + 3; + valStr = rowStr.substring("#c ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#! ")) { + col = colBegin + 4; + valStr = rowStr.substring("#! ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#% ")) { + col = colBegin + paperMetaData.referenceStartCol; + valStr = rowStr.substring("#! ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + } + } + finally { + IOUtilFunctions.closeSilently(reader); + } + } + + // #index ---- index id of this author + // #n ---- name (separated by semicolons) + // #a ---- affiliations (separated by semicolons) + // #pc ---- the count of published papers of this author + // #cn ---- the total number of citations of this author + // #hi ---- the H-index of this author + // #pi ---- the P-index with equal A-index of this author + // #upi ---- the P-index with unequal A-index of this author + // #t ---- research interests of this author (separated by semicolons) + protected final void readAMinerAuthorFrameFromInputSplit(InputSplit split, ArrayList rowIndex, InputFormat informat, + JobConf job, FrameBlock dest, ValueType[] schema) throws IOException { + + // create record reader + RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + int row, col; + int index = -1; + String valStr; + try { + while(reader.next(key, value)) // foreach line + { + index++; + String rowStr = value.toString().trim(); + if(rowStr.length() == 0) + continue; + row = rowIndex.get(index); + + if(rowStr.startsWith("#index ")) { + col = 0; + valStr = rowStr.substring("#index ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#n ")) { + col = 1; + valStr = rowStr.substring("#n ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#a ")) { + col = authorMetaData.getAffiliationStartCol(); + valStr = rowStr.substring("#a ".length()).trim(); + String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); + for(int i = 0; i < valList.length; i++) + dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); + } + else if(rowStr.startsWith("#t ")) { + col = authorMetaData.getResearchInterestStartCol(); + valStr = rowStr.substring("#t ".length()).trim(); + String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); + for(int i = 0; i < valList.length; i++) + dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); + } + + else if(rowStr.startsWith("#pc ")) { + col = 2; + valStr = rowStr.substring("#pc ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#cn ")) { + col = 3; + valStr = rowStr.substring("#cn ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#hi ")) { + col = 4; + valStr = rowStr.substring("#hi ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#pi ")) { + col = 5; + valStr = rowStr.substring("#pi ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + else if(rowStr.startsWith("#upi ")) { + col = 6; + valStr = rowStr.substring("#upi ".length()).trim(); + dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); + } + } + } + finally { + IOUtilFunctions.closeSilently(reader); + } + } + + protected DatasetMetaDataPaper computeAMinerSizePaper(JobConf job) throws IOException { + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + this.rowIndexs = new ArrayList[splits.length]; + this.colBeginIndexs = new ArrayList[splits.length]; + + LongWritable key = new LongWritable(); + Text value = new Text(); + int ncol = 5; + int maxAuthors = 0; + int maxAffiliations = 0; + int maxReferences = 0; + int row = -1; + int lastRefCount = 0; + + for(int i = 0; i < splits.length; i++) { + RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); + int refCount = 0; + boolean splitRowFlag = false; + this.rowIndexs[i] = new ArrayList<>(); + this.colBeginIndexs[i] = new ArrayList<>(); + while(reader.next(key, value)) { + String raw = value.toString().trim(); + if(raw.startsWith("#index ")) { + row++; + if(splitRowFlag) + maxReferences = Math.max(maxReferences, refCount); + else + maxReferences = Math.max(maxReferences, refCount + lastRefCount); + + splitRowFlag = true; + lastRefCount = refCount; + refCount = 0; + this.colBeginIndexs[i].add(0); + } + else if(raw.startsWith("#@")) { //authors (separated by semicolons) + maxAuthors = Math.max(maxAuthors, IOUtilFunctions.countTokensCSV(raw, ";")); + this.colBeginIndexs[i].add(0); + } + else if(raw.startsWith("#o")) { //(separated by semicolons, and each affiliaiton corresponds to an author in order) + maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); + this.colBeginIndexs[i].add(0); + } + else if(raw.startsWith("#%")) { // the id of references of this paper (there are multiple lines, with each indicating a reference) + + if(!splitRowFlag) + this.colBeginIndexs[i].add(refCount + lastRefCount); + else + this.colBeginIndexs[i].add(refCount); + refCount++; + } + else + this.colBeginIndexs[i].add(0); + + this.rowIndexs[i].add(row); + } + } + ncol += maxAuthors + maxAffiliations + maxReferences; + + DatasetMetaDataPaper datasetMetaDataPaper = new DatasetMetaDataPaper(ncol, row + 1, maxAuthors, maxAffiliations); + return datasetMetaDataPaper; + } + + protected DatasetMetaDataAuthor computeAMinerSizeAuthor(JobConf job) throws IOException { + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + this.rowIndexs = new ArrayList[splits.length]; + this.colBeginIndexs = new ArrayList[splits.length]; + + LongWritable key = new LongWritable(); + Text value = new Text(); + int ncol = 7; + int maxAffiliations = 0; + int maxResearchInterest = 0; + int row = -1; + + for(int i = 0; i < splits.length; i++) { + RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); + this.rowIndexs[i] = new ArrayList<>(); + this.colBeginIndexs[i] = new ArrayList<>(); + while(reader.next(key, value)) { + String raw = value.toString().trim(); + if(raw.startsWith("#index ")) + row++; + else if(raw.startsWith("#a")) //affiliations (separated by semicolons) + maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); + + else if(raw.startsWith("#t")) // research interests of this author (separated by semicolons) + maxResearchInterest = Math.max(maxResearchInterest, IOUtilFunctions.countTokensCSV(raw, ";")); + + this.rowIndexs[i].add(row); + } + } + ncol += maxAffiliations + maxResearchInterest; + + DatasetMetaDataAuthor datasetMetaDataAuthor = new DatasetMetaDataAuthor(ncol, row + 1, maxAffiliations, maxResearchInterest); + return datasetMetaDataAuthor; + } + + protected static abstract class DatasetMetaData { + protected final int ncol; + protected final int nrow; + protected ValueType[] schema; + protected String[] names; + private int affiliationStartCol; + + public DatasetMetaData(int ncol, int nrow, int affiliationStartCol) { + this.ncol = ncol; + this.nrow = nrow; + this.names = new String[ncol]; + this.affiliationStartCol = affiliationStartCol; + for(int i = 0; i < ncol; i++) + this.names[i] = "col_" + i; + } + + public String[] getNames() { + return names; + } + + public ValueType[] getSchema() { + return schema; + } + + public int getAffiliationStartCol() { + return affiliationStartCol; + } + + public int getNcol() { + return ncol; + } + + public int getNrow() { + return nrow; + } + } + + protected static class DatasetMetaDataPaper extends DatasetMetaData { + private final int authorStartCol; + private final int referenceStartCol; + + public DatasetMetaDataPaper(int ncol, int nrow, int maxAuthor, int maxAffiliation) { + super(ncol, nrow, 5 + maxAuthor); + this.schema = new ValueType[ncol]; + this.schema[0] = ValueType.INT64; // index id of this paper + this.schema[1] = ValueType.STRING; //paper title + this.schema[2] = ValueType.INT32; //year + this.schema[3] = ValueType.STRING; //publication venue + this.schema[4] = ValueType.STRING; // abstract + + for(int i = 5; i < maxAuthor + maxAffiliation + 5; i++) + this.schema[i] = ValueType.STRING; + + for(int i = maxAuthor + maxAffiliation + 5; i < ncol; i++) + this.schema[i] = ValueType.FP64; + + this.authorStartCol = 5; + this.referenceStartCol = maxAuthor + maxAffiliation + 5; + } + + public int getAuthorStartCol() { + return authorStartCol; + } + + public int getReferenceStartCol() { + return referenceStartCol; + } + } + + protected static class DatasetMetaDataAuthor extends DatasetMetaData { + private final int researchInterestStartCol; + + public DatasetMetaDataAuthor(int ncol, int nrow, int maxAffiliation, int maxResearchInterest) { + super(ncol, nrow, 7); + this.schema = new ValueType[ncol]; + this.schema[0] = ValueType.INT64; // index id of this author + this.schema[1] = ValueType.STRING; // name (separated by semicolons) + this.schema[2] = ValueType.INT32; // the count of published papers of this author + this.schema[3] = ValueType.INT32; // the total number of citations of this author + this.schema[4] = ValueType.FP32; // the H-index of this author + this.schema[5] = ValueType.FP32; // the P-index with equal A-index of this author + this.schema[6] = ValueType.FP32; // the P-index with unequal A-index of this author + + for(int i = 7; i < maxAffiliation + maxResearchInterest + 7; i++) + this.schema[i] = ValueType.STRING; + this.researchInterestStartCol = 7 + maxAffiliation; + } + + public int getResearchInterestStartCol() { + return researchInterestStartCol; + } + } +} From 31fe3647de369cee054ebe67f4c5baf6778cb9db Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 22 Jun 2022 01:20:37 +0200 Subject: [PATCH 67/84] Add AMiner Dataset Parallel Reader to SystemDS --- .../runtime/io/FrameReaderTextAMiner.java | 69 +++- .../io/FrameReaderTextAMinerParallel.java | 316 ++++++++++++++++++ 2 files changed, 375 insertions(+), 10 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java index 49ca9ccd1ae..1e136c7d338 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java @@ -43,17 +43,17 @@ public class FrameReaderTextAMiner extends FrameReader { protected final FileFormatPropertiesAMiner _props; - private DatasetMetaDataPaper paperMetaData; - private DatasetMetaDataAuthor authorMetaData; - private ArrayList[] rowIndexs; - private ArrayList[] colBeginIndexs; + protected DatasetMetaDataPaper paperMetaData; + protected DatasetMetaDataAuthor authorMetaData; + protected ArrayList[] rowIndexs; + protected ArrayList[] colBeginIndexs; public FrameReaderTextAMiner(FileFormatPropertiesAMiner props) { //if unspecified use default properties for robustness _props = props; } - @Override public final FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) + @Override public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException { LOG.debug("readFrameFromHDFS AMiner"); // prepare file access @@ -361,7 +361,7 @@ else if(raw.startsWith("#%")) { // the id of references of this paper (there are } ncol += maxAuthors + maxAffiliations + maxReferences; - DatasetMetaDataPaper datasetMetaDataPaper = new DatasetMetaDataPaper(ncol, row + 1, maxAuthors, maxAffiliations); + DatasetMetaDataPaper datasetMetaDataPaper = new DatasetMetaDataPaper(ncol, row + 1, maxAuthors, maxAffiliations, maxReferences); return datasetMetaDataPaper; } @@ -399,8 +399,7 @@ else if(raw.startsWith("#t")) // research interests of this author (separated b } ncol += maxAffiliations + maxResearchInterest; - DatasetMetaDataAuthor datasetMetaDataAuthor = new DatasetMetaDataAuthor(ncol, row + 1, maxAffiliations, maxResearchInterest); - return datasetMetaDataAuthor; + return new DatasetMetaDataAuthor(ncol, row + 1, maxAffiliations, maxResearchInterest); } protected static abstract class DatasetMetaData { @@ -408,7 +407,12 @@ protected static abstract class DatasetMetaData { protected final int nrow; protected ValueType[] schema; protected String[] names; - private int affiliationStartCol; + private final int affiliationStartCol; + protected int index; + protected int maxAffiliation = 0; + protected int maxResearchInterest = 0; + protected int maxReference = 0; + protected int maxAuthor = 0; public DatasetMetaData(int ncol, int nrow, int affiliationStartCol) { this.ncol = ncol; @@ -438,13 +442,53 @@ public int getNcol() { public int getNrow() { return nrow; } + + public int getIndex() { + return index; + } + + public void setIndex(int index) { + this.index = index; + } + + public int getMaxAffiliation() { + return maxAffiliation; + } + + public void setMaxAffiliation(int maxAffiliation) { + this.maxAffiliation = maxAffiliation; + } + + public int getMaxResearchInterest() { + return maxResearchInterest; + } + + public void setMaxResearchInterest(int maxResearchInterest) { + this.maxResearchInterest = maxResearchInterest; + } + + public int getMaxReference() { + return maxReference; + } + + public void setMaxReference(int maxReference) { + this.maxReference = maxReference; + } + + public int getMaxAuthor() { + return maxAuthor; + } + + public void setMaxAuthor(int maxAuthor) { + this.maxAuthor = maxAuthor; + } } protected static class DatasetMetaDataPaper extends DatasetMetaData { private final int authorStartCol; private final int referenceStartCol; - public DatasetMetaDataPaper(int ncol, int nrow, int maxAuthor, int maxAffiliation) { + public DatasetMetaDataPaper(int ncol, int nrow, int maxAuthor, int maxAffiliation, int maxReference) { super(ncol, nrow, 5 + maxAuthor); this.schema = new ValueType[ncol]; this.schema[0] = ValueType.INT64; // index id of this paper @@ -452,6 +496,9 @@ public DatasetMetaDataPaper(int ncol, int nrow, int maxAuthor, int maxAffiliatio this.schema[2] = ValueType.INT32; //year this.schema[3] = ValueType.STRING; //publication venue this.schema[4] = ValueType.STRING; // abstract + this.maxAffiliation = maxAffiliation; + this.maxAuthor = maxAuthor; + this.maxReference = maxReference; for(int i = 5; i < maxAuthor + maxAffiliation + 5; i++) this.schema[i] = ValueType.STRING; @@ -485,6 +532,8 @@ public DatasetMetaDataAuthor(int ncol, int nrow, int maxAffiliation, int maxRese this.schema[4] = ValueType.FP32; // the H-index of this author this.schema[5] = ValueType.FP32; // the P-index with equal A-index of this author this.schema[6] = ValueType.FP32; // the P-index with unequal A-index of this author + this.maxAffiliation = maxAffiliation; + this.maxResearchInterest = maxResearchInterest; for(int i = 7; i < maxAffiliation + maxResearchInterest + 7; i++) this.schema[i] = ValueType.STRING; diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java new file mode 100644 index 00000000000..8135c514129 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java @@ -0,0 +1,316 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.io; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.common.Types; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.conf.ConfigurationManager; +import org.apache.sysds.hops.OptimizerUtils; +import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.Pair; +import org.apache.sysds.runtime.transform.TfUtils; +import org.apache.sysds.runtime.util.CommonThreadPool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; + +/** + * Multi-threaded frame text AMiner reader. + */ +public class FrameReaderTextAMinerParallel extends FrameReaderTextAMiner { + protected int _numThreads; + + public FrameReaderTextAMinerParallel(FileFormatPropertiesAMiner props) { + super(props); + this._numThreads = OptimizerUtils.getParallelTextReadParallelism(); + } + + @Override public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException { + + //prepare file access + JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); + Path path = new Path(fname); + FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + + InputSplit[] splits = informat.getSplits(job, _numThreads); + splits = IOUtilFunctions.sortInputSplits(splits); + + // check existence and non-empty file + checkValidInputFile(fs, path); + + FrameBlock ret = readAMinerHDFS(splits, informat, job); + + return ret; + } + + protected FrameBlock readAMinerHDFS(InputSplit[] splits, TextInputFormat informat, JobConf job) throws IOException { + try { + ExecutorService pool = CommonThreadPool.get(Math.min(_numThreads, splits.length)); + this.rowIndexs = new ArrayList[splits.length]; + this.colBeginIndexs = new ArrayList[splits.length]; + + //compute num rows per split + ArrayList tasks = new ArrayList<>(); + for(int i = 0; i < splits.length; i++) { + rowIndexs[i] = new ArrayList<>(); + if(_props.getType().equals("author")) + tasks.add(new CountRowsColsTaskAuthor(splits[i], informat, job, rowIndexs[i], i)); + else { + colBeginIndexs[i] = new ArrayList<>(); + tasks.add(new CountRowsColsTaskPaper(splits[i], informat, job, rowIndexs[i], colBeginIndexs[i], i)); + } + } + List> cret = pool.invokeAll(tasks); + + for(Future count : cret) + while(!count.isDone()) + ; + + //compute row offset per split via cumsum on row counts + int offset = 0; + int maxAffiliation = 0; + int maxResearchInterest = 0; + int maxReference = 0; + int maxAuthor = 0; + int ncol; + ValueType[] lschema = null; + String[] lnames = null; + + for(Future count : cret) { + DatasetMetaData metaData = count.get(); + ArrayList ri = rowIndexs[metaData.getIndex()]; + if(_props.getType().equals("author")) { + maxResearchInterest = Math.max(maxResearchInterest, metaData.maxResearchInterest); + } + else { + int negativeBeginPos = -1; + int negativeEndPos = -1; + for(int i = 0; i < ri.size(); i++) { + int valIndex = ri.get(i); + if(valIndex == -1) { + if(negativeBeginPos == -1) { + negativeBeginPos = i; + } + negativeEndPos = i; + } + } + if(negativeBeginPos != -1) { + int bcIndex = colBeginIndexs[metaData.getIndex() - 1].get(colBeginIndexs[metaData.getIndex() - 1].size() - 1); + for(int i = negativeBeginPos; i <= negativeEndPos; i++) { + colBeginIndexs[metaData.getIndex()].set(i, i - negativeBeginPos + bcIndex + 1); + } + int tMax = Math.max(bcIndex + negativeEndPos - negativeBeginPos + 1, metaData.maxReference); + metaData.setMaxReference(tMax); + } + maxReference = Math.max(maxReference, metaData.maxReference); + maxAuthor = Math.max(maxAuthor, metaData.maxAuthor); + } + + for(int i = 0; i < ri.size(); i++) + ri.set(i, ri.get(i) + offset); + + maxAffiliation = Math.max(maxAffiliation, metaData.maxAffiliation); + offset += metaData.getNrow(); + } + if(_props.getType().equals("paper")) { + ncol = 5 + maxAuthor + maxAffiliation + maxReference; + this.paperMetaData = new DatasetMetaDataPaper(ncol, offset, maxAuthor, maxAffiliation, maxReference); + lschema = this.paperMetaData.schema; + lnames = this.paperMetaData.names; + } + else { + ncol = 7 + maxAffiliation + maxResearchInterest; + this.authorMetaData = new DatasetMetaDataAuthor(ncol, offset, maxAffiliation, maxResearchInterest); + lschema = this.authorMetaData.schema; + lnames = this.authorMetaData.names; + } + FrameBlock ret = createOutputFrameBlock(lschema, lnames, offset + 1); + //read individual splits + ArrayList tasks2 = new ArrayList<>(); + for(int i = 0; i < splits.length; i++) + tasks2.add(new ReadRowsTask(splits[i], rowIndexs[i], colBeginIndexs[i], informat, job, ret, lschema)); + CommonThreadPool.invokeAndShutdown(pool, tasks2); + return ret; + } + catch(Exception e) { + throw new IOException("Failed parallel read of text AMiner input.", e); + } + } + + private static abstract class CountRowsColsTask implements Callable { + protected InputSplit _split = null; + protected Integer _splitIndex; + protected TextInputFormat _informat = null; + protected JobConf _job = null; + protected ArrayList _rowIndex; + protected ArrayList _colBeginIndex; + + public CountRowsColsTask(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, + ArrayList colBeginIndex, int splitIndex) { + _split = split; + _informat = informat; + _job = job; + _rowIndex = rowIndex; + _colBeginIndex = colBeginIndex; + _splitIndex = splitIndex; + } + + @Override public DatasetMetaData call() throws Exception { + return null; + } + } + + private static class CountRowsColsTaskAuthor extends CountRowsColsTask { + + public CountRowsColsTaskAuthor(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, int splitIndex) { + super(split, informat, job, rowIndex, null, splitIndex); + } + + @Override public DatasetMetaDataAuthor call() throws Exception { + RecordReader reader = _informat.getRecordReader(_split, _job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + int ncol = 7; + int maxAffiliations = 0; + int maxResearchInterest = 0; + int row = -1; + + while(reader.next(key, value)) { + String raw = value.toString().trim(); + if(raw.startsWith("#index ")) + row++; + else if(raw.startsWith("#a")) //affiliations (separated by semicolons) + maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); + + else if(raw.startsWith("#t")) // research interests of this author (separated by semicolons) + maxResearchInterest = Math.max(maxResearchInterest, IOUtilFunctions.countTokensCSV(raw, ";")); + + this._rowIndex.add(row); + } + + ncol += maxAffiliations + maxResearchInterest; + + DatasetMetaDataAuthor datasetMetaDataAuthor = new DatasetMetaDataAuthor(ncol, row + 1, maxAffiliations, maxResearchInterest); + datasetMetaDataAuthor.setIndex(_splitIndex); + return datasetMetaDataAuthor; + } + } + + private static class CountRowsColsTaskPaper extends CountRowsColsTask { + + public CountRowsColsTaskPaper(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, + ArrayList colBeginIndex, int splitIndex) { + super(split, informat, job, rowIndex, colBeginIndex, splitIndex); + } + + @Override public DatasetMetaDataPaper call() throws Exception { + RecordReader reader = _informat.getRecordReader(_split, _job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + int ncol = 5; + int maxAuthors = 0; + int maxAffiliations = 0; + int maxReferences = 0; + int row = -1; + int refCount = 0; + + while(reader.next(key, value)) { + String raw = value.toString().trim(); + if(raw.startsWith("#index ")) { + row++; + maxReferences = Math.max(maxReferences, refCount); + refCount = 0; + this._colBeginIndex.add(0); + } + else if(raw.startsWith("#@")) { //authors (separated by semicolons) + maxAuthors = Math.max(maxAuthors, IOUtilFunctions.countTokensCSV(raw, ";")); + this._colBeginIndex.add(0); + } + else if(raw.startsWith("#o")) { //(separated by semicolons, and each affiliaiton corresponds to an author in order) + maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); + this._colBeginIndex.add(0); + } + else if(raw.startsWith("#%")) { // the id of references of this paper (there are multiple lines, with each indicating a reference) + this._colBeginIndex.add(refCount); + refCount++; + } + else + this._colBeginIndex.add(0); + + this._rowIndex.add(row); + } + + ncol += maxAuthors + maxAffiliations + maxReferences; + DatasetMetaDataPaper datasetMetaDataPaper = new DatasetMetaDataPaper(ncol, row + 1, maxAuthors, maxAffiliations, maxReferences); + datasetMetaDataPaper.setIndex(_splitIndex); + return datasetMetaDataPaper; + } + } + + private class ReadRowsTask implements Callable { + private final InputSplit _split; + private final TextInputFormat _informat; + private final JobConf _job; + private final FrameBlock _dest; + private final ArrayList _rowIndex; + private final ArrayList _colBeginIndex; + private final ValueType[] _schema; + + public ReadRowsTask(InputSplit split, ArrayList rowIndex, ArrayList colBeginIndex, TextInputFormat informat, JobConf job, + FrameBlock dest, ValueType[] schema) { + _split = split; + _informat = informat; + _job = job; + _dest = dest; + _rowIndex = rowIndex; + _colBeginIndex = colBeginIndex; + _schema = schema; + } + + @Override public Object call() throws Exception { + if(_props.getType().equals("paper")) + readAMinerPaperFrameFromInputSplit(_split, _rowIndex, _colBeginIndex, _informat, _job, _dest, _schema); + else + readAMinerAuthorFrameFromInputSplit(_split, _rowIndex, _informat, _job, _dest, _schema); + return null; + } + } +} From 0443417bc01d7460ca51969e9d60d1e15e5f6682 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 22 Jun 2022 03:02:24 +0200 Subject: [PATCH 68/84] Update experiment code --- .../sysds/runtime/iogen/EXP/SystemDS.java | 35 +++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java index 67bbbe0f63b..5a0be36d1ff 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java @@ -26,6 +26,7 @@ public static void main(String[] args) throws IOException, JSONException { String config = null; String schemaMapFileName = null; boolean parallel; + Types.ValueType[] schema; Util util = new Util(); schemaFileName = System.getProperty("schemaFileName"); @@ -67,6 +68,7 @@ public static void main(String[] args) throws IOException, JSONException { } catch(Exception exception) { + } if(dataType.equalsIgnoreCase("matrix")) { @@ -110,18 +112,20 @@ public static void main(String[] args) throws IOException, JSONException { matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); } else { - Types.ValueType[] schema = util.getSchema(schemaFileName); - cols = schema.length; - FrameBlock frameBlock = null; + FrameBlock frameBlock = null; if(!parallel) { switch(format) { case "csv": + schema = util.getSchema(schemaFileName); + cols = schema.length; FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); FrameReader frameReader = new FrameReaderTextCSV(propertiesCSV); frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); break; case "json": + schema = util.getSchema(schemaFileName); + cols = schema.length; schemaMapFileName = System.getProperty("schemaMapFileName"); Map schemaMap = util.getSchemaMap(schemaMapFileName); config = System.getProperty("config"); @@ -143,16 +147,31 @@ public static void main(String[] args) throws IOException, JSONException { throw new IOException("JSON Config don't support!!" + config); } break; + + case "aminer-author": + FileFormatPropertiesAMiner propertiesAMinerAuthor = new FileFormatPropertiesAMiner("author"); + FrameReader frAuthor = new FrameReaderTextAMiner(propertiesAMinerAuthor); + frameBlock = frAuthor.readFrameFromHDFS(dataFileName, null, null, -1,-1); + break; + case "aminer-paper": + FileFormatPropertiesAMiner propertiesAMinerPaper = new FileFormatPropertiesAMiner("paper"); + FrameReader frPaper = new FrameReaderTextAMiner(propertiesAMinerPaper); + frameBlock = frPaper.readFrameFromHDFS(dataFileName, null, null, -1,-1); + break; } } else { switch(format) { case "csv": + schema = util.getSchema(schemaFileName); + cols = schema.length; FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); FrameReader frameReader = new FrameReaderTextCSVParallel(propertiesCSV); frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); break; case "json": + schema = util.getSchema(schemaFileName); + cols = schema.length; schemaMapFileName = System.getProperty("schemaMapFileName"); Map schemaMap = util.getSchemaMap(schemaMapFileName); config = System.getProperty("config"); @@ -174,6 +193,16 @@ public static void main(String[] args) throws IOException, JSONException { throw new IOException("JSON Config don't support!!" + config); } break; + case "aminer-author": + FileFormatPropertiesAMiner propertiesAMinerAuthor = new FileFormatPropertiesAMiner("author"); + FrameReader frAuthor = new FrameReaderTextAMinerParallel(propertiesAMinerAuthor); + frameBlock = frAuthor.readFrameFromHDFS(dataFileName, null, null, -1,-1); + break; + case "aminer-paper": + FileFormatPropertiesAMiner propertiesAMinerPaper = new FileFormatPropertiesAMiner("paper"); + FrameReader frPaper = new FrameReaderTextAMinerParallel(propertiesAMinerPaper); + frameBlock = frPaper.readFrameFromHDFS(dataFileName, null, null, -1,-1); + break; } } From 95d8bcebc72bf2886da9fc48f50cb1b2d3dfd774 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 22 Jun 2022 14:52:33 +0200 Subject: [PATCH 69/84] Fix Aminer parallel reader bug --- .../runtime/io/FrameReaderTextAMiner.java | 50 +++++++++---------- .../io/FrameReaderTextAMinerParallel.java | 30 +++++++---- 2 files changed, 45 insertions(+), 35 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java index 1e136c7d338..a76d420cb5b 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java @@ -65,40 +65,52 @@ public FrameReaderTextAMiner(FileFormatPropertiesAMiner props) { // check existence and non-empty file checkValidInputFile(fs, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + ValueType[] lschema = null; String[] lnames = null; if(_props.getType().equals("paper")) { - paperMetaData = computeAMinerSizePaper(job); + paperMetaData = computeAMinerSizePaper(informat,job, splits); rlen = paperMetaData.nrow; lschema = paperMetaData.schema; lnames = paperMetaData.names; } else { - authorMetaData = computeAMinerSizeAuthor(job); + authorMetaData = computeAMinerSizeAuthor(informat,job, splits); rlen = authorMetaData.nrow; lschema = authorMetaData.schema; lnames = authorMetaData.names; } FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - // core read (sequential/parallel) - readAMinerFrameFromHDFS(job, ret, lschema); + // core read (sequential/parallel) + readAMinerFrameFromHDFS(informat,job, splits, ret, lschema); return ret; } @Override public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException { + + // TODO: fix stream reader. incomplete LOG.debug("readFrameFromInputStream csv"); ValueType[] lschema = null; String[] lnames = null; + + InputStreamInputFormat informat = new InputStreamInputFormat(is); + InputSplit[] splits = informat.getSplits(null, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + if(_props.getType().equals("paper")) { - paperMetaData = computeAMinerSizePaper(null); + paperMetaData = computeAMinerSizePaper(null,null, splits); rlen = paperMetaData.nrow; lschema = paperMetaData.schema; lnames = paperMetaData.names; } else { - authorMetaData = computeAMinerSizeAuthor(null); + authorMetaData = computeAMinerSizeAuthor(null,null, splits); rlen = authorMetaData.nrow; lschema = authorMetaData.schema; lnames = authorMetaData.names; @@ -106,24 +118,18 @@ public FrameReaderTextAMiner(FileFormatPropertiesAMiner props) { FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); // core read (sequential/parallel) - InputStreamInputFormat informat = new InputStreamInputFormat(is); - InputSplit split = informat.getSplits(null, 1)[0]; if(_props.getType().equals("paper")) { - readAMinerPaperFrameFromInputSplit(split, rowIndexs[0], colBeginIndexs[0], informat, null, ret, schema); + readAMinerPaperFrameFromInputSplit(splits[0], rowIndexs[0], colBeginIndexs[0], informat, null, ret, schema); } else { - readAMinerAuthorFrameFromInputSplit(split, rowIndexs[0], informat, null, ret, schema); + readAMinerAuthorFrameFromInputSplit(splits[0], rowIndexs[0], informat, null, ret, schema); } - return ret; + } - protected void readAMinerFrameFromHDFS(JobConf job, FrameBlock dest, ValueType[] schema) throws IOException { + protected void readAMinerFrameFromHDFS(TextInputFormat informat, JobConf job, InputSplit[] splits, FrameBlock dest, ValueType[] schema) throws IOException { LOG.debug("readAMinerFrameFromHDFS csv"); - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - InputSplit[] splits = informat.getSplits(job, 1); - splits = IOUtilFunctions.sortInputSplits(splits); if(_props.getType().equals("paper")) { for(int i = 0; i < splits.length; i++) readAMinerPaperFrameFromInputSplit(splits[i], rowIndexs[i], colBeginIndexs[i], informat, job, dest, schema); @@ -300,11 +306,7 @@ else if(rowStr.startsWith("#upi ")) { } } - protected DatasetMetaDataPaper computeAMinerSizePaper(JobConf job) throws IOException { - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - InputSplit[] splits = informat.getSplits(job, 1); - splits = IOUtilFunctions.sortInputSplits(splits); + protected DatasetMetaDataPaper computeAMinerSizePaper(TextInputFormat informat, JobConf job, InputSplit[] splits) throws IOException { this.rowIndexs = new ArrayList[splits.length]; this.colBeginIndexs = new ArrayList[splits.length]; @@ -365,11 +367,7 @@ else if(raw.startsWith("#%")) { // the id of references of this paper (there are return datasetMetaDataPaper; } - protected DatasetMetaDataAuthor computeAMinerSizeAuthor(JobConf job) throws IOException { - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - InputSplit[] splits = informat.getSplits(job, 1); - splits = IOUtilFunctions.sortInputSplits(splits); + protected DatasetMetaDataAuthor computeAMinerSizeAuthor(TextInputFormat informat, JobConf job, InputSplit[] splits) throws IOException { this.rowIndexs = new ArrayList[splits.length]; this.colBeginIndexs = new ArrayList[splits.length]; diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java index 8135c514129..d26960a12d0 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java @@ -41,6 +41,7 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.BitSet; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; @@ -51,6 +52,7 @@ */ public class FrameReaderTextAMinerParallel extends FrameReaderTextAMiner { protected int _numThreads; + protected BitSet[] bitSets; public FrameReaderTextAMinerParallel(FileFormatPropertiesAMiner props) { super(props); @@ -85,6 +87,7 @@ protected FrameBlock readAMinerHDFS(InputSplit[] splits, TextInputFormat informa ExecutorService pool = CommonThreadPool.get(Math.min(_numThreads, splits.length)); this.rowIndexs = new ArrayList[splits.length]; this.colBeginIndexs = new ArrayList[splits.length]; + this.bitSets = new BitSet[splits.length]; //compute num rows per split ArrayList tasks = new ArrayList<>(); @@ -94,7 +97,8 @@ protected FrameBlock readAMinerHDFS(InputSplit[] splits, TextInputFormat informa tasks.add(new CountRowsColsTaskAuthor(splits[i], informat, job, rowIndexs[i], i)); else { colBeginIndexs[i] = new ArrayList<>(); - tasks.add(new CountRowsColsTaskPaper(splits[i], informat, job, rowIndexs[i], colBeginIndexs[i], i)); + bitSets[i] = new BitSet(); + tasks.add(new CountRowsColsTaskPaper(splits[i], informat, job, rowIndexs[i], colBeginIndexs[i], bitSets[i], i)); } } List> cret = pool.invokeAll(tasks); @@ -124,7 +128,8 @@ protected FrameBlock readAMinerHDFS(InputSplit[] splits, TextInputFormat informa int negativeEndPos = -1; for(int i = 0; i < ri.size(); i++) { int valIndex = ri.get(i); - if(valIndex == -1) { + if(valIndex == -1 && ((i == 0 && bitSets[metaData.getIndex()].get(i)) || (i > 0 && bitSets[metaData.getIndex()].get( + i) && !bitSets[metaData.getIndex()].get(i - 1)))) { if(negativeBeginPos == -1) { negativeBeginPos = i; } @@ -132,11 +137,13 @@ protected FrameBlock readAMinerHDFS(InputSplit[] splits, TextInputFormat informa } } if(negativeBeginPos != -1) { - int bcIndex = colBeginIndexs[metaData.getIndex() - 1].get(colBeginIndexs[metaData.getIndex() - 1].size() - 1); + int bcIndex = colBeginIndexs[metaData.getIndex() - 1].get(colBeginIndexs[metaData.getIndex() - 1].size() - 1) + 1; + int counter = 0; for(int i = negativeBeginPos; i <= negativeEndPos; i++) { - colBeginIndexs[metaData.getIndex()].set(i, i - negativeBeginPos + bcIndex + 1); + colBeginIndexs[metaData.getIndex()].set(i, counter + bcIndex); + counter++; } - int tMax = Math.max(bcIndex + negativeEndPos - negativeBeginPos + 1, metaData.maxReference); + int tMax = Math.max(bcIndex + counter, metaData.maxReference); metaData.setMaxReference(tMax); } maxReference = Math.max(maxReference, metaData.maxReference); @@ -181,15 +188,17 @@ private static abstract class CountRowsColsTask implements Callable _rowIndex; protected ArrayList _colBeginIndex; + protected BitSet _bitSet; public CountRowsColsTask(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, - ArrayList colBeginIndex, int splitIndex) { + ArrayList colBeginIndex, BitSet bitSet, int splitIndex) { _split = split; _informat = informat; _job = job; _rowIndex = rowIndex; _colBeginIndex = colBeginIndex; _splitIndex = splitIndex; + _bitSet = bitSet; } @Override public DatasetMetaData call() throws Exception { @@ -200,7 +209,7 @@ public CountRowsColsTask(InputSplit split, TextInputFormat informat, JobConf job private static class CountRowsColsTaskAuthor extends CountRowsColsTask { public CountRowsColsTaskAuthor(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, int splitIndex) { - super(split, informat, job, rowIndex, null, splitIndex); + super(split, informat, job, rowIndex, null, null, splitIndex); } @Override public DatasetMetaDataAuthor call() throws Exception { @@ -237,8 +246,8 @@ else if(raw.startsWith("#t")) // research interests of this author (separated b private static class CountRowsColsTaskPaper extends CountRowsColsTask { public CountRowsColsTaskPaper(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, - ArrayList colBeginIndex, int splitIndex) { - super(split, informat, job, rowIndex, colBeginIndex, splitIndex); + ArrayList colBeginIndex, BitSet bitSet, int splitIndex) { + super(split, informat, job, rowIndex, colBeginIndex, bitSet, splitIndex); } @Override public DatasetMetaDataPaper call() throws Exception { @@ -251,9 +260,11 @@ public CountRowsColsTaskPaper(InputSplit split, TextInputFormat informat, JobCon int maxReferences = 0; int row = -1; int refCount = 0; + int bIndex = 0; while(reader.next(key, value)) { String raw = value.toString().trim(); + bIndex++; if(raw.startsWith("#index ")) { row++; maxReferences = Math.max(maxReferences, refCount); @@ -270,6 +281,7 @@ else if(raw.startsWith("#o")) { //(separated by semicolons, and each affiliaiton } else if(raw.startsWith("#%")) { // the id of references of this paper (there are multiple lines, with each indicating a reference) this._colBeginIndex.add(refCount); + this._bitSet.set(bIndex); refCount++; } else From f39ef6fbe4ccb1b9ac0c4def5de670d91775b39f Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 23 Jun 2022 11:56:30 +0200 Subject: [PATCH 70/84] Fix Frame code gen bug --- .../runtime/iogen/FormatIdentifying.java | 5 ++++ .../runtime/iogen/codegen/FrameCodeGen.java | 28 +++++++++++-------- .../runtime/iogen/codegen/MatrixCodeGen.java | 27 ++++++++++-------- 3 files changed, 36 insertions(+), 24 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 1dc5e7d5555..6e167a41e16 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -451,6 +451,11 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I } } } + + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist|| + colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist){ + properties.setSparse(true); + } } // check row-index Exist diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 8fa5e9ea9e0..74666310578 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -116,16 +116,16 @@ public String generateCodeJavaParallel() { src.append("int index, endPos, strLen; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("try { \n"); - src.append("int ri = -1; \n"); - src.append("int beginPosStr, endPosStr; \n"); - src.append("StringBuilder sb = new StringBuilder(); \n"); - src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); - src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); - src.append("boolean flag = true; \n"); - src.append("while(flag) { \n"); - src.append("flag = reader.next(key, value); \n"); - src.append("if(flag) { \n"); if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + src.append("int ri = -1; \n"); + src.append("int beginPosStr, endPosStr; \n"); + src.append("StringBuilder sb = new StringBuilder(); \n"); + src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); + src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); + src.append("boolean flag = true; \n"); + src.append("while(flag) { \n"); + src.append("flag = reader.next(key, value); \n"); + src.append("if(flag) { \n"); src.append("ri++; \n"); src.append("String valStr = value.toString(); \n"); src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); @@ -142,10 +142,14 @@ public String generateCodeJavaParallel() { src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); src.append("} \n"); + src.append("} \n"); + src.append("else \n"); + src.append("str = sb.toString(); \n"); + } + else { + src.append("while(reader.next(key, value)) { \n"); + src.append("str = value.toString(); \n"); } - src.append("} \n"); - src.append("else \n"); - src.append("str = sb.toString(); \n"); src.append("strLen = str.length(); \n"); src.append(trie.getJavaCode()); src.append("} \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index 0c8b06fa116..c6fe9b06f4e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -64,6 +64,7 @@ public MatrixCodeGen(CustomProperties properties, String className) { "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + "import java.io.IOException;\n" + "import java.util.HashSet; \n" + + "import org.apache.sysds.runtime.util.UtilFunctions; \n"+ "public class "+className+" extends MatrixGenerateReaderParallel {\n" + "\tpublic "+className+"(CustomProperties _props) {\n" + "super(_props);} \n" + "@Override \n" + @@ -139,16 +140,17 @@ public String generateCodeJavaParallel() { src.append("int index, endPos, strLen; \n"); src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("try { \n"); - src.append("int ri = -1; \n"); - src.append("int beginPosStr, endPosStr; \n"); - src.append("StringBuilder sb = new StringBuilder(); \n"); - src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); - src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); - src.append("boolean flag = true; \n"); - src.append("while(flag) { \n"); - src.append("flag = reader.next(key, value); \n"); - src.append("if(flag) { \n"); + // seq-scattered if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + src.append("int ri = -1; \n"); + src.append("int beginPosStr, endPosStr; \n"); + src.append("StringBuilder sb = new StringBuilder(); \n"); + src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); + src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); + src.append("boolean flag = true; \n"); + src.append("while(flag) { \n"); + src.append("flag = reader.next(key, value); \n"); + src.append("if(flag) { \n"); src.append("ri++; \n"); src.append("String valStr = value.toString(); \n"); src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); @@ -165,10 +167,11 @@ public String generateCodeJavaParallel() { src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); src.append("} \n"); + //src.append("} \n"); + src.append("else \n"); + src.append("str = sb.toString(); \n"); } - src.append("} \n"); - src.append("else \n"); - src.append("str = sb.toString(); \n"); + src.append("str = value.toString();\n"); src.append("strLen = str.length(); \n"); src.append(trie.getJavaCode()); src.append("} \n"); From c1253a91430b2293221de226ff1104ff4bf36089 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Thu, 23 Jun 2022 15:19:22 +0200 Subject: [PATCH 71/84] Fix frame single thread reader --- .../sysds/runtime/iogen/GenerateReader.java | 4 +- .../runtime/iogen/codegen/FrameCodeGen.java | 83 +---- .../runtime/iogen/codegen/MatrixCodeGen.java | 106 +++---- .../iogen/template/FrameGenerateReader.java | 299 +++++++++++++++--- .../template/FrameGenerateReaderParallel.java | 2 +- .../iogen/template/TemplateCodeGenBase.java | 2 - 6 files changed, 325 insertions(+), 171 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index d2707cf5939..5c88438425a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -82,7 +82,7 @@ public MatrixReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - String srcJava = properties.isParallel() ? src.generateCodeJavaParallel(): src.generateCodeJava(); + String srcJava = src.generateCodeJava(); matrixReader = (MatrixReader) CodegenUtils.compileClass(className, srcJava).getDeclaredConstructor(cArg).newInstance(properties); return matrixReader; } @@ -108,7 +108,7 @@ public FrameReader getReader() throws Exception { // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - String srcJava = properties.isParallel() ? src.generateCodeJavaParallel(): src.generateCodeJava(); + String srcJava = src.generateCodeJava(); frameReader = (FrameReader) CodegenUtils.compileClass(className, srcJava).getDeclaredConstructor(cArg).newInstance(properties); return frameReader; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 74666310578..39ecc1657f2 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -30,82 +30,28 @@ public FrameCodeGen(CustomProperties properties, String className) { // 1. set java code template // 1.a: single thread code gen - if(!properties.isParallel()){ - javaTemplate = "import org.apache.hadoop.io.LongWritable; \n" + - "import org.apache.hadoop.io.Text; \n" + - "import org.apache.hadoop.mapred.InputFormat; \n" + - "import org.apache.hadoop.mapred.InputSplit; \n" + - "import org.apache.hadoop.mapred.JobConf; \n" + - "import org.apache.hadoop.mapred.RecordReader; \n" + - "import org.apache.hadoop.mapred.Reporter; \n" + - "import org.apache.sysds.common.Types; \n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions; \n" + - "import org.apache.sysds.runtime.iogen.CustomProperties; \n" + - "import org.apache.sysds.runtime.matrix.data.FrameBlock; \n" + - "import org.apache.sysds.runtime.iogen.template.FrameGenerateReader; \n" + - "import java.io.IOException; \n" + - "import java.util.HashSet; \n" + - "public class "+className+" extends FrameGenerateReader{ \n" + - "public "+className+"(CustomProperties _props) { \n" + - " super(_props); \n" + - " } \n" + + String javaBaseClass = !properties.isParallel() ? "FrameGenerateReader" : "FrameGenerateReaderParallel"; - "@Override protected int readFrameFromInputSplit(InputSplit split, InputFormat informat, \n" + - " JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, \n" + - " boolean first) throws IOException { \n" + + javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + + "import org.apache.hadoop.io.Text;\n" + + "import org.apache.hadoop.mapred.RecordReader;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.iogen.template."+javaBaseClass+";\n" + + "import org.apache.sysds.runtime.matrix.data.FrameBlock;\n" + + "import java.io.IOException;\n" + "import java.util.HashSet;\n" + + "public class "+className+" extends "+javaBaseClass+" {\n" + + "public "+className+"(CustomProperties _props) {\n" + + "super(_props);} \n" + + "@Override \n" + + "protected int reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, " + + "FrameBlock dest, int row, SplitInfo splitInfo) throws IOException {\n"+ code+ "}} \n"; - } - else { - javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + - "import org.apache.hadoop.io.Text;\n" + - "import org.apache.hadoop.mapred.RecordReader;\n" + - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.iogen.template.FrameGenerateReaderParallel;\n" + - "import org.apache.sysds.runtime.matrix.data.FrameBlock;\n" + - "import java.io.IOException;\n" + "import java.util.HashSet;\n" + - "public class "+className+" extends FrameGenerateReaderParallel {\n" + - "public "+className+"(CustomProperties _props) {\n" + - "super(_props);} \n" + - "@Override \n" + - "protected void reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, " + - "FrameBlock dest, int row, SplitInfo splitInfo) throws IOException {\n"+ - code+ - "}} \n"; - } } @Override public String generateCodeJava() { - StringBuilder src = new StringBuilder(); - src.append("RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); \n"); - src.append("LongWritable key = new LongWritable(); \n"); - src.append("Text value = new Text(); \n"); - src.append("int row = rl; \n"); - src.append("long lnnz = 0; \n"); - src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); - - src.append("int index, endPos, strLen; \n"); - src.append("try { \n"); - src.append("while(reader.next(key, value)){ \n"); - src.append("String str = value.toString(); \n"); - src.append("strLen = str.length(); \n"); - - CodeGenTrie trie = new CodeGenTrie(properties, "dest.set", false); - src.append(trie.getJavaCode()); - - src.append("}} \n"); - src.append("finally { \n"); - src.append("IOUtilFunctions.closeSilently(reader); \n"); - src.append("} \n"); - src.append("return row; \n"); - - return javaTemplate.replace(code, src.toString()); - } - - @Override - public String generateCodeJavaParallel() { StringBuilder src = new StringBuilder(); CodeGenTrie trie = new CodeGenTrie(properties, "dest.set", false); trie.setMatrix(true); @@ -156,6 +102,7 @@ public String generateCodeJavaParallel() { src.append("} \n"); src.append("catch(Exception ex){ \n"); src.append("} \n"); + src.append("return row; \n"); return javaTemplate.replace(code, src.toString()); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index c6fe9b06f4e..bb786e8a7a6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -128,59 +128,59 @@ public String generateCodeJava() { return javaTemplate.replace(code, src.toString()); } - @Override - public String generateCodeJavaParallel() { - StringBuilder src = new StringBuilder(); - CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); - trie.setMatrix(true); - src.append("String str=\"\"; \n"); - src.append("String remainStr = \"\"; \n"); - src.append("int col = -1; \n"); - src.append("long lnnz = 0; \n"); - src.append("int index, endPos, strLen; \n"); - src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); - src.append("try { \n"); - // seq-scattered - if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ - src.append("int ri = -1; \n"); - src.append("int beginPosStr, endPosStr; \n"); - src.append("StringBuilder sb = new StringBuilder(); \n"); - src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); - src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); - src.append("boolean flag = true; \n"); - src.append("while(flag) { \n"); - src.append("flag = reader.next(key, value); \n"); - src.append("if(flag) { \n"); - src.append("ri++; \n"); - src.append("String valStr = value.toString(); \n"); - src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); - src.append("endPosStr = ri == endIndex ? splitInfo.getRecordPositionEnd(row): valStr.length(); \n"); - src.append("if(ri >= beginIndex && ri <= endIndex){ \n"); - src.append("sb.append(valStr.substring(beginPosStr, endPosStr)); \n"); - src.append("remainStr = valStr.substring(endPosStr); \n"); - src.append("continue; \n"); - src.append("} \n"); - src.append("else { \n"); - src.append("str = sb.toString(); \n"); - src.append("sb = new StringBuilder(); \n"); - src.append("sb.append(remainStr).append(valStr); \n"); - src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); - src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); - src.append("} \n"); - //src.append("} \n"); - src.append("else \n"); - src.append("str = sb.toString(); \n"); - } - src.append("str = value.toString();\n"); - src.append("strLen = str.length(); \n"); - src.append(trie.getJavaCode()); - src.append("} \n"); - src.append("} \n"); - src.append("catch(Exception ex){ \n"); - src.append("} \n"); - src.append("return lnnz; \n"); - return javaTemplate.replace(code, src.toString()); - } +// @Override +// public String generateCodeJavaParallel() { +// StringBuilder src = new StringBuilder(); +// CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); +// trie.setMatrix(true); +// src.append("String str=\"\"; \n"); +// src.append("String remainStr = \"\"; \n"); +// src.append("int col = -1; \n"); +// src.append("long lnnz = 0; \n"); +// src.append("int index, endPos, strLen; \n"); +// src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); +// src.append("try { \n"); +// // seq-scattered +// if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ +// src.append("int ri = -1; \n"); +// src.append("int beginPosStr, endPosStr; \n"); +// src.append("StringBuilder sb = new StringBuilder(); \n"); +// src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); +// src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); +// src.append("boolean flag = true; \n"); +// src.append("while(flag) { \n"); +// src.append("flag = reader.next(key, value); \n"); +// src.append("if(flag) { \n"); +// src.append("ri++; \n"); +// src.append("String valStr = value.toString(); \n"); +// src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); +// src.append("endPosStr = ri == endIndex ? splitInfo.getRecordPositionEnd(row): valStr.length(); \n"); +// src.append("if(ri >= beginIndex && ri <= endIndex){ \n"); +// src.append("sb.append(valStr.substring(beginPosStr, endPosStr)); \n"); +// src.append("remainStr = valStr.substring(endPosStr); \n"); +// src.append("continue; \n"); +// src.append("} \n"); +// src.append("else { \n"); +// src.append("str = sb.toString(); \n"); +// src.append("sb = new StringBuilder(); \n"); +// src.append("sb.append(remainStr).append(valStr); \n"); +// src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); +// src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); +// src.append("} \n"); +// //src.append("} \n"); +// src.append("else \n"); +// src.append("str = sb.toString(); \n"); +// } +// src.append("str = value.toString();\n"); +// src.append("strLen = str.length(); \n"); +// src.append(trie.getJavaCode()); +// src.append("} \n"); +// src.append("} \n"); +// src.append("catch(Exception ex){ \n"); +// src.append("} \n"); +// src.append("return lnnz; \n"); +// return javaTemplate.replace(code, src.toString()); +// } @Override public String generateCodeCPP() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java index 52063a7b905..6df945d78be 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java @@ -32,45 +32,25 @@ import org.apache.sysds.runtime.iogen.CustomProperties; import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.Pair; import org.apache.sysds.runtime.util.InputStreamInputFormat; -import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashSet; -import java.util.List; public abstract class FrameGenerateReader extends FrameReader { protected CustomProperties _props; + protected SplitOffsetInfos _offsets; public FrameGenerateReader(CustomProperties _props) { this._props = _props; } - private int getNumRows(List files, FileSystem fs) throws IOException, DMLRuntimeException { - int rows = 0; - for(int fileNo = 0; fileNo < files.size(); fileNo++) { - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); - try { - // Row Identify - if(_props.getRowIndexStructure().getProperties().equals(RowIndexStructure.IndexProperties.Identity)) { - while(br.readLine() != null) - rows++; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - } - return rows; - } - @Override - public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, - long clen) throws IOException, DMLRuntimeException { + public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException { // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); @@ -81,25 +61,131 @@ public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Stri // check existence and non-empty file checkValidInputFile(fs, path); - // compute size if necessary - if(rlen <= 0) { - ArrayList paths = new ArrayList<>(); - paths.add(path); - rlen = getNumRows(paths, fs); - } + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); // allocate output frame block Types.ValueType[] lschema = createOutputSchema(schema, clen); String[] lnames = createOutputNames(names, clen); - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - // core read (sequential/parallel) - readFrameFromHDFS(path, job, fs, ret, lschema, lnames, rlen, clen); + FrameBlock ret; + if(rlen <= 0 || _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) + ret = computeSizeAndCreateOutputFrameBlock(job, schema, names, splits, path); + else + ret = createOutputFrameBlock(lschema, lnames, rlen); + readFrameFromHDFS(informat, splits, job, ret); return ret; } + private FrameBlock computeSizeAndCreateOutputFrameBlock(JobConf job, Types.ValueType[] schema, String[] names, InputSplit[] splits, Path path) + throws IOException, DMLRuntimeException { + + //Types.ValueType[] lschema = createOutputSchema(schema, clen); + //String[] lnames = createOutputNames(names, clen); + + FileInputFormat.addInputPath(job, path); + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + int row = 0; + // count rows in parallel per split + try { + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.Identity) { + // compute number of rows + for(InputSplit inputSplit : splits) { + RecordReader reader = informat.getRecordReader(inputSplit, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + try { + // count remaining number of rows, ignore meta data + while(reader.next(key, value)) { + row++; + } + } + finally { + IOUtilFunctions.closeSilently(reader); + } + } + } + else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { + _offsets = new SplitOffsetInfos(splits.length); + int splitIndex = 0; + for(InputSplit inputSplit : splits) { + int nrows = 0; + SplitInfo splitInfo = new SplitInfo(); + ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(inputSplit, informat, job, + _props.getRowIndexStructure().getSeqBeginString()); + + ArrayList> endIndexes; + int tokenLength = 0; + if(!_props.getRowIndexStructure().getSeqBeginString().equals(_props.getRowIndexStructure().getSeqEndString())) { + endIndexes = getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqEndString()); + tokenLength = _props.getRowIndexStructure().getSeqEndString().length(); + } + else { + endIndexes = new ArrayList<>(); + for(int i = 1; i < beginIndexes.size(); i++) + endIndexes.add(beginIndexes.get(i)); + } + + int i = 0; + int j = 0; + while(i < beginIndexes.size() && j < endIndexes.size()) { + Pair p1 = beginIndexes.get(i); + Pair p2 = endIndexes.get(j); + int n = 0; + while(p1.getKey() < p2.getKey() || (p1.getKey() == p2.getKey() && p1.getValue() < p2.getValue())) { + n++; + i++; + if(i == beginIndexes.size()) + break; + p1 = beginIndexes.get(i); + } + j += n - 1; + splitInfo.addIndexAndPosition(beginIndexes.get(i - n).getKey(), endIndexes.get(j).getKey(), beginIndexes.get(i - n).getValue(), + endIndexes.get(j).getValue() + tokenLength); + j++; + nrows++; + } + if(i == beginIndexes.size() && j < endIndexes.size()) + nrows++; + if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) + splitInfo.setRemainString(""); + else { + RecordReader reader = informat.getRecordReader(inputSplit, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + StringBuilder sb = new StringBuilder(); + for(int ri = 0; ri < beginIndexes.get(0).getKey(); ri++) { + reader.next(key, value); + String raw = value.toString(); + sb.append(raw); + } + if(beginIndexes.get(0).getValue() != 0) { + reader.next(key, value); + sb.append(value.toString().substring(0, beginIndexes.get(0).getValue())); + } + splitInfo.setRemainString(sb.toString()); + } + splitInfo.setNrows(nrows); + _offsets.setSeqOffsetPerSplit(splitIndex, splitInfo); + _offsets.setOffsetPerSplit(splitIndex, row); + row += nrows; + splitIndex++; + } + } + } + catch(Exception e) { + throw new IOException("Thread pool Error " + e.getMessage(), e); + } + FrameBlock ret = createOutputFrameBlock(schema, names, row); + return ret; + } + @Override public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] schema, String[] names, long rlen, long clen) throws IOException, DMLRuntimeException { @@ -112,25 +198,28 @@ public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] sch // core read (sequential/parallel) InputStreamInputFormat informat = new InputStreamInputFormat(is); InputSplit split = informat.getSplits(null, 1)[0]; - readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); + //readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); return ret; } - protected void readFrameFromHDFS(Path path, JobConf job, FileSystem fs, FrameBlock dest, Types.ValueType[] schema, - String[] names, long rlen, long clen) throws IOException { - - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - InputSplit[] splits = informat.getSplits(job, 1); - splits = IOUtilFunctions.sortInputSplits(splits); - for(int i = 0, rpos = 0; i < splits.length; i++) - rpos = readFrameFromInputSplit(splits[i], informat, job, dest, schema, names, rlen, clen, rpos, i == 0); + protected void readFrameFromHDFS(TextInputFormat informat, InputSplit[] splits, JobConf job, FrameBlock dest) throws IOException { + int rpos = 0; + for(int i = 0; i < splits.length; i++) { + RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + SplitInfo splitInfo = null; + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + splitInfo = _offsets.getSeqOffsetPerSplit(i); + rpos = _offsets.getOffsetPerSplit(i); + } + reaFrameFromHDFS(reader, key, value, dest, rpos, splitInfo); + } } - protected abstract int readFrameFromInputSplit(InputSplit split, InputFormat informat, - JobConf job, FrameBlock dest, Types.ValueType[] schema, String[] names, long rlen, long clen, int rl, - boolean first) throws IOException; + protected abstract int reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, + int rowPos, SplitInfo splitInfo) throws IOException; protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { int endPos = strLen; @@ -142,4 +231,124 @@ protected int getEndPos(String str, int strLen, int currPos, HashSet end return endPos; } + private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, + String token) throws IOException { + RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + ArrayList> result = new ArrayList<>(); + + int ri = 0; + while (reader.next(key, value)){ + String raw = value.toString(); + int index; + int fromIndex = 0; + do { + index = raw.indexOf(token, fromIndex); + if(index !=-1){ + result.add(new Pair<>(ri, index)); + fromIndex = index+token.length(); + } + else + break; + }while(true); + ri++; + } + return result; + } + + private static class SplitOffsetInfos { + // offset & length info per split + private int[] offsetPerSplit = null; + private int[] lenghtPerSplit = null; + private SplitInfo[] seqOffsetPerSplit = null; + + public SplitOffsetInfos(int numSplits) { + lenghtPerSplit = new int[numSplits]; + offsetPerSplit = new int[numSplits]; + seqOffsetPerSplit = new SplitInfo[numSplits]; + } + + public int getLenghtPerSplit(int split) { + return lenghtPerSplit[split]; + } + + public void setLenghtPerSplit(int split, int r) { + lenghtPerSplit[split] = r; + } + + public int getOffsetPerSplit(int split) { + return offsetPerSplit[split]; + } + + public void setOffsetPerSplit(int split, int o) { + offsetPerSplit[split] = o; + } + + public SplitInfo getSeqOffsetPerSplit(int split) { + return seqOffsetPerSplit[split]; + } + + public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { + seqOffsetPerSplit[split] = splitInfo; + } + } + + protected static class SplitInfo{ + private int nrows; + private ArrayList recordIndexBegin; + private ArrayList recordIndexEnd; + private ArrayList recordPositionBegin; + private ArrayList recordPositionEnd; + private String remainString; + + public SplitInfo() { + recordIndexBegin = new ArrayList<>(); + recordIndexEnd = new ArrayList<>(); + recordPositionBegin = new ArrayList<>(); + recordPositionEnd = new ArrayList<>(); + } + + public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos){ + recordIndexBegin.add(beginIndex); + recordIndexEnd.add(endIndex); + recordPositionBegin.add(beginPos); + recordPositionEnd.add(endPos); + } + + public int getNrows() { + return nrows; + } + + public void setNrows(int nrows) { + this.nrows = nrows; + } + + public String getRemainString() { + return remainString; + } + + public void setRemainString(String remainString) { + this.remainString = remainString; + } + + public int getRecordIndexBegin(int index) { + return recordIndexBegin.get(index); + } + + public int getRecordIndexEnd(int index) { + return recordIndexEnd.get(index); + } + + public int getRecordPositionBegin(int index) { + return recordPositionBegin.get(index); + } + + public int getRecordPositionEnd(int index) { + return recordPositionEnd.get(index); + } + } + + + } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java index 86aa19d0d41..61347167316 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -449,6 +449,6 @@ private static ArrayList> getTokenIndexOnMultiLineRecords return result; } - protected abstract void reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, + protected abstract int reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, int rowPos, SplitInfo splitInfo) throws IOException; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java index 7ab5d91a306..fd813fe96fe 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateCodeGenBase.java @@ -38,7 +38,5 @@ public TemplateCodeGenBase(CustomProperties properties, String className) { public abstract String generateCodeJava(); - public abstract String generateCodeJavaParallel(); - public abstract String generateCodeCPP(); } From 36b1c988377bfff4e843e6a7a82c2941886793d0 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 24 Jun 2022 02:03:31 +0200 Subject: [PATCH 72/84] Fix some bugs --- .../runtime/iogen/ColIndexStructure.java | 8 +- .../sysds/runtime/iogen/CustomProperties.java | 14 +- .../runtime/iogen/FormatIdentifying.java | 27 +- .../runtime/iogen/RowIndexStructure.java | 8 +- .../runtime/iogen/codegen/CodeGenTrie.java | 7 +- .../iogen/codegen/CodeGenTrieNode.java | 6 +- .../runtime/iogen/codegen/FrameCodeGen.java | 10 +- .../runtime/iogen/codegen/MatrixCodeGen.java | 171 +++++------- .../iogen/template/FrameGenerateReader.java | 158 +---------- .../template/FrameGenerateReaderParallel.java | 195 ++++--------- .../iogen/template/MatrixGenerateReader.java | 257 ++++++++++-------- .../MatrixGenerateReaderParallel.java | 176 ++---------- .../runtime/iogen/template/TemplateUtil.java | 198 ++++++++++++++ 13 files changed, 533 insertions(+), 702 deletions(-) create mode 100644 src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java index f1e88275888..25521c44ac6 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ColIndexStructure.java @@ -47,8 +47,12 @@ public ColIndexStructure() { private String valueDelim; public HashSet endWithValueStrings() { - HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); - return endWithValueString; + if(keyPattern!=null) { + HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); + return endWithValueString; + } + else + return null; } public IndexProperties getProperties() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 28df09ec60c..3c0b2712c6a 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -83,11 +83,15 @@ public void setSchema(Types.ValueType[] schema) { } public HashSet[] endWithValueStrings() { - HashSet[] endWithValueString = new HashSet[colKeyPatterns.length]; - for(int i = 0; i < colKeyPatterns.length; i++) - if(colKeyPatterns[i] != null) - endWithValueString[i] = colKeyPatterns[i].getFirstSuffixKeyPatterns(); - return endWithValueString; + if(colKeyPatterns !=null) { + HashSet[] endWithValueString = new HashSet[colKeyPatterns.length]; + for(int i = 0; i < colKeyPatterns.length; i++) + if(colKeyPatterns[i] != null) + endWithValueString[i] = colKeyPatterns[i].getFirstSuffixKeyPatterns(); + return endWithValueString; + } + else + return null; } public KeyTrie getValueKeyPattern() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 6e167a41e16..d2e08e20682 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -180,25 +180,26 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I } // build key pattern for row index - int numberOfSelectedCols = 3; + int numberOfSelectedCols = (int) (ncols * 0.1); + int numberOfSelectedRows = (int) (nrows * 0.1); int begin = rowIndexStructure.getRowIndexBegin(); boolean check, flagReconstruct; - int[] selectedRowIndex = new int[2]; + int[] selectedRowIndex = new int[numberOfSelectedRows]; KeyTrie rowKeyPattern = null; // Select two none zero row as a row index candidate int index = 0; - for(int r = 1; r < nrows; r++) { + for(int r = 1; r 1) + if(index >= numberOfSelectedRows) break; } - for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { + for(int c = ncols -1; c >= Math.max(ncols - numberOfSelectedCols, 0); c--) { Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); ArrayList prefixStrings = colPrefixString.getKey(); ArrayList prefixStringRowIndexes = colPrefixString.getValue(); @@ -226,12 +227,12 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I prefixRawIndex.add(new RawIndex(sb.toString())); } } - if(c == numberOfSelectedCols - 1) { + if(c == ncols - 1) { ArrayList rowPrefixStrings = new ArrayList<>(); MappingTrie rowTrie = new MappingTrie(); rowKeyPattern = new KeyTrie(); for(int si : selectedRowIndex) { - for(int ci = 0; ci < ncols; ci++) { + for(int ci = ncols - 1; ci >=0; ci--) { int cri = mapRow[si][ci]; if(cri != -1) { String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); @@ -283,22 +284,22 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I // build key pattern for column index begin = colIndexStructure.getColIndexBegin(); - int[] selectedColIndex = new int[2]; + int[] selectedColIndex = new int[numberOfSelectedCols]; KeyTrie colKeyPattern = null; // Select two none zero row as a row index candidate index = 0; - for(int c = 0; c < ncols; c++) { + for(int c = ncols - 1; c>=0; c--) { for(int r = 1; r < nrows; r++) if(mapRow[r][c] != -1) { selectedColIndex[index++] = c; break; } - if(index > 1) + if(index >= numberOfSelectedCols) break; } - for(int c = 0; c < Math.min(numberOfSelectedCols, ncols); c++) { + for(int c = ncols -1; c >= Math.max(ncols - numberOfSelectedCols, 0); c--) { Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); ArrayList prefixStrings = colPrefixString.getKey(); ArrayList prefixStringRowIndexes = colPrefixString.getValue(); @@ -326,7 +327,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I prefixRawIndex.add(new RawIndex(sb.toString())); } } - if(c == numberOfSelectedCols - 1) { + if(c == ncols - 1) { ArrayList colPrefixStrings = new ArrayList<>(); MappingTrie colTrie = new MappingTrie(); colKeyPattern = new KeyTrie(); @@ -646,7 +647,7 @@ private int checkRowIndexesOnColumnRaw(int colIndex, int beginPos) { } } - if(nne > nrows * 0.3) { + if(nne > 0) { if(beginPos == 1) return -1; else diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java index 6e0ec3343e6..55211a0b257 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RowIndexStructure.java @@ -48,8 +48,12 @@ public RowIndexStructure() { private String seqEndString; public HashSet endWithValueStrings() { - HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); - return endWithValueString; + if(keyPattern!=null) { + HashSet endWithValueString = keyPattern.getFirstSuffixKeyPatterns(); + return endWithValueString; + } + else + return null; } public IndexProperties getProperties() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index fe7cf08e57a..f3a6bce692b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -145,11 +145,11 @@ else if(rowIndex == RowIndexStructure.IndexProperties.Identity && src.append("String strIndexValue[] = si.split(\""+ properties.getColIndexStructure().getIndexDelim()+"\", -1); \n"); src.append("if(strIndexValue.length == 2){ \n"); src.append("col = UtilFunctions.parseToInt(strIndexValue[0]); \n"); - src.append("if(col < "+ncols+"){ \n"); + src.append("if(col <= "+ncols+"){ \n"); if(this.isMatrix){ src.append("try{ \n"); - src.append(destination).append("(row, col, Double.parseDouble(strIndexValue[1]); \n"); - src.append("} catch(Exception e){"+destination+".append(row, col, 0d);} \n"); + src.append(destination).append("(row, col, Double.parseDouble(strIndexValue[1])); \n"); + src.append("} catch(Exception e){"+destination+"(row, col, 0d);} \n"); } else { src.append(destination).append("(row, col, UtilFunctions.stringToObject(_props.getSchema()[col], strIndexValue[1]); \n"); @@ -157,6 +157,7 @@ else if(rowIndex == RowIndexStructure.IndexProperties.Identity && src.append("} \n"); src.append("} \n"); src.append("} \n"); + src.append("row++; \n"); } return src.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java index 51051c7406c..1079cc130e5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrieNode.java @@ -83,7 +83,7 @@ private String getIndexCode(String currPos) { else ewvs = "endWithValueStringCol"; - src.append("endPos = getEndPos(str, strLen, " + currPos + "," + ewvs + "); \n"); + src.append("endPos = TemplateUtil.getEndPos(str, strLen, " + currPos + "," + ewvs + "); \n"); subStr = "str.substring(" + currPos + ",endPos)"; src.append("try{ \n"); if(this.colIndex.equals("0")) { @@ -107,9 +107,9 @@ private String getColValueCode(String destination, String currPos) { StringBuilder src = new StringBuilder(); if(this.colIndex.equals("col")) - src.append("endPos = getEndPos(str, strLen, " + currPos + ", endWithValueStringVal); \n"); + src.append("endPos = TemplateUtil.getEndPos(str, strLen, " + currPos + ", endWithValueStringVal); \n"); else - src.append("endPos = getEndPos(str, strLen, " + currPos + ", endWithValueString[" + colIndex + "]); \n"); + src.append("endPos = TemplateUtil.getEndPos(str, strLen, " + currPos + ", endWithValueString[" + colIndex + "]); \n"); src.append("String cellStr" + colIndex + " = str.substring(" + currPos + ",endPos); \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 39ecc1657f2..7e1a5b18422 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -29,7 +29,6 @@ public FrameCodeGen(CustomProperties properties, String className) { super(properties, className); // 1. set java code template - // 1.a: single thread code gen String javaBaseClass = !properties.isParallel() ? "FrameGenerateReader" : "FrameGenerateReaderParallel"; javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + @@ -38,13 +37,16 @@ public FrameCodeGen(CustomProperties properties, String className) { "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + "import org.apache.sysds.runtime.iogen.template."+javaBaseClass+";\n" + "import org.apache.sysds.runtime.matrix.data.FrameBlock;\n" + - "import java.io.IOException;\n" + "import java.util.HashSet;\n" + + "import java.io.IOException;\n" + + "import java.util.HashSet;\n" + + "import org.apache.sysds.runtime.iogen.template.TemplateUtil; \n" + + "import org.apache.sysds.runtime.util.UtilFunctions; \n" + "public class "+className+" extends "+javaBaseClass+" {\n" + "public "+className+"(CustomProperties _props) {\n" + "super(_props);} \n" + "@Override \n" + - "protected int reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, " + - "FrameBlock dest, int row, SplitInfo splitInfo) throws IOException {\n"+ + "protected int readFrameFromHDFS(RecordReader reader, LongWritable key, Text value, " + + "FrameBlock dest, int row, TemplateUtil.SplitInfo splitInfo) throws IOException {\n"+ code+ "}} \n"; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index bb786e8a7a6..edd5966c821 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -21,7 +21,6 @@ import org.apache.sysds.runtime.iogen.ColIndexStructure; import org.apache.sysds.runtime.iogen.CustomProperties; -import org.apache.sysds.runtime.iogen.MappingProperties; import org.apache.sysds.runtime.iogen.RowIndexStructure; import org.apache.sysds.runtime.iogen.template.TemplateCodeGenBase; @@ -31,66 +30,42 @@ public MatrixCodeGen(CustomProperties properties, String className) { super(properties, className); // 1. set java code template - // 1.a: single thread code gen - if(!properties.isParallel()){ - javaTemplate = "import org.apache.commons.lang.mutable.MutableInt;\n" + - "import org.apache.sysds.runtime.io.IOUtilFunctions;\n" + - "import java.util.HashMap;\n" + - "import java.util.HashSet;\n" + - "import java.util.regex.Matcher;\n" + - "import java.util.regex.Pattern; \n" + - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + - "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReader; \n" + - "import java.io.BufferedReader;\n" + - "import java.io.IOException;\n" + - "import java.io.InputStream;\n" + - "import java.io.InputStreamReader;\n" + - "public class " + className + " extends MatrixGenerateReader {\n" + - " public " + className + "(CustomProperties _props) {\n" + " super(_props);\n" + " }\n" + - " @Override protected long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest,\n" + - " MutableInt rowPos, long rlen, long clen, int blen) throws IOException {\n" + - code + - "}}\n"; - } - // 1.b: multi-thread code gen - else { - javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + - "import org.apache.hadoop.io.Text;\n" + - "import org.apache.hadoop.mapred.RecordReader;\n" + - "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + - "import org.apache.sysds.runtime.iogen.RowIndexStructure;\n" + - "import org.apache.sysds.runtime.iogen.template.MatrixGenerateReaderParallel;\n" + - "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + - "import java.io.IOException;\n" + - "import java.util.HashSet; \n" + - "import org.apache.sysds.runtime.util.UtilFunctions; \n"+ - "public class "+className+" extends MatrixGenerateReaderParallel {\n" + - "\tpublic "+className+"(CustomProperties _props) {\n" + "super(_props);} \n" + - "@Override \n" + - "protected long readMatrixFromHDFS(RecordReader reader, " + - " LongWritable key, Text value, MatrixBlock dest, int row,\n" + "SplitInfo splitInfo) throws IOException { \n" + - code + - "}}\n"; + String javaBaseClass = !properties.isParallel() ? "MatrixGenerateReader" : "MatrixGenerateReaderParallel"; + javaTemplate = "import org.apache.hadoop.io.LongWritable;\n" + + "import org.apache.hadoop.io.Text;\n" + + "import org.apache.hadoop.mapred.RecordReader;\n" + + "import org.apache.sysds.runtime.iogen.CustomProperties;\n" + + "import org.apache.sysds.runtime.iogen.template."+javaBaseClass+";\n" + + "import org.apache.sysds.runtime.matrix.data.MatrixBlock;\n" + + "import java.io.IOException;\n" + + "import java.util.HashSet;\n" + + "import org.apache.sysds.runtime.util.UtilFunctions; \n" + + "import org.apache.commons.lang.mutable.MutableInt; \n" + + "import org.apache.sysds.runtime.iogen.template.TemplateUtil; \n" + + "public class "+className+" extends "+javaBaseClass+" {\n" + + "public "+className+"(CustomProperties _props) {\n" + + "super(_props);} \n" + + "@Override \n" + + "protected long readMatrixFromHDFS(RecordReader reader, LongWritable key, Text value, " + + "MatrixBlock dest, MutableInt rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException {\n"+ + code+ + "}} \n"; - } // 2. set cpp code template } @Override public String generateCodeJava() { + StringBuilder src = new StringBuilder(); CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); - trie.setMatrix(true); - src.append("String str; \n"); - src.append("int row = rowPos.intValue(); \n"); + + src.append("String str=\"\"; \n"); + src.append("String remainStr = \"\"; \n"); src.append("int col = -1; \n"); + src.append("int row = rowPos.intValue(); \n"); src.append("long lnnz = 0; \n"); src.append("int index, endPos, strLen; \n"); - src.append("BufferedReader br = new BufferedReader(new InputStreamReader(is)); \n"); - if(properties.getMappingProperties().getDataProperties() == MappingProperties.DataProperties.NOTEXIST) { - src.append("double cellValue = "+ properties.getMappingProperties().getPatternValue() +"; \n"); - } boolean flag1 = false; boolean flag2 = false; @@ -112,75 +87,51 @@ public String generateCodeJava() { src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); src.append("try { \n"); - src.append("while((str = br.readLine()) != null){ \n"); + if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + src.append("int ri = -1; \n"); + src.append("int beginPosStr, endPosStr; \n"); + src.append("StringBuilder sb = new StringBuilder(); \n"); + src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); + src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); + src.append("boolean flag = true; \n"); + src.append("while(flag) { \n"); + src.append("flag = reader.next(key, value); \n"); + src.append("if(flag) { \n"); + src.append("ri++; \n"); + src.append("String valStr = value.toString(); \n"); + src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); + src.append("endPosStr = ri == endIndex ? splitInfo.getRecordPositionEnd(row): valStr.length(); \n"); + src.append("if(ri >= beginIndex && ri <= endIndex){ \n"); + src.append("sb.append(valStr.substring(beginPosStr, endPosStr)); \n"); + src.append("remainStr = valStr.substring(endPosStr); \n"); + src.append("continue; \n"); + src.append("} \n"); + src.append("else { \n"); + src.append("str = sb.toString(); \n"); + src.append("sb = new StringBuilder(); \n"); + src.append("sb.append(remainStr).append(valStr); \n"); + src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); + src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); + src.append("} \n"); + src.append("} \n"); + src.append("else \n"); + src.append("str = sb.toString(); \n"); + } + else { + src.append("while(reader.next(key, value)) { \n"); + src.append("str = value.toString(); \n"); + } src.append("strLen = str.length(); \n"); - src.append(trie.getJavaCode()); - src.append("} \n"); src.append("} \n"); - src.append("finally { \n"); - src.append("IOUtilFunctions.closeSilently(br); \n"); - src.append("}"); + src.append("catch(Exception ex){ \n"); + src.append("} \n"); src.append("rowPos.setValue(row); \n"); src.append("return lnnz; \n"); - return javaTemplate.replace(code, src.toString()); } -// @Override -// public String generateCodeJavaParallel() { -// StringBuilder src = new StringBuilder(); -// CodeGenTrie trie = new CodeGenTrie(properties, "dest.appendValue", true); -// trie.setMatrix(true); -// src.append("String str=\"\"; \n"); -// src.append("String remainStr = \"\"; \n"); -// src.append("int col = -1; \n"); -// src.append("long lnnz = 0; \n"); -// src.append("int index, endPos, strLen; \n"); -// src.append("HashSet[] endWithValueString = _props.endWithValueStrings(); \n"); -// src.append("try { \n"); -// // seq-scattered -// if(properties.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ -// src.append("int ri = -1; \n"); -// src.append("int beginPosStr, endPosStr; \n"); -// src.append("StringBuilder sb = new StringBuilder(); \n"); -// src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); -// src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); -// src.append("boolean flag = true; \n"); -// src.append("while(flag) { \n"); -// src.append("flag = reader.next(key, value); \n"); -// src.append("if(flag) { \n"); -// src.append("ri++; \n"); -// src.append("String valStr = value.toString(); \n"); -// src.append("beginPosStr = ri == beginIndex ? splitInfo.getRecordPositionBegin(row) : 0; \n"); -// src.append("endPosStr = ri == endIndex ? splitInfo.getRecordPositionEnd(row): valStr.length(); \n"); -// src.append("if(ri >= beginIndex && ri <= endIndex){ \n"); -// src.append("sb.append(valStr.substring(beginPosStr, endPosStr)); \n"); -// src.append("remainStr = valStr.substring(endPosStr); \n"); -// src.append("continue; \n"); -// src.append("} \n"); -// src.append("else { \n"); -// src.append("str = sb.toString(); \n"); -// src.append("sb = new StringBuilder(); \n"); -// src.append("sb.append(remainStr).append(valStr); \n"); -// src.append("beginIndex = splitInfo.getRecordIndexBegin(row+1); \n"); -// src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); -// src.append("} \n"); -// //src.append("} \n"); -// src.append("else \n"); -// src.append("str = sb.toString(); \n"); -// } -// src.append("str = value.toString();\n"); -// src.append("strLen = str.length(); \n"); -// src.append(trie.getJavaCode()); -// src.append("} \n"); -// src.append("} \n"); -// src.append("catch(Exception ex){ \n"); -// src.append("} \n"); -// src.append("return lnnz; \n"); -// return javaTemplate.replace(code, src.toString()); -// } @Override public String generateCodeCPP() { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java index 6df945d78be..8afb6217491 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java @@ -38,12 +38,11 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.HashSet; public abstract class FrameGenerateReader extends FrameReader { protected CustomProperties _props; - protected SplitOffsetInfos _offsets; + protected TemplateUtil.SplitOffsetInfos _offsets; public FrameGenerateReader(CustomProperties _props) { this._props = _props; @@ -72,7 +71,7 @@ public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Stri FrameBlock ret; if(rlen <= 0 || _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) - ret = computeSizeAndCreateOutputFrameBlock(job, schema, names, splits, path); + ret = computeSizeAndCreateOutputFrameBlock(informat,job, schema, names, splits, path); else ret = createOutputFrameBlock(lschema, lnames, rlen); @@ -81,15 +80,10 @@ public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Stri } - private FrameBlock computeSizeAndCreateOutputFrameBlock(JobConf job, Types.ValueType[] schema, String[] names, InputSplit[] splits, Path path) + private FrameBlock computeSizeAndCreateOutputFrameBlock(TextInputFormat informat, JobConf job, Types.ValueType[] schema, String[] names, + InputSplit[] splits, Path path) throws IOException, DMLRuntimeException { - //Types.ValueType[] lschema = createOutputSchema(schema, clen); - //String[] lnames = createOutputNames(names, clen); - - FileInputFormat.addInputPath(job, path); - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); int row = 0; // count rows in parallel per split try { @@ -111,18 +105,18 @@ private FrameBlock computeSizeAndCreateOutputFrameBlock(JobConf job, Types.Value } } else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { - _offsets = new SplitOffsetInfos(splits.length); + _offsets = new TemplateUtil.SplitOffsetInfos(splits.length); int splitIndex = 0; for(InputSplit inputSplit : splits) { int nrows = 0; - SplitInfo splitInfo = new SplitInfo(); - ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(inputSplit, informat, job, + TemplateUtil.SplitInfo splitInfo = new TemplateUtil.SplitInfo(); + ArrayList> beginIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqBeginString()); ArrayList> endIndexes; int tokenLength = 0; if(!_props.getRowIndexStructure().getSeqBeginString().equals(_props.getRowIndexStructure().getSeqEndString())) { - endIndexes = getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqEndString()); + endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqEndString()); tokenLength = _props.getRowIndexStructure().getSeqEndString().length(); } else { @@ -209,145 +203,17 @@ protected void readFrameFromHDFS(TextInputFormat informat, InputSplit[] splits, RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); - SplitInfo splitInfo = null; + TemplateUtil.SplitInfo splitInfo = null; if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ splitInfo = _offsets.getSeqOffsetPerSplit(i); rpos = _offsets.getOffsetPerSplit(i); } - reaFrameFromHDFS(reader, key, value, dest, rpos, splitInfo); - } - } - - protected abstract int reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, - int rowPos, SplitInfo splitInfo) throws IOException; - - protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { - int endPos = strLen; - for(String d : endWithValueString) { - int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; - if(pos != -1) - endPos = Math.min(endPos, pos); - } - return endPos; - } - - private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, - String token) throws IOException { - RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - ArrayList> result = new ArrayList<>(); - - int ri = 0; - while (reader.next(key, value)){ - String raw = value.toString(); - int index; - int fromIndex = 0; - do { - index = raw.indexOf(token, fromIndex); - if(index !=-1){ - result.add(new Pair<>(ri, index)); - fromIndex = index+token.length(); - } - else - break; - }while(true); - ri++; - } - return result; - } - - private static class SplitOffsetInfos { - // offset & length info per split - private int[] offsetPerSplit = null; - private int[] lenghtPerSplit = null; - private SplitInfo[] seqOffsetPerSplit = null; - - public SplitOffsetInfos(int numSplits) { - lenghtPerSplit = new int[numSplits]; - offsetPerSplit = new int[numSplits]; - seqOffsetPerSplit = new SplitInfo[numSplits]; - } - - public int getLenghtPerSplit(int split) { - return lenghtPerSplit[split]; - } - - public void setLenghtPerSplit(int split, int r) { - lenghtPerSplit[split] = r; - } - - public int getOffsetPerSplit(int split) { - return offsetPerSplit[split]; - } - - public void setOffsetPerSplit(int split, int o) { - offsetPerSplit[split] = o; - } - - public SplitInfo getSeqOffsetPerSplit(int split) { - return seqOffsetPerSplit[split]; - } - - public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { - seqOffsetPerSplit[split] = splitInfo; + readFrameFromHDFS(reader, key, value, dest, rpos, splitInfo); } } - protected static class SplitInfo{ - private int nrows; - private ArrayList recordIndexBegin; - private ArrayList recordIndexEnd; - private ArrayList recordPositionBegin; - private ArrayList recordPositionEnd; - private String remainString; - - public SplitInfo() { - recordIndexBegin = new ArrayList<>(); - recordIndexEnd = new ArrayList<>(); - recordPositionBegin = new ArrayList<>(); - recordPositionEnd = new ArrayList<>(); - } - - public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos){ - recordIndexBegin.add(beginIndex); - recordIndexEnd.add(endIndex); - recordPositionBegin.add(beginPos); - recordPositionEnd.add(endPos); - } - - public int getNrows() { - return nrows; - } - - public void setNrows(int nrows) { - this.nrows = nrows; - } - - public String getRemainString() { - return remainString; - } - - public void setRemainString(String remainString) { - this.remainString = remainString; - } - - public int getRecordIndexBegin(int index) { - return recordIndexBegin.get(index); - } - - public int getRecordIndexEnd(int index) { - return recordIndexEnd.get(index); - } - - public int getRecordPositionBegin(int index) { - return recordPositionBegin.get(index); - } - - public int getRecordPositionEnd(int index) { - return recordPositionEnd.get(index); - } - } + protected abstract int readFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, + int rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java index 61347167316..a9678475ddd 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -55,7 +55,7 @@ public abstract class FrameGenerateReaderParallel extends FrameReader { protected CustomProperties _props; protected int _numThreads; protected JobConf job; - protected SplitOffsetInfos _offsets; + protected TemplateUtil.SplitOffsetInfos _offsets; protected int _rLen; protected int _cLen; @@ -64,9 +64,8 @@ public FrameGenerateReaderParallel(CustomProperties _props) { this._props = _props; } - @Override - public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, - long clen) throws IOException, DMLRuntimeException { + @Override public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException { //prepare file access job = new JobConf(ConfigurationManager.getCachedJobConf()); @@ -113,7 +112,7 @@ private FrameBlock computeSizeAndCreateOutputFrameBlock(Types.ValueType[] schema // collect row counts for offset computation // early error notify in case not all tasks successful - _offsets = new SplitOffsetInfos(tasks.size()); + _offsets = new TemplateUtil.SplitOffsetInfos(tasks.size()); int i = 0; for(Future rc : pool.invokeAll(tasks)) { int lnrow = (int) rc.get().longValue(); // incl error handling @@ -132,13 +131,13 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index // collect row counts for offset computation // early error notify in case not all tasks successful - _offsets = new SplitOffsetInfos(tasks.size()); + _offsets = new TemplateUtil.SplitOffsetInfos(tasks.size()); int i = 0; - for(Future rc : pool.invokeAll(tasks)) { - SplitInfo splitInfo = rc.get(); + for(Future rc : pool.invokeAll(tasks)) { + TemplateUtil.SplitInfo splitInfo = rc.get(); _offsets.setSeqOffsetPerSplit(i, splitInfo); _offsets.setOffsetPerSplit(i, _rLen); - _rLen = _rLen + splitInfo.nrows; + _rLen = _rLen + splitInfo.getNrows(); i++; } pool.shutdown(); @@ -150,7 +149,7 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index // robustness for wrong dimensions which are already compiled into the plan if(rlen != -1 && _rLen != rlen) { - String msg = "Read frame dimensions differ from meta data: [" + _rLen + "x" + _cLen + "] vs. [" + rlen+ "x" + clen + "]."; + String msg = "Read frame dimensions differ from meta data: [" + _rLen + "x" + _cLen + "] vs. [" + rlen + "x" + clen + "]."; if(rlen < _rLen || clen < _cLen) { // a) specified matrix dimensions too small throw new DMLRuntimeException(msg); @@ -167,29 +166,28 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index return ret; } - @Override - public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] schema, String[] names, - long rlen, long clen) throws IOException, DMLRuntimeException { + @Override public FrameBlock readFrameFromInputStream(InputStream is, Types.ValueType[] schema, String[] names, long rlen, long clen) + throws IOException, DMLRuntimeException { // allocate output frame block InputStreamInputFormat informat = new InputStreamInputFormat(is); InputSplit[] splits = informat.getSplits(null, 1); - FrameBlock ret = computeSizeAndCreateOutputFrameBlock(schema, names, splits,null, rlen, clen); -// -// // core read (sequential/parallel) -// -// -// ReadTask rt = new ReadTask(splits[0], informat, ret, 1) -// -// //readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); -// -// ArrayList tasks = new ArrayList<>(); -// int splitCount = 0; -// for (InputSplit split : splits) { -// tasks.add( new ReadTask(split, informat, dest, splitCount++). ); -// } -// pool.invokeAll(tasks); -// pool.shutdown(); + FrameBlock ret = computeSizeAndCreateOutputFrameBlock(schema, names, splits, null, rlen, clen); + // + // // core read (sequential/parallel) + // + // + // ReadTask rt = new ReadTask(splits[0], informat, ret, 1) + // + // //readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); + // + // ArrayList tasks = new ArrayList<>(); + // int splitCount = 0; + // for (InputSplit split : splits) { + // tasks.add( new ReadTask(split, informat, dest, splitCount++). ); + // } + // pool.invokeAll(tasks); + // pool.shutdown(); // TODO: implement parallel reader for input stream return ret; } @@ -201,18 +199,18 @@ protected void readFrameFromHDFS(InputSplit[] splits, Path path, JobConf job, Fr informat.configure(job); ExecutorService pool = CommonThreadPool.get(_numThreads); - try{ + try { // create read tasks for all splits ArrayList tasks = new ArrayList<>(); int splitCount = 0; - for (InputSplit split : splits) { - tasks.add( new ReadTask(split, informat, dest, splitCount++) ); + for(InputSplit split : splits) { + tasks.add(new ReadTask(split, informat, dest, splitCount++)); } pool.invokeAll(tasks); pool.shutdown(); } - catch (Exception e) { + catch(Exception e) { throw new IOException("Threadpool issue, while parallel read.", e); } } @@ -220,50 +218,13 @@ protected void readFrameFromHDFS(InputSplit[] splits, Path path, JobConf job, Fr protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { int endPos = strLen; for(String d : endWithValueString) { - int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; + int pos = d.length() > 0 ? str.indexOf(d, currPos) : strLen; if(pos != -1) endPos = Math.min(endPos, pos); } return endPos; } - private static class SplitOffsetInfos { - // offset & length info per split - private int[] offsetPerSplit = null; - private int[] lenghtPerSplit = null; - private SplitInfo[] seqOffsetPerSplit = null; - - public SplitOffsetInfos(int numSplits) { - lenghtPerSplit = new int[numSplits]; - offsetPerSplit = new int[numSplits]; - seqOffsetPerSplit = new SplitInfo[numSplits]; - } - - public int getLenghtPerSplit(int split) { - return lenghtPerSplit[split]; - } - - public void setLenghtPerSplit(int split, int r) { - lenghtPerSplit[split] = r; - } - - public int getOffsetPerSplit(int split) { - return offsetPerSplit[split]; - } - - public void setOffsetPerSplit(int split, int o) { - offsetPerSplit[split] = o; - } - - public SplitInfo getSeqOffsetPerSplit(int split) { - return seqOffsetPerSplit[split]; - } - - public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { - seqOffsetPerSplit[split] = splitInfo; - } - } - private class ReadTask implements Callable { private final InputSplit _split; @@ -279,26 +240,25 @@ public ReadTask(InputSplit split, TextInputFormat informat, FrameBlock dest, int _splitCount = splitCount; } - @Override - public Long call() throws IOException { + @Override public Long call() throws IOException { RecordReader reader = _informat.getRecordReader(_split, job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); _row = _offsets.getOffsetPerSplit(_splitCount); - SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); - reaFrameFromHDFS(reader, key, value, _dest, _row, _splitInfo); + TemplateUtil.SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); + readFrameFromHDFS(reader, key, value, _dest, _row, _splitInfo); return 0L; } } - private static class CountSeqScatteredRowsTask implements Callable { + private static class CountSeqScatteredRowsTask implements Callable { private final InputSplit _split; private final TextInputFormat _inputFormat; private final JobConf _jobConf; private final String _beginString; private final String _endString; - public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, String beginString, String endString){ + public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, String beginString, String endString) { _split = split; _inputFormat = inputFormat; _jobConf = jobConf; @@ -306,9 +266,8 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, _endString = endString; } - @Override - public SplitInfo call() throws Exception { - SplitInfo splitInfo = new SplitInfo(); + @Override public TemplateUtil.SplitInfo call() throws Exception { + TemplateUtil.SplitInfo splitInfo = new TemplateUtil.SplitInfo(); int nrows = 0; ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); ArrayList> endIndexes; @@ -336,9 +295,9 @@ public SplitInfo call() throws Exception { break; p1 = beginIndexes.get(i); } - j += n-1; + j += n - 1; splitInfo.addIndexAndPosition(beginIndexes.get(i - n).getKey(), endIndexes.get(j).getKey(), beginIndexes.get(i - n).getValue(), - endIndexes.get(j).getValue()+tokenLength); + endIndexes.get(j).getValue() + tokenLength); j++; nrows++; } @@ -346,13 +305,13 @@ public SplitInfo call() throws Exception { nrows++; if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) splitInfo.setRemainString(""); - else{ + else { RecordReader reader = _inputFormat.getRecordReader(_split, _jobConf, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); StringBuilder sb = new StringBuilder(); - for(int ri = 0; ri< beginIndexes.get(0).getKey(); ri++){ + for(int ri = 0; ri < beginIndexes.get(0).getKey(); ri++) { reader.next(key, value); String raw = value.toString(); sb.append(raw); @@ -368,61 +327,6 @@ public SplitInfo call() throws Exception { } } - protected static class SplitInfo{ - private int nrows; - private ArrayList recordIndexBegin; - private ArrayList recordIndexEnd; - private ArrayList recordPositionBegin; - private ArrayList recordPositionEnd; - private String remainString; - - public SplitInfo() { - recordIndexBegin = new ArrayList<>(); - recordIndexEnd = new ArrayList<>(); - recordPositionBegin = new ArrayList<>(); - recordPositionEnd = new ArrayList<>(); - } - - public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos){ - recordIndexBegin.add(beginIndex); - recordIndexEnd.add(endIndex); - recordPositionBegin.add(beginPos); - recordPositionEnd.add(endPos); - } - - public int getNrows() { - return nrows; - } - - public void setNrows(int nrows) { - this.nrows = nrows; - } - - public String getRemainString() { - return remainString; - } - - public void setRemainString(String remainString) { - this.remainString = remainString; - } - - public int getRecordIndexBegin(int index) { - return recordIndexBegin.get(index); - } - - public int getRecordIndexEnd(int index) { - return recordIndexEnd.get(index); - } - - public int getRecordPositionBegin(int index) { - return recordPositionBegin.get(index); - } - - public int getRecordPositionEnd(int index) { - return recordPositionEnd.get(index); - } - } - private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, String token) throws IOException { RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); @@ -431,24 +335,25 @@ private static ArrayList> getTokenIndexOnMultiLineRecords ArrayList> result = new ArrayList<>(); int ri = 0; - while (reader.next(key, value)){ + while(reader.next(key, value)) { String raw = value.toString(); int index; int fromIndex = 0; do { index = raw.indexOf(token, fromIndex); - if(index !=-1){ + if(index != -1) { result.add(new Pair<>(ri, index)); - fromIndex = index+token.length(); + fromIndex = index + token.length(); } else break; - }while(true); + } + while(true); ri++; } return result; } - protected abstract int reaFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, - int rowPos, SplitInfo splitInfo) throws IOException; + protected abstract int readFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, int rowPos, + TemplateUtil.SplitInfo splitInfo) throws IOException; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index 6572c46e1fd..b9d81a55205 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -20,8 +20,14 @@ package org.apache.sysds.runtime.iogen.template; import org.apache.commons.lang.mutable.MutableInt; -import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.FileInputFormat; +import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; import org.apache.sysds.conf.ConfigurationManager; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.iogen.CustomProperties; @@ -33,73 +39,55 @@ import org.apache.sysds.runtime.io.MatrixReader; import org.apache.sysds.runtime.matrix.data.Pair; -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.Writer; import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; public abstract class MatrixGenerateReader extends MatrixReader { protected static CustomProperties _props; + protected TemplateUtil.SplitOffsetInfos _offsets; public MatrixGenerateReader(CustomProperties _props) { MatrixGenerateReader._props = _props; } - protected MatrixBlock computeSize(List files, FileSystem fs, long rlen, long clen) - throws IOException, DMLRuntimeException { - // allocate target matrix block based on given size; - return new MatrixBlock(getNumRows(files, fs), (int) clen, rlen * clen); - } - - private static int getNumRows(List files, FileSystem fs) throws IOException, DMLRuntimeException { - int rows = 0; - for(int fileNo = 0; fileNo < files.size(); fileNo++) { - BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); - try { - // Row Identify - if(_props.getRowIndexStructure().getProperties().equals(RowIndexStructure.IndexProperties.Identity)) { - while(br.readLine() != null) - rows++; - } - } - finally { - IOUtilFunctions.closeSilently(br); - } - } - return rows; - } - @Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { - MatrixBlock ret = null; - if(rlen >= 0 && clen >= 0) //otherwise allocated on read - ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, true, false); - //prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); Path path = new Path(fname); FileSystem fs = IOUtilFunctions.getFileSystem(path, job); + FileInputFormat.addInputPath(job, path); + + checkValidInputFile(fs, path); + + TextInputFormat informat = new TextInputFormat(); + informat.configure(job); + InputSplit[] splits = informat.getSplits(job, 1); + splits = IOUtilFunctions.sortInputSplits(splits); + + MatrixBlock ret; + if(rlen >= 0 && clen >= 0 && _props.getRowIndexStructure().getProperties() != RowIndexStructure.IndexProperties.SeqScatter) { + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || + _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ){ + clen++; + rlen ++; + } + ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, !_props.isSparse(), _props.isSparse()); + } + else + ret = computeSizeAndCreateOutputMatrixBlock(informat,job, splits, estnnz); //core read - ret = readMatrixFromHDFS(path, job, fs, ret, rlen, clen, blen); + readMatrixFromHDFS(informat, splits, job, ret); return ret; } + @Override public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long clen, int blen, long estnnz) throws IOException, DMLRuntimeException { @@ -111,89 +99,122 @@ public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long cle return ret; } - @SuppressWarnings("unchecked") - private MatrixBlock readMatrixFromHDFS(Path path, JobConf job, FileSystem fs, - MatrixBlock dest, long rlen, long clen, int blen) throws IOException, DMLRuntimeException { - - //prepare file paths in alphanumeric order - ArrayList files = new ArrayList<>(); - if(fs.getFileStatus(path).isDirectory()) { - for(FileStatus stat : fs.listStatus(path, IOUtilFunctions.hiddenFileFilter)) - files.add(stat.getPath()); - Collections.sort(files); + private MatrixBlock computeSizeAndCreateOutputMatrixBlock(TextInputFormat informat, JobConf job, InputSplit[] splits, long estnnz) throws IOException, DMLRuntimeException { + int row = 0; + // count rows in parallel per split + try { + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.Identity) { + // compute number of rows + for(InputSplit inputSplit : splits) { + RecordReader reader = informat.getRecordReader(inputSplit, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + try { + // count remaining number of rows, ignore meta data + while(reader.next(key, value)) { + row++; + } + } + finally { + IOUtilFunctions.closeSilently(reader); + } + } + } + else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { + _offsets = new TemplateUtil.SplitOffsetInfos(splits.length); + int splitIndex = 0; + for(InputSplit inputSplit : splits) { + int nrows = 0; + TemplateUtil.SplitInfo splitInfo = new TemplateUtil.SplitInfo(); + ArrayList> beginIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(inputSplit, informat, job, + _props.getRowIndexStructure().getSeqBeginString()); + + ArrayList> endIndexes; + int tokenLength = 0; + if(!_props.getRowIndexStructure().getSeqBeginString().equals(_props.getRowIndexStructure().getSeqEndString())) { + endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqEndString()); + tokenLength = _props.getRowIndexStructure().getSeqEndString().length(); + } + else { + endIndexes = new ArrayList<>(); + for(int i = 1; i < beginIndexes.size(); i++) + endIndexes.add(beginIndexes.get(i)); + } + + int i = 0; + int j = 0; + while(i < beginIndexes.size() && j < endIndexes.size()) { + Pair p1 = beginIndexes.get(i); + Pair p2 = endIndexes.get(j); + int n = 0; + while(p1.getKey() < p2.getKey() || (p1.getKey() == p2.getKey() && p1.getValue() < p2.getValue())) { + n++; + i++; + if(i == beginIndexes.size()) + break; + p1 = beginIndexes.get(i); + } + j += n - 1; + splitInfo.addIndexAndPosition(beginIndexes.get(i - n).getKey(), endIndexes.get(j).getKey(), beginIndexes.get(i - n).getValue(), + endIndexes.get(j).getValue() + tokenLength); + j++; + nrows++; + } + if(i == beginIndexes.size() && j < endIndexes.size()) + nrows++; + if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) + splitInfo.setRemainString(""); + else { + RecordReader reader = informat.getRecordReader(inputSplit, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + + StringBuilder sb = new StringBuilder(); + for(int ri = 0; ri < beginIndexes.get(0).getKey(); ri++) { + reader.next(key, value); + String raw = value.toString(); + sb.append(raw); + } + if(beginIndexes.get(0).getValue() != 0) { + reader.next(key, value); + sb.append(value.toString().substring(0, beginIndexes.get(0).getValue())); + } + splitInfo.setRemainString(sb.toString()); + } + splitInfo.setNrows(nrows); + _offsets.setSeqOffsetPerSplit(splitIndex, splitInfo); + _offsets.setOffsetPerSplit(splitIndex, row); + row += nrows; + splitIndex++; + } + } } - else - files.add(path); - - //determine matrix size via additional pass if required - if(dest == null) { - dest = computeSize(files, fs, rlen, clen); - rlen = dest.getNumRows(); - //clen = _props.getColumnIdentifyProperties().length; + catch(Exception e) { + throw new IOException("Thread pool Error " + e.getMessage(), e); } + MatrixBlock ret = createOutputMatrixBlock(row, _props.getNcols(), (int) row, estnnz, !_props.isSparse(), _props.isSparse()); + return ret; + } - //actual read of individual files - long lnnz = 0; + @SuppressWarnings("unchecked") + protected void readMatrixFromHDFS(TextInputFormat informat, InputSplit[] splits, JobConf job, MatrixBlock dest) throws IOException { MutableInt row = new MutableInt(0); - for(int fileNo = 0; fileNo < files.size(); fileNo++) { - lnnz += readMatrixFromInputStream(fs.open(files.get(fileNo)), path.toString(), dest, row, rlen, clen, blen); + long lnnz = 0; + for(int i = 0; i < splits.length; i++) { + RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + TemplateUtil.SplitInfo splitInfo = null; + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + splitInfo = _offsets.getSeqOffsetPerSplit(i); + row.setValue(_offsets.getOffsetPerSplit(i)); + } + lnnz += readMatrixFromHDFS(reader, key, value, dest, row, splitInfo); } - //post processing dest.setNonZeros(lnnz); - - return dest; } - protected abstract long readMatrixFromInputStream(InputStream is, String srcInfo, MatrixBlock dest, - MutableInt rowPos, long rlen, long clen, int blen) throws IOException; - - protected void saveCode(String fileName, String code) { - try(Writer writer = new BufferedWriter( - new OutputStreamWriter(new FileOutputStream(fileName, false), "utf-8"))) { - writer.write(code); - } - catch(Exception ex) { - } - } - - protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { - int endPos = strLen; - for(String d : endWithValueString) { - int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; - if(pos != -1) - endPos = Math.min(endPos, pos); - } - return endPos; - } - - protected int getColIndex(HashMap colKeyPatternMap, String key){ - if(colKeyPatternMap.containsKey(key)) - return colKeyPatternMap.get(key); - else - return -1; - } - - protected String getStringChunkOfBufferReader(BufferedReader br, String remainedStr,int chunkSize){ - StringBuilder sb = new StringBuilder(); - String str; - int readSize = 0; - try { - while((str = br.readLine()) != null && readSize0) { - if(remainedStr!=null && remainedStr.length() >0) - return remainedStr + sb; - else - return sb.toString(); - } - else - return null; - } + protected abstract long readMatrixFromHDFS(RecordReader reader, LongWritable key, Text value, MatrixBlock dest, + MutableInt rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java index eb10feb51ec..552f8684d4e 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java @@ -19,6 +19,7 @@ package org.apache.sysds.runtime.iogen.template; +import org.apache.commons.lang.mutable.MutableInt; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; @@ -43,7 +44,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.HashSet; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -53,7 +53,7 @@ public abstract class MatrixGenerateReaderParallel extends MatrixReader { protected static CustomProperties _props; protected int _numThreads = 1; protected JobConf job; - protected SplitOffsetInfos _offsets; + protected TemplateUtil.SplitOffsetInfos _offsets; protected int _rLen; protected int _cLen; @@ -84,7 +84,7 @@ public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int bl MatrixBlock ret = computeSizeAndCreateOutputMatrixBlock(splits, path, rlen, _props.getNcols(), blen, estnnz); // Second Read Pass (read, parse strings, append to matrix block) - readMatrixFromHDFS(splits, path, job, ret, rlen, clen, blen); + readMatrixFromHDFS(informat,splits, path, job, ret, rlen, clen, blen); return ret; } @@ -108,7 +108,7 @@ private MatrixBlock computeSizeAndCreateOutputMatrixBlock(InputSplit[] splits, P // collect row counts for offset computation // early error notify in case not all tasks successful - _offsets = new SplitOffsetInfos(tasks.size()); + _offsets = new TemplateUtil.SplitOffsetInfos(tasks.size()); int i = 0; for(Future rc : pool.invokeAll(tasks)) { int lnrow = (int) rc.get().longValue(); // incl error handling @@ -127,13 +127,13 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index // collect row counts for offset computation // early error notify in case not all tasks successful - _offsets = new SplitOffsetInfos(tasks.size()); + _offsets = new TemplateUtil.SplitOffsetInfos(tasks.size()); int i = 0; - for(Future rc : pool.invokeAll(tasks)) { - SplitInfo splitInfo = rc.get(); + for(Future rc : pool.invokeAll(tasks)) { + TemplateUtil.SplitInfo splitInfo = rc.get(); _offsets.setSeqOffsetPerSplit(i, splitInfo); _offsets.setOffsetPerSplit(i, _rLen); - _rLen = _rLen + splitInfo.nrows; + _rLen = _rLen + splitInfo.getNrows(); i++; } pool.shutdown(); @@ -160,6 +160,11 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index // allocate target matrix block based on given size; // need to allocate sparse as well since lock-free insert into target + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || + _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ){ + _rLen++; + _cLen++; + } long estnnz2 = (estnnz < 0) ? (long) _rLen * _cLen : estnnz; return createOutputMatrixBlock(_rLen, _cLen, blen, estnnz2, !_props.isSparse(), _props.isSparse()); } @@ -175,12 +180,9 @@ public MatrixBlock readMatrixFromInputStream(InputStream is, long rlen, long cle return ret; } - private void readMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, int blen) throws IOException + private void readMatrixFromHDFS(TextInputFormat informat, InputSplit[] splits, Path path, JobConf job, MatrixBlock dest, long rlen, long clen, + int blen) throws IOException { - FileInputFormat.addInputPath(job, path); - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - ExecutorService pool = CommonThreadPool.get(_numThreads); try{ // create read tasks for all splits @@ -203,50 +205,12 @@ private void readMatrixFromHDFS(InputSplit[] splits, Path path, JobConf job, Mat } } - private static class SplitOffsetInfos { - // offset & length info per split - private int[] offsetPerSplit = null; - private int[] lenghtPerSplit = null; - private SplitInfo[] seqOffsetPerSplit = null; - - public SplitOffsetInfos(int numSplits) { - lenghtPerSplit = new int[numSplits]; - offsetPerSplit = new int[numSplits]; - seqOffsetPerSplit = new SplitInfo[numSplits]; - } - - public int getLenghtPerSplit(int split) { - return lenghtPerSplit[split]; - } - - public void setLenghtPerSplit(int split, int r) { - lenghtPerSplit[split] = r; - } - - public int getOffsetPerSplit(int split) { - return offsetPerSplit[split]; - } - - public void setOffsetPerSplit(int split, int o) { - offsetPerSplit[split] = o; - } - - public SplitInfo getSeqOffsetPerSplit(int split) { - return seqOffsetPerSplit[split]; - } - - public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { - seqOffsetPerSplit[split] = splitInfo; - } - } - private class ReadTask implements Callable { private final InputSplit _split; private final TextInputFormat _informat; private final MatrixBlock _dest; private final int _splitCount; - private int _row = 0; private long _nnz = 0; public ReadTask(InputSplit split, TextInputFormat informat, MatrixBlock dest, int splitCount) { @@ -261,9 +225,9 @@ public Long call() throws IOException { RecordReader reader = _informat.getRecordReader(_split, job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); - _row = _offsets.getOffsetPerSplit(_splitCount); - SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); - _nnz = readMatrixFromHDFS(reader, key, value, _dest, _row, _splitInfo); + MutableInt rowPos = new MutableInt(_offsets.getOffsetPerSplit(_splitCount)); + TemplateUtil.SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); + _nnz = readMatrixFromHDFS(reader, key, value, _dest, rowPos, _splitInfo); return _nnz; } @@ -272,7 +236,7 @@ public long getNnz() { } } - private static class CountSeqScatteredRowsTask implements Callable { + private static class CountSeqScatteredRowsTask implements Callable { private final InputSplit _split; private final TextInputFormat _inputFormat; private final JobConf _jobConf; @@ -288,14 +252,14 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, } @Override - public SplitInfo call() throws Exception { - SplitInfo splitInfo = new SplitInfo(); + public TemplateUtil.SplitInfo call() throws Exception { + TemplateUtil.SplitInfo splitInfo = new TemplateUtil.SplitInfo(); int nrows = 0; - ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); + ArrayList> beginIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); ArrayList> endIndexes; int tokenLength = 0; if(!_beginString.equals(_endString)) { - endIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); + endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); tokenLength = _endString.length(); } else { @@ -349,97 +313,7 @@ public SplitInfo call() throws Exception { } } - protected static class SplitInfo{ - private int nrows; - private ArrayList recordIndexBegin; - private ArrayList recordIndexEnd; - private ArrayList recordPositionBegin; - private ArrayList recordPositionEnd; - private String remainString; - - public SplitInfo() { - recordIndexBegin = new ArrayList<>(); - recordIndexEnd = new ArrayList<>(); - recordPositionBegin = new ArrayList<>(); - recordPositionEnd = new ArrayList<>(); - } - - public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos){ - recordIndexBegin.add(beginIndex); - recordIndexEnd.add(endIndex); - recordPositionBegin.add(beginPos); - recordPositionEnd.add(endPos); - } - - public int getNrows() { - return nrows; - } - - public void setNrows(int nrows) { - this.nrows = nrows; - } - - public String getRemainString() { - return remainString; - } - - public void setRemainString(String remainString) { - this.remainString = remainString; - } - - public int getRecordIndexBegin(int index) { - return recordIndexBegin.get(index); - } - - public int getRecordIndexEnd(int index) { - return recordIndexEnd.get(index); - } - - public int getRecordPositionBegin(int index) { - return recordPositionBegin.get(index); - } - - public int getRecordPositionEnd(int index) { - return recordPositionEnd.get(index); - } - } - - private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, - String token) throws IOException { - RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - ArrayList> result = new ArrayList<>(); - - int ri = 0; - while (reader.next(key, value)){ - String raw = value.toString(); - int index; - int fromIndex = 0; - do { - index = raw.indexOf(token, fromIndex); - if(index !=-1){ - result.add(new Pair<>(ri, index)); - fromIndex = index+token.length(); - } - else - break; - }while(true); - ri++; - } - return result; - } - protected abstract long readMatrixFromHDFS(RecordReader reader, LongWritable key, Text value, MatrixBlock dest, - int rowPos, SplitInfo splitInfo) throws IOException; - - protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { - int endPos = strLen; - for(String d : endWithValueString) { - int pos = d.length()> 0 ? str.indexOf(d, currPos): strLen; - if(pos != -1) - endPos = Math.min(endPos, pos); - } - return endPos; - } + MutableInt rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException; + } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java new file mode 100644 index 00000000000..0d9a0d7b0a8 --- /dev/null +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.runtime.iogen.template; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.InputSplit; +import org.apache.hadoop.mapred.JobConf; +import org.apache.hadoop.mapred.RecordReader; +import org.apache.hadoop.mapred.Reporter; +import org.apache.hadoop.mapred.TextInputFormat; +import org.apache.sysds.runtime.matrix.data.Pair; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; + +public class TemplateUtil { + + public static class SplitOffsetInfos { + // offset & length info per split + private int[] offsetPerSplit = null; + private int[] lenghtPerSplit = null; + private SplitInfo[] seqOffsetPerSplit = null; + + public SplitOffsetInfos(int numSplits) { + lenghtPerSplit = new int[numSplits]; + offsetPerSplit = new int[numSplits]; + seqOffsetPerSplit = new SplitInfo[numSplits]; + } + + public int getLenghtPerSplit(int split) { + return lenghtPerSplit[split]; + } + + public void setLenghtPerSplit(int split, int r) { + lenghtPerSplit[split] = r; + } + + public int getOffsetPerSplit(int split) { + return offsetPerSplit[split]; + } + + public void setOffsetPerSplit(int split, int o) { + offsetPerSplit[split] = o; + } + + public SplitInfo getSeqOffsetPerSplit(int split) { + return seqOffsetPerSplit[split]; + } + + public void setSeqOffsetPerSplit(int split, SplitInfo splitInfo) { + seqOffsetPerSplit[split] = splitInfo; + } + } + + public static class SplitInfo { + private int nrows; + private ArrayList recordIndexBegin; + private ArrayList recordIndexEnd; + private ArrayList recordPositionBegin; + private ArrayList recordPositionEnd; + private String remainString; + + public SplitInfo() { + recordIndexBegin = new ArrayList<>(); + recordIndexEnd = new ArrayList<>(); + recordPositionBegin = new ArrayList<>(); + recordPositionEnd = new ArrayList<>(); + } + + public void addIndexAndPosition(int beginIndex, int endIndex, int beginPos, int endPos) { + recordIndexBegin.add(beginIndex); + recordIndexEnd.add(endIndex); + recordPositionBegin.add(beginPos); + recordPositionEnd.add(endPos); + } + + public int getNrows() { + return nrows; + } + + public void setNrows(int nrows) { + this.nrows = nrows; + } + + public String getRemainString() { + return remainString; + } + + public void setRemainString(String remainString) { + this.remainString = remainString; + } + + public int getRecordIndexBegin(int index) { + return recordIndexBegin.get(index); + } + + public int getRecordIndexEnd(int index) { + return recordIndexEnd.get(index); + } + + public int getRecordPositionBegin(int index) { + return recordPositionBegin.get(index); + } + + public int getRecordPositionEnd(int index) { + return recordPositionEnd.get(index); + } + } + + public static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, + String token) throws IOException { + RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); + LongWritable key = new LongWritable(); + Text value = new Text(); + ArrayList> result = new ArrayList<>(); + + int ri = 0; + while(reader.next(key, value)) { + String raw = value.toString(); + int index; + int fromIndex = 0; + do { + index = raw.indexOf(token, fromIndex); + if(index != -1) { + result.add(new Pair<>(ri, index)); + fromIndex = index + token.length(); + } + else + break; + } + while(true); + ri++; + } + return result; + } + + public static int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { + int endPos = strLen; + for(String d : endWithValueString) { + int pos = d.length() > 0 ? str.indexOf(d, currPos) : strLen; + if(pos != -1) + endPos = Math.min(endPos, pos); + } + return endPos; + } + + + public static String getStringChunkOfBufferReader(BufferedReader br, String remainedStr,int chunkSize){ + StringBuilder sb = new StringBuilder(); + String str; + int readSize = 0; + try { + while((str = br.readLine()) != null && readSize0) { + if(remainedStr!=null && remainedStr.length() >0) + return remainedStr + sb; + else + return sb.toString(); + } + else + return null; + } + + + protected int getColIndex(HashMap colKeyPatternMap, String key){ + return colKeyPatternMap.getOrDefault(key, -1); + } +} + + From 2489647cb8223d8f1b4631637dc95c4a3af8321a Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 24 Jun 2022 22:20:24 +0200 Subject: [PATCH 73/84] Minor --- .../template/FrameGenerateReaderParallel.java | 46 ++----------------- 1 file changed, 5 insertions(+), 41 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java index a9678475ddd..6e4a5ea6954 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -215,23 +215,12 @@ protected void readFrameFromHDFS(InputSplit[] splits, Path path, JobConf job, Fr } } - protected int getEndPos(String str, int strLen, int currPos, HashSet endWithValueString) { - int endPos = strLen; - for(String d : endWithValueString) { - int pos = d.length() > 0 ? str.indexOf(d, currPos) : strLen; - if(pos != -1) - endPos = Math.min(endPos, pos); - } - return endPos; - } - private class ReadTask implements Callable { private final InputSplit _split; private final TextInputFormat _informat; private final FrameBlock _dest; private final int _splitCount; - private int _row = 0; public ReadTask(InputSplit split, TextInputFormat informat, FrameBlock dest, int splitCount) { _split = split; @@ -244,9 +233,9 @@ public ReadTask(InputSplit split, TextInputFormat informat, FrameBlock dest, int RecordReader reader = _informat.getRecordReader(_split, job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); - _row = _offsets.getOffsetPerSplit(_splitCount); + int row = _offsets.getOffsetPerSplit(_splitCount); TemplateUtil.SplitInfo _splitInfo = _offsets.getSeqOffsetPerSplit(_splitCount); - readFrameFromHDFS(reader, key, value, _dest, _row, _splitInfo); + readFrameFromHDFS(reader, key, value, _dest, row, _splitInfo); return 0L; } } @@ -269,11 +258,12 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, @Override public TemplateUtil.SplitInfo call() throws Exception { TemplateUtil.SplitInfo splitInfo = new TemplateUtil.SplitInfo(); int nrows = 0; - ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); + ArrayList> beginIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, + _beginString); ArrayList> endIndexes; int tokenLength = 0; if(!_beginString.equals(_endString)) { - endIndexes = getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); + endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); tokenLength = _endString.length(); } else { @@ -327,32 +317,6 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, } } - private static ArrayList> getTokenIndexOnMultiLineRecords(InputSplit split, TextInputFormat inputFormat, JobConf job, - String token) throws IOException { - RecordReader reader = inputFormat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - ArrayList> result = new ArrayList<>(); - - int ri = 0; - while(reader.next(key, value)) { - String raw = value.toString(); - int index; - int fromIndex = 0; - do { - index = raw.indexOf(token, fromIndex); - if(index != -1) { - result.add(new Pair<>(ri, index)); - fromIndex = index + token.length(); - } - else - break; - } - while(true); - ri++; - } - return result; - } protected abstract int readFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, int rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException; From 9d98d58a37edc09c2e2383dd6d732c317edd09d5 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Sat, 25 Jun 2022 15:28:05 +0200 Subject: [PATCH 74/84] Improve performance and fix multi-line detection mapping --- .../runtime/iogen/FormatIdentifying.java | 41 +++++++++++++++---- .../sysds/runtime/iogen/ReaderMapping.java | 16 +++++--- 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index d2e08e20682..945ca96d055 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -386,17 +386,31 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ ArrayList> prefixSuffixBeginEndCells = extractPrefixSuffixBeginEndCells(false); + ArrayList>> keys; TextTrie textTrie = new TextTrie(); textTrie.insert(prefixSuffixBeginEndCells.get(0).getKey(), 0); + char startChar = prefixSuffixBeginEndCells.get(0).getKey().charAt(0); + int minSubStringLength = Math.min(80, prefixSuffixBeginEndCells.get(0).getKey().length()); for(int i=1; i< prefixSuffixBeginEndCells.size(); i++){ String prefix = prefixSuffixBeginEndCells.get(i).getKey(); for(int j=0; j< prefix.length(); j++){ - textTrie.insert(prefix.substring(j),i); + if(startChar == prefix.charAt(j)) + textTrie.insert(prefix.substring(j, j+Math.min(minSubStringLength, prefix.length() - j)),i); + } + if(i % 10 == 0){ + keys = textTrie.getAllKeys(); + String upIntersect; + int index = keys.get(0).getKey().indexOf("\n"); + if(index == -1) + upIntersect = keys.get(0).getKey(); + else + upIntersect = keys.get(0).getKey().substring(0, index); + minSubStringLength = upIntersect.length(); } } // scoring the prefix tree - ArrayList>> keys = textTrie.getAllKeys(); + keys = textTrie.getAllKeys(); String beginString = null; String endString = null; if(keys.get(0).getValue().size() == nrows){ @@ -417,14 +431,28 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I suffixes.add(str); } + String str = new StringBuilder(suffixes.get(0).replace("\n", "")).reverse().toString(); + minSubStringLength = Math.min(80, str.length()); TextTrie textTrieEnd = new TextTrie(); - textTrieEnd.insert(new StringBuilder(suffixes.get(0).replace("\n", "")).reverse().toString(), 0); - + textTrieEnd.insert(str, 0); + startChar = str.charAt(0); for(int i=1; i< suffixes.size(); i++){ StringBuilder sb = new StringBuilder(suffixes.get(i).replace("\n", Lop.OPERAND_DELIMITOR)).reverse(); - String str = sb.toString(); + str = sb.toString(); for(int j=0; j< str.length(); j++){ - textTrieEnd.insert(str.substring(j),i); + if(startChar == str.charAt(j)) + textTrieEnd.insert(str.substring(j, j+Math.min(minSubStringLength, str.length() - j)),i); + } + if(i % 10 == 0){ + keys = textTrieEnd.getAllKeys(); + index = keys.get(0).getKey().indexOf(Lop.OPERAND_DELIMITOR); + String upIntersect; + if(index == -1) + upIntersect = new StringBuilder(keys.get(0).getKey()).reverse().toString(); + else + upIntersect = new StringBuilder(keys.get(0).getKey().substring(0, index)).reverse().toString(); + + minSubStringLength = upIntersect.length(); } } keys = textTrieEnd.getAllKeys(); @@ -1230,5 +1258,4 @@ private Pair getIndexOfKeyPatternOnString(String str, ArrayLis else return new Pair<>(-1, -1); } - } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java index 0a6468476c4..4b02e7fa6ce 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/ReaderMapping.java @@ -155,24 +155,28 @@ else if(mappedValueCount > 0 && mappedValueCount < actualValueCount) boolean singleLine = true; // first mapped value - int firstLineNumber = -1; + int[] firstLineNumbers = new int[nrows]; for(int r = 0; r < nrows; r++) { int c = 0; - firstLineNumber = -1; - for(; c < ncols && firstLineNumber == -1; c++) - firstLineNumber = mapRow[r][c]; + firstLineNumbers[r] = -1; + for(; c < ncols && firstLineNumbers[r] == -1; c++) + firstLineNumbers[r] = mapRow[r][c]; // other mapped for(; c < ncols && singleLine; c++) if(mapRow[r][c] != -1) - singleLine = firstLineNumber == mapRow[r][c]; + singleLine = firstLineNumbers[r] == mapRow[r][c]; + } + for(int r=0; r Date: Sat, 25 Jun 2022 20:22:34 +0200 Subject: [PATCH 75/84] Improve performance and fix multi-line detection mapping --- .../runtime/iogen/FormatIdentifying.java | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 945ca96d055..90b3f7150c8 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -398,16 +398,6 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I if(startChar == prefix.charAt(j)) textTrie.insert(prefix.substring(j, j+Math.min(minSubStringLength, prefix.length() - j)),i); } - if(i % 10 == 0){ - keys = textTrie.getAllKeys(); - String upIntersect; - int index = keys.get(0).getKey().indexOf("\n"); - if(index == -1) - upIntersect = keys.get(0).getKey(); - else - upIntersect = keys.get(0).getKey().substring(0, index); - minSubStringLength = upIntersect.length(); - } } // scoring the prefix tree keys = textTrie.getAllKeys(); @@ -443,17 +433,6 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I if(startChar == str.charAt(j)) textTrieEnd.insert(str.substring(j, j+Math.min(minSubStringLength, str.length() - j)),i); } - if(i % 10 == 0){ - keys = textTrieEnd.getAllKeys(); - index = keys.get(0).getKey().indexOf(Lop.OPERAND_DELIMITOR); - String upIntersect; - if(index == -1) - upIntersect = new StringBuilder(keys.get(0).getKey()).reverse().toString(); - else - upIntersect = new StringBuilder(keys.get(0).getKey().substring(0, index)).reverse().toString(); - - minSubStringLength = upIntersect.length(); - } } keys = textTrieEnd.getAllKeys(); From cd9b54a80c24fc577ef20e55f937dffc17091a52 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 27 Jul 2022 03:22:51 +0200 Subject: [PATCH 76/84] Cleanup, fixed bugs in codegen, update tests, remove unnecessary tests --- .../runtime/iogen/FormatIdentifying.java | 188 ++++++---- .../sysds/runtime/iogen/GenerateReader.java | 3 + .../sysds/runtime/iogen/Hirschberg.java | 307 --------------- .../apache/sysds/runtime/iogen/RawIndex.java | 5 - .../apache/sysds/runtime/iogen/TextTrie.java | 1 - .../runtime/iogen/codegen/CodeGenTrie.java | 5 +- .../runtime/iogen/codegen/FrameCodeGen.java | 6 +- .../runtime/iogen/codegen/MatrixCodeGen.java | 6 +- .../iogen/template/FrameGenerateReader.java | 9 +- .../template/FrameGenerateReaderParallel.java | 47 +-- .../iogen/template/MatrixGenerateReader.java | 16 +- .../MatrixGenerateReaderParallel.java | 34 +- .../runtime/iogen/template/TemplateUtil.java | 3 +- .../iogen/FrameSingleRowFlatTest.java | 82 +--- .../iogen/FrameSingleRowNestedTest.java | 92 +---- .../functions/iogen/GenerateRandomFrame.java | 1 - .../iogen/GenerateReaderFrameTest.java | 14 +- .../iogen/GenerateReaderMatrixTest.java | 15 +- .../Identify/FrameGenerateReaderCSVTest.java | 121 ------ .../Identify/MatrixGRRowColIdentifyTest.java | 349 ------------------ .../iogen/MatrixMultiRowNestedTest.java | 39 +- .../iogen/MatrixSingleRowFlatTest.java | 235 ++++-------- .../iogen/MatrixSingleRowNestedTest.java | 16 +- 23 files changed, 301 insertions(+), 1293 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java index 90b3f7150c8..fabb1875ae7 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java @@ -119,16 +119,14 @@ private void runIdentification() { // ref to Table 1: if(mappingProperties.getRecordProperties() == MappingProperties.RecordProperties.SINGLELINE) { // #1 - if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && - colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.Identity) { KeyTrie[] colKeyPatterns; colKeyPatterns = buildColsKeyPatternSingleRow(); properties.setColKeyPatterns(colKeyPatterns); } // #2 - else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && - colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.Identity && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { // find cell-index and value separators RawIndex raw = null; for(int c = 0; c < ncols; c++) { @@ -171,8 +169,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I } else { // # 4, 6, 7, 8, 9 - if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && - colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.CellWiseExist && colIndexStructure.getProperties() == ColIndexStructure.IndexProperties.CellWiseExist) { if(mappingProperties.getDataProperties() != MappingProperties.DataProperties.NOTEXIST) { KeyTrie valueKeyPattern = buildValueKeyPattern(); @@ -182,6 +179,8 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I // build key pattern for row index int numberOfSelectedCols = (int) (ncols * 0.1); int numberOfSelectedRows = (int) (nrows * 0.1); + numberOfSelectedRows = numberOfSelectedRows == 0 ? nrows - 1 : numberOfSelectedRows; + numberOfSelectedCols = numberOfSelectedCols == 0 ? ncols - 1 : numberOfSelectedCols; int begin = rowIndexStructure.getRowIndexBegin(); boolean check, flagReconstruct; int[] selectedRowIndex = new int[numberOfSelectedRows]; @@ -189,7 +188,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I // Select two none zero row as a row index candidate int index = 0; - for(int r = 1; r= Math.max(ncols - numberOfSelectedCols, 0); c--) { + for(int c = ncols - 1; c >= Math.max(ncols - numberOfSelectedCols, 0); c--) { Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); ArrayList prefixStrings = colPrefixString.getKey(); ArrayList prefixStringRowIndexes = colPrefixString.getValue(); @@ -232,7 +231,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I MappingTrie rowTrie = new MappingTrie(); rowKeyPattern = new KeyTrie(); for(int si : selectedRowIndex) { - for(int ci = ncols - 1; ci >=0; ci--) { + for(int ci = ncols - 1; ci >= 0; ci--) { int cri = mapRow[si][ci]; if(cri != -1) { String str = sampleRawIndexes.get(cri).getSubString(0, mapCol[si][ci]); @@ -289,7 +288,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I // Select two none zero row as a row index candidate index = 0; - for(int c = ncols - 1; c>=0; c--) { + for(int c = ncols - 1; c >= 0; c--) { for(int r = 1; r < nrows; r++) if(mapRow[r][c] != -1) { selectedColIndex[index++] = c; @@ -299,7 +298,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I break; } - for(int c = ncols -1; c >= Math.max(ncols - numberOfSelectedCols, 0); c--) { + for(int c = ncols - 1; c >= Math.max(ncols - numberOfSelectedCols, 0); c--) { Pair, ArrayList> colPrefixString = extractAllPrefixStringsOfAColSingleLine(c, false); ArrayList prefixStrings = colPrefixString.getKey(); ArrayList prefixStringRowIndexes = colPrefixString.getValue(); @@ -383,7 +382,7 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I colIndexStructure.setKeyPattern(colKeyPattern); } // #10 sequential scattered - if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter){ + if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { ArrayList> prefixSuffixBeginEndCells = extractPrefixSuffixBeginEndCells(false); ArrayList>> keys; @@ -392,18 +391,18 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I char startChar = prefixSuffixBeginEndCells.get(0).getKey().charAt(0); int minSubStringLength = Math.min(80, prefixSuffixBeginEndCells.get(0).getKey().length()); - for(int i=1; i< prefixSuffixBeginEndCells.size(); i++){ + for(int i = 1; i < prefixSuffixBeginEndCells.size(); i++) { String prefix = prefixSuffixBeginEndCells.get(i).getKey(); - for(int j=0; j< prefix.length(); j++){ + for(int j = 0; j < prefix.length(); j++) { if(startChar == prefix.charAt(j)) - textTrie.insert(prefix.substring(j, j+Math.min(minSubStringLength, prefix.length() - j)),i); + textTrie.insert(prefix.substring(j, j + Math.min(minSubStringLength, prefix.length() - j)), i); } } // scoring the prefix tree keys = textTrie.getAllKeys(); String beginString = null; String endString = null; - if(keys.get(0).getValue().size() == nrows){ + if(keys.get(0).getValue().size() == nrows) { int index = keys.get(0).getKey().indexOf("\n"); if(index == -1) beginString = keys.get(0).getKey(); @@ -411,42 +410,46 @@ else if(rowIndexStructure.getProperties() == RowIndexStructure.IndexProperties.I beginString = keys.get(0).getKey().substring(0, index); // recompute suffix strings to find end of string + int minSuffixStringLength = prefixSuffixBeginEndCells.get(0).getValue().length(); + String reverseBeginString = new StringBuilder(beginString).reverse().toString(); ArrayList suffixes = new ArrayList<>(); - for(int i=0; i= this.actualValueCount) { rowIndexStructure.setProperties(RowIndexStructure.IndexProperties.CellWiseExist); rowIndexStructure.setRowIndexBegin(0); @@ -531,8 +533,8 @@ private RowIndexStructure getRowIndexStructure() { ArrayList list = new ArrayList<>(); for(int i = bitSet.nextSetBit(0); i != -1; i = bitSet.nextSetBit(i + 1)) list.add(i); - for(int i=0; i= this.actualValueCount) { colIndexStructure.setProperties(ColIndexStructure.IndexProperties.CellWiseExist); colIndexStructure.setColIndexBegin(0); @@ -785,29 +787,10 @@ private ArrayList> extractPrefixSuffixBeginEndCells(boolean } // set suffix - for(int r = 0; r[] extractAllSuffixStringsOfColsSingleLine() { // Methods For Multi Lines Mapping // //////////////////////////////////////////////////////////////////////////// - private void updateMapsAndExtractAllSuffixStringsOfColsMultiLine(String beginString, String endString){ + private void updateMapsAndExtractAllSuffixStringsOfColsMultiLine(String beginString, String endString) { ArrayList upRawIndexes = new ArrayList<>(); ArrayList> beginIndexes = getTokenIndexOnMultiLineRecords(beginString); ArrayList> endIndexes; @@ -1127,7 +1110,7 @@ private void updateMapsAndExtractAllSuffixStringsOfColsMultiLine(String beginStr int i = 0; int j = 0; StringBuilder sb = new StringBuilder(); - while(i < beginIndexes.size() && j < endIndexes.size() && r p1 = beginIndexes.get(i); Pair p2 = endIndexes.get(j); int n = 0; @@ -1169,22 +1152,69 @@ private void updateMapsAndExtractAllSuffixStringsOfColsMultiLine(String beginStr this.sampleRawIndexes = upRawIndexes; } - private ArrayList> getTokenIndexOnMultiLineRecords(String token){ + private ArrayList> getTokenIndexOnMultiLineRecords(String token) { ArrayList> result = new ArrayList<>(); - for(int ri=0; ri< this.sampleRawIndexes.size(); ri++){ + for(int ri = 0; ri < this.sampleRawIndexes.size(); ri++) { String raw = this.sampleRawIndexes.get(ri).getRaw(); int index; int fromIndex = 0; do { index = raw.indexOf(token, fromIndex); - if(index !=-1){ + if(index != -1) { result.add(new Pair<>(ri, index)); - fromIndex = index+token.length(); + fromIndex = index + token.length(); } else break; - }while(true); + } + while(true); + } + return result; + } + + private ArrayList> getTokenIndexOnMultiLineRecords(String beginToken, String endToken) { + ArrayList> result = new ArrayList<>(); + + for(int ri = 0; ri < this.sampleRawIndexes.size(); ) { + String raw = this.sampleRawIndexes.get(ri).getRaw(); + int index; + int fromIndex = 0; + do { + index = raw.indexOf(endToken, fromIndex); + if(index != -1) { + if(index + endToken.length() + beginToken.length() <= raw.length()) { + boolean flag = true; + for(int i = index + endToken.length(), j = 0; i < index + endToken.length() + beginToken.length() && flag; i++, j++) { + flag = raw.charAt(i) == beginToken.charAt(j); + } + if(flag) { + result.add(new Pair<>(ri, index)); + fromIndex = index + beginToken.length() + endToken.length(); + } + else + fromIndex++; + } + else { + if(ri+1 == this.sampleRawIndexes.size()) + break; + // skip empty rows + do { + raw = this.sampleRawIndexes.get(++ri).getRaw(); + } + while(raw.length() == 0); + + if(raw.startsWith(beginToken)) { + result.add(new Pair<>(ri, 0)); + fromIndex = 1; + } + } + } + else + break; + } + while(true); + ri++; } return result; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 5c88438425a..deb7bd23c26 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -61,6 +61,9 @@ public String getRandomClassName() { return "GIOReader_" + result; } + public CustomProperties getProperties() { + return properties; + } // Generate Reader for Matrix public static class GenerateReaderMatrix extends GenerateReader { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java b/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java deleted file mode 100644 index e28c678a0c0..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/Hirschberg.java +++ /dev/null @@ -1,307 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.iogen; - -import org.apache.sysds.runtime.matrix.data.Pair; - -import java.util.ArrayList; -import java.util.Arrays; - -public class Hirschberg { - - public Pair, String> getLCS(String x, String y, int pxy, int pgap) { - int i, j; // initialising variables - int m = x.length(); // length of gene1 - int n = y.length(); // length of gene2 - - // table for storing optimal substructure answers - int dp[][] = new int[n + m + 1][n + m + 1]; - - for(int[] x1 : dp) - Arrays.fill(x1, 0); - - // initialising the table - for(i = 0; i <= (n + m); i++) { - dp[i][0] = i * pgap; - dp[0][i] = i * pgap; - } - - // calculating the minimum penalty - for(i = 1; i <= m; i++) { - for(j = 1; j <= n; j++) { - if(x.charAt(i - 1) == y.charAt(j - 1)) { - dp[i][j] = dp[i - 1][j - 1]; - } - else { - dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1] + pxy, dp[i - 1][j] + pgap), dp[i][j - 1] + pgap); - } - } - } - - // Reconstructing the solution - int l = n + m; // maximum possible length - i = m; - j = n; - int xpos = l; - int ypos = l; - - // Final answers for the respective strings - int xans[] = new int[l + 1]; - int yans[] = new int[l + 1]; - - while(!(i == 0 || j == 0)) { - if(x.charAt(i - 1) == y.charAt(j - 1)) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } - else if(dp[i - 1][j - 1] + pxy == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } - else if(dp[i - 1][j] + pgap == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) '_'; - i--; - } - else if(dp[i][j - 1] + pgap == dp[i][j]) { - xans[xpos--] = (int) '_'; - yans[ypos--] = (int) y.charAt(j - 1); - j--; - } - } - while(xpos > 0) { - if(i > 0) - xans[xpos--] = (int) x.charAt(--i); - else - xans[xpos--] = (int) '_'; - } - while(ypos > 0) { - if(j > 0) - yans[ypos--] = (int) y.charAt(--j); - else - yans[ypos--] = (int) '_'; - } - // Since we have assumed the answer to be n+m long, we need to remove the extra - // gaps in the starting id represents the index from which the arrays xans, yans are useful - int id = 1; - for(i = l; i >= 1; i--) { - if((char) yans[i] == '_' && (char) xans[i] == '_') { - id = i + 1; - break; - } - } - - StringBuilder sb = new StringBuilder(); - ArrayList pattern = new ArrayList<>(); - for(i = id; i <= l; i++) { - if(xans[i] == yans[i]) - sb.append((char) xans[i]); - else { - if(sb.length() > 0) - pattern.add(sb.toString()); - sb = new StringBuilder(); - } - } - - if(sb.length() > 0) - pattern.add(sb.toString()); - - // System.out.println(""); - // for(i = id; i <= l; i++) - // System.out.print((char) yans[i]); - // - sb = new StringBuilder(); - for(int bi = id; bi <= l; bi++) { - if(xans[bi] == yans[bi]) { - sb.append((char) xans[bi]); - //System.out.print((char) xans[bi]); - } -// else -// System.out.print("*"); - } - //System.out.println(); - if(sb.length() > 0) { - // StringBuilder stringBuilder = new StringBuilder(); - // for (String s: pattern){ - // stringBuilder.append(s).append("_"); - // } - // if (stringBuilder.length()>0) - // stringBuilder.deleteCharAt(stringBuilder.length()-1); - return new Pair<>(pattern, sb.toString()); - } - else - return null; - } - - public Pair, String> getLCS(String x, String y) { - int i, j; // initialising variables - int m = x.length(); // length of gene1 - int n = y.length(); // length of gene2 - - // table for storing optimal substructure answers - int dp[][] = new int[n + m + 1][n + m + 1]; - - for(int[] x1 : dp) - Arrays.fill(x1, 0); - - // initialising the table - for(i = 0; i <= (n + m); i++) { - dp[i][0] = i; - dp[0][i] = i; - } - - // calculating the minimum penalty - for(i = 1; i <= m; i++) { - for(j = 1; j <= n; j++) { - if(x.charAt(i - 1) == y.charAt(j - 1)) { - dp[i][j] = dp[i - 1][j - 1]; - } - else { - dp[i][j] = Math.min(Math.min(dp[i - 1][j - 1], dp[i - 1][j]), dp[i][j - 1]); - } - } - } - - // Reconstructing the solution - int l = n + m; // maximum possible length - i = m; - j = n; - int xpos = l; - int ypos = l; - - // Final answers for the respective strings - int xans[] = new int[l + 1]; - int yans[] = new int[l + 1]; - - while(!(i == 0 || j == 0)) { - if(x.charAt(i - 1) == y.charAt(j - 1)) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } - else if(dp[i - 1][j - 1] == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) y.charAt(j - 1); - i--; - j--; - } - else if(dp[i - 1][j] == dp[i][j]) { - xans[xpos--] = (int) x.charAt(i - 1); - yans[ypos--] = (int) '_'; - i--; - } - else if(dp[i][j - 1] == dp[i][j]) { - xans[xpos--] = (int) '_'; - yans[ypos--] = (int) y.charAt(j - 1); - j--; - } - } - while(xpos > 0) { - if(i > 0) - xans[xpos--] = (int) x.charAt(--i); - else - xans[xpos--] = (int) '_'; - } - while(ypos > 0) { - if(j > 0) - yans[ypos--] = (int) y.charAt(--j); - else - yans[ypos--] = (int) '_'; - } - // Since we have assumed the answer to be n+m long, we need to remove the extra - // gaps in the starting id represents the index from which the arrays xans, yans are useful - int id = 1; - for(i = l; i >= 1; i--) { - if((char) yans[i] == '_' && (char) xans[i] == '_') { - id = i + 1; - break; - } - } - - StringBuilder sb = new StringBuilder(); - ArrayList pattern = new ArrayList<>(); - for(i = id; i <= l; i++) { - if(xans[i] == yans[i]) - sb.append((char) xans[i]); - else { - if(sb.length() > 0) - pattern.add(sb.toString()); - sb = new StringBuilder(); - } - } - - if(sb.length() > 0) - pattern.add(sb.toString()); - - // System.out.println(""); - // for(i = id; i <= l; i++) - // System.out.print((char) yans[i]); - // - sb = new StringBuilder(); - for(int bi = id; bi <= l; bi++) { - if(xans[bi] == yans[bi]) { - sb.append((char) xans[bi]); - //System.out.print((char) xans[bi]); - } -// else -// System.out.print("*"); - } - //System.out.println(); - if(sb.length() > 0) { - return new Pair<>(pattern, sb.toString()); - } - else - return null; - } - - public ArrayList getLCS(ArrayList list, int pxy, int pgap) { - - if(list.size() < 2) - return null; - - String str1 = list.get(0); - String str2 = list.get(1); - - Pair, String> pattern = getLCS(str1, str2); - if(pattern != null) { - String intersect = pattern.getValue(); - ArrayList intersectPattern = pattern.getKey(); - for(int i = 2; i < list.size(); i++) { - pattern = getLCS(intersect, list.get(i)); - if(pattern != null) { - intersect = pattern.getValue(); - intersectPattern = pattern.getKey(); - } - else - intersect = null; - } - if(intersect != null) - return intersectPattern; - } - return null; - } -} - - diff --git a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java index 58159c754ec..80e17cad806 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/RawIndex.java @@ -266,11 +266,6 @@ private void extractNumericDotActualValues() { } private void extractNumericDotEActualValues() { - // if(this.dotEActualNumericValues == null) - // this.dotEActualNumericValues = new HashMap<>(); - // else - // return; - BitSet numericDotEBitSet = (BitSet) numberBitSet.clone(); numericDotEBitSet.or(dotBitSet); numericDotEBitSet.or(eBitSet); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java index 8b38064e679..a228993e229 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/TextTrie.java @@ -20,7 +20,6 @@ package org.apache.sysds.runtime.iogen; import com.google.gson.Gson; -import org.apache.sysds.runtime.instructions.spark.AppendRSPInstruction; import org.apache.sysds.runtime.matrix.data.Pair; import java.util.ArrayList; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java index f3a6bce692b..a3d0f18c3df 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/CodeGenTrie.java @@ -109,11 +109,7 @@ private void insert(CodeGenTrieNode root, String index, Types.ValueType valueTyp public String getJavaCode() { StringBuilder src = new StringBuilder(); int ncols = properties.getNcols(); - - MappingProperties.RepresentationProperties representation = properties.getMappingProperties().getRepresentationProperties(); MappingProperties.DataProperties data = properties.getMappingProperties().getDataProperties(); - MappingProperties.RecordProperties record = properties.getMappingProperties().getRecordProperties(); - RowIndexStructure.IndexProperties rowIndex = properties.getRowIndexStructure().getProperties(); ColIndexStructure.IndexProperties colIndex = properties.getColIndexStructure().getProperties(); @@ -149,6 +145,7 @@ else if(rowIndex == RowIndexStructure.IndexProperties.Identity && if(this.isMatrix){ src.append("try{ \n"); src.append(destination).append("(row, col, Double.parseDouble(strIndexValue[1])); \n"); + src.append("lnnz++;\n"); src.append("} catch(Exception e){"+destination+"(row, col, 0d);} \n"); } else { diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java index 7e1a5b18422..ca441c27be2 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/FrameCodeGen.java @@ -71,7 +71,7 @@ public String generateCodeJava() { src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); src.append("boolean flag = true; \n"); - src.append("while(flag) { \n"); + src.append("while(flag || sb.length() > 0) { \n"); src.append("flag = reader.next(key, value); \n"); src.append("if(flag) { \n"); src.append("ri++; \n"); @@ -91,8 +91,10 @@ public String generateCodeJava() { src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); src.append("} \n"); src.append("} \n"); - src.append("else \n"); + src.append("else {\n"); src.append("str = sb.toString(); \n"); + src.append("sb = new StringBuilder();\n"); + src.append("}"); } else { src.append("while(reader.next(key, value)) { \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java index edd5966c821..752fe781b74 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/codegen/MatrixCodeGen.java @@ -94,7 +94,7 @@ public String generateCodeJava() { src.append("int beginIndex = splitInfo.getRecordIndexBegin(0); \n"); src.append("int endIndex = splitInfo.getRecordIndexEnd(0); \n"); src.append("boolean flag = true; \n"); - src.append("while(flag) { \n"); + src.append("while(flag || sb.length() > 0) { \n"); src.append("flag = reader.next(key, value); \n"); src.append("if(flag) { \n"); src.append("ri++; \n"); @@ -114,8 +114,10 @@ public String generateCodeJava() { src.append("endIndex = splitInfo.getRecordIndexEnd(row+1); \n"); src.append("} \n"); src.append("} \n"); - src.append("else \n"); + src.append("else {\n"); src.append("str = sb.toString(); \n"); + src.append("sb = new StringBuilder();\n"); + src.append("}"); } else { src.append("while(reader.next(key, value)) { \n"); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java index 8afb6217491..5a3a53f8b73 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReader.java @@ -115,16 +115,18 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index ArrayList> endIndexes; int tokenLength = 0; + boolean diffBeginEndToken = false; if(!_props.getRowIndexStructure().getSeqBeginString().equals(_props.getRowIndexStructure().getSeqEndString())) { endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqEndString()); tokenLength = _props.getRowIndexStructure().getSeqEndString().length(); + diffBeginEndToken = true; } else { endIndexes = new ArrayList<>(); for(int i = 1; i < beginIndexes.size(); i++) endIndexes.add(beginIndexes.get(i)); } - + beginIndexes.remove(beginIndexes.size()-1); int i = 0; int j = 0; while(i < beginIndexes.size() && j < endIndexes.size()) { @@ -144,7 +146,7 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index j++; nrows++; } - if(i == beginIndexes.size() && j < endIndexes.size()) + if(!diffBeginEndToken && i == beginIndexes.size() && j < endIndexes.size()) nrows++; if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) splitInfo.setRemainString(""); @@ -214,7 +216,4 @@ protected void readFrameFromHDFS(TextInputFormat informat, InputSplit[] splits, protected abstract int readFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, int rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException; - - - } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java index 6e4a5ea6954..2f2dce62313 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/FrameGenerateReaderParallel.java @@ -45,7 +45,6 @@ import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; -import java.util.HashSet; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Future; @@ -123,6 +122,24 @@ private FrameBlock computeSizeAndCreateOutputFrameBlock(Types.ValueType[] schema } pool.shutdown(); } + if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist || + _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist) { + ArrayList tasks = new ArrayList<>(); + for(InputSplit split : splits) + tasks.add(new IOUtilFunctions.CountRowsTask(split, informat, job, false)); + + // collect row counts for offset computation + // early error notify in case not all tasks successful + _offsets = new TemplateUtil.SplitOffsetInfos(tasks.size()); + int i = 0; + for(Future rc : pool.invokeAll(tasks)) { + int lnrow = (int) rc.get().longValue(); // incl error handling + _offsets.setOffsetPerSplit(i, _rLen); + _offsets.setLenghtPerSplit(i, lnrow); + i++; + } + pool.shutdown(); + } else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { ArrayList tasks = new ArrayList<>(); for(InputSplit split : splits) @@ -173,21 +190,6 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index InputStreamInputFormat informat = new InputStreamInputFormat(is); InputSplit[] splits = informat.getSplits(null, 1); FrameBlock ret = computeSizeAndCreateOutputFrameBlock(schema, names, splits, null, rlen, clen); - // - // // core read (sequential/parallel) - // - // - // ReadTask rt = new ReadTask(splits[0], informat, ret, 1) - // - // //readFrameFromInputSplit(split, informat, null, ret, schema, names, rlen, clen, 0, true); - // - // ArrayList tasks = new ArrayList<>(); - // int splitCount = 0; - // for (InputSplit split : splits) { - // tasks.add( new ReadTask(split, informat, dest, splitCount++). ); - // } - // pool.invokeAll(tasks); - // pool.shutdown(); // TODO: implement parallel reader for input stream return ret; } @@ -229,7 +231,8 @@ public ReadTask(InputSplit split, TextInputFormat informat, FrameBlock dest, int _splitCount = splitCount; } - @Override public Long call() throws IOException { + @Override + public Long call() throws IOException { RecordReader reader = _informat.getRecordReader(_split, job, Reporter.NULL); LongWritable key = new LongWritable(); Text value = new Text(); @@ -255,23 +258,26 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, _endString = endString; } - @Override public TemplateUtil.SplitInfo call() throws Exception { + @Override + public TemplateUtil.SplitInfo call() throws Exception { TemplateUtil.SplitInfo splitInfo = new TemplateUtil.SplitInfo(); int nrows = 0; ArrayList> beginIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); ArrayList> endIndexes; int tokenLength = 0; + boolean diffBeginEndToken = false; if(!_beginString.equals(_endString)) { endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); tokenLength = _endString.length(); + diffBeginEndToken = true; } else { endIndexes = new ArrayList<>(); for(int i = 1; i < beginIndexes.size(); i++) endIndexes.add(beginIndexes.get(i)); } - + beginIndexes.remove(beginIndexes.size()-1); int i = 0; int j = 0; while(i < beginIndexes.size() && j < endIndexes.size()) { @@ -291,7 +297,7 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, j++; nrows++; } - if(i == beginIndexes.size() && j < endIndexes.size()) + if(!diffBeginEndToken && i == beginIndexes.size() && j < endIndexes.size()) nrows++; if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) splitInfo.setRemainString(""); @@ -317,7 +323,6 @@ public CountSeqScatteredRowsTask(InputSplit split, TextInputFormat inputFormat, } } - protected abstract int readFrameFromHDFS(RecordReader reader, LongWritable key, Text value, FrameBlock dest, int rowPos, TemplateUtil.SplitInfo splitInfo) throws IOException; } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java index b9d81a55205..2db51736f93 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReader.java @@ -71,11 +71,11 @@ public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int bl MatrixBlock ret; if(rlen >= 0 && clen >= 0 && _props.getRowIndexStructure().getProperties() != RowIndexStructure.IndexProperties.SeqScatter) { - if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || - _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ){ - clen++; - rlen ++; - } +// if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || +// _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ){ +// //clen++; +// //rlen ++; +// } ret = createOutputMatrixBlock(rlen, clen, (int) rlen, estnnz, !_props.isSparse(), _props.isSparse()); } else @@ -131,16 +131,18 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index ArrayList> endIndexes; int tokenLength = 0; + boolean diffBeginEndToken = false; if(!_props.getRowIndexStructure().getSeqBeginString().equals(_props.getRowIndexStructure().getSeqEndString())) { endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(inputSplit, informat, job, _props.getRowIndexStructure().getSeqEndString()); tokenLength = _props.getRowIndexStructure().getSeqEndString().length(); + diffBeginEndToken = true; } else { endIndexes = new ArrayList<>(); for(int i = 1; i < beginIndexes.size(); i++) endIndexes.add(beginIndexes.get(i)); } - + beginIndexes.remove(beginIndexes.size()-1); int i = 0; int j = 0; while(i < beginIndexes.size() && j < endIndexes.size()) { @@ -160,7 +162,7 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index j++; nrows++; } - if(i == beginIndexes.size() && j < endIndexes.size()) + if(!diffBeginEndToken && i == beginIndexes.size() && j < endIndexes.size()) nrows++; if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) splitInfo.setRemainString(""); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java index 552f8684d4e..848b30ed054 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/MatrixGenerateReaderParallel.java @@ -119,6 +119,24 @@ private MatrixBlock computeSizeAndCreateOutputMatrixBlock(InputSplit[] splits, P } pool.shutdown(); } + else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist || + _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist) { + ArrayList tasks = new ArrayList<>(); + for(InputSplit split : splits) + tasks.add(new IOUtilFunctions.CountRowsTask(split, informat, job, false)); + + // collect row counts for offset computation + // early error notify in case not all tasks successful + _offsets = new TemplateUtil.SplitOffsetInfos(tasks.size()); + int i = 0; + for(Future rc : pool.invokeAll(tasks)) { + int lnrow = (int) rc.get().longValue(); // incl error handling + _offsets.setOffsetPerSplit(i, _rLen); + _offsets.setLenghtPerSplit(i, lnrow); + i++; + } + pool.shutdown(); + } else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.SeqScatter) { ArrayList tasks = new ArrayList<>(); for(InputSplit split : splits) @@ -157,14 +175,6 @@ else if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.Index _cLen = (int) clen; } } - - // allocate target matrix block based on given size; - // need to allocate sparse as well since lock-free insert into target - if(_props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.RowWiseExist || - _props.getRowIndexStructure().getProperties() == RowIndexStructure.IndexProperties.CellWiseExist ){ - _rLen++; - _cLen++; - } long estnnz2 = (estnnz < 0) ? (long) _rLen * _cLen : estnnz; return createOutputMatrixBlock(_rLen, _cLen, blen, estnnz2, !_props.isSparse(), _props.isSparse()); } @@ -191,8 +201,7 @@ private void readMatrixFromHDFS(TextInputFormat informat, InputSplit[] splits, P for (InputSplit split : splits) { tasks.add( new ReadTask(split, informat, dest, splitCount++) ); } - pool.invokeAll(tasks); - pool.shutdown(); + CommonThreadPool.invokeAndShutdown(pool, tasks); // check return codes and aggregate nnz long lnnz = 0; @@ -258,15 +267,18 @@ public TemplateUtil.SplitInfo call() throws Exception { ArrayList> beginIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _beginString); ArrayList> endIndexes; int tokenLength = 0; + boolean diffBeginEndToken = false; if(!_beginString.equals(_endString)) { endIndexes = TemplateUtil.getTokenIndexOnMultiLineRecords(_split, _inputFormat, _jobConf, _endString); tokenLength = _endString.length(); + diffBeginEndToken = true; } else { endIndexes = new ArrayList<>(); for(int i = 1; i < beginIndexes.size(); i++) endIndexes.add(beginIndexes.get(i)); } + beginIndexes.remove(beginIndexes.size()-1); int i = 0; int j = 0; @@ -287,7 +299,7 @@ public TemplateUtil.SplitInfo call() throws Exception { j++; nrows++; } - if(i == beginIndexes.size() && j < endIndexes.size()) + if(!diffBeginEndToken && i == beginIndexes.size() && j < endIndexes.size()) nrows++; if(beginIndexes.get(0).getKey() == 0 && beginIndexes.get(0).getValue() == 0) splitInfo.setRemainString(""); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java index 0d9a0d7b0a8..9ccd8657fd5 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/template/TemplateUtil.java @@ -152,6 +152,7 @@ public static ArrayList> getTokenIndexOnMultiLineRecords( while(true); ri++; } + result.add(new Pair<>(ri, 0)); return result; } @@ -188,8 +189,6 @@ public static String getStringChunkOfBufferReader(BufferedReader br, String rema else return null; } - - protected int getColIndex(HashMap colKeyPatternMap, String key){ return colKeyPatternMap.getOrDefault(key, -1); } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java index 71cc092b0e5..637762ea534 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java @@ -39,7 +39,7 @@ public void test1() { sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } // 2. dataset contain different value types @@ -48,7 +48,7 @@ public void test2() { sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } @Test @@ -56,7 +56,7 @@ public void test3() { sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } @Test @@ -64,7 +64,7 @@ public void test4() { sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; data = new String[][] {{"1", "2", "b"}, {"6", "7", "bb"}, {"11", "12", "14"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } @Test @@ -72,7 +72,7 @@ public void test5() { sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; data = new String[][] {{"1", "2", "b"}, {"6", "7", "bb"}, {"11", "12", "14"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.FP64, Types.ValueType.STRING}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } // CSV with empty values @@ -81,76 +81,6 @@ public void test6() { sampleRaw = "1,2,a,,c\n" + "6,,aa,bb,cc\n" + ",12,13,14,15"; data = new String[][] {{"1", "2", ""}, {"6", "0", "bb"}, {"0", "12", "14"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } - - // LibSVM - // with in-order col indexes and numeric col indexes - @Test - public void test7() { - sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:50 6:60\n" + "+1 1:101 2:201 \n" + - "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; - - data = new String[][] {{"1", "10", "20", "30", "0", "", ""}, - {"-1", "0", "0", "0", "40", "50", "60"}, - {"1", "101", "201", "0", "0", "", ""}, - {"-1", "0", "0", "0", "0", "", "601"}, - {"-1", "0", "0", "0", "0", "501", ""}, - {"1", "0", "0", "301", "0", "", ""}}; - - schema = new Types.ValueType[] {Types.ValueType.FP32, Types.ValueType.INT32, Types.ValueType.INT64, - Types.ValueType.FP32, Types.ValueType.FP64, Types.ValueType.STRING, Types.ValueType.STRING}; - runGenerateReaderTest(); - } - - @Test - public void test8() { - sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:a 6:b\n" + "+1 1:101 2:201 \n" + - "-1 6:c \n" + "-1 5:d\n" + "+1 3:301"; - - data = new String[][] {{"1", "10", "20", "30", "0", "", ""}, - {"-1", "0", "0", "0", "40", "a", "b"}, - {"1", "101", "201", "0", "0", "", ""}, - {"-1", "0", "0", "0", "0", "", "c"}, - {"-1", "0", "0", "0", "0", "d", ""}, - {"1", "0", "0", "301", "0", "", ""}}; - - schema = new Types.ValueType[] {Types.ValueType.FP32, Types.ValueType.INT32, Types.ValueType.INT64, - Types.ValueType.FP32, Types.ValueType.FP64, Types.ValueType.STRING, Types.ValueType.STRING}; - runGenerateReaderTest(); - } - - // MatrixMarket(MM) - //MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) - @Test - public void test9() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n"+ "1,5,50\n" + "2,1,101\n" + "2,2,201\n" + "4,1,104\n" + - "4,5,504\n" + "5,3,305"; - data = new String[][] {{"10","20","30"}, - {"101","201",""}, - {"0","0",""}, - {"104", "0", ""}, - {"0", "0", "305"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.FP64, Types.ValueType.STRING}; - runGenerateReaderTest(); - } - -// @Test -// public void test10() { -// sampleRaw = "30,\"Stationary wave solutions of a system of reaction-diffusion equations derived from the Fitzhugh-Nagumo equations\",1984,\"SIAM Journal on Applied Mathematics\",\"\",Gene A. Klaasen\",\"William C. Troy\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Univ. of Tennessee, Knoxville\",\"Univ. of Pittsburgh, Pittsburgh, PA\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + -// "31,\"Up and running: the small business computer implementation cookbook\",1984,\"Up and running: the small business computer implementation cookbook\",\"\",Jess W. Curry, Jr.\",\"David M. Bonner\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Arthur Young\",\"Arthur Young\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + -// "32,\"Proc. IFIP working conference on Programming Languages and System Design\",1983,\"Proc. IFIP working conference on Programming Languages and System Design\",\"\",J Bormann\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Technical Univ. of Dresden, East Germany\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + -// "33,\"Fast automatic liveness analysis of hierarchical parallel systems\",1983,\"Proc. IFIP working conference on Programming Languages and System Design\",\"\",Johannes Rohrich\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Univ. Karlsruhe, Karlsruhe, W. Germany\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,\n" + -// "34,\"Concatenable type declarations: their applications and implementaion\",1983,\"Proc. IFIP working conference on Programming Languages and System Design\",\"\",A Kreczmar\",\"A Salwicki\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"Univ. of Warsaw, Poland\",\"Univ. of Warsaw, Poland\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",\"\",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"; -// data = new String[][] {{"30","Stationary wave solutions of a system of reaction-diffusion equations derived from the Fitzhugh-Nagumo equations","1984", "SIAM Journal on Applied Mathematics",""}, -// {"31","Up and running: the small business computer implementation cookbook","1984", "Up and running: the small business computer implementation cookbook", ""}, -// {"32","Proc. IFIP working conference on Programming Languages and System Design","1983", "Proc. IFIP working conference on Programming Languages and System Design",""}, -// {"33","Fast automatic liveness analysis of hierarchical parallel systems","1983", "Proc. IFIP working conference on Programming Languages and System Design",""}, -// {"34","Concatenable type declarations: their applications and implementaion","1983", "Proc. IFIP working conference on Programming Languages and System Design",""} -// }; -// schema = new Types.ValueType[] {Types.ValueType.INT64, Types.ValueType.STRING, Types.ValueType.FP32, Types.ValueType.STRING, Types.ValueType.STRING}; -// runGenerateReaderTest(); -// } - - } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java index 3a6197454a1..787801b6f2d 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java @@ -20,16 +20,8 @@ package org.apache.sysds.test.functions.iogen; import org.apache.sysds.common.Types; -import org.apache.sysds.lops.Lop; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.EXP.Util; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONObject; import org.junit.Test; -import java.io.IOException; - public class FrameSingleRowNestedTest extends GenerateReaderFrameTest { private final static String TEST_NAME = "FrameSingleRowNestedTest"; @@ -49,7 +41,7 @@ public void test1() { data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //2. flat object, out-of-order values, contain different value types @@ -61,7 +53,7 @@ public void test2() { data = new String[][] {{"1", "string"}, {"6", "string2"}, {"11", "string3"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //3. nested object with unique attribute names @Test @@ -71,7 +63,7 @@ public void test3() { "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; data = new String[][] {{"1", "2", "5"}, {"6", "7", "10"}, {"11", "12", "15"}}; schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //5. nested object with repeated attribute names, out-of-order @@ -83,82 +75,6 @@ public void test5() { schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64, Types.ValueType.FP32, Types.ValueType.INT64}; data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; - runGenerateReaderTest(); - } - - @Test - public void test6() { - sampleRaw = "{\"index\":207,\"name\":\"Nuno Guimarães\",\"affiliations\":[\"ISCTEUniversity Institute of Lisbon, Lisbon, Portugal\"],\"paperCount\":1,\"citationNumber\":0,\"hIndex\":0.0,\"researchInterests\":[\"mental state\",\"mental workload\",\"higher mental workload\",\"mental load\",\"mental workload evaluation\",\"mental workload pattern\",\"ecological reading situation\",\"reading condition\",\"visual user interface\",\"EEG signal\"]}\n"+ - "{\"index\":208,\"name\":\" Nguyen Minh Nhut\",\"affiliations\":[\"Data Mining Department, Institute for Infocomm Research (I2R), 1 Fusionopolis Way, Connexis (South Tower), Singapore 138632\"],\"paperCount\":1,\"citationNumber\":0,\"hIndex\":0.0,\"researchInterests\":[\"system health monitoring\",\"sensor node\",\"adaptive classification system architecture\",\"effective health monitoring system\",\"proposed system\",\"real-time adaptive classification system\",\"adaptive sampling frequency\",\"different sampling\",\"different sampling rate\",\"individual sensor\"]}\n\n"+ - "{\"index\":209,\"name\":\"Louis Janus\",\"affiliations\":[\"\"],\"paperCount\":1,\"citationNumber\":0,\"hIndex\":0.0,\"researchInterests\":[\"language instruction\"]}"; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64, - Types.ValueType.FP32, Types.ValueType.INT64}; - data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; - runGenerateReaderTest(); - } - - @Test - public void test7() { - String key = "\\aaa\""; - key = key.replace("\\\"", Lop.OPERAND_DELIMITOR); - key = key.replace("\\", "\\\\"); - key = key.replace(Lop.OPERAND_DELIMITOR,"\\\""); - //System.out.println(key.length()); - - StringBuilder src = new StringBuilder(); - src.append("index = str.indexOf(\"" + - key.replace("\\\"", "\"").replace("\"", "\\\"") - - + "\"); \n"); - - System.out.println(src); -// sampleRaw = "{\n\"a\":1,\n\"b\":2,\n\"c\":3,\n\"d\":4,\n\"e\":5\n}\n" + -// "{\"a\":6,\n\"b\":7,\"c\":8,\"d\":9,\"e\":10\n}\n" + -// "{\"a\":11,\"b\":12,\n\"c\":13,\"d\":14,\"e\":15\n}"; -// -// data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; -// schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; -// runGenerateReaderTest(); - } - - @Test - public void test8() throws Exception { - //java -Xms15g -Xmx15g -Dparallel=true -cp ./lib/*:./SystemDS.jar org.apache.sysds.runtime.iogen.EXP.GIOFrame - String dpath = "/home/sfathollahzadeh/Documents/GitHub/papers/2022-vldb-GIO/Experiments/"; - String sampleRawFileName = dpath+"data/message-hl7/F173/sample-message-hl7200.raw"; - String sampleFrameFileName = dpath+"data/message-hl7/F173/sample-message-hl7200.frame"; - String sampleRawDelimiter = "\t"; - String schemaFileName = dpath+"data/message-hl7/F173/message-hl7.schema"; - String dataFileName = dpath+"data/message-hl7.dat"; - boolean parallel = true; - long rows = -1; - Util util = new Util(); - - // read and parse mtd file - String mtdFileName = dataFileName + ".mtd"; - try { - String mtd = util.readEntireTextFile(mtdFileName); - mtd = mtd.replace("\n", "").replace("\r", ""); - mtd = mtd.toLowerCase().trim(); - JSONObject jsonObject = new JSONObject(mtd); - if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); - } catch (Exception exception) {} - - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); - FrameReader fr = gr.getReader(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); - for(int r=0; r< 10; r++) { - for(int c = 0; c < frameBlock.getNumColumns(); c++) - System.out.print(c+":"+frameBlock.get(r,c)+" "); - System.out.println(); - } - - int a = 100; + runGenerateReaderTest(false); } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java index 9b711057223..ebe0cfe1e64 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java @@ -57,7 +57,6 @@ protected String[][] generateRandomData(Types.ValueType[] types, int nrows, int } protected String getRandomString(int length) { - //String alphabet1 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890"; String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; StringBuilder salt = new StringBuilder(); Random rnd = new Random(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java index 34af434013b..e785eb9bb90 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java @@ -25,6 +25,7 @@ import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.FrameBlock; +import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.runtime.util.UtilFunctions; import org.apache.sysds.test.AutomatedTestBase; import org.apache.sysds.test.TestConfiguration; @@ -49,8 +50,7 @@ public abstract class GenerateReaderFrameTest extends AutomatedTestBase { Types.ValueType.INT32, Types.ValueType.INT64, Types.ValueType.FP32, - Types.ValueType.FP64//, - // Types.ValueType.BOOLEAN + Types.ValueType.FP64 }; protected abstract String getTestName(); @@ -151,7 +151,7 @@ else if(types[rnt].isNumeric() || types[rnt] == Types.ValueType.BOOLEAN) } } @SuppressWarnings("unused") - protected void runGenerateReaderTest() { + protected void runGenerateReaderTest(boolean parallel) { Types.ExecMode oldPlatform = rtplatform; rtplatform = Types.ExecMode.SINGLE_NODE; @@ -175,12 +175,16 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "frame_data.raw"; int clen = data[0].length; writeRawString(sampleRaw, dataPath); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, true); + GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); FrameReader fr = gr.getReader(); FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath, schema, data.length, clen); - int a = 100; + String[][] expected = DataConverter.convertToStringFrame(sampleFrame); + String[][] actual = DataConverter.convertToStringFrame(frameBlock); + + TestUtils.compareFrames(expected, actual, sampleFrame.getNumRows(), sampleFrame.getNumColumns()); + } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index 2b11bc7dd5c..b3f4b1b7bc6 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -23,7 +23,6 @@ import org.apache.sysds.common.Types; import org.apache.sysds.conf.CompilerConfig; import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.iogen.FormatIdentifying; import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.MatrixBlock; import org.apache.sysds.runtime.util.DataConverter; @@ -66,7 +65,7 @@ protected void generateRandomSymmetric(int size, double min, double max, double } @SuppressWarnings("unused") - protected void runGenerateReaderTest() { + protected void runGenerateReaderTest(boolean parallel) { Types.ExecMode oldPlatform = rtplatform; rtplatform = Types.ExecMode.SINGLE_NODE; @@ -90,19 +89,13 @@ protected void runGenerateReaderTest() { String dataPath = HOME + "matrix_data.raw"; int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); -// FormatIdentifying formatIdentifying = new FormatIdentifying(sampleRaw, sampleMB); -// myTest mt = new myTest(formatIdentifying.getFormatProperties()); -// mt.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); -// int a = 100; GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, true); MatrixReader mr = gr.getReader(); + //mmm m3 = new mmm(gr.getProperties()); + //MatrixBlock matrixBlock = m3.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); - -// TestUtils.compareMatrices(sampleMB, matrixBlock, 0); - - int a = 100; - + TestUtils.compareMatrices(sampleMB, matrixBlock, 0); } catch(Exception exception) { exception.printStackTrace(); diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java deleted file mode 100644 index 3bc13086f66..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/FrameGenerateReaderCSVTest.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen.Identify; - -import org.apache.sysds.test.functions.iogen.GenerateReaderFrameTest; -import org.junit.Test; - -public class FrameGenerateReaderCSVTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameGenerateReaderCSVTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - private void extractSampleRawCSV(String separator) { - int nrows = data.length; - int ncols = data[0].length; - StringBuilder sb = new StringBuilder(); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - sb.append(data[r][c]); - if(c != ncols - 1) - sb.append(separator); - } - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - } - - @Test - public void test1() { - String[] naStrings = {}; - String separator = ","; - generateRandomData(10, 5, 1, 100, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test2() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -10, 10, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test3() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "****"; - generateRandomData(100, 500, -10, 10, 1, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test4() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ","; - generateRandomData(10, 10, -10, 10, 0.7, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test5() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = ",,,,"; - generateRandomData(10, 10, -10, 10, 0.5, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test6() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(1000, 100, -10, 10, 0.4, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test7() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(1000, 100, -10, 10, 0.8, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } - - @Test - public void test8() { - String[] naStrings = {"NULL", "inf", "NaN"}; - String separator = "**"; - generateRandomData(10000, 100, -10, 10, 0.5, naStrings); - extractSampleRawCSV(separator); - runGenerateReaderTest(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java deleted file mode 100644 index c99555d051a..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/Identify/MatrixGRRowColIdentifyTest.java +++ /dev/null @@ -1,349 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen.Identify; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.io.FrameReaderJSONJackson; -import org.apache.sysds.runtime.io.FrameReaderJSONL; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.iogen.EXP.Util; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.test.functions.iogen.GenerateReaderMatrixTest; -import org.junit.Test; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Random; - -public class MatrixGRRowColIdentifyTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixGenerateReaderCSVTest"; - - @Override protected String getTestName() { - return TEST_NAME; - } - - private void generateRandomCSV(int nrows, int ncols, double min, double max, double sparsity, String separator, - String[] naString) { - - sampleMatrix = getRandomMatrix(nrows, ncols, min, max, sparsity, 714); - StringBuilder sb = new StringBuilder(); - - for(int r = 0; r < nrows; r++) { - StringBuilder row = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - if(sampleMatrix[r][c] != 0) { - row.append(sampleMatrix[r][c]).append(separator); - } - else { - Random rn = new Random(); - int rni = rn.nextInt(naString.length); - row.append(naString[rni]).append(separator); - } - } - - sb.append(row.substring(0, row.length() - separator.length())); - if(r != nrows - 1) - sb.append("\n"); - } - sampleRaw = sb.toString(); - } - - @Test public void test1() { - sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; - sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; - runGenerateReaderTest(); - } - - @Test public void test2() { - sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; - sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; - runGenerateReaderTest(); - } - - @Test public void test3() { - sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; - sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; - runGenerateReaderTest(); - } - - @Test public void test4() { - String[] naString = {"NaN"}; - generateRandomCSV(20, 20, -10, 10, 1, ",", naString); - runGenerateReaderTest(); - } - - @Test public void test5() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"name\":3,\"password\":4}}\n" + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12}, {15, 16}, {19, 20}}; - runGenerateReaderTest(); - } - - @Test public void test6() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + "{\"name\":18, \"occupation\":19, \"user\":{\"name\":20,\"password\":21}}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12}, {15, 16}, {19, 20}}; - runGenerateReaderTest(); - } - - @Test public void test7() { - sampleRaw = "{\"name\":1, \"occupation\":2, \"user\":{\"password\":4, \"name\":3}}\n" + "{\"name\":6, \"occupation\":7, \"user\":{\"name\":8,\"password\":9}}\n" + "{\"name\":10, \"occupation\":11, \"user\":{\"name\":12,\"password\":13}}\n" + "{\"name\":14, \"occupation\":15, \"user\":{\"name\":16,\"password\":17}}\n" + "{\"name\":18, \"user\":{\"name\":20,\"password\":21}, \"occupation\":19}"; - sampleMatrix = new double[][] {{2, 3}, {7, 8}, {11, 12}, {15, 16}, {19, 20}}; - runGenerateReaderTest(); - } - - @Test public void test8() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "2,1,40\n" + "2,2,50\n" + "2,3,60\n" + "3,1,70\n" + "3,2,80\n" + "3,3,90\n"; - - sampleMatrix = new double[][] {{10, 20, 30}, {40, 50, 60}, {70, 80, 90}}; - runGenerateReaderTest(); - } - - @Test public void test9() { - sampleRaw = "
\n" + // 0 - "1\n" + //1 - "2\n" + // 2 - "3\n" + // 3 - "1980\n" + // 4 - "GIO\n" + // 5 - "
\n" + // 6 - "
\n" + // 7 - "10\n" + // 8 - "20\n" + // 9 - "30\n" + // 10 - "2000\n" + // 11 - "GIO2\n" + // 12 - "
\n" + // 13 - "
\n" + // 14 - "2010\n" + // 15 - "100\n" + // 16 - "200\n" + // 17 - "300\n" + // 18 - "800\n" + // 18 - "GIO3\n" + // 19 - "
\n" + // 20 - "
\n" + // 21 - "1000\n" + // 22 - "2000\n" + // 23 - "3000\n" + // 24 - "2222\n" + // 25 - "GIO4\n" + // 26 - "
"; // 27 - - sampleMatrix = new double[][] {{1, 2, 3, 1980}, {10, 20, 30, 2000}, {100, 200, 300, 2010}, - {1000, 2000, 3000, 2222}}; - runGenerateReaderTest(); - } - - @Test public void test10() { - sampleRaw = "
\n" + "1980 \n" + "1 \n" + "2 \n" + "3 \n" + "GIO \n" + "
\n" + " \n" + "10 \n" + "21 \n" + "30 \n" + "2000 \n" + "GIO2 \n" + "\n" + " \n" + "100 \n" + "300 \n" + "210 \n" + "GIO3 \n" + "200 \n" + "\n" + "
\n" + "2222 \n" + "1000 \n" + "2000 \n" + "3000 \n" + "GIO4 \n" + "
"; - - sampleMatrix = new double[][] {{1, 2, 3, 1980}, {10, 21, 30, 2000}, {100, 200, 300, 2010}, - {1000, 2000, 3000, 2222}}; - runGenerateReaderTest(); - } - - @Test public void test11() { - sampleRaw = "#index 1\n" + "#t 2,3\n" + "#s 1980\n" + "#index 10\n\n" + "#t 21,30\n" + "#s 2000\n\n" + "#index 100\n" + "#t 200,300\n" + "#s 2222"; - - sampleMatrix = new double[][] {{1, 2, 3, 1980}, {10, 21, 30, 2000}, {100, 200, 300, 2010}, - {1000, 2000, 3000, 2222}}; - runGenerateReaderTest(); - } - - @Test public void test12() { - // sampleRaw = "#index 1\n" + - // "#t 2,3\n" + - // "#s 1980\n"+ - // "#index 10\n\n" + - // "#t 21,30\n" + - // "#s 2000\n\n"+ - // "#index 100\n" + - // "#t 200,300\n" + - // "#s 2222"; - // - // sampleMatrix = new double[][] {{1,2,3}, {10,21,30}, {100,200,300},{1000,2000,3000}}; - // runGenerateReaderTest(); - - StringBuilder sb = new StringBuilder( - " ,)R2I( hcraeseR mmocofnI rof etutitsnI ,tnemtrapeD gniniM ataD\"[:\"snoitailiffa\",\"tuhN hniM neyugN \":\"eman\",802:\"xedni\"{"); - System.out.println(sb.reverse()); - } - - // @Test - // public void test13() throws Exception { - // String sampleRawFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.raw"; - // String sampleFrameFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/sample_aminer_author1000_5.frame"; - // Integer sampleNRows = 1000; - // String delimiter = "\\t"; - // String schemaFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/Q2/aminer_author_5.schema"; - // String dataFileName = "/home/saeed/Documents/Dataset/GIODataset/aminer/csv/aminer_author.data"; - // - // Float percent = 7f;//Float.parseFloat(args[6]); - // String datasetName = "aminer_paper";//args[7]; - // String LOG_HOME ="/home/saeed/Documents/ExpLog";//args[8]; - // - // if(delimiter.equals("\\t")) - // delimiter = "\t"; - // - // Util util = new Util(); - // Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - // int ncols = sampleSchema.length; - // - // ArrayList newSampleSchema = new ArrayList<>(); - // ArrayList> newSampleFrame = new ArrayList<>(); - // - // String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleNRows, ncols, delimiter); - // - // for(int c = 0; c < sampleFrameStrings[0].length; c++) { - // HashSet valueSet = new HashSet<>(); - // for(int r=0; r0){ - // ArrayList tempList = new ArrayList<>(); - // for(int r=0; r newSampleSchema = new ArrayList<>(); - ArrayList> newSampleFrame = new ArrayList<>(); - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, delimiter,ncols); - - for(int c = 0; c < sampleFrameStrings[0].length; c++) { - HashSet valueSet = new HashSet<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) - valueSet.add(sampleFrameStrings[r][c]); - if(valueSet.size() > 1) { - ArrayList tempList = new ArrayList<>(); - for(int r = 0; r < sampleFrameStrings.length; r++) { - tempList.add(sampleFrameStrings[r][c]); - } - newSampleFrame.add(tempList); - newSampleSchema.add(sampleSchema[c]); - } - } - - sampleFrameStrings = new String[newSampleFrame.get(0).size()][newSampleFrame.size()]; - - for(int row = 0; row < sampleFrameStrings.length; row++) { - for(int col = 0; col < sampleFrameStrings[0].length; col++) { - sampleFrameStrings[row][col] = newSampleFrame.get(col).get(row); - } - } - - sampleSchema = new Types.ValueType[newSampleSchema.size()]; - for(int i = 0; i < newSampleSchema.size(); i++) - sampleSchema[i] = newSampleSchema.get(i); - - //String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, ncols, delimiter); - - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, false); - FrameReader fr =gr.getReader(); - - FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, -1, sampleSchema.length); - - } - } - - @Test public void test14() throws Exception { -// FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); -// -// String FILENAME_SINGLE = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/tpch-json/Q3/sample-tpch-json200.raw"; -// Types.ValueType[] schema = {Types.ValueType.STRING,Types.ValueType.STRING,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64,Types.ValueType.FP64}; -// -// Map schemaMap = new HashMap<>(); -// schemaMap.put("/returnFlag",0); -// schemaMap.put("/lineStatus",1); -// schemaMap.put("/quantity",2); -// schemaMap.put("/extendedPrice",3); -// schemaMap.put("/discount",4); -// schemaMap.put("/tax",5); -// // Read FrameBlock -// FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(FILENAME_SINGLE, schema, schemaMap, -1, schema.length); -// -// int a = 100; - - String schemaFileName ="/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schema"; - String schemaMapFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/F10/twitter-json.schemaMap"; - String dataFileName = "/home/saeed/Documents/Github/papers/2022-vldb-GIO/Experiments/data/twitter-json/twitter-json.data"; - long nrows = 1000; - - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - Map schemaMap = util.getSchemaMap(schemaMapFileName); - - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java index 339d08c9aec..b56a803cf04 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java @@ -30,17 +30,12 @@ protected String getTestName() { return TEST_NAME; } - // JSON Dataset + // XML Dataset //1. flat object, in-order values @Test public void test1() { sampleRaw = "\n" + "1\n" + - "\n" + - "70\n" + - "85\n" + - "90\n" + - ""+ "2\n" + "3\n" + "\n" + @@ -55,7 +50,7 @@ public void test1() { "9\n" + ""; sampleMatrix = new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //2. flat object, out-of-order values @@ -65,7 +60,7 @@ public void test2() { "{\"d\":9,\"b\":7,\"c\":8,\"a\":6,\"e\":10}\n" + "{\"d\":14,\"a\":11,\"e\":15,\"b\":12,\"c\":13}"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //3. nested object with unique attribute names @Test @@ -74,7 +69,7 @@ public void test3() { "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //4. nested object with unique attribute names, out-of-order @@ -84,7 +79,7 @@ public void test4() { "{\"a\":6,\"f\":10,\"b\":{\"e\":9,\"c\":7,\"d\":8}}\n" + "{\"b\":{\"d\":13,\"c\":12,\"e\":14},\"a\":11,\"f\":15}\n"; sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //5. nested object with repeated attribute names, out-of-order @@ -94,7 +89,7 @@ public void test5() { "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } // XML @@ -106,7 +101,7 @@ public void test6() { "
678910
\n" + "
1112131415
"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //6. nested object with unique attribute names, in-order @@ -117,7 +112,7 @@ public void test7() { "678910\n" + "1112131415"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //7. nested object with unique attribute names, in-order @@ -128,7 +123,7 @@ public void test8() { "671980DB910\n" + "11122012CEP1415\n"; sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } @Test @@ -159,8 +154,18 @@ public void test9() { "#t 39\n" + "#c 310\n" + "#% 311\n" + - "#% 500"; - sampleMatrix = new double[][] {{1,12,13,14,15},{2,22,23,24,25},{3,32,33,34,35}}; - runGenerateReaderTest(); + "#% 500\n"+ + "\n" + + "#index 4\n" + + "#* 42\n" + + "#@ 43;44;45\n" + + "#o 46;47;48\n" + + "#t 49\n" + + "#c 410\n" + + "#% 411\n" + + "#% 600"; + + sampleMatrix = new double[][] {{1,12,13,14,15},{2,22,23,24,25},{3,32,33,34,35}, {4,42,43,44,45}}; + runGenerateReaderTest(false); } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java index bfc6812763e..6514d74a9dc 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java @@ -19,251 +19,144 @@ package org.apache.sysds.test.functions.iogen; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.Test; -import java.io.IOException; - public class MatrixSingleRowFlatTest extends GenerateReaderMatrixTest { private final static String TEST_NAME = "MatrixSingleRowFlatTest"; - @Override - protected String getTestName() { + @Override protected String getTestName() { return TEST_NAME; } // CSV Dataset // 1. matrix and dataset are dense and "," is delim - @Test - public void test1() { + @Test public void test1() { sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } // 2. matrix and dataset are dense and ",a" is delim - @Test - public void test2() { + @Test public void test2() { sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //3. matrix and dataset are dense and ",," is delim - @Test - public void test3() { + @Test public void test3() { sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; - sampleMatrix = new double[][] {{1, 3, 5}, {6, 8, 10}, {11, 13,15}}; - runGenerateReaderTest(); + sampleMatrix = new double[][] {{1, 3, 5}, {6, 8, 10}, {11, 13, 15}}; + runGenerateReaderTest(false); } //4. matrix and dataset contain empty/0 values and "," is delim - @Test - public void test4() { + @Test public void test4() { sampleRaw = "1,2,,4,5\n" + ",7,8,9,10\n" + "11,12,,,\n" + "13,14,,,16"; sampleMatrix = new double[][] {{1, 2, 5}, {0, 7, 10}, {11, 12, 0}, {13, 14, 16}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } // LibSVM //5. LibSVM with in-order col indexes and numeric col indexes - @Test - public void test5() { - sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:50 6:60\n" + "+1 1:101 2:201 \n" + - "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, - {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(); + @Test public void test5() { + sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:50 6:60\n" + "+1 1:101 2:201 \n" + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, + {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(false); } //6. LibSVM with out-of-order col indexes and numeric col indexes - @Test - public void test6() { - sampleRaw = "+1 3:30 1:10 2:20\n" + "-1 5:50 6:60 4:40\n" + "+1 1:101 2:201 \n" + - "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, - {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(); + @Test public void test6() { + sampleRaw = "+1 3:30 1:10 2:20\n" + "-1 5:50 6:60 4:40\n" + "+1 1:101 2:201 \n" + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, + {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(false); } //7. Special LibSVM with in-order col indexes and none-numeric col indexes // a -> 1, b->2, c->3, d->4, e->5, f->6 - @Test - public void test7() { - sampleRaw = "+1 a:10 b:20 c:30\n" + "-1 d:40 e:50 f:60\n" + "+1 a:101 b:201 \n" + - "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, - {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(); + @Test public void test7() { + sampleRaw = "+1 a:10 b:20 c:30\n" + "-1 d:40 e:50 f:60\n" + "+1 a:101 b:201 \n" + "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, + {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(false); } //8. Special LibSVM with out-of-order col indexes and none-numeric col indexes // a -> 1, b->2, c->3, d->4, e->5, f->6 - @Test - public void test8() { - sampleRaw = "+1 c:30 a:10 b:20\n" + "-1 e:50 f:60 d:40\n" + "+1 a:101 b:201 \n" + - "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, - {-1, 0, 0, 0, 0, 0, 601}, {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(); + @Test public void test8() { + sampleRaw = "+1 c:30 a:10 b:20\n" + "-1 e:50 f:60 d:40\n" + "+1 a:101 b:201 \n" + "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; + sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, + {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; + runGenerateReaderTest(false); } // MatrixMarket(MM) //9. MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) - @Test - public void test9() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n"+ "1,5,50\n" + "2,1,101\n" + "2,2,201\n" + "4,1,104\n" + - "4,5,504\n" + "5,3,305"; - sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(); + @Test public void test9() { + sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "1,5,50\n" + "2,1,101\n" + "2,2,201\n" + "4,1,104\n" + "4,5,504\n" + "5,3,305"; + sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(false); } //10. MM with inorder dataset, (RowIndex,Col Index,Value). Row begin index: Row & Col begin index: (0,1) - @Test - public void test10() { - sampleRaw = "0,1,10\n" + "0,2,20\n" + "0,3,30\n"+ "0,5,50\n" + "1,1,101\n" + "1,2,201\n" + "3,1,104\n" + - "3,5,504\n" + "4,3,305"; - sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(); + @Test public void test10() { + sampleRaw = "0,1,10\n" + "0,2,20\n" + "0,3,30\n" + "0,5,50\n" + "1,1,101\n" + "1,2,201\n" + "3,1,104\n" + "3,5,504\n" + "4,3,305"; + sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(false); } //11. MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,0) - @Test - public void test11() { - sampleRaw = "1,0,10\n" + "1,1,20\n" + "1,2,30\n"+ "1,4,50\n" + "2,0,101\n" + "2,1,201\n" + "4,0,104\n" + - "4,4,504\n" + "5,2,305"; - sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(); + @Test public void test11() { + sampleRaw = "1,0,10\n" + "1,1,20\n" + "1,2,30\n" + "1,4,50\n" + "2,0,101\n" + "2,1,201\n" + "4,0,104\n" + "4,4,504\n" + "5,2,305"; + sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(false); } //12. MM with inorder dataset, (RowIndex,Col Index,Value). Row begin index: Row & Col begin index: (0,0) - @Test - public void test12() { - sampleRaw = "0,0,10\n" + "0,1,20\n" + "0,2,30\n"+ "0,4,50\n" + "1,0,101\n" + "1,1,201\n" + "3,0,104\n" + - "3,4,504\n" + "4,2,305"; - sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(); + @Test public void test12() { + sampleRaw = "0,0,10\n" + "0,1,20\n" + "0,2,30\n" + "0,4,50\n" + "1,0,101\n" + "1,1,201\n" + "2,0,104\n" + "2,4,504\n" + "3,2,305"; + sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(false); } //13. MM with out-of-order dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) - @Test - public void test13() { - sampleRaw = "4,5,504\n" + "1,2,20\n" + "1,1,10\n" + "2,1,101\n" + "1,3,30\n"+ "1,5,50\n" + "2,2,201\n" + "4,1,104\n" + - "5,3,305"; - sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(); + @Test public void test13() { + sampleRaw = "4,5,504\n" + "1,2,20\n" + "1,1,10\n" + "2,1,101\n" + "1,3,30\n" + "1,5,50\n" + "2,2,201\n" + "4,1,104\n" + "5,3,305"; + sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; + runGenerateReaderTest(false); } //14. MM with out-of-order dataset, (ColIndex,Row Index, Value). Row & Col begin index: (1,1) - @Test - public void test14() { - sampleRaw = "5,4,504\n" + "2,1,20\n" + "1,1,10\n" + "1,2,101\n" + "3,1,30\n"+ "5,1,50\n" + "2,2,201\n" + "1,4,104\n" + - "3,5,305\n"+"2,4,204"; - sampleMatrix = new double[][] {{10,20,30}, {101,201,0}, {0,0,0},{104, 204, 0}, {0, 0, 305}}; - runGenerateReaderTest(); + @Test public void test14() { + sampleRaw = "5,4,504\n" + "2,1,20\n" + "1,1,10\n" + "1,2,101\n" + "3,1,30\n" + "5,1,50\n" + "2,2,201\n" + "1,4,104\n" + "3,5,305\n" + "2,4,204"; + sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 204, 0}, {0, 0, 305}}; + runGenerateReaderTest(false); } - - //========================= - @Test public void test15() { sampleRaw = "1,2,3,4\n" + "5,6,7,8\n" + "9,10,11,12\n" + "13,14,15,16"; - sampleMatrix = new double[][] {{1,2,3,4}, {5,6,7,8}, {9,10,11,12}, {13,14,15,16}}; - runGenerateReaderTest(); + sampleMatrix = new double[][] {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; + runGenerateReaderTest(false); } @Test public void test16() { sampleRaw = "1,2,3,0\n" + "5,0,7,8\n" + "9,0,0,12\n" + "13,14,0,0"; - sampleMatrix = new double[][] {{1,2,3,0}, {5,0,7,8}, {9,0,0,12}, {13,14,0,0}}; - runGenerateReaderTest(); + sampleMatrix = new double[][] {{1, 2, 3, 0}, {5, 0, 7, 8}, {9, 0, 0, 12}, {13, 14, 0, 0}}; + runGenerateReaderTest(false); } @Test public void test17() { - sampleRaw = "1:10 2:20 3:30\n" + "4:40 5:50\n" + "1:60 2:70 3:80\n" + "4:90 5:100"; - sampleMatrix = new double[][] {{10,20,30,0,0}, {0,0,0,40,50}, {60,70,80,0,0}, {0,0,0,90,100}}; - runGenerateReaderTest(); + sampleRaw = "0:10 1:20 2:30\n" + "3:40 4:50\n" + "0:60 1:70 2:80\n" + "3:90 4:100"; + sampleMatrix = new double[][] {{10, 20, 30, 0, 0}, {0, 0, 0, 40, 50}, {60, 70, 80, 0, 0}, {0, 0, 0, 90, 100}}; + runGenerateReaderTest(false); } @Test public void test18() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "1,4,40\n" + "2,2,20\n"+ "2,3,30\n"+ "2,4,40\n"+ "3,3,30\n"+ "3,4,40\n"+ "4,4,40\n"; - sampleMatrix = new double[][] {{10,20,30,40}, {0,20,30,40}, {0,0,30,40}, {0,0,0,40}}; - runGenerateReaderTest(); - } - - @Test public void test19() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "1,4,40\n" + "2,2,50\n"+ "2,3,60\n"+ "2,4,70\n"+ "3,3,80\n"+ "3,4,90\n"+ "4,4,100\n"; - sampleMatrix = new double[][] {{10,20,30,40}, {20,50,60,70}, {30,60,80,90}, {40,70,90,10}}; - runGenerateReaderTest(); - } - - @Test public void test20() { - sampleRaw = "1,1\n" + "1,2\n" + "1,3\n" + "1,4\n" + "2,2\n"+ "2,3\n"+ "2,4\n"+ "3,3\n"+ "3,4\n"+ "4,4\n"; - sampleMatrix = new double[][] {{10,10,10,10}, {0,10,10,10}, {0,0,10,10}, {0,0,0,10}}; - runGenerateReaderTest(); + sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "1,4,40\n" + "2,2,20\n" + "2,3,30\n" + "2,4,40\n" + "3,3,30\n" + "3,4,40\n" + "4,4,40\n"; + sampleMatrix = new double[][] {{10, 20, 30, 40}, {0, 20, 30, 40}, {0, 0, 30, 40}, {0, 0, 0, 40}}; + runGenerateReaderTest(false); } - - - - @Test public void test180() { - String jsonInString = "{\"a\":1, \"b\":2}\n" + "{\"d\":1, \"e\":2}"; - try { - final ObjectMapper mapper = new ObjectMapper(); - mapper.readTree(jsonInString); - System.out.println("Yes"); - } catch (IOException e) { - System.out.println("No"); - } - } - - - -// @Test -// public void test15() { -// sampleRaw = "0,1,2,3\n" + "10,0,20,30\n" + "100,200,0,300\n"+"1000,2000,3000,0"; -// sampleMatrix = new double[][] {{0,1,2,3}, {10,0,20,30}, {100,200,300,0},{1000,2000,3000,0}}; -// runGenerateReaderTest(); -// } -// -// //upper-triangular -// @Test -// public void test16() { -// sampleRaw = "1,2,3,4\n" + "0,20,30,40\n" + "0,0,300,400\n"+"0,0,0,4000"; -// sampleMatrix = new double[][] {{1,2,3,4}, {0,20,30,40}, {0,0,300,400},{0,0,0,4000}}; -// runGenerateReaderTest(); -// } -// -// //lower-triangular -// @Test -// public void test17() { -// sampleRaw = "1,0,0,0\n" + "10,20,0,0\n" + "100,200,300,0\n"+"1000,2000,3000,4000"; -// sampleMatrix = new double[][] {{1,0,0,0}, {10,20,0,0}, {100,200,300,0},{1000,2000,3000,4000}}; -// runGenerateReaderTest(); -// } -// -// //symmetric -// @Test -// public void test19() { -// sampleRaw = "1,2,3,4\n" + "2,2,4,5\n" + "3,4,3,6\n"+"4,5,6,4"; -// sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; -// runGenerateReaderTest(); -// } -// -// //symmetric-upper -// @Test -// public void test20() { -// sampleRaw = "1,2,3,4\n" + "0,2,4,5\n" + "0,0,3,6\n"+"0,0,0,4"; -// sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; -// runGenerateReaderTest(); -// } -// -// //symmetric-lower -// @Test -// public void test21() { -// sampleRaw = "1,0,0,0\n" + "2,2,0,0\n" + "3,4,3,0\n"+"4,5,6,4"; -// sampleMatrix = new double[][] {{1,2,3,4}, {2,2,4,5}, {3,4,3,6},{4,5,6,4}}; -// runGenerateReaderTest(); -// } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java index 856d626c835..42b25fe1b7a 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java @@ -38,7 +38,7 @@ public void test1() { "{\"a\":6,\"b\":7,\"c\":8,\"d\":9,\"e\":10}\n" + "{\"a\":11,\"b\":12,\"c\":13,\"d\":14,\"e\":15}"; sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //2. flat object, out-of-order values @@ -48,7 +48,7 @@ public void test2() { "{\"d\":9,\"b\":7,\"c\":8,\"a\":6,\"e\":10}\n" + "{\"d\":14,\"a\":11,\"e\":15,\"b\":12,\"c\":13}"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //3. nested object with unique attribute names @Test @@ -57,7 +57,7 @@ public void test3() { "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //4. nested object with unique attribute names, out-of-order @@ -67,7 +67,7 @@ public void test4() { "{\"a\":6,\"f\":10,\"b\":{\"e\":9,\"c\":7,\"d\":8}}\n" + "{\"b\":{\"d\":13,\"c\":12,\"e\":14},\"a\":11,\"f\":15}\n"; sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //5. nested object with repeated attribute names, out-of-order @@ -77,7 +77,7 @@ public void test5() { "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } // XML @@ -89,7 +89,7 @@ public void test6() { "
678910
\n" + "
1112131415
"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //6. nested object with unique attribute names, in-order @@ -100,7 +100,7 @@ public void test7() { "678910\n" + "1112131415"; sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } //7. nested object with unique attribute names, in-order @@ -111,6 +111,6 @@ public void test8() { "671980DB910\n" + "11122012CEP1415\n"; sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; - runGenerateReaderTest(); + runGenerateReaderTest(false); } } From 5ee9d77da0e156900969d3b8b8fb5f481d6e7dac Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 27 Jul 2022 03:23:54 +0200 Subject: [PATCH 77/84] minor --- .../sysds/test/functions/iogen/GenerateReaderMatrixTest.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java index b3f4b1b7bc6..a77baf1de88 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java @@ -90,10 +90,8 @@ protected void runGenerateReaderTest(boolean parallel) { int clen = sampleMatrix[0].length; writeRawString(sampleRaw, dataPath); - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, true); + GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, parallel); MatrixReader mr = gr.getReader(); - //mmm m3 = new mmm(gr.getProperties()); - //MatrixBlock matrixBlock = m3.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); TestUtils.compareMatrices(sampleMB, matrixBlock, 0); } From 46f3c1bb282e8ade93aa021a419431fcc49de977 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 27 Jul 2022 03:32:15 +0200 Subject: [PATCH 78/84] Remove benchmark codes from GIO PR --- .../sysds/runtime/io/FrameReaderJSONGson.java | 140 ------------ .../io/FrameReaderJSONGsonParallel.java | 110 --------- .../runtime/io/FrameReaderJSONJackson.java | 142 ------------ .../io/FrameReaderJSONJacksonParallel.java | 110 --------- .../sysds/runtime/iogen/EXP/GIOFrame.java | 49 ---- .../iogen/EXP/GIOFrameIdentification.java | 33 --- .../sysds/runtime/iogen/EXP/GIOMatrix.java | 42 ---- .../iogen/EXP/GIOMatrixIdentification.java | 25 --- .../sysds/runtime/iogen/EXP/SystemDS.java | 212 ------------------ .../sysds/runtime/iogen/EXP/SystemDSGson.java | 43 ---- .../runtime/iogen/EXP/SystemDSJackson.java | 43 ---- .../runtime/iogen/EXP/SystemDSJson4j.java | 42 ---- .../apache/sysds/runtime/iogen/EXP/Util.java | 112 --------- 13 files changed, 1103 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java delete mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java delete mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java delete mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java delete mode 100644 src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java delete mode 100755 src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java deleted file mode 100644 index 3a2560b76dc..00000000000 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGson.java +++ /dev/null @@ -1,140 +0,0 @@ -package org.apache.sysds.runtime.io; - -import com.google.gson.JsonArray; -import com.google.gson.JsonElement; -import com.google.gson.JsonObject; -import com.google.gson.JsonParser; -import com.google.gson.JsonPrimitive; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.sysds.common.Types; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.UtilFunctions; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static org.apache.sysds.runtime.io.FrameReader.checkValidInputFile; -import static org.apache.sysds.runtime.io.FrameReader.createOutputFrameBlock; -import static org.apache.sysds.runtime.io.FrameReader.createOutputSchema; - -public class FrameReaderJSONGson -{ - public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Map schemaMap, - long rlen, long clen) throws IOException, DMLRuntimeException - { - //prepare file access - JobConf jobConf = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path(fname); - FileSystem fileSystem = IOUtilFunctions.getFileSystem(path, jobConf); - FileInputFormat.addInputPath(jobConf, path); - - //check existence and non-empty file - checkValidInputFile(fileSystem, path); - - Types.ValueType[] lschema = createOutputSchema(schema, clen); - String[] lnames = createOutputNamesFromSchemaMap(schemaMap); - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - - readJSONLFrameFromHDFS(path, jobConf, fileSystem, ret, schema, schemaMap); - return ret; - } - - - protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, - Types.ValueType[] schema, Map schemaMap) throws IOException - { - TextInputFormat inputFormat = new TextInputFormat(); - inputFormat.configure(jobConf); - InputSplit[] splits = inputFormat.getSplits(jobConf, 1); - splits = IOUtilFunctions.sortInputSplits(splits); - - for (int i = 0, rowPos = 0; i < splits.length; i++) { - rowPos = readJSONLFrameFromInputSplit(splits[i], inputFormat, jobConf, schema, schemaMap, dest, rowPos); - } - } - - - protected static int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, - JobConf jobConf, Types.ValueType[] schema, Map schemaMap, FrameBlock dest, int currentRow) - throws IOException - { - RecordReader reader = inputFormat.getRecordReader(split, jobConf, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - - int row = currentRow; - try { - while (reader.next(key, value)) { - JsonParser jsonParser = new JsonParser(); - JsonElement root= jsonParser.parse(value.toString()); - Map map = new HashMap<>(); - addKeys("", root, map, new ArrayList<>()); - for (Map.Entry entry : schemaMap.entrySet()) { - String strCellValue = map.get(entry.getKey()); - if(strCellValue!=null){ - dest.set(row, entry.getValue(), UtilFunctions.stringToObject(schema[entry.getValue()], strCellValue)); - } - } - row++; - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - return row; - } - - private static void addKeys(String currentPath, JsonElement jsonNode, Map map, List suffix) { - - if (jsonNode.isJsonObject()) { - JsonObject jsonObject = (JsonObject) jsonNode; - Set> iter = jsonObject.entrySet(); - String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; - for(Map.Entry entry: iter){ - addKeys(pathPrefix + entry.getKey(), entry.getValue(), map, suffix); - } - } else if (jsonNode.isJsonArray()) { - JsonArray arrayNode = (JsonArray) jsonNode; - for (int i = 0; i < arrayNode.size(); i++) { - suffix.add(i + 1); - addKeys(currentPath+"-"+i, arrayNode.get(i), map, suffix); - if (i + 1 (); - } - JsonPrimitive valueNode = (JsonPrimitive) jsonNode; - map.put(currentPath, valueNode.getAsString()); - } - } - - - private String[] createOutputNamesFromSchemaMap(Map schemaMap) { - String[] names = new String[schemaMap.size()]; - schemaMap.forEach((key, value) -> names[value] = key); - return names; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java deleted file mode 100644 index 5bea3627f7c..00000000000 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONGsonParallel.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.io; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.sysds.common.Types; -import org.apache.sysds.hops.OptimizerUtils; -import org.apache.sysds.runtime.io.IOUtilFunctions.CountRowsTask; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.CommonThreadPool; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; - -public class FrameReaderJSONGsonParallel extends FrameReaderJSONGson -{ - @Override - protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, - FrameBlock dest, Types.ValueType[] schema, Map schemaMap) - throws IOException - { - int numThreads = OptimizerUtils.getParallelTextReadParallelism(); - - TextInputFormat inputFormat = new TextInputFormat(); - inputFormat.configure(jobConf); - InputSplit[] splits = inputFormat.getSplits(jobConf, numThreads); - splits = IOUtilFunctions.sortInputSplits(splits); - - try{ - ExecutorService executorPool = CommonThreadPool.get(Math.min(numThreads, splits.length)); - - //compute num rows per split - ArrayList countRowsTasks = new ArrayList<>(); - for (InputSplit split : splits){ - countRowsTasks.add(new CountRowsTask(split, inputFormat, jobConf)); - } - List> ret = executorPool.invokeAll(countRowsTasks); - - //compute row offset per split via cumsum on row counts - long offset = 0; - List offsets = new ArrayList<>(); - for( Future rc : ret ) { - offsets.add(offset); - offset += rc.get(); - } - - //read individual splits - ArrayList readRowsTasks = new ArrayList<>(); - for( int i=0; i{ - private InputSplit _split; - private TextInputFormat _inputFormat; - private JobConf _jobConf; - private FrameBlock _dest; - Map _schemaMap; - private int _offset; - - public ReadRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, - FrameBlock dest, Map schemaMap, int offset) - { - _split = split; - _inputFormat = inputFormat; - _jobConf = jobConf; - _dest = dest; - _schemaMap = schemaMap; - _offset = offset; - } - - @Override - public Object call() throws Exception { - readJSONLFrameFromInputSplit(_split, _inputFormat, _jobConf, _dest.getSchema(), _schemaMap, _dest, _offset); - return null; - } - } -} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java deleted file mode 100644 index a86bc8d4dad..00000000000 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJackson.java +++ /dev/null @@ -1,142 +0,0 @@ -package org.apache.sysds.runtime.io; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.node.ArrayNode; -import com.fasterxml.jackson.databind.node.ObjectNode; -import com.fasterxml.jackson.databind.node.ValueNode; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.sysds.common.Types; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.UtilFunctions; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import static org.apache.sysds.runtime.io.FrameReader.*; - - -public class FrameReaderJSONJackson -{ - public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, Map schemaMap, - long rlen, long clen) throws IOException, DMLRuntimeException - { - //prepare file access - JobConf jobConf = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path(fname); - FileSystem fileSystem = IOUtilFunctions.getFileSystem(path, jobConf); - FileInputFormat.addInputPath(jobConf, path); - - //check existence and non-empty file - checkValidInputFile(fileSystem, path); - - Types.ValueType[] lschema = createOutputSchema(schema, clen); - String[] lnames = createOutputNamesFromSchemaMap(schemaMap); - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - - readJSONLFrameFromHDFS(path, jobConf, fileSystem, ret, schema, schemaMap); - return ret; - } - - - protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, FrameBlock dest, - Types.ValueType[] schema, Map schemaMap) throws IOException - { - TextInputFormat inputFormat = new TextInputFormat(); - inputFormat.configure(jobConf); - InputSplit[] splits = inputFormat.getSplits(jobConf, 1); - splits = IOUtilFunctions.sortInputSplits(splits); - - for (int i = 0, rowPos = 0; i < splits.length; i++) { - rowPos = readJSONLFrameFromInputSplit(splits[i], inputFormat, jobConf, schema, schemaMap, dest, rowPos); - } - } - - - protected static int readJSONLFrameFromInputSplit(InputSplit split, InputFormat inputFormat, - JobConf jobConf, Types.ValueType[] schema, Map schemaMap, FrameBlock dest, int currentRow) - throws IOException - { - RecordReader reader = inputFormat.getRecordReader(split, jobConf, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - - int row = currentRow; - try { - while (reader.next(key, value)) { - ObjectMapper mapper = new ObjectMapper(); - JsonNode root = mapper.readTree(value.toString()); - Map map = new HashMap<>(); - addKeys("", root, map, new ArrayList<>()); - for (Map.Entry entry : schemaMap.entrySet()) { - String strCellValue = map.get(entry.getKey()); - if(strCellValue!=null){ - try { - dest.set(row, entry.getValue(), UtilFunctions.stringToObject(schema[entry.getValue()], strCellValue)); - } - catch(Exception e){} - - } - } - row++; - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - return row; - } - private static void addKeys(String currentPath, JsonNode jsonNode, Map map, List suffix) { - if (jsonNode.isObject()) { - ObjectNode objectNode = (ObjectNode) jsonNode; - Iterator> iter = objectNode.fields(); - String pathPrefix = currentPath.isEmpty() ? "" : currentPath + "/"; - - while (iter.hasNext()) { - Map.Entry entry = iter.next(); - addKeys(pathPrefix + entry.getKey(), entry.getValue(), map, suffix); - } - } else if (jsonNode.isArray()) { - ArrayNode arrayNode = (ArrayNode) jsonNode; - for (int i = 0; i < arrayNode.size(); i++) { - suffix.add(i + 1); - addKeys(currentPath+"-"+i, arrayNode.get(i), map, suffix); - if (i + 1 (); - } - ValueNode valueNode = (ValueNode) jsonNode; - map.put("/"+currentPath, valueNode.asText()); - } - } - - private String[] createOutputNamesFromSchemaMap(Map schemaMap) { - String[] names = new String[schemaMap.size()]; - schemaMap.forEach((key, value) -> names[value] = key); - return names; - } -} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java deleted file mode 100644 index 9b3d5de7841..00000000000 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONJacksonParallel.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.io; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.sysds.common.Types; -import org.apache.sysds.hops.OptimizerUtils; -import org.apache.sysds.runtime.io.IOUtilFunctions.CountRowsTask; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.CommonThreadPool; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; - -public class FrameReaderJSONJacksonParallel extends FrameReaderJSONJackson -{ - @Override - protected void readJSONLFrameFromHDFS(Path path, JobConf jobConf, FileSystem fileSystem, - FrameBlock dest, Types.ValueType[] schema, Map schemaMap) - throws IOException - { - int numThreads = OptimizerUtils.getParallelTextReadParallelism(); - - TextInputFormat inputFormat = new TextInputFormat(); - inputFormat.configure(jobConf); - InputSplit[] splits = inputFormat.getSplits(jobConf, numThreads); - splits = IOUtilFunctions.sortInputSplits(splits); - - try{ - ExecutorService executorPool = CommonThreadPool.get(Math.min(numThreads, splits.length)); - - //compute num rows per split - ArrayList countRowsTasks = new ArrayList<>(); - for (InputSplit split : splits){ - countRowsTasks.add(new CountRowsTask(split, inputFormat, jobConf)); - } - List> ret = executorPool.invokeAll(countRowsTasks); - - //compute row offset per split via cumsum on row counts - long offset = 0; - List offsets = new ArrayList<>(); - for( Future rc : ret ) { - offsets.add(offset); - offset += rc.get(); - } - - //read individual splits - ArrayList readRowsTasks = new ArrayList<>(); - for( int i=0; i{ - private InputSplit _split; - private TextInputFormat _inputFormat; - private JobConf _jobConf; - private FrameBlock _dest; - Map _schemaMap; - private int _offset; - - public ReadRowsTask(InputSplit split, TextInputFormat inputFormat, JobConf jobConf, - FrameBlock dest, Map schemaMap, int offset) - { - _split = split; - _inputFormat = inputFormat; - _jobConf = jobConf; - _dest = dest; - _schemaMap = schemaMap; - _offset = offset; - } - - @Override - public Object call() throws Exception { - readJSONLFrameFromInputSplit(_split, _inputFormat, _jobConf, _dest.getSchema(), _schemaMap, _dest, _offset); - return null; - } - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java deleted file mode 100644 index 84504a596e1..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrame.java +++ /dev/null @@ -1,49 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONObject; - -public class GIOFrame { - - public static void main(String[] args) throws Exception { - String sampleRawFileName; - String sampleFrameFileName; - String sampleRawDelimiter; - String schemaFileName; - String dataFileName; - boolean parallel; - long rows = -1; - - sampleRawFileName = System.getProperty("sampleRawFileName"); - sampleFrameFileName = System.getProperty("sampleFrameFileName"); - sampleRawDelimiter = "\t"; - schemaFileName = System.getProperty("schemaFileName"); - dataFileName = System.getProperty("dataFileName"); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - Util util = new Util(); - - // read and parse mtd file - String mtdFileName = dataFileName + ".mtd"; - try { - String mtd = util.readEntireTextFile(mtdFileName); - mtd = mtd.replace("\n", "").replace("\r", ""); - mtd = mtd.toLowerCase().trim(); - JSONObject jsonObject = new JSONObject(mtd); - if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); - } catch (Exception exception) {} - - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); - FrameReader fr = gr.getReader(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataFileName, sampleSchema, rows, sampleSchema.length); - - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java deleted file mode 100644 index 0b1b8f00941..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOFrameIdentification.java +++ /dev/null @@ -1,33 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; - -public class GIOFrameIdentification { - - public static void main(String[] args) throws Exception { - String sampleRawFileName; - String sampleFrameFileName; - String sampleRawDelimiter; - String schemaFileName; - boolean parallel; - - sampleRawFileName = System.getProperty("sampleRawFileName"); - sampleFrameFileName = System.getProperty("sampleFrameFileName"); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - sampleRawDelimiter = "\t"; - - schemaFileName = System.getProperty("schemaFileName"); - Util util = new Util(); - Types.ValueType[] sampleSchema = util.getSchema(schemaFileName); - int ncols = sampleSchema.length; - - String[][] sampleFrameStrings = util.loadFrameData(sampleFrameFileName, sampleRawDelimiter, ncols); - FrameBlock sampleFrame = new FrameBlock(sampleSchema, sampleFrameStrings); - - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); - gr.getReader(); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java deleted file mode 100644 index 0aab12731e1..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrix.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.wink.json4j.JSONObject; - -public class GIOMatrix { - - public static void main(String[] args) throws Exception { - String sampleRawFileName; - String sampleMatrixFileName; - String sampleRawDelimiter; - String dataFileName; - boolean parallel; - long rows = -1; - - sampleRawFileName = System.getProperty("sampleRawFileName"); - sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); - sampleRawDelimiter = "\t"; - dataFileName = System.getProperty("dataFileName"); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - Util util = new Util(); - // read and parse mtd file - String mtdFileName = dataFileName + ".mtd"; - try { - String mtd = util.readEntireTextFile(mtdFileName); - mtd = mtd.replace("\n", "").replace("\r", ""); - mtd = mtd.toLowerCase().trim(); - JSONObject jsonObject = new JSONObject(mtd); - if (jsonObject.containsKey("rows")) rows = jsonObject.getLong("rows"); - } catch (Exception exception) {} - - - MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, parallel); - MatrixReader matrixReader = gr.getReader(); - MatrixBlock matrixBlock = matrixReader.readMatrixFromHDFS(dataFileName, rows, sampleMB.getNumColumns(), -1, -1); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java deleted file mode 100644 index ba56bc2a9f8..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/GIOMatrixIdentification.java +++ /dev/null @@ -1,25 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; - -public class GIOMatrixIdentification { - - public static void main(String[] args) throws Exception { - String sampleRawFileName; - String sampleMatrixFileName; - String sampleRawDelimiter; - boolean parallel; - sampleRawFileName = System.getProperty("sampleRawFileName"); - sampleMatrixFileName = System.getProperty("sampleMatrixFileName"); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - sampleRawDelimiter = "\t"; - - Util util = new Util(); - MatrixBlock sampleMB = util.loadMatrixData(sampleMatrixFileName, sampleRawDelimiter); - String sampleRaw = util.readEntireTextFile(sampleRawFileName); - - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, parallel); - gr.getReader(); - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java deleted file mode 100644 index 5a0be36d1ff..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDS.java +++ /dev/null @@ -1,212 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.*; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONException; -import org.apache.wink.json4j.JSONObject; - -import java.io.IOException; -import java.util.Map; - -public class SystemDS { - - public static void main(String[] args) throws IOException, JSONException { - - String schemaFileName; - String dataFileName; - String dataType = null; - String valueType; - String sep = null; - String indSep = null; - boolean header = false; - long cols = -1; - long rows = -1; - String format = null; - String config = null; - String schemaMapFileName = null; - boolean parallel; - Types.ValueType[] schema; - - Util util = new Util(); - schemaFileName = System.getProperty("schemaFileName"); - dataFileName = System.getProperty("dataFileName"); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - // read and parse mtd file - String mtdFileName = dataFileName + ".mtd"; - try { - String mtd = util.readEntireTextFile(mtdFileName); - mtd = mtd.replace("\n", "").replace("\r", ""); - mtd = mtd.trim(); - JSONObject jsonObject = new JSONObject(mtd); - if(jsonObject.containsKey("data_type")) - dataType = jsonObject.getString("data_type"); - - if(jsonObject.containsKey("value_type")) - valueType = jsonObject.getString("value_type"); - - if(jsonObject.containsKey("format")) - format = jsonObject.getString("format"); - - if(jsonObject.containsKey("cols")) - cols = jsonObject.getLong("cols"); - - if(jsonObject.containsKey("rows")) - rows = jsonObject.getLong("rows"); - - if(jsonObject.containsKey("header")) - header = jsonObject.getBoolean("header"); - - if(jsonObject.containsKey("schema_path")) - schemaFileName = jsonObject.getString("schema_path"); - - if(jsonObject.containsKey("sep")) - sep = jsonObject.getString("sep"); - - if(jsonObject.containsKey("indSep")) - indSep = jsonObject.getString("indSep"); - - } - catch(Exception exception) { - - } - - if(dataType.equalsIgnoreCase("matrix")) { - MatrixReader matrixReader = null; - if(!parallel) { - switch(format) { - case "csv": - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); - matrixReader = new ReaderTextCSV(propertiesCSV); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - break; - case "libsvm": - FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); - matrixReader = new ReaderTextLIBSVM(propertiesLIBSVM); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - break; - case "mm": - matrixReader = new ReaderTextCell(Types.FileFormat.MM, true); - break; - } - } - else { - switch(format) { - case "csv": - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); - matrixReader = new ReaderTextCSVParallel(propertiesCSV); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - break; - case "libsvm": - FileFormatPropertiesLIBSVM propertiesLIBSVM = new FileFormatPropertiesLIBSVM(sep, indSep, false); - matrixReader = new ReaderTextLIBSVMParallel(propertiesLIBSVM); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - break; - case "mm": - matrixReader = new ReaderTextCellParallel(Types.FileFormat.MM); - break; - } - } - if(matrixReader == null) - throw new IOException("The Matrix Reader is NULL: " + dataFileName + ", format: " + format); - matrixReader.readMatrixFromHDFS(dataFileName, rows, cols, -1, -1); - } - else { - - FrameBlock frameBlock = null; - if(!parallel) { - switch(format) { - case "csv": - schema = util.getSchema(schemaFileName); - cols = schema.length; - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); - FrameReader frameReader = new FrameReaderTextCSV(propertiesCSV); - frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); - break; - case "json": - schema = util.getSchema(schemaFileName); - cols = schema.length; - schemaMapFileName = System.getProperty("schemaMapFileName"); - Map schemaMap = util.getSchemaMap(schemaMapFileName); - config = System.getProperty("config"); - switch(config.toLowerCase()) { - case "gson": - FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); - frameBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - - case "jackson": - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - frameBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - case "json4j": - FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); - frameBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - default: - throw new IOException("JSON Config don't support!!" + config); - } - break; - - case "aminer-author": - FileFormatPropertiesAMiner propertiesAMinerAuthor = new FileFormatPropertiesAMiner("author"); - FrameReader frAuthor = new FrameReaderTextAMiner(propertiesAMinerAuthor); - frameBlock = frAuthor.readFrameFromHDFS(dataFileName, null, null, -1,-1); - break; - case "aminer-paper": - FileFormatPropertiesAMiner propertiesAMinerPaper = new FileFormatPropertiesAMiner("paper"); - FrameReader frPaper = new FrameReaderTextAMiner(propertiesAMinerPaper); - frameBlock = frPaper.readFrameFromHDFS(dataFileName, null, null, -1,-1); - break; - } - } - else { - switch(format) { - case "csv": - schema = util.getSchema(schemaFileName); - cols = schema.length; - FileFormatPropertiesCSV propertiesCSV = new FileFormatPropertiesCSV(header, sep, false); - FrameReader frameReader = new FrameReaderTextCSVParallel(propertiesCSV); - frameBlock = frameReader.readFrameFromHDFS(dataFileName, schema, rows, cols); - break; - case "json": - schema = util.getSchema(schemaFileName); - cols = schema.length; - schemaMapFileName = System.getProperty("schemaMapFileName"); - Map schemaMap = util.getSchemaMap(schemaMapFileName); - config = System.getProperty("config"); - switch(config.toLowerCase()) { - case "gson": - FrameReaderJSONGsonParallel frameReaderJSONGson = new FrameReaderJSONGsonParallel(); - frameBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - - case "jackson": - FrameReaderJSONJacksonParallel frameReaderJSONJackson = new FrameReaderJSONJacksonParallel(); - frameBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - case "json4j": - FrameReaderJSONLParallel frameReaderJSONL = new FrameReaderJSONLParallel(); - frameBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, rows, cols); - break; - default: - throw new IOException("JSON Config don't support!!" + config); - } - break; - case "aminer-author": - FileFormatPropertiesAMiner propertiesAMinerAuthor = new FileFormatPropertiesAMiner("author"); - FrameReader frAuthor = new FrameReaderTextAMinerParallel(propertiesAMinerAuthor); - frameBlock = frAuthor.readFrameFromHDFS(dataFileName, null, null, -1,-1); - break; - case "aminer-paper": - FileFormatPropertiesAMiner propertiesAMinerPaper = new FileFormatPropertiesAMiner("paper"); - FrameReader frPaper = new FrameReaderTextAMinerParallel(propertiesAMinerPaper); - frameBlock = frPaper.readFrameFromHDFS(dataFileName, null, null, -1,-1); - break; - } - } - - } - - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java deleted file mode 100644 index 206f6003992..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSGson.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReaderJSONGson; -import org.apache.sysds.runtime.io.FrameReaderJSONGsonParallel; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONException; - -import java.io.IOException; -import java.util.Map; - -public class SystemDSGson { - - public static void main(String[] args) throws IOException, JSONException { - - String schemaFileName; - String schemaMapFileName; - String dataFileName; - long nrows; - boolean parallel; - - schemaFileName = System.getProperty("schemaFileName"); - schemaMapFileName = System.getProperty("schemaMapFileName"); - dataFileName = System.getProperty("dataFileName"); - nrows = Long.parseLong(System.getProperty("nrows")); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - Map schemaMap = util.getSchemaMap(schemaMapFileName); - - if(!parallel) { - FrameReaderJSONGson frameReaderJSONGson = new FrameReaderJSONGson(); - FrameBlock readBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - else { - FrameReaderJSONGsonParallel frameReaderJSONGson = new FrameReaderJSONGsonParallel(); - FrameBlock readBlock = frameReaderJSONGson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java deleted file mode 100644 index 23a4fea97cb..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJackson.java +++ /dev/null @@ -1,43 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReaderJSONJackson; -import org.apache.sysds.runtime.io.FrameReaderJSONJacksonParallel; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONException; - -import java.io.IOException; -import java.util.Map; - -public class SystemDSJackson { - - public static void main(String[] args) throws IOException, JSONException { - - String schemaFileName; - String schemaMapFileName; - String dataFileName; - long nrows; - boolean parallel; - - schemaFileName = System.getProperty("schemaFileName"); - schemaMapFileName = System.getProperty("schemaMapFileName"); - dataFileName = System.getProperty("dataFileName"); - nrows = Long.parseLong(System.getProperty("nrows")); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - Map schemaMap = util.getSchemaMap(schemaMapFileName); - - if(!parallel) { - FrameReaderJSONJackson frameReaderJSONJackson = new FrameReaderJSONJackson(); - FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - else { - FrameReaderJSONJacksonParallel frameReaderJSONJackson = new FrameReaderJSONJacksonParallel(); - FrameBlock readBlock = frameReaderJSONJackson.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java deleted file mode 100644 index 09832b844d6..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/SystemDSJson4j.java +++ /dev/null @@ -1,42 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.io.FrameReaderJSONL; -import org.apache.sysds.runtime.io.FrameReaderJSONLParallel; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.wink.json4j.JSONException; - -import java.io.IOException; -import java.util.Map; - -public class SystemDSJson4j { - - public static void main(String[] args) throws IOException, JSONException { - - String schemaFileName; - String schemaMapFileName; - String dataFileName; - long nrows; - boolean parallel; - - schemaFileName = System.getProperty("schemaFileName"); - schemaMapFileName = System.getProperty("schemaMapFileName"); - dataFileName = System.getProperty("dataFileName"); - nrows = Long.parseLong(System.getProperty("nrows")); - parallel = Boolean.parseBoolean(System.getProperty("parallel")); - - Util util = new Util(); - Types.ValueType[] schema = util.getSchema(schemaFileName); - int ncols = schema.length; - Map schemaMap = util.getSchemaMap(schemaMapFileName); - - if(parallel) { - FrameReaderJSONL frameReaderJSONL = new FrameReaderJSONL(); - FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - else { - FrameReaderJSONLParallel frameReaderJSONL = new FrameReaderJSONLParallel(); - FrameBlock readBlock = frameReaderJSONL.readFrameFromHDFS(dataFileName, schema, schemaMap, nrows, ncols); - } - } -} diff --git a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java b/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java deleted file mode 100755 index f364fa8efeb..00000000000 --- a/src/main/java/org/apache/sysds/runtime/iogen/EXP/Util.java +++ /dev/null @@ -1,112 +0,0 @@ -package org.apache.sysds.runtime.iogen.EXP; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.sysds.runtime.util.DataConverter; - -import java.io.BufferedReader; -import java.io.BufferedWriter; -import java.io.FileOutputStream; -import java.io.FileReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; - -public class Util { - - public String readEntireTextFile(String fileName) throws IOException { - String text = Files.readString(Paths.get(fileName)); - return text; - } - - public void createLog(String fileName, String text) throws IOException { - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName)); - writer.write(text); - writer.write("\n"); - writer.close(); - } - - public void addLog(String fileName, String log) { - try(Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName, true), "utf-8"))) { - writer.write(log); - writer.write("\n"); - } - catch(Exception ex) { - } - } - - public Types.ValueType[] getSchema(String fileName) throws IOException { - String[] sschema = readEntireTextFile(fileName).trim().split(","); - Types.ValueType[] result = new Types.ValueType[sschema.length]; - for(int i = 0; i < sschema.length; i++) - result[i] = Types.ValueType.valueOf(sschema[i]); - return result; - } - - public Map getSchemaMap(String fileName) throws IOException { - Map schemaMap = new HashMap<>(); - try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { - String line; - while((line = br.readLine()) != null) { - String[] colSchema = line.split(","); - schemaMap.put(colSchema[0], Integer.parseInt(colSchema[1])); - } - } - return schemaMap; - } - - public String[][] loadFrameData(String fileName,String delimiter, int ncols) - throws IOException { - ArrayList sampleRawLines = new ArrayList<>(); - try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { - String line; - while((line = br.readLine()) != null) { - String[] data = line.split(delimiter); - String[] colsData = new String[ncols]; - for(int i = 0; i < data.length; i++) { - String[] value = data[i].split("::"); - if(value.length ==2) { - int col = Integer.parseInt(value[0]); - colsData[col] = value[1]; - } - } - sampleRawLines.add(colsData); - } - } - - int nrows = sampleRawLines.size(); - String[][] result = new String[nrows][ncols]; - for(int i=0; i< nrows; i++) - result[i] = sampleRawLines.get(i); - - return result; - } - - public MatrixBlock loadMatrixData(String fileName, String delimiter) throws IOException { - int ncols = 0; - try(BufferedReader br = new BufferedReader(new FileReader(fileName,StandardCharsets.UTF_8))) { - String line; - while((line = br.readLine()) != null) { - String[] data = line.split(delimiter); - ncols = Math.max(ncols, Integer.parseInt( data[data.length-1].split("::")[0])); - } - } - String[][] dataString = loadFrameData(fileName,delimiter, ncols+1); - double[][] data = new double[dataString.length][dataString[0].length]; - for(int i=0;i Date: Wed, 27 Jul 2022 10:51:24 +0200 Subject: [PATCH 79/84] Minor Cleanup --- .../runtime/io/FrameReaderTextAMiner.java | 545 ------------------ .../io/FrameReaderTextAMinerParallel.java | 328 ----------- 2 files changed, 873 deletions(-) delete mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java delete mode 100644 src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java deleted file mode 100644 index a76d420cb5b..00000000000 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMiner.java +++ /dev/null @@ -1,545 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.io; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.sysds.common.Types.ValueType; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.InputStreamInputFormat; -import org.apache.sysds.runtime.util.UtilFunctions; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; - -public class FrameReaderTextAMiner extends FrameReader { - protected final FileFormatPropertiesAMiner _props; - protected DatasetMetaDataPaper paperMetaData; - protected DatasetMetaDataAuthor authorMetaData; - protected ArrayList[] rowIndexs; - protected ArrayList[] colBeginIndexs; - - public FrameReaderTextAMiner(FileFormatPropertiesAMiner props) { - //if unspecified use default properties for robustness - _props = props; - } - - @Override public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen) - throws IOException, DMLRuntimeException { - LOG.debug("readFrameFromHDFS AMiner"); - // prepare file access - JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path(fname); - FileSystem fs = IOUtilFunctions.getFileSystem(path, job); - FileInputFormat.addInputPath(job, path); - - // check existence and non-empty file - checkValidInputFile(fs, path); - - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - InputSplit[] splits = informat.getSplits(job, 1); - splits = IOUtilFunctions.sortInputSplits(splits); - - ValueType[] lschema = null; - String[] lnames = null; - if(_props.getType().equals("paper")) { - paperMetaData = computeAMinerSizePaper(informat,job, splits); - rlen = paperMetaData.nrow; - lschema = paperMetaData.schema; - lnames = paperMetaData.names; - } - else { - authorMetaData = computeAMinerSizeAuthor(informat,job, splits); - rlen = authorMetaData.nrow; - lschema = authorMetaData.schema; - lnames = authorMetaData.names; - } - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - - // core read (sequential/parallel) - readAMinerFrameFromHDFS(informat,job, splits, ret, lschema); - return ret; - } - - @Override public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, long rlen, long clen) - throws IOException, DMLRuntimeException { - - // TODO: fix stream reader. incomplete - LOG.debug("readFrameFromInputStream csv"); - ValueType[] lschema = null; - String[] lnames = null; - - InputStreamInputFormat informat = new InputStreamInputFormat(is); - InputSplit[] splits = informat.getSplits(null, 1); - splits = IOUtilFunctions.sortInputSplits(splits); - - if(_props.getType().equals("paper")) { - paperMetaData = computeAMinerSizePaper(null,null, splits); - rlen = paperMetaData.nrow; - lschema = paperMetaData.schema; - lnames = paperMetaData.names; - } - else { - authorMetaData = computeAMinerSizeAuthor(null,null, splits); - rlen = authorMetaData.nrow; - lschema = authorMetaData.schema; - lnames = authorMetaData.names; - } - FrameBlock ret = createOutputFrameBlock(lschema, lnames, rlen); - - // core read (sequential/parallel) - if(_props.getType().equals("paper")) { - readAMinerPaperFrameFromInputSplit(splits[0], rowIndexs[0], colBeginIndexs[0], informat, null, ret, schema); - } - else { - readAMinerAuthorFrameFromInputSplit(splits[0], rowIndexs[0], informat, null, ret, schema); - } - return ret; - - } - - protected void readAMinerFrameFromHDFS(TextInputFormat informat, JobConf job, InputSplit[] splits, FrameBlock dest, ValueType[] schema) throws IOException { - LOG.debug("readAMinerFrameFromHDFS csv"); - if(_props.getType().equals("paper")) { - for(int i = 0; i < splits.length; i++) - readAMinerPaperFrameFromInputSplit(splits[i], rowIndexs[i], colBeginIndexs[i], informat, job, dest, schema); - } - else { - for(int i = 0; i < splits.length; i++) - readAMinerAuthorFrameFromInputSplit(splits[i], rowIndexs[i], informat, job, dest, schema); - } - } - - // #index ---- index id of this paper - // #* ---- paper title - // #@ ---- authors (separated by semicolons) - // #o ---- affiliations (separated by semicolons, and each affiliaiton corresponds to an author in order) - // #t ---- year - // #c ---- publication venue - // #% ---- the id of references of this paper (there are multiple lines, with each indicating a reference) - // #! ---- abstract - protected final void readAMinerPaperFrameFromInputSplit(InputSplit split, ArrayList rowIndex, ArrayList colBeginIndex, - InputFormat informat, JobConf job, FrameBlock dest, ValueType[] schema) throws IOException { - - // create record reader - RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int row, col; - int colBegin = 0; - int index = -1; - String valStr; - // Read the data - try { - while(reader.next(key, value)) // foreach line - { - index++; - String rowStr = value.toString().trim(); - if(rowStr.length() == 0) - continue; - row = rowIndex.get(index); - colBegin = colBeginIndex.get(index); - - if(rowStr.startsWith("#index ")) { - col = colBegin; - valStr = rowStr.substring("#index ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#* ")) { - col = colBegin + 1; - valStr = rowStr.substring("#* ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#@ ")) { - col = colBegin + paperMetaData.authorStartCol; - valStr = rowStr.substring("#@ ".length()).trim(); - String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); - for(int i = 0; i < valList.length; i++) - dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); - } - else if(rowStr.startsWith("#o ")) { - col = colBegin + paperMetaData.getAffiliationStartCol(); - valStr = rowStr.substring("#o ".length()).trim(); - String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); - for(int i = 0; i < valList.length; i++) - dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); - } - else if(rowStr.startsWith("#t ")) { - col = colBegin + 2; - valStr = rowStr.substring("#t ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#c ")) { - col = colBegin + 3; - valStr = rowStr.substring("#c ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#! ")) { - col = colBegin + 4; - valStr = rowStr.substring("#! ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#% ")) { - col = colBegin + paperMetaData.referenceStartCol; - valStr = rowStr.substring("#! ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - } - - // #index ---- index id of this author - // #n ---- name (separated by semicolons) - // #a ---- affiliations (separated by semicolons) - // #pc ---- the count of published papers of this author - // #cn ---- the total number of citations of this author - // #hi ---- the H-index of this author - // #pi ---- the P-index with equal A-index of this author - // #upi ---- the P-index with unequal A-index of this author - // #t ---- research interests of this author (separated by semicolons) - protected final void readAMinerAuthorFrameFromInputSplit(InputSplit split, ArrayList rowIndex, InputFormat informat, - JobConf job, FrameBlock dest, ValueType[] schema) throws IOException { - - // create record reader - RecordReader reader = informat.getRecordReader(split, job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int row, col; - int index = -1; - String valStr; - try { - while(reader.next(key, value)) // foreach line - { - index++; - String rowStr = value.toString().trim(); - if(rowStr.length() == 0) - continue; - row = rowIndex.get(index); - - if(rowStr.startsWith("#index ")) { - col = 0; - valStr = rowStr.substring("#index ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#n ")) { - col = 1; - valStr = rowStr.substring("#n ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#a ")) { - col = authorMetaData.getAffiliationStartCol(); - valStr = rowStr.substring("#a ".length()).trim(); - String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); - for(int i = 0; i < valList.length; i++) - dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); - } - else if(rowStr.startsWith("#t ")) { - col = authorMetaData.getResearchInterestStartCol(); - valStr = rowStr.substring("#t ".length()).trim(); - String[] valList = IOUtilFunctions.splitCSV(valStr, ";"); - for(int i = 0; i < valList.length; i++) - dest.set(row, col + i, UtilFunctions.stringToObject(schema[col], valList[i])); - } - - else if(rowStr.startsWith("#pc ")) { - col = 2; - valStr = rowStr.substring("#pc ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#cn ")) { - col = 3; - valStr = rowStr.substring("#cn ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#hi ")) { - col = 4; - valStr = rowStr.substring("#hi ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#pi ")) { - col = 5; - valStr = rowStr.substring("#pi ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - else if(rowStr.startsWith("#upi ")) { - col = 6; - valStr = rowStr.substring("#upi ".length()).trim(); - dest.set(row, col, UtilFunctions.stringToObject(schema[col], valStr)); - } - } - } - finally { - IOUtilFunctions.closeSilently(reader); - } - } - - protected DatasetMetaDataPaper computeAMinerSizePaper(TextInputFormat informat, JobConf job, InputSplit[] splits) throws IOException { - this.rowIndexs = new ArrayList[splits.length]; - this.colBeginIndexs = new ArrayList[splits.length]; - - LongWritable key = new LongWritable(); - Text value = new Text(); - int ncol = 5; - int maxAuthors = 0; - int maxAffiliations = 0; - int maxReferences = 0; - int row = -1; - int lastRefCount = 0; - - for(int i = 0; i < splits.length; i++) { - RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); - int refCount = 0; - boolean splitRowFlag = false; - this.rowIndexs[i] = new ArrayList<>(); - this.colBeginIndexs[i] = new ArrayList<>(); - while(reader.next(key, value)) { - String raw = value.toString().trim(); - if(raw.startsWith("#index ")) { - row++; - if(splitRowFlag) - maxReferences = Math.max(maxReferences, refCount); - else - maxReferences = Math.max(maxReferences, refCount + lastRefCount); - - splitRowFlag = true; - lastRefCount = refCount; - refCount = 0; - this.colBeginIndexs[i].add(0); - } - else if(raw.startsWith("#@")) { //authors (separated by semicolons) - maxAuthors = Math.max(maxAuthors, IOUtilFunctions.countTokensCSV(raw, ";")); - this.colBeginIndexs[i].add(0); - } - else if(raw.startsWith("#o")) { //(separated by semicolons, and each affiliaiton corresponds to an author in order) - maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); - this.colBeginIndexs[i].add(0); - } - else if(raw.startsWith("#%")) { // the id of references of this paper (there are multiple lines, with each indicating a reference) - - if(!splitRowFlag) - this.colBeginIndexs[i].add(refCount + lastRefCount); - else - this.colBeginIndexs[i].add(refCount); - refCount++; - } - else - this.colBeginIndexs[i].add(0); - - this.rowIndexs[i].add(row); - } - } - ncol += maxAuthors + maxAffiliations + maxReferences; - - DatasetMetaDataPaper datasetMetaDataPaper = new DatasetMetaDataPaper(ncol, row + 1, maxAuthors, maxAffiliations, maxReferences); - return datasetMetaDataPaper; - } - - protected DatasetMetaDataAuthor computeAMinerSizeAuthor(TextInputFormat informat, JobConf job, InputSplit[] splits) throws IOException { - this.rowIndexs = new ArrayList[splits.length]; - this.colBeginIndexs = new ArrayList[splits.length]; - - LongWritable key = new LongWritable(); - Text value = new Text(); - int ncol = 7; - int maxAffiliations = 0; - int maxResearchInterest = 0; - int row = -1; - - for(int i = 0; i < splits.length; i++) { - RecordReader reader = informat.getRecordReader(splits[i], job, Reporter.NULL); - this.rowIndexs[i] = new ArrayList<>(); - this.colBeginIndexs[i] = new ArrayList<>(); - while(reader.next(key, value)) { - String raw = value.toString().trim(); - if(raw.startsWith("#index ")) - row++; - else if(raw.startsWith("#a")) //affiliations (separated by semicolons) - maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); - - else if(raw.startsWith("#t")) // research interests of this author (separated by semicolons) - maxResearchInterest = Math.max(maxResearchInterest, IOUtilFunctions.countTokensCSV(raw, ";")); - - this.rowIndexs[i].add(row); - } - } - ncol += maxAffiliations + maxResearchInterest; - - return new DatasetMetaDataAuthor(ncol, row + 1, maxAffiliations, maxResearchInterest); - } - - protected static abstract class DatasetMetaData { - protected final int ncol; - protected final int nrow; - protected ValueType[] schema; - protected String[] names; - private final int affiliationStartCol; - protected int index; - protected int maxAffiliation = 0; - protected int maxResearchInterest = 0; - protected int maxReference = 0; - protected int maxAuthor = 0; - - public DatasetMetaData(int ncol, int nrow, int affiliationStartCol) { - this.ncol = ncol; - this.nrow = nrow; - this.names = new String[ncol]; - this.affiliationStartCol = affiliationStartCol; - for(int i = 0; i < ncol; i++) - this.names[i] = "col_" + i; - } - - public String[] getNames() { - return names; - } - - public ValueType[] getSchema() { - return schema; - } - - public int getAffiliationStartCol() { - return affiliationStartCol; - } - - public int getNcol() { - return ncol; - } - - public int getNrow() { - return nrow; - } - - public int getIndex() { - return index; - } - - public void setIndex(int index) { - this.index = index; - } - - public int getMaxAffiliation() { - return maxAffiliation; - } - - public void setMaxAffiliation(int maxAffiliation) { - this.maxAffiliation = maxAffiliation; - } - - public int getMaxResearchInterest() { - return maxResearchInterest; - } - - public void setMaxResearchInterest(int maxResearchInterest) { - this.maxResearchInterest = maxResearchInterest; - } - - public int getMaxReference() { - return maxReference; - } - - public void setMaxReference(int maxReference) { - this.maxReference = maxReference; - } - - public int getMaxAuthor() { - return maxAuthor; - } - - public void setMaxAuthor(int maxAuthor) { - this.maxAuthor = maxAuthor; - } - } - - protected static class DatasetMetaDataPaper extends DatasetMetaData { - private final int authorStartCol; - private final int referenceStartCol; - - public DatasetMetaDataPaper(int ncol, int nrow, int maxAuthor, int maxAffiliation, int maxReference) { - super(ncol, nrow, 5 + maxAuthor); - this.schema = new ValueType[ncol]; - this.schema[0] = ValueType.INT64; // index id of this paper - this.schema[1] = ValueType.STRING; //paper title - this.schema[2] = ValueType.INT32; //year - this.schema[3] = ValueType.STRING; //publication venue - this.schema[4] = ValueType.STRING; // abstract - this.maxAffiliation = maxAffiliation; - this.maxAuthor = maxAuthor; - this.maxReference = maxReference; - - for(int i = 5; i < maxAuthor + maxAffiliation + 5; i++) - this.schema[i] = ValueType.STRING; - - for(int i = maxAuthor + maxAffiliation + 5; i < ncol; i++) - this.schema[i] = ValueType.FP64; - - this.authorStartCol = 5; - this.referenceStartCol = maxAuthor + maxAffiliation + 5; - } - - public int getAuthorStartCol() { - return authorStartCol; - } - - public int getReferenceStartCol() { - return referenceStartCol; - } - } - - protected static class DatasetMetaDataAuthor extends DatasetMetaData { - private final int researchInterestStartCol; - - public DatasetMetaDataAuthor(int ncol, int nrow, int maxAffiliation, int maxResearchInterest) { - super(ncol, nrow, 7); - this.schema = new ValueType[ncol]; - this.schema[0] = ValueType.INT64; // index id of this author - this.schema[1] = ValueType.STRING; // name (separated by semicolons) - this.schema[2] = ValueType.INT32; // the count of published papers of this author - this.schema[3] = ValueType.INT32; // the total number of citations of this author - this.schema[4] = ValueType.FP32; // the H-index of this author - this.schema[5] = ValueType.FP32; // the P-index with equal A-index of this author - this.schema[6] = ValueType.FP32; // the P-index with unequal A-index of this author - this.maxAffiliation = maxAffiliation; - this.maxResearchInterest = maxResearchInterest; - - for(int i = 7; i < maxAffiliation + maxResearchInterest + 7; i++) - this.schema[i] = ValueType.STRING; - this.researchInterestStartCol = 7 + maxAffiliation; - } - - public int getResearchInterestStartCol() { - return researchInterestStartCol; - } - } -} diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java deleted file mode 100644 index d26960a12d0..00000000000 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderTextAMinerParallel.java +++ /dev/null @@ -1,328 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.runtime.io; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileInputFormat; -import org.apache.hadoop.mapred.InputSplit; -import org.apache.hadoop.mapred.JobConf; -import org.apache.hadoop.mapred.RecordReader; -import org.apache.hadoop.mapred.Reporter; -import org.apache.hadoop.mapred.TextInputFormat; -import org.apache.sysds.common.Types; -import org.apache.sysds.common.Types.ValueType; -import org.apache.sysds.conf.ConfigurationManager; -import org.apache.sysds.hops.OptimizerUtils; -import org.apache.sysds.runtime.DMLRuntimeException; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.matrix.data.Pair; -import org.apache.sysds.runtime.transform.TfUtils; -import org.apache.sysds.runtime.util.CommonThreadPool; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.List; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Future; - -/** - * Multi-threaded frame text AMiner reader. - */ -public class FrameReaderTextAMinerParallel extends FrameReaderTextAMiner { - protected int _numThreads; - protected BitSet[] bitSets; - - public FrameReaderTextAMinerParallel(FileFormatPropertiesAMiner props) { - super(props); - this._numThreads = OptimizerUtils.getParallelTextReadParallelism(); - } - - @Override public FrameBlock readFrameFromHDFS(String fname, Types.ValueType[] schema, String[] names, long rlen, long clen) - throws IOException, DMLRuntimeException { - - //prepare file access - JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); - Path path = new Path(fname); - FileSystem fs = IOUtilFunctions.getFileSystem(path, job); - - FileInputFormat.addInputPath(job, path); - TextInputFormat informat = new TextInputFormat(); - informat.configure(job); - - InputSplit[] splits = informat.getSplits(job, _numThreads); - splits = IOUtilFunctions.sortInputSplits(splits); - - // check existence and non-empty file - checkValidInputFile(fs, path); - - FrameBlock ret = readAMinerHDFS(splits, informat, job); - - return ret; - } - - protected FrameBlock readAMinerHDFS(InputSplit[] splits, TextInputFormat informat, JobConf job) throws IOException { - try { - ExecutorService pool = CommonThreadPool.get(Math.min(_numThreads, splits.length)); - this.rowIndexs = new ArrayList[splits.length]; - this.colBeginIndexs = new ArrayList[splits.length]; - this.bitSets = new BitSet[splits.length]; - - //compute num rows per split - ArrayList tasks = new ArrayList<>(); - for(int i = 0; i < splits.length; i++) { - rowIndexs[i] = new ArrayList<>(); - if(_props.getType().equals("author")) - tasks.add(new CountRowsColsTaskAuthor(splits[i], informat, job, rowIndexs[i], i)); - else { - colBeginIndexs[i] = new ArrayList<>(); - bitSets[i] = new BitSet(); - tasks.add(new CountRowsColsTaskPaper(splits[i], informat, job, rowIndexs[i], colBeginIndexs[i], bitSets[i], i)); - } - } - List> cret = pool.invokeAll(tasks); - - for(Future count : cret) - while(!count.isDone()) - ; - - //compute row offset per split via cumsum on row counts - int offset = 0; - int maxAffiliation = 0; - int maxResearchInterest = 0; - int maxReference = 0; - int maxAuthor = 0; - int ncol; - ValueType[] lschema = null; - String[] lnames = null; - - for(Future count : cret) { - DatasetMetaData metaData = count.get(); - ArrayList ri = rowIndexs[metaData.getIndex()]; - if(_props.getType().equals("author")) { - maxResearchInterest = Math.max(maxResearchInterest, metaData.maxResearchInterest); - } - else { - int negativeBeginPos = -1; - int negativeEndPos = -1; - for(int i = 0; i < ri.size(); i++) { - int valIndex = ri.get(i); - if(valIndex == -1 && ((i == 0 && bitSets[metaData.getIndex()].get(i)) || (i > 0 && bitSets[metaData.getIndex()].get( - i) && !bitSets[metaData.getIndex()].get(i - 1)))) { - if(negativeBeginPos == -1) { - negativeBeginPos = i; - } - negativeEndPos = i; - } - } - if(negativeBeginPos != -1) { - int bcIndex = colBeginIndexs[metaData.getIndex() - 1].get(colBeginIndexs[metaData.getIndex() - 1].size() - 1) + 1; - int counter = 0; - for(int i = negativeBeginPos; i <= negativeEndPos; i++) { - colBeginIndexs[metaData.getIndex()].set(i, counter + bcIndex); - counter++; - } - int tMax = Math.max(bcIndex + counter, metaData.maxReference); - metaData.setMaxReference(tMax); - } - maxReference = Math.max(maxReference, metaData.maxReference); - maxAuthor = Math.max(maxAuthor, metaData.maxAuthor); - } - - for(int i = 0; i < ri.size(); i++) - ri.set(i, ri.get(i) + offset); - - maxAffiliation = Math.max(maxAffiliation, metaData.maxAffiliation); - offset += metaData.getNrow(); - } - if(_props.getType().equals("paper")) { - ncol = 5 + maxAuthor + maxAffiliation + maxReference; - this.paperMetaData = new DatasetMetaDataPaper(ncol, offset, maxAuthor, maxAffiliation, maxReference); - lschema = this.paperMetaData.schema; - lnames = this.paperMetaData.names; - } - else { - ncol = 7 + maxAffiliation + maxResearchInterest; - this.authorMetaData = new DatasetMetaDataAuthor(ncol, offset, maxAffiliation, maxResearchInterest); - lschema = this.authorMetaData.schema; - lnames = this.authorMetaData.names; - } - FrameBlock ret = createOutputFrameBlock(lschema, lnames, offset + 1); - //read individual splits - ArrayList tasks2 = new ArrayList<>(); - for(int i = 0; i < splits.length; i++) - tasks2.add(new ReadRowsTask(splits[i], rowIndexs[i], colBeginIndexs[i], informat, job, ret, lschema)); - CommonThreadPool.invokeAndShutdown(pool, tasks2); - return ret; - } - catch(Exception e) { - throw new IOException("Failed parallel read of text AMiner input.", e); - } - } - - private static abstract class CountRowsColsTask implements Callable { - protected InputSplit _split = null; - protected Integer _splitIndex; - protected TextInputFormat _informat = null; - protected JobConf _job = null; - protected ArrayList _rowIndex; - protected ArrayList _colBeginIndex; - protected BitSet _bitSet; - - public CountRowsColsTask(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, - ArrayList colBeginIndex, BitSet bitSet, int splitIndex) { - _split = split; - _informat = informat; - _job = job; - _rowIndex = rowIndex; - _colBeginIndex = colBeginIndex; - _splitIndex = splitIndex; - _bitSet = bitSet; - } - - @Override public DatasetMetaData call() throws Exception { - return null; - } - } - - private static class CountRowsColsTaskAuthor extends CountRowsColsTask { - - public CountRowsColsTaskAuthor(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, int splitIndex) { - super(split, informat, job, rowIndex, null, null, splitIndex); - } - - @Override public DatasetMetaDataAuthor call() throws Exception { - RecordReader reader = _informat.getRecordReader(_split, _job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - - int ncol = 7; - int maxAffiliations = 0; - int maxResearchInterest = 0; - int row = -1; - - while(reader.next(key, value)) { - String raw = value.toString().trim(); - if(raw.startsWith("#index ")) - row++; - else if(raw.startsWith("#a")) //affiliations (separated by semicolons) - maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); - - else if(raw.startsWith("#t")) // research interests of this author (separated by semicolons) - maxResearchInterest = Math.max(maxResearchInterest, IOUtilFunctions.countTokensCSV(raw, ";")); - - this._rowIndex.add(row); - } - - ncol += maxAffiliations + maxResearchInterest; - - DatasetMetaDataAuthor datasetMetaDataAuthor = new DatasetMetaDataAuthor(ncol, row + 1, maxAffiliations, maxResearchInterest); - datasetMetaDataAuthor.setIndex(_splitIndex); - return datasetMetaDataAuthor; - } - } - - private static class CountRowsColsTaskPaper extends CountRowsColsTask { - - public CountRowsColsTaskPaper(InputSplit split, TextInputFormat informat, JobConf job, ArrayList rowIndex, - ArrayList colBeginIndex, BitSet bitSet, int splitIndex) { - super(split, informat, job, rowIndex, colBeginIndex, bitSet, splitIndex); - } - - @Override public DatasetMetaDataPaper call() throws Exception { - RecordReader reader = _informat.getRecordReader(_split, _job, Reporter.NULL); - LongWritable key = new LongWritable(); - Text value = new Text(); - int ncol = 5; - int maxAuthors = 0; - int maxAffiliations = 0; - int maxReferences = 0; - int row = -1; - int refCount = 0; - int bIndex = 0; - - while(reader.next(key, value)) { - String raw = value.toString().trim(); - bIndex++; - if(raw.startsWith("#index ")) { - row++; - maxReferences = Math.max(maxReferences, refCount); - refCount = 0; - this._colBeginIndex.add(0); - } - else if(raw.startsWith("#@")) { //authors (separated by semicolons) - maxAuthors = Math.max(maxAuthors, IOUtilFunctions.countTokensCSV(raw, ";")); - this._colBeginIndex.add(0); - } - else if(raw.startsWith("#o")) { //(separated by semicolons, and each affiliaiton corresponds to an author in order) - maxAffiliations = Math.max(maxAffiliations, IOUtilFunctions.countTokensCSV(raw, ";")); - this._colBeginIndex.add(0); - } - else if(raw.startsWith("#%")) { // the id of references of this paper (there are multiple lines, with each indicating a reference) - this._colBeginIndex.add(refCount); - this._bitSet.set(bIndex); - refCount++; - } - else - this._colBeginIndex.add(0); - - this._rowIndex.add(row); - } - - ncol += maxAuthors + maxAffiliations + maxReferences; - DatasetMetaDataPaper datasetMetaDataPaper = new DatasetMetaDataPaper(ncol, row + 1, maxAuthors, maxAffiliations, maxReferences); - datasetMetaDataPaper.setIndex(_splitIndex); - return datasetMetaDataPaper; - } - } - - private class ReadRowsTask implements Callable { - private final InputSplit _split; - private final TextInputFormat _informat; - private final JobConf _job; - private final FrameBlock _dest; - private final ArrayList _rowIndex; - private final ArrayList _colBeginIndex; - private final ValueType[] _schema; - - public ReadRowsTask(InputSplit split, ArrayList rowIndex, ArrayList colBeginIndex, TextInputFormat informat, JobConf job, - FrameBlock dest, ValueType[] schema) { - _split = split; - _informat = informat; - _job = job; - _dest = dest; - _rowIndex = rowIndex; - _colBeginIndex = colBeginIndex; - _schema = schema; - } - - @Override public Object call() throws Exception { - if(_props.getType().equals("paper")) - readAMinerPaperFrameFromInputSplit(_split, _rowIndex, _colBeginIndex, _informat, _job, _dest, _schema); - else - readAMinerAuthorFrameFromInputSplit(_split, _rowIndex, _informat, _job, _dest, _schema); - return null; - } - } -} From 2f66e416ea07c756edaabf4abbb081814522183a Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 5 Aug 2022 01:37:31 +0200 Subject: [PATCH 80/84] Initial Integration of GIO with SystemDS (DML) --- .../java/org/apache/sysds/common/Types.java | 13 +- .../java/org/apache/sysds/hops/DataOp.java | 43 ++- .../apache/sysds/hops/GenerateReaderOp.java | 158 ++++++++ .../java/org/apache/sysds/lops/DataIOGen.java | 344 ++++++++++++++++++ src/main/java/org/apache/sysds/lops/Lop.java | 11 +- .../java/org/apache/sysds/lops/ReaderGen.java | 71 ++++ .../org/apache/sysds/lops/compile/Dag.java | 34 +- .../apache/sysds/parser/DMLTranslator.java | 71 +++- .../apache/sysds/parser/DataExpression.java | 78 +++- .../apache/sysds/parser/StatementBlock.java | 15 +- .../instructions/CPInstructionParser.java | 1 + .../cp/VariableCPInstruction.java | 97 ++++- .../sysds/runtime/io/FrameReaderFactory.java | 19 + .../sysds/runtime/io/MatrixReaderFactory.java | 25 ++ .../sysds/runtime/iogen/CustomProperties.java | 41 ++- .../sysds/runtime/iogen/GenerateReader.java | 80 ++-- .../sysds/runtime/iogen/SampleProperties.java | 70 +++- .../iogen/FrameSingleRowFlatTest.java | 86 ----- .../iogen/FrameSingleRowNestedTest.java | 80 ---- ...erMatrixTest.java => GIOMatrixReader.java} | 100 ++--- .../test/functions/iogen/GIOReadCSVTest1.java | 39 ++ .../functions/iogen/GenerateRandomFrame.java | 312 ---------------- .../functions/iogen/GenerateRandomMatrix.java | 319 ---------------- .../iogen/GenerateReaderFrameTest.java | 204 ----------- .../iogen/MatrixMultiRowNestedTest.java | 171 --------- .../iogen/MatrixSingleRowFlatTest.java | 162 --------- .../iogen/MatrixSingleRowNestedTest.java | 116 ------ .../scripts/functions/iogen/ReaderCSV_1.dml | 25 ++ .../scripts/functions/iogen/in/dataset_1.dat | 3 + .../functions/iogen/in/sampleMatrix_1.mtx | 3 + .../functions/iogen/in/sampleMatrix_1.mtx.mtd | 6 + .../functions/iogen/in/sampleMatrix_1.raw | 3 + 32 files changed, 1206 insertions(+), 1594 deletions(-) create mode 100644 src/main/java/org/apache/sysds/hops/GenerateReaderOp.java create mode 100644 src/main/java/org/apache/sysds/lops/DataIOGen.java create mode 100644 src/main/java/org/apache/sysds/lops/ReaderGen.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java rename src/test/java/org/apache/sysds/test/functions/iogen/{GenerateReaderMatrixTest.java => GIOMatrixReader.java} (53%) create mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/GIOReadCSVTest1.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomMatrix.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java delete mode 100644 src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java create mode 100644 src/test/scripts/functions/iogen/ReaderCSV_1.dml create mode 100644 src/test/scripts/functions/iogen/in/dataset_1.dat create mode 100644 src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx create mode 100644 src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx.mtd create mode 100644 src/test/scripts/functions/iogen/in/sampleMatrix_1.raw diff --git a/src/main/java/org/apache/sysds/common/Types.java b/src/main/java/org/apache/sysds/common/Types.java index a7cfa823aa7..6d3170e4fef 100644 --- a/src/main/java/org/apache/sysds/common/Types.java +++ b/src/main/java/org/apache/sysds/common/Types.java @@ -535,6 +535,16 @@ public String toString() { } } + public enum OpOpGenerateReader { + GENERATEREADER; + public boolean isGenerateReader(){return this == GENERATEREADER;} + + @Override + public String toString() { + return "GRead"; + } + } + public enum FileFormat { TEXT, // text cell IJV representation (mm w/o header) MM, // text matrix market IJV representation @@ -544,7 +554,8 @@ public enum FileFormat { BINARY, // binary block representation (dense/sparse/ultra-sparse) FEDERATED, // A federated matrix PROTO, // protocol buffer representation - HDF5; // Hierarchical Data Format (HDF) + HDF5, // Hierarchical Data Format (HDF) + IOGEN; // Generated Reader public boolean isIJV() { return this == TEXT || this == MM; diff --git a/src/main/java/org/apache/sysds/hops/DataOp.java b/src/main/java/org/apache/sysds/hops/DataOp.java index 42c51e452b7..fb9dc4e4eb9 100644 --- a/src/main/java/org/apache/sysds/hops/DataOp.java +++ b/src/main/java/org/apache/sysds/hops/DataOp.java @@ -32,6 +32,7 @@ import org.apache.sysds.conf.ConfigurationManager; import org.apache.sysds.hops.rewrite.HopRewriteUtils; import org.apache.sysds.lops.Data; +import org.apache.sysds.lops.DataIOGen; import org.apache.sysds.lops.Federated; import org.apache.sysds.lops.Lop; import org.apache.sysds.common.Types.ExecType; @@ -54,11 +55,15 @@ public class DataOp extends Hop { //read dataop properties private FileFormat _inFormat = FileFormat.TEXT; + private String _inIOGenFormat; private long _inBlocksize = -1; private boolean _hasOnlyRDD = false; private boolean _recompileRead = true; + private boolean _ioGenRead = false; + private GenerateReaderOp _generateReaderOp; + /** * List of "named" input parameters. They are maintained as a hashmap: * parameter names (String) are mapped as indices (Integer) into getInput() @@ -247,6 +252,26 @@ public void setFileName(String fn) { _fileName = fn; } + public void setIOGenRead(boolean isIOGenRead) { + _ioGenRead = isIOGenRead; + } + + public boolean isIOGenRead(){ + return _ioGenRead; + } + + public String getIOGenFormat() { + return _inIOGenFormat; + } + + public void setIOGenFormat(String ioGenFormat) { + this._inIOGenFormat = ioGenFormat; + } + + public void setGenerateReaderOp(GenerateReaderOp op){ + _generateReaderOp = op; + } + public String getFileName() { return _fileName; } @@ -283,20 +308,28 @@ public Lop constructLops() for (Entry cur : _paramIndexMap.entrySet()) { inputLops.put(cur.getKey(), getInput().get(cur.getValue()).constructLops()); } + if(_ioGenRead) + inputLops.put("iogenformat", _generateReaderOp.constructLops()); // Create the lop switch(_op) { case TRANSIENTREAD: - l = new Data(_op, null, inputLops, getName(), null, - getDataType(), getValueType(), getFileFormat()); + if(!_ioGenRead) + l = new Data(_op, null, inputLops, getName(), null, getDataType(), getValueType(), getFileFormat()); + else + l = new DataIOGen(_op, null, inputLops, getName(), null, getDataType(), getValueType(), getIOGenFormat()); setOutputDimensions(l); break; case PERSISTENTREAD: - l = new Data(_op, null, inputLops, getName(), null, - getDataType(), getValueType(), getFileFormat()); - l.getOutputParameters().setDimensions(getDim1(), getDim2(), _inBlocksize, getNnz(), getUpdateType()); + if(!_ioGenRead){ + l = new Data(_op, null, inputLops, getName(), null, getDataType(), getValueType(), getFileFormat()); + l.getOutputParameters().setDimensions(getDim1(), getDim2(), _inBlocksize, getNnz(), getUpdateType()); + } + else + l = new DataIOGen(_op, null, inputLops, getName(), null, getDataType(), getValueType(), getIOGenFormat()); + break; case PERSISTENTWRITE: diff --git a/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java b/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java new file mode 100644 index 00000000000..e410715c458 --- /dev/null +++ b/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.hops; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types; +import org.apache.sysds.common.Types.DataType; +import org.apache.sysds.lops.Lop; +import org.apache.sysds.lops.ReaderGen; +import org.apache.sysds.runtime.meta.DataCharacteristics; + +import java.util.HashMap; +import java.util.Map.Entry; + + +public class GenerateReaderOp extends Hop { + private static final Log LOG = LogFactory.getLog(GenerateReaderOp.class.getName()); + private Types.OpOpGenerateReader _op; + + /** + * List of "named" input parameters. They are maintained as a hashmap: + * parameter names (String) are mapped as indices (Integer) into getInput() + * arraylist. + *

+ * i.e., getInput().get(_paramIndexMap.get(parameterName)) refers to the Hop + * that is associated with parameterName. + */ + private HashMap _paramIndexMap = new HashMap<>(); + + private GenerateReaderOp() { + //default constructor for clone + } + + @Override public void checkArity() { + + } + + @Override public boolean allowsAllExecTypes() { + return false; + } + + @Override protected DataCharacteristics inferOutputCharacteristics(MemoTable memo) { + return null; + } + + @Override public Lop constructLops() { + //return already created lops + if( getLops() != null ) + return getLops(); + + Types.ExecType et = Types.ExecType.CP; + + + // construct lops for all input parameters + HashMap inputLops = new HashMap<>(); + for (Entry cur : _paramIndexMap.entrySet()) { + inputLops.put(cur.getKey(), getInput().get(cur.getValue()).constructLops()); + } + + Lop l = new ReaderGen(getInput().get(0).constructLops(),_dataType, _valueType, et, inputLops); + + setLineNumbers(l); + setPrivacy(l); + setLops(l); + + //add reblock/checkpoint lops if necessary + constructAndSetLopsDataFlowProperties(); + + return getLops(); + } + + @Override protected Types.ExecType optFindExecType(boolean transitive) { + return null; + } + + @Override public String getOpString() { + String s = new String(""); + s += _op.toString(); + s += " "+getName(); + return s; + } + + @Override public boolean isGPUEnabled() { + return false; + } + + @Override protected double computeOutputMemEstimate(long dim1, long dim2, long nnz) { + return 0; + } + + @Override protected double computeIntermediateMemEstimate(long dim1, long dim2, long nnz) { + return 0; + } + + @Override public void refreshSizeInformation() { + + } + + @Override public Object clone() throws CloneNotSupportedException { + return null; + } + + @Override public boolean compare(Hop that) { + return false; + } + + /** + * Generate Reader operation for Matrix + * This constructor supports expression in parameters + * + * @param dt data type + * @param dop data operator type + * @param in high-level operator + * @param inputParameters input parameters + */ + public GenerateReaderOp(String l, DataType dt, Types.OpOpGenerateReader dop, Hop in, HashMap inputParameters) { + _dataType = dt; + _op = dop; + _name = l; + getInput().add(0, in); + in.getParent().add(this); + + if(inputParameters != null) { + int index = 1; + for(Entry e : inputParameters.entrySet()) { + String s = e.getKey(); + Hop input = e.getValue(); + getInput().add(input); + input.getParent().add(this); + + _paramIndexMap.put(s, index); + index++; + } + } + } + + public Types.OpOpGenerateReader getOp() { + return _op; + } +} diff --git a/src/main/java/org/apache/sysds/lops/DataIOGen.java b/src/main/java/org/apache/sysds/lops/DataIOGen.java new file mode 100644 index 00000000000..803bef1bb41 --- /dev/null +++ b/src/main/java/org/apache/sysds/lops/DataIOGen.java @@ -0,0 +1,344 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.lops; + +import org.apache.sysds.common.Types.DataType; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.common.Types.FileFormat; +import org.apache.sysds.common.Types.OpOpData; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.parser.DataExpression; + +import java.util.HashMap; + +/** + * Lop to represent data objects. Data objects represent matrices, vectors, + * variables, literals. Can be for both input and output. + */ +public class DataIOGen extends Lop { + public static final String PREAD_PREFIX = "pREAD"; + private final String formatType; + private final OpOpData _op; + private final boolean literal_var; + private HashMap _inputParams; + + /** + * Method to create literal LOPs. + * + * @param vt value type + * @param literalValue literal value + * @return literal low-level operator + */ + public static DataIOGen createLiteralLop(ValueType vt, String literalValue) { + // All literals have default format type of TEXT + return new DataIOGen(OpOpData.PERSISTENTREAD, null, null, null, literalValue, DataType.SCALAR, vt, FileFormat.TEXT.toString()); + } + + /** + * Constructor to setup read or write LOP + * In case of write: input must be provided. This will always be added as the first element in input array. + * For literals: this function is invoked through the static method createLiteralLop. + * + * @param op operation type + * @param input low-level operator + * @param inputParametersLops input lops + * @param name string name + * @param literal string literal + * @param dt data type + * @param vt value type + * @param fmt file format + */ + public DataIOGen(OpOpData op, Lop input, HashMap inputParametersLops, String name, String literal, DataType dt, ValueType vt, + String fmt) { + super(Type.Data, dt, vt); + _op = op; + literal_var = (literal != null); + + // Either name or literal can be non-null. + if(literal_var) { + if(_op.isTransient()) + throw new LopsException("Invalid parameter values while setting up a Data LOP -- transient flag is invalid for a literal."); + getOutputParameters().setLabel(literal); + } + else if(name != null) { + if(_op.isTransient()) + getOutputParameters().setLabel(name); // tvar+name + else { + String code = _op == OpOpData.FUNCTIONOUTPUT ? "" : _op.isRead() ? "pREAD" : "pWRITE"; + getOutputParameters().setLabel(code + name); + } + } + else { + throw new LopsException("Invalid parameter values while setting up a Data LOP -- the lop must have either literal value or a name."); + } + + // WRITE operation must have an input Lops, we always put this + // input Lops as the first element of WRITE input. The parameters of + // WRITE operation are then put as the following input elements. + if(input != null && op.isWrite()) { + addInput(input); + input.addOutput(this); + } + + _inputParams = inputParametersLops; + + if(_inputParams != null) { + for(Lop lop : inputParametersLops.values()) { + addInput(lop); + lop.addOutput(this); + } + if(inputParametersLops.get(DataExpression.IO_FILENAME) != null) { + OutputParameters outParams = (inputParametersLops.get(DataExpression.IO_FILENAME)).getOutputParameters(); + String fName = outParams.getLabel(); + this.getOutputParameters().setFile_name(fName); + } + } + + //set output format + formatType = fmt; + //outParams.setFormat(fmt); + setLopProperties(); + } + + private void setLopProperties() { + lps.setProperties(inputs, ExecType.INVALID); + } + + /** + * Data-Lop-specific method to set the execution type for persistent write. + * TODO: split lops into MR/CP lop. + * + * @param et execution type + */ + public void setExecType(ExecType et) { + lps.execType = et; + } + + /** + * method to get format type for input, output files. + * + * @return file format + */ + public String getFileFormatType() { + return formatType; + } + + @Override public String toString() { + return getID() + ":" + "File_Name: " + getOutputParameters().getFile_name() + " " + "Label: " + getOutputParameters().getLabel() + " " + "Operation: = " + _op + " " + "Format: " + outParams.getFormat() + " Datatype: " + getDataType() + " Valuetype: " + getValueType() + " num_rows = " + getOutputParameters().getNumRows() + " num_cols = " + getOutputParameters().getNumCols() + " UpdateInPlace: " + getOutputParameters().getUpdateType(); + } + + /** + * method to get operation type, i.e. read/write. + * + * @return operation type + */ + + public OpOpData getOperationType() { + return _op; + } + + /** + * method to get inputParams + * + * @return input parameters + */ + public HashMap getInputParams() { + return _inputParams; + } + + public Lop getNamedInputLop(String name) { + return _inputParams.get(name); + } + + public Lop getNamedInputLop(String name, String defaultVal) { + if(_inputParams.containsKey(name)) + return _inputParams.get(name); + else + return DataIOGen.createLiteralLop(ValueType.STRING, defaultVal); + } + + /** + * method to check if this data lop represents a literal. + * + * @return true if data lop is a literal + */ + public boolean isLiteral() { + return literal_var; + } + + public boolean getBooleanValue() { + if(literal_var) { + return Boolean.parseBoolean(getOutputParameters().getLabel()); + } + else + throw new LopsException("Cannot obtain the value of a non-literal variable at compile time."); + } + + public double getDoubleValue() { + if(literal_var) { + return Double.parseDouble(getOutputParameters().getLabel()); + } + else + throw new LopsException("Cannot obtain the value of a non-literal variable at compile time."); + } + + public long getLongValue() { + if(literal_var) { + ValueType vt = getValueType(); + switch(vt) { + case INT64: + return Long.parseLong(getOutputParameters().getLabel()); + case FP64: + return (long) Double.parseDouble(getOutputParameters().getLabel()); + + default: + throw new LopsException("Encountered a non-numeric value " + (vt) + ", while a numeric value is expected."); + } + } + else + throw new LopsException("Can not obtain the value of a non-literal variable at compile time."); + } + + public String getStringValue() { + if(literal_var) { + return getOutputParameters().getLabel(); + } + else + throw new LopsException("Cannot obtain the value of a non-literal variable at compile time."); + } + + public boolean isPersistentWrite() { + return _op == OpOpData.PERSISTENTWRITE; + } + + public boolean isPersistentRead() { + return _op == OpOpData.PERSISTENTREAD && !literal_var; + } + + /** + * Method to get CP instructions for reading/writing scalars and matrices from/to HDFS. + * This method generates CP read/write instructions. + */ + @Override public String getInstructions(String input1, String input2) { + if(getOutputParameters().getFile_name() == null && _op.isRead()) + throw new LopsException( + this.printErrorLocation() + "Data.getInstructions(): Exepecting a SCALAR data type, encountered " + getDataType()); + + StringBuilder sb = new StringBuilder(); + if(this.getExecType() == ExecType.SPARK) + sb.append("SPARK"); + else + sb.append("CP"); + sb.append(OPERAND_DELIMITOR); + if(_op.isRead()) { + sb.append("read"); + sb.append(OPERAND_DELIMITOR); + sb.append(this.prepInputOperand(input1)); + } + else if(_op.isWrite()) { + sb.append("write"); + sb.append(OPERAND_DELIMITOR); + sb.append(getInputs().get(0).prepInputOperand(input1)); + } + else + throw new LopsException(this.printErrorLocation() + "In Data Lop, Unknown operation: " + _op); + + sb.append(OPERAND_DELIMITOR); + Lop fnameLop = _inputParams.get(DataExpression.IO_FILENAME); + boolean literal = (fnameLop instanceof DataIOGen && ((DataIOGen) fnameLop).isLiteral()); + sb.append(prepOperand(input2, DataType.SCALAR, ValueType.STRING, literal)); + + // attach outputInfo in case of matrices + OutputParameters oparams = getOutputParameters(); + if(_op.isWrite()) { + sb.append(OPERAND_DELIMITOR); + String fmt = getFileFormatType(); + sb.append(prepOperand(fmt, DataType.SCALAR, ValueType.STRING, true)); + } + + if(_op.isWrite()) { + sb.append(OPERAND_DELIMITOR); + Lop descriptionLop = getInputParams().get(DataExpression.DESCRIPTIONPARAM); + if(descriptionLop != null) { + boolean descLiteral = (descriptionLop instanceof DataIOGen && ((DataIOGen) descriptionLop).isLiteral()); + sb.append(prepOperand(descriptionLop.getOutputParameters().getLabel(), DataType.SCALAR, ValueType.STRING, descLiteral)); + } + else { + sb.append(prepOperand("", DataType.SCALAR, ValueType.STRING, true)); + } + sb.append(OPERAND_DELIMITOR); + sb.append(oparams.getBlocksize()); + } + + return sb.toString(); + } + + /** + * Method to generate createvar instruction that updates symbol table with metadata, hdfsfile name, etc. + */ + @Override public String getInstructions() { + return getCreateVarInstructions(getOutputParameters().getFile_name(), getOutputParameters().getLabel()); + } + + @Override public String getInstructions(String outputFileName) { + return getCreateVarInstructions(outputFileName, getOutputParameters().getLabel()); + } + + public String getCreateVarInstructions(String outputFileName, String outputLabel) { + if(getDataType() == DataType.MATRIX || getDataType() == DataType.FRAME) { + + if(_op.isTransient()) + throw new LopsException("getInstructions() should not be called for transient nodes."); + + OutputParameters oparams = getOutputParameters(); + + StringBuilder sb = new StringBuilder(); + sb.append("CP"); + sb.append(OPERAND_DELIMITOR); + sb.append("createvar"); + sb.append(OPERAND_DELIMITOR); + sb.append(outputLabel); + sb.append(OPERAND_DELIMITOR); + sb.append(outputFileName); + sb.append(OPERAND_DELIMITOR); + sb.append(false); + sb.append(OPERAND_DELIMITOR); + sb.append(getDataType()); + sb.append(OPERAND_DELIMITOR); // only persistent reads come here! + sb.append("IOGEN"); + sb.append(OPERAND_DELIMITOR); + sb.append(getFileFormatType()); + sb.append(OPERAND_DELIMITOR); + sb.append(oparams.getNumRows()); + sb.append(OPERAND_DELIMITOR); + sb.append(oparams.getNumCols()); + sb.append(OPERAND_DELIMITOR); + sb.append(oparams.getBlocksize()); + sb.append(OPERAND_DELIMITOR); + sb.append(oparams.getNnz()); + sb.append(OPERAND_DELIMITOR); + sb.append(oparams.getUpdateType().toString().toLowerCase()); + return sb.toString(); + } + else { + throw new LopsException(this.printErrorLocation() + "In Data Lop, Unexpected data type " + getDataType()); + } + } +} diff --git a/src/main/java/org/apache/sysds/lops/Lop.java b/src/main/java/org/apache/sysds/lops/Lop.java index 440669d13ac..1673fa6da1b 100644 --- a/src/main/java/org/apache/sysds/lops/Lop.java +++ b/src/main/java/org/apache/sysds/lops/Lop.java @@ -64,7 +64,8 @@ public enum Type { PlusMult, MinusMult, //CP SpoofFused, //CP/SP generated fused operator Sql, //CP sql read - Federated //FED federated read + Federated, //FED federated read + ReaderGen // IO Gen } @@ -202,6 +203,14 @@ public boolean isDataExecLocation() { return this instanceof Data; } + public boolean isDataIOGenExecLocation(){ + return this instanceof DataIOGen; + } + + public boolean isReaderGenExecLocation(){ + return this instanceof ReaderGen; + } + protected void setupLopProperties(ExecType et) { //setup Spark parameters lps.setProperties( inputs, et); diff --git a/src/main/java/org/apache/sysds/lops/ReaderGen.java b/src/main/java/org/apache/sysds/lops/ReaderGen.java new file mode 100644 index 00000000000..b10454888ed --- /dev/null +++ b/src/main/java/org/apache/sysds/lops/ReaderGen.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.lops; + +import org.apache.sysds.common.Types.DataType; +import org.apache.sysds.common.Types.ExecType; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.parser.DataExpression; + +import java.util.HashMap; + +public class ReaderGen extends Lop +{ + public static final String OPCODE = "readergen"; + private HashMap _inputParams; + + public ReaderGen(Lop input, DataType dt, ValueType vt, ExecType et, HashMap inputParametersLops) { + super(Type.Checkpoint, dt, vt); + addInput(input); + input.addOutput(this); + lps.setProperties(inputs, et); + _inputParams = inputParametersLops; + } + + @Override + public String toString() { + return "ReaderGen"; + } + + @Override + public String getInstructions(){ + StringBuilder sb = new StringBuilder(); + sb.append( getExecType() ); + sb.append( Lop.OPERAND_DELIMITOR ); + sb.append( OPCODE ); + + // sample matrix/frame + Lop inSample = _inputParams.get(DataExpression.SAMPLE); + sb.append( OPERAND_DELIMITOR ); + sb.append ( inSample.prepInputOperand(inSample.getOutputParameters().getLabel())); + + // sample Raw + Lop inSampleRaw = _inputParams.get(DataExpression.SAMPLE_RAW); + sb.append( OPERAND_DELIMITOR ); + sb.append (inSampleRaw.getOutputParameters().getLabel()); + + // src output + Lop inOutput = _inputParams.get(DataExpression.FORMAT_TYPE); + sb.append( OPERAND_DELIMITOR ); + sb.append(inOutput.getOutputParameters().getLabel()); + + return sb.toString(); + } +} diff --git a/src/main/java/org/apache/sysds/lops/compile/Dag.java b/src/main/java/org/apache/sysds/lops/compile/Dag.java index b089b1068f6..20b9adb2d4a 100644 --- a/src/main/java/org/apache/sysds/lops/compile/Dag.java +++ b/src/main/java/org/apache/sysds/lops/compile/Dag.java @@ -43,6 +43,7 @@ import org.apache.sysds.lops.Checkpoint; import org.apache.sysds.lops.CoVariance; import org.apache.sysds.lops.Data; +import org.apache.sysds.lops.DataIOGen; import org.apache.sysds.lops.FunctionCallCP; import org.apache.sysds.lops.GroupedAggregate; import org.apache.sysds.lops.GroupedAggregateM; @@ -56,6 +57,7 @@ import org.apache.sysds.lops.ParameterizedBuiltin; import org.apache.sysds.lops.PickByCount; import org.apache.sysds.lops.ReBlock; +import org.apache.sysds.lops.ReaderGen; import org.apache.sysds.lops.SpoofFused; import org.apache.sysds.lops.UAggOuterChain; import org.apache.sysds.lops.UnaryCP; @@ -304,9 +306,9 @@ private ArrayList doPlainInstructionGen(StatementBlock sb, List execNodes = nodes.stream() - .filter(l -> (!l.isDataExecLocation() + .filter(l -> (!l.isDataIOGenExecLocation() && !l.isReaderGenExecLocation() && (!l.isDataExecLocation() || (((Data)l).getOperationType().isWrite() && !isTransientWriteRead((Data)l)) - || (((Data)l).isPersistentRead() && l.getDataType().isScalar()))) + || (((Data)l).isPersistentRead() && l.getDataType().isScalar())))) .collect(Collectors.toList()); // generate executable instruction @@ -380,10 +382,11 @@ private static List deleteUpdatedTransientReadVariables(StatementBl // first capture all transient read variables for ( Lop node : nodeV ) { - if (node.isDataExecLocation() + if ((node.isDataIOGenExecLocation() && node.getDataType() == DataType.MATRIX) + || (node.isDataExecLocation() && ((Data) node).getOperationType().isTransient() && ((Data) node).getOperationType().isRead() - && ((Data) node).getDataType() == DataType.MATRIX) { + && ((Data) node).getDataType() == DataType.MATRIX)) { // "node" is considered as updated ONLY IF the old value is not used any more // So, make sure that this READ node does not feed into any (transient/persistent) WRITE boolean hasWriteParent=false; @@ -466,13 +469,13 @@ private static List generateRemoveInstructions(StatementBlock sb) { private static ArrayList generateInstructionsForInputVariables(List nodes_v) { ArrayList insts = new ArrayList<>(); for(Lop n : nodes_v) { - if (n.isDataExecLocation() + if (n.isReaderGenExecLocation() || n.isDataIOGenExecLocation() || ( n.isDataExecLocation() && !((Data) n).getOperationType().isTransient() && ((Data) n).getOperationType().isRead() && (n.getDataType() == DataType.MATRIX || n.getDataType() == DataType.FRAME - || n.getDataType() == DataType.LIST) ) + || n.getDataType() == DataType.LIST))) { - if ( !((Data)n).isLiteral() ) { + if ( n.isDataIOGenExecLocation() || n.isReaderGenExecLocation() || (n.isDataExecLocation() && !((Data)n).isLiteral()) ) { try { String inst_string = n.getInstructions(); CPInstruction currInstr = CPInstructionParser.parseSingleInstruction(inst_string); @@ -605,7 +608,7 @@ private void generateControlProgramJobs(List execNodes, } } - if ( !hasTransientWriteParent ) { + if ( !hasTransientWriteParent) { deleteInst.addAll(out.getLastInstructions()); } else { @@ -646,6 +649,13 @@ else if (node.getType() == Lop.Type.Nary) { inst_string = node.getInstructions(inputs, node.getOutputParameters().getLabel()); } + else if(node instanceof ReaderGen) + inst_string = node.getInstructions(); + + else if(node instanceof DataIOGen){ + inst_string = node.getInstructions(); + } + else { if ( node.getInputs().isEmpty() ) { // currently, such a case exists only for Rand lop @@ -745,8 +755,12 @@ else if ( !node.getInputs().isEmpty() ) + inst_string, e); } - markedNodes.add(node); - doRmVar = true; + if(node instanceof DataIOGen || node instanceof ReaderGen) + doRmVar = false; + else { + markedNodes.add(node); + doRmVar = true; + } } else if (node.isDataExecLocation() ) { Data dnode = (Data)node; diff --git a/src/main/java/org/apache/sysds/parser/DMLTranslator.java b/src/main/java/org/apache/sysds/parser/DMLTranslator.java index c196f780c33..f73663c0fb4 100644 --- a/src/main/java/org/apache/sysds/parser/DMLTranslator.java +++ b/src/main/java/org/apache/sysds/parser/DMLTranslator.java @@ -32,6 +32,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Builtins; +import org.apache.sysds.common.Types; import org.apache.sysds.common.Types.AggOp; import org.apache.sysds.common.Types.DataType; import org.apache.sysds.common.Types.Direction; @@ -65,6 +66,7 @@ import org.apache.sysds.hops.NaryOp; import org.apache.sysds.hops.OptimizerUtils; import org.apache.sysds.hops.ParameterizedBuiltinOp; +import org.apache.sysds.hops.GenerateReaderOp; import org.apache.sysds.hops.ReorgOp; import org.apache.sysds.hops.TernaryOp; import org.apache.sysds.hops.UnaryOp; @@ -95,6 +97,7 @@ public class DMLTranslator { private static final Log LOG = LogFactory.getLog(DMLTranslator.class.getName()); private DMLProgram _dmlProg; + private HashMap _ioGenHops = new HashMap<>(); public DMLTranslator(DMLProgram dmlp) { _dmlProg = dmlp; @@ -1151,8 +1154,10 @@ else if (ptype == PRINTTYPE.STOP) { ae = HopRewriteUtils.createBinary(ids.get(target.getName()), ae, OpOp2.PLUS); target.setProperties(accum.getOutput()); } - else - target.setProperties(source.getOutput()); + else { + if(target != null) + target.setProperties(source.getOutput()); + } if (source instanceof BuiltinFunctionExpression){ BuiltinFunctionExpression BuiltinSource = (BuiltinFunctionExpression)source; @@ -1160,16 +1165,18 @@ else if (ptype == PRINTTYPE.STOP) { sb.setSplitDag(true); } - ids.put(target.getName(), ae); - - //add transient write if needed - Integer statementId = liveOutToTemp.get(target.getName()); - if ((statementId != null) && (statementId.intValue() == i)) { - DataOp transientwrite = new DataOp(target.getName(), target.getDataType(), target.getValueType(), ae, OpOpData.TRANSIENTWRITE, null); - transientwrite.setOutputParams(ae.getDim1(), ae.getDim2(), ae.getNnz(), ae.getUpdateType(), ae.getBlocksize()); - transientwrite.setParseInfo(target); - updatedLiveOut.addVariable(target.getName(), target); - output.add(transientwrite); + if(target != null) { + ids.put(target.getName(), ae); + + //add transient write if needed + Integer statementId = liveOutToTemp.get(target.getName()); + if((statementId != null) && (statementId.intValue() == i)) { + DataOp transientwrite = new DataOp(target.getName(), target.getDataType(), target.getValueType(), ae, OpOpData.TRANSIENTWRITE, null); + transientwrite.setOutputParams(ae.getDim1(), ae.getDim2(), ae.getNnz(), ae.getUpdateType(), ae.getBlocksize()); + transientwrite.setParseInfo(target); + updatedLiveOut.addVariable(target.getName(), target); + output.add(transientwrite); + } } } // CASE: target is indexed identifier (left-hand side indexed expression) @@ -1561,8 +1568,13 @@ else if( source instanceof DataExpression ) { Hop ae = processDataExpression((DataExpression)source, target, hops); if (ae instanceof DataOp && ((DataOp) ae).getOp() != OpOpData.SQLREAD && ((DataOp) ae).getOp() != OpOpData.FEDERATED) { - String formatName = ((DataExpression)source).getVarParam(DataExpression.FORMAT_TYPE).toString(); - ((DataOp)ae).setFileFormat(Expression.convertFormatType(formatName)); + String formatName = ((DataExpression) source).getVarParam(DataExpression.FORMAT_TYPE).toString(); + if( !((DataOp)ae).isIOGenRead()) + ((DataOp) ae).setFileFormat(Expression.convertFormatType(formatName)); + else { + ((DataOp) ae).setIOGenFormat(formatName); + } + } return ae; } @@ -2102,12 +2114,30 @@ private Hop processDataExpression(DataExpression source, DataIdentifier target, if (target == null) { target = createTarget(source); } - + + boolean isIOGEN = false; // construct hop based on opcode switch(source.getOpCode()) { case READ: - currBuiltinOp = new DataOp(target.getName(), target.getDataType(), target.getValueType(), OpOpData.PERSISTENTREAD, paramHops); - ((DataOp)currBuiltinOp).setFileName(((StringIdentifier)source.getVarParam(DataExpression.IO_FILENAME)).getValue()); + if(source.getVarParam(DataExpression.SAMPLE_RAW) !=null && source.getVarParam(DataExpression.SAMPLE)!=null + && source.getVarParam(DataExpression.FORMAT_TYPE) !=null && source.getVarParam(DataExpression.DATATYPEPARAM) != null) + isIOGEN = true; + + if(!isIOGEN) { + currBuiltinOp = new DataOp(target.getName(), target.getDataType(), target.getValueType(), OpOpData.PERSISTENTREAD, paramHops); + ((DataOp) currBuiltinOp).setFileName(((StringIdentifier) source.getVarParam(DataExpression.IO_FILENAME)).getValue()); + if(source.getVarParam(DataExpression.IS_IOGEN_FORMAT) != null && + ((BooleanIdentifier) source.getVarParam(DataExpression.IS_IOGEN_FORMAT)).getValue()){ + ((DataOp) currBuiltinOp).setIOGenRead(true); + ((DataOp) currBuiltinOp).setGenerateReaderOp(_ioGenHops.get(((StringIdentifier)source.getVarParam(DataExpression.FORMAT_TYPE)).getValue())); + + } + } + else { + currBuiltinOp = new GenerateReaderOp(((StringIdentifier)source.getVarParam(DataExpression.FORMAT_TYPE)).getValue(),source.getDataType(), + Types.OpOpGenerateReader.GENERATEREADER, hops.get(hops.keySet().iterator().next()), paramHops); + _ioGenHops.put(((StringIdentifier)source.getVarParam(DataExpression.FORMAT_TYPE)).getValue(), (GenerateReaderOp)currBuiltinOp); + } break; case WRITE: @@ -2158,7 +2188,7 @@ private Hop processDataExpression(DataExpression source, DataIdentifier target, //set identifier meta data (incl dimensions and blocksizes) setIdentifierParams(currBuiltinOp, source.getOutput()); - if( source.getOpCode()==DataExpression.DataOp.READ ) + if( source.getOpCode()==DataExpression.DataOp.READ && !isIOGEN) ((DataOp)currBuiltinOp).setInputBlocksize(target.getBlocksize()); else if ( source.getOpCode() == DataExpression.DataOp.WRITE ) { ((DataOp)currBuiltinOp).setPrivacy(hops.get(target.getName()).getPrivacy()); @@ -2931,6 +2961,11 @@ else if( s instanceof AssignmentStatement { DataExpression dexpr = (DataExpression) ((AssignmentStatement)s).getSource(); if (dexpr.isRead()) { + // checks for IOGEN + if(dexpr.getVarParam(DataExpression.SAMPLE_RAW) !=null && dexpr.getVarParam(DataExpression.SAMPLE)!=null + && dexpr.getVarParam(DataExpression.FORMAT_TYPE) !=null && dexpr.getVarParam(DataExpression.DATATYPEPARAM) != null) + return true; + String pfname = dexpr.getVarParam(DataExpression.IO_FILENAME).toString(); // found read-after-write if (pWrites.containsKey(pfname) && !pfname.trim().isEmpty()) { diff --git a/src/main/java/org/apache/sysds/parser/DataExpression.java b/src/main/java/org/apache/sysds/parser/DataExpression.java index e2e3996cea4..d4f35156c96 100644 --- a/src/main/java/org/apache/sysds/parser/DataExpression.java +++ b/src/main/java/org/apache/sysds/parser/DataExpression.java @@ -118,7 +118,13 @@ public class DataExpression extends DataIdentifier public static final String HDF5_DATASET_NAME = "dataset"; public static final String DELIM_SPARSE = "sparse"; // applicable only for write - + + // Parameter names relevant to IOGEN reader + public static final String SAMPLE_RAW = "sample_raw"; + public static final String SAMPLE = "sample"; + public static final String IS_IOGEN_FORMAT = "isiogenformat"; + public static final Set IOGEN_SUBMITTED_FORMATS = new HashSet<>(); + public static final Set RAND_VALID_PARAM_NAMES = new HashSet<>( Arrays.asList(RAND_ROWS, RAND_COLS, RAND_DIMS, RAND_MIN, RAND_MAX, RAND_SPARSITY, RAND_SEED, RAND_PDF, RAND_LAMBDA)); @@ -147,7 +153,10 @@ public class DataExpression extends DataIdentifier //Parameters related to dataset name/HDF4 files. HDF5_DATASET_NAME, // Parameters related to privacy - PRIVACY, FINE_GRAINED_PRIVACY)); + PRIVACY, FINE_GRAINED_PRIVACY, + // Parameters related to IOGEN + SAMPLE_RAW, SAMPLE + )); /** Valid parameter names in arguments to read instruction */ public static final Set READ_VALID_PARAM_NAMES = new HashSet<>( @@ -158,7 +167,9 @@ public class DataExpression extends DataIdentifier // Parameters related to delimited/libsvm files. LIBSVM_INDEX_DELIM, //Parameters related to dataset name/HDF4 files. - HDF5_DATASET_NAME)); + HDF5_DATASET_NAME, + // Parameters related to IOGEN + SAMPLE_RAW, SAMPLE)); /* Default Values for delimited (CSV/LIBSVM) files */ public static final String DEFAULT_DELIM_DELIMITER = ","; @@ -244,12 +255,34 @@ private static DataExpression processReadDataExpression(String functionName, } ParameterExpression pexpr = (passedParamExprs.size() == 0) ? null : passedParamExprs.get(0); - - if ( (pexpr != null) && (!(pexpr.getName() == null) || (pexpr.getName() != null && pexpr.getName().equalsIgnoreCase(DataExpression.IO_FILENAME)))){ - errorListener.validationError(parseInfo, "first parameter to read statement must be filename"); - return null; - } else if( pexpr != null ){ - dataExpr.addVarParam(DataExpression.IO_FILENAME, pexpr.getExpr()); + + // checks for IOGEN + boolean isIOGEN = false; + if(pexpr !=null && pexpr.getName()!=null && passedParamExprs.size() == 4){ + HashSet names = new HashSet<>(); + for(int i = 0; i < 4; i++){ + ParameterExpression pexprtmp = passedParamExprs.get(i); + String pexprName = pexprtmp.getName(); + names.add(pexprName); + } + if(names.size() == 4 && names.contains(SAMPLE_RAW) && names.contains(SAMPLE) && + names.contains(FORMAT_TYPE) && names.contains(DATATYPEPARAM)){ + isIOGEN = true; + } + } + + if(!isIOGEN) { + if((pexpr != null) && (!(pexpr.getName() == null) || (pexpr.getName() != null && pexpr.getName() + .equalsIgnoreCase(DataExpression.IO_FILENAME)))) { + errorListener.validationError(parseInfo, "first parameter to read statement must be filename"); + return null; + } + else if(pexpr != null) { + dataExpr.addVarParam(DataExpression.IO_FILENAME, pexpr.getExpr()); + } + } + else { + dataExpr.addVarParam(pexpr.getName(), pexpr.getExpr()); } // validate all parameters are added only once and valid name @@ -950,6 +983,8 @@ public void validateExpression(HashMap ids, HashMap ids, HashMap ids, HashMap ids, HashMap= 10) { - // matrix characteristics - mc.setDimension(Long.parseLong(parts[6]), Long.parseLong(parts[7])); - mc.setBlocksize(Integer.parseInt(parts[8])); - mc.setNonZeros(Long.parseLong(parts[9])); - } - else { - throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str); + if(!isIOGen) { + if(parts.length == 6) { + // do nothing + } + else if(parts.length >= 10) { + // matrix characteristics + mc.setDimension(Long.parseLong(parts[6]), Long.parseLong(parts[7])); + mc.setBlocksize(Integer.parseInt(parts[8])); + mc.setNonZeros(Long.parseLong(parts[9])); + } + else { + throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str); + } } iimd = new MetaDataFormat(mc, FileFormat.safeValueOf(fmt)); } @@ -406,8 +426,12 @@ else if (parts.length >= 10) { iimd = new MetaDataFormat(tc, FileFormat.safeValueOf(fmt)); } UpdateType updateType = UpdateType.COPY; - if ( parts.length >= 11 ) - updateType = UpdateType.valueOf(parts[10].toUpperCase()); + if(!isIOGen) { + if(parts.length >= 11) + updateType = UpdateType.valueOf(parts[10].toUpperCase()); + } + else + updateType = UpdateType.valueOf(parts[11].toUpperCase()); //handle frame schema String schema = (dt==DataType.FRAME && parts.length>=12) ? parts[parts.length-1] : null; @@ -468,6 +492,12 @@ else if(fmt.equalsIgnoreCase("hdf5")) { return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str); } + else if(fmt.equalsIgnoreCase("iogen")) { + FileFormatProperties fmtProperties = new CustomProperties(parts[6]); + ((CustomProperties)fmtProperties).setParallel(ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS)); + return new VariableCPInstruction(VariableOperationCode.CreateVariable, + in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str); + } else { return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, schema, opcode, str); } @@ -561,6 +591,12 @@ else if(in3.getName().equalsIgnoreCase("hdf5") ){ in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN); // option: remote or local break; + case ReaderGen: + in1 = new CPOperand(parts[1]); + fprops = new SampleProperties(parts[2], parts[3]); + VariableCPInstruction instGen = new VariableCPInstruction( + getVariableOperationCode(opcode), in1, null, null, null, null, fprops, null, null, opcode, str); + return instGen; } return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, opcode, str); } @@ -643,6 +679,9 @@ public void processInstruction(ExecutionContext ec) { processSetFileNameInstruction(ec); break; + case ReaderGen: + processReaderGenInstruction(ec); + break; default: throw new DMLRuntimeException("Unknown opcode: " + opcode ); } @@ -1197,6 +1236,38 @@ private void writeMMFile(ExecutionContext ec, String fname) { } } + /** + * Handler for generate reader instructions. + * @param ec execution context + */ + private void processReaderGenInstruction(ExecutionContext ec) { + if( getInput1().getDataType() == DataType.MATRIX ) { + MatrixBlock mBlock = ec.getMatrixInput(getInput1().getName()); + SampleProperties sampleProperties = ((SampleProperties)_formatProperties); + sampleProperties.setParallel(ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS)); + sampleProperties.setSampleMatrix(mBlock); + try { + GenerateReader.GenerateReaderMatrix grm = new GenerateReader.GenerateReaderMatrix(sampleProperties); + } + catch(Exception e) { + throw new DMLRuntimeException(e); + } + } + else { + FrameBlock fBlock = ec.getFrameInput(getInput1().getName()); + SampleProperties sampleProperties = ((SampleProperties)_formatProperties); + sampleProperties.setParallel(ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_READ_TEXTFORMATS)); + sampleProperties.setSampleFrame(fBlock); + try { + GenerateReader.GenerateReaderFrame grf = new GenerateReader.GenerateReaderFrame(sampleProperties); + } + catch(Exception e) { + throw new DMLRuntimeException(e); + } + } + } + + private static void cleanDataOnHDFS(MatrixObject mo) { try { String fpath = mo.getFileName(); diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java index 3906acad12b..8df9994352d 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java @@ -25,6 +25,9 @@ import org.apache.sysds.conf.CompilerConfig.ConfigType; import org.apache.sysds.conf.ConfigurationManager; import org.apache.sysds.runtime.DMLRuntimeException; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.GenerateReader; +import org.apache.sysds.runtime.matrix.data.Pair; public class FrameReaderFactory { protected static final Log LOG = LogFactory.getLog(FrameReaderFactory.class.getName()); @@ -69,6 +72,22 @@ public static FrameReader createFrameReader(FileFormat fmt, FileFormatProperties reader = new FrameReaderProto(); break; + case IOGEN: + CustomProperties customProperties = (CustomProperties) props; + String[] path = customProperties.getFormat().split("/"); + String className = path[path.length -1].split("\\.")[0]; + Pair loadSrcReaderAndProperties = customProperties.loadSrcReaderAndProperties(); + + GenerateReader.GenerateReaderFrame frm = new GenerateReader.GenerateReaderFrame(loadSrcReaderAndProperties.getValue(), + loadSrcReaderAndProperties.getKey(), className); + try { + reader = frm.getReader(); + } + catch(Exception e) { + throw new DMLRuntimeException("IOGEN Matrix Reader Error: " + e); + } + break; + default: throw new DMLRuntimeException("Failed to create frame reader for unknown format: " + fmt.toString()); } diff --git a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java index 473f6441a2a..c428a1f6b02 100644 --- a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java @@ -26,7 +26,11 @@ import org.apache.sysds.conf.ConfigurationManager; import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.iogen.CustomProperties; +import org.apache.sysds.runtime.iogen.GenerateReader; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.matrix.data.Pair; + public class MatrixReaderFactory { private static final Log LOG = LogFactory.getLog(MatrixReaderFactory.class.getName()); @@ -117,6 +121,27 @@ public static MatrixReader createMatrixReader( ReadProperties props ) { reader = (par & mcsr) ? new ReaderHDF5Parallel(fileFormatPropertiesHDF5) : new ReaderHDF5( fileFormatPropertiesHDF5); break; + case IOGEN: + CustomProperties customProperties; + if(props.formatProperties != null) { + customProperties = (CustomProperties) props.formatProperties; + } + else { + throw new DMLRuntimeException("Failed to create matrix reader with NULL CustomProperties"); + } + String[] path = customProperties.getFormat().split("/"); + String className = path[path.length -1].split("\\.")[0]; + Pair loadSrcReaderAndProperties = customProperties.loadSrcReaderAndProperties(); + + GenerateReader.GenerateReaderMatrix grm = new GenerateReader.GenerateReaderMatrix(loadSrcReaderAndProperties.getValue(), + loadSrcReaderAndProperties.getKey(), className); + try { + reader = grm.getReader(); + } + catch(Exception e) { + throw new DMLRuntimeException("IOGEN Matrix Reader Error: " + e); + } + break; default: throw new DMLRuntimeException("Failed to create matrix reader for unknown format: " + fmt.toString()); diff --git a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java index 3c0b2712c6a..789cb06b7c0 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/CustomProperties.java @@ -19,9 +19,15 @@ package org.apache.sysds.runtime.iogen; +import com.google.gson.Gson; import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.io.FileFormatProperties; +import org.apache.sysds.runtime.matrix.data.Pair; +import java.io.IOException; import java.io.Serializable; +import java.nio.file.Files; +import java.nio.file.Paths; import java.util.HashSet; public class CustomProperties extends FileFormatProperties implements Serializable { @@ -35,6 +41,11 @@ public class CustomProperties extends FileFormatProperties implements Serializab private int ncols; private boolean sparse; private boolean parallel; + private String format; + + public CustomProperties(String format) { + this.format = format; + } public CustomProperties(MappingProperties mappingProperties, RowIndexStructure rowIndexStructure, ColIndexStructure colIndexStructure) { this.mappingProperties = mappingProperties; @@ -83,7 +94,7 @@ public void setSchema(Types.ValueType[] schema) { } public HashSet[] endWithValueStrings() { - if(colKeyPatterns !=null) { + if(colKeyPatterns != null) { HashSet[] endWithValueString = new HashSet[colKeyPatterns.length]; for(int i = 0; i < colKeyPatterns.length; i++) if(colKeyPatterns[i] != null) @@ -125,4 +136,32 @@ public boolean isParallel() { public void setParallel(boolean parallel) { this.parallel = parallel; } + + public String getFormat() { + return format; + } + + public void setFormat(String format) { + this.format = format; + } + + public Pair loadSrcReaderAndProperties() { + String textProp = readEntireTextFile(format + ".prop").trim(); + String textSrc = readEntireTextFile(format).trim(); + + Gson gson = new Gson(); + CustomProperties customProperties = gson.fromJson(textProp, CustomProperties.class); + return new Pair<>(textSrc, customProperties); + } + + private String readEntireTextFile(String fileName) { + String text; + try { + text = Files.readString(Paths.get(fileName)); + } + catch(IOException e) { + throw new DMLRuntimeException(e); + } + return text; + } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index deb7bd23c26..922efe91d4b 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -19,6 +19,7 @@ package org.apache.sysds.runtime.iogen; +import com.google.gson.Gson; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.runtime.codegen.CodegenUtils; @@ -26,45 +27,62 @@ import org.apache.sysds.runtime.io.FrameReader; import org.apache.sysds.runtime.iogen.codegen.FrameCodeGen; import org.apache.sysds.runtime.iogen.codegen.MatrixCodeGen; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import java.util.Random; +import java.io.File; +import java.io.FileWriter; public abstract class GenerateReader { protected static final Log LOG = LogFactory.getLog(GenerateReader.class.getName()); protected CustomProperties properties; + protected String src; + protected String className; public GenerateReader(SampleProperties sampleProperties) throws Exception { - FormatIdentifying formatIdentifying = sampleProperties.getDataType().isMatrix() ? new FormatIdentifying(sampleProperties.getSampleRaw(), - sampleProperties.getSampleMatrix()) : new FormatIdentifying(sampleProperties.getSampleRaw(), - sampleProperties.getSampleFrame()); + sampleProperties.getSampleMatrix()) : new FormatIdentifying(sampleProperties.getSampleRaw(), sampleProperties.getSampleFrame()); properties = formatIdentifying.getFormatProperties(); if(properties == null) { throw new Exception("The file format couldn't recognize!!"); } - if(sampleProperties.getDataType().isFrame()){ + if(sampleProperties.getDataType().isFrame()) { properties.setSchema(sampleProperties.getSampleFrame().getSchema()); } + properties.setParallel(sampleProperties.isParallel()); + + String[] path = sampleProperties.getFormat().split("/"); + String fileName = path[path.length - 1]; + if(path.length > 1) { + String dirPath = sampleProperties.getFormat().substring(0, sampleProperties.getFormat().length() - fileName.length()); + File outDir = new File(dirPath); + outDir.getParentFile().mkdirs(); + } + className = fileName.split("\\.")[0]; + String srcJava = getReaderString(); + FileWriter srcWriter = new FileWriter(sampleProperties.getFormat()); + srcWriter.write(srcJava); + srcWriter.close(); + + Gson gson = new Gson(); + FileWriter propWriter = new FileWriter(sampleProperties.getFormat() + ".prop"); + propWriter.write(gson.toJson(properties)); + propWriter.close(); } - public String getRandomClassName() { - Random r = new Random(); - int low = 0; - int high = 100000000; - int result = r.nextInt(high - low) + low; - - return "GIOReader_" + result; + public GenerateReader(CustomProperties properties, String src, String className) { + this.properties = properties; + this.src = src; + this.className = className; } public CustomProperties getProperties() { return properties; } + public abstract String getReaderString(); + // Generate Reader for Matrix public static class GenerateReaderMatrix extends GenerateReader { @@ -74,20 +92,24 @@ public GenerateReaderMatrix(SampleProperties sampleProperties) throws Exception super(sampleProperties); } - public GenerateReaderMatrix(String sampleRaw, MatrixBlock sampleMatrix, boolean parallel) throws Exception { - super(new SampleProperties(sampleRaw, sampleMatrix)); - properties.setParallel(parallel); + public GenerateReaderMatrix(CustomProperties properties, String src, String className) { + super(properties, src, className); } public MatrixReader getReader() throws Exception { - String className = getRandomClassName(); + Class[] cArg = new Class[1]; + cArg[0] = CustomProperties.class; + matrixReader = (MatrixReader) CodegenUtils.compileClass(className, src).getDeclaredConstructor(cArg).newInstance(properties); + return matrixReader; + } + + @Override public String getReaderString() { MatrixCodeGen src = new MatrixCodeGen(properties, className); // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; - String srcJava = src.generateCodeJava(); - matrixReader = (MatrixReader) CodegenUtils.compileClass(className, srcJava).getDeclaredConstructor(cArg).newInstance(properties); - return matrixReader; + String srcJava = src.generateCodeJava(); + return srcJava; } } @@ -100,20 +122,24 @@ public GenerateReaderFrame(SampleProperties sampleProperties) throws Exception { super(sampleProperties); } - public GenerateReaderFrame(String sampleRaw, FrameBlock sampleFrame, boolean parallel) throws Exception { - super(new SampleProperties(sampleRaw, sampleFrame)); - properties.setParallel(parallel); + public GenerateReaderFrame(CustomProperties properties, String src, String className) { + super(properties, src, className); } public FrameReader getReader() throws Exception { - String className = getRandomClassName(); + Class[] cArg = new Class[1]; + cArg[0] = CustomProperties.class; + frameReader = (FrameReader) CodegenUtils.compileClass(className, src).getDeclaredConstructor(cArg).newInstance(properties); + return frameReader; + } + + @Override public String getReaderString() { FrameCodeGen src = new FrameCodeGen(properties, className); // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; cArg[0] = CustomProperties.class; String srcJava = src.generateCodeJava(); - frameReader = (FrameReader) CodegenUtils.compileClass(className, srcJava).getDeclaredConstructor(cArg).newInstance(properties); - return frameReader; + return srcJava; } } } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/SampleProperties.java b/src/main/java/org/apache/sysds/runtime/iogen/SampleProperties.java index 06b1afebef0..c59dd7fffb9 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/SampleProperties.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/SampleProperties.java @@ -22,10 +22,17 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.sysds.common.Types; +import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.io.FileFormatProperties; import org.apache.sysds.runtime.matrix.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.InvalidPathException; +import java.nio.file.Paths; + public class SampleProperties extends FileFormatProperties { protected static final Log LOG = LogFactory.getLog(CustomProperties.class.getName()); @@ -34,21 +41,30 @@ public class SampleProperties extends FileFormatProperties { private MatrixBlock sampleMatrix; private FrameBlock sampleFrame; private Types.DataType dataType; + private String format; + private boolean parallel; public SampleProperties(String sampleRaw) { - this.sampleRaw = sampleRaw; + this.sampleRaw = checkAndExtractSampleRaw(sampleRaw); } - public SampleProperties(String sampleRaw, MatrixBlock sampleMatrix) { - this.sampleRaw = sampleRaw; + public SampleProperties(String sampleRaw, MatrixBlock sampleMatrix, boolean parallel) { + this.sampleRaw = checkAndExtractSampleRaw(sampleRaw); this.sampleMatrix = sampleMatrix; this.dataType = Types.DataType.MATRIX; + this.parallel = parallel; } - public SampleProperties(String sampleRaw, FrameBlock sampleFrame) { - this.sampleRaw = sampleRaw; + public SampleProperties(String sampleRaw, FrameBlock sampleFrame, boolean parallel) { + this.sampleRaw = checkAndExtractSampleRaw(sampleRaw); this.sampleFrame = sampleFrame; this.dataType = Types.DataType.FRAME; + this.parallel = parallel; + } + + public SampleProperties(String sampleRaw, String format) { + this.sampleRaw = checkAndExtractSampleRaw(sampleRaw); + this.format = format; } public String getSampleRaw() { @@ -76,4 +92,48 @@ public void setSampleFrame(FrameBlock sampleFrame) { this.sampleFrame = sampleFrame; dataType = Types.DataType.FRAME; } + + private boolean checkPath(String path) { + try { + File filePath = new File(path); + if(filePath.exists()) + return true; + else + return false; + } + catch(InvalidPathException | NullPointerException ex) { + return false; + } + } + + private String checkAndExtractSampleRaw(String sampleRaw) { + if(checkPath(sampleRaw)) + return readEntireTextFile(sampleRaw); + else + return sampleRaw; + + } + + private String readEntireTextFile(String fileName) { + String text; + try { + text = Files.readString(Paths.get(fileName)); + } + catch(IOException e) { + throw new DMLRuntimeException(e); + } + return text; + } + + public boolean isParallel() { + return parallel; + } + + public void setParallel(boolean parallel) { + this.parallel = parallel; + } + + public String getFormat() { + return format; + } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java deleted file mode 100644 index 637762ea534..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowFlatTest.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.apache.sysds.common.Types; -import org.junit.Test; - -public class FrameSingleRowFlatTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameSingleRowFlatTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - - // CSV: Frame - // 1. dataset contain INT32 values - @Test - public void test1() { - sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; - data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(false); - } - - // 2. dataset contain different value types - @Test - public void test2() { - sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; - data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(false); - } - - @Test - public void test3() { - sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; - data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(false); - } - - @Test - public void test4() { - sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; - data = new String[][] {{"1", "2", "b"}, {"6", "7", "bb"}, {"11", "12", "14"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(false); - } - - @Test - public void test5() { - sampleRaw = "1,2,a,b,c\n" + "6,7,aa,bb,cc\n" + "11,12,13,14,15"; - data = new String[][] {{"1", "2", "b"}, {"6", "7", "bb"}, {"11", "12", "14"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.FP64, Types.ValueType.STRING}; - runGenerateReaderTest(false); - } - - // CSV with empty values - @Test - public void test6() { - sampleRaw = "1,2,a,,c\n" + "6,,aa,bb,cc\n" + ",12,13,14,15"; - data = new String[][] {{"1", "2", ""}, {"6", "0", "bb"}, {"0", "12", "14"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(false); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java deleted file mode 100644 index 787801b6f2d..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/FrameSingleRowNestedTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.apache.sysds.common.Types; -import org.junit.Test; - -public class FrameSingleRowNestedTest extends GenerateReaderFrameTest { - - private final static String TEST_NAME = "FrameSingleRowNestedTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - // JSON Dataset - //1. flat object, in-order values - @Test - public void test1() { - sampleRaw = "{\"a\":1,\"b\":2,\"c\":3,\"d\":4,\"e\":5}\n" + - "{\"a\":6,\"b\":7,\"c\":8,\"d\":9,\"e\":10}\n" + - "{\"a\":11,\"b\":12,\"c\":13,\"d\":14,\"e\":15}"; - - data = new String[][] {{"1", "2"}, {"6", "7"}, {"11", "12"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.INT32}; - runGenerateReaderTest(false); - } - - //2. flat object, out-of-order values, contain different value types - @Test - public void test2() { - sampleRaw = "{\"b\":\"string\",\"a\":\"1\",\"e\":5,\"c\":3,\"d\":4}\n" + - "{\"d\":9,\"b\":\"string2\",\"c\":8,\"a\":\"6\",\"e\":10}\n" + - "{\"d\":14,\"a\":\"11\",\"e\":15,\"b\":\"string3\",\"c\":13}"; - - data = new String[][] {{"1", "string"}, {"6", "string2"}, {"11", "string3"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING}; - runGenerateReaderTest(false); - } - //3. nested object with unique attribute names - @Test - public void test3() { - sampleRaw = "{\"a\":1,\"b\":{\"c\":2,\"d\":3,\"e\":4},\"f\":5}\n" + - "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + - "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; - data = new String[][] {{"1", "2", "5"}, {"6", "7", "10"}, {"11", "12", "15"}}; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64}; - runGenerateReaderTest(false); - } - - //5. nested object with repeated attribute names, out-of-order - @Test - public void test5() { - sampleRaw = "{\"a\":1,\"b\":{\"a\":2,\"b\":3,\"f\":4},\"f\":5}\n" + - "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + - "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; - schema = new Types.ValueType[] {Types.ValueType.INT32, Types.ValueType.STRING, Types.ValueType.FP64, - Types.ValueType.FP32, Types.ValueType.INT64}; - data = new String[][] {{"1", "2", "3", "4", "5"}, {"6", "7", "8", "9", "10"}, {"11", "12", "13", "14", "15"}}; - runGenerateReaderTest(false); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java similarity index 53% rename from src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java rename to src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java index a77baf1de88..04dd854c8fd 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderMatrixTest.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java @@ -22,50 +22,49 @@ import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types; import org.apache.sysds.conf.CompilerConfig; -import org.apache.sysds.runtime.io.MatrixReader; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.MatrixBlock; -import org.apache.sysds.runtime.util.DataConverter; import org.apache.sysds.test.AutomatedTestBase; import org.apache.sysds.test.TestConfiguration; import org.apache.sysds.test.TestUtils; +import org.junit.Test; -import java.io.BufferedWriter; import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -public abstract class GenerateReaderMatrixTest extends AutomatedTestBase { +public abstract class GIOMatrixReader extends AutomatedTestBase { protected final static String TEST_DIR = "functions/iogen/"; - protected final static String TEST_CLASS_DIR = TEST_DIR + GenerateReaderMatrixTest.class.getSimpleName() + "/"; - protected String sampleRaw; - protected double[][] sampleMatrix; + protected final static String TEST_CLASS_DIR = TEST_DIR + GIOMatrixReader.class.getSimpleName() + "/"; + + protected abstract int getId(); + + protected String getInputDatasetFileName() { + return "dataset_" + getId() + ".dat"; + } + + protected String getInputSampleMatrixFileName() { + return "sampleMatrix_" + getId() + ".mtx"; + } + + protected String getInputSampleRawFileName() { + return "sampleMatrix_" + getId() + ".raw"; + } + + protected String getOutputGIO() { + return "GIO" + getTestName()+"_"+ getId()+".java"; + } + + @Test + public void testSequential_CP1() { + runGIOTest(getId(), false); + } protected abstract String getTestName(); - @Override - public void setUp() { + @Override public void setUp() { TestUtils.clearAssertionInformation(); addTestConfiguration(getTestName(), new TestConfiguration(TEST_DIR, getTestName(), new String[] {"Y"})); } - protected void generateRandomSymmetric(int size, double min, double max, double sparsity, boolean isSkew) { - sampleMatrix = getRandomMatrix(size, size, min, max, sparsity, 714); - int conf = isSkew ? -1 : 1; - for(int i = 0; i < size; i++) { - for(int j = 0; j <= i; j++) { - - if(i != j) - sampleMatrix[i][j] = sampleMatrix[j][i] * conf; - else - sampleMatrix[i][j] = 0; - } - } - } - - @SuppressWarnings("unused") - protected void runGenerateReaderTest(boolean parallel) { + @SuppressWarnings("unused") protected void runGIOTest(int testNumber, boolean parallel) { Types.ExecMode oldPlatform = rtplatform; rtplatform = Types.ExecMode.SINGLE_NODE; @@ -74,26 +73,35 @@ protected void runGenerateReaderTest(boolean parallel) { boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT; try { + CompilerConfig.FLAG_DYN_RECOMPILE = false; CompilerConfig.FLAG_PARREADWRITE_TEXT = false; TestConfiguration config = getTestConfiguration(getTestName()); loadTestConfiguration(config); - - MatrixBlock sampleMB = DataConverter.convertToMatrixBlock(sampleMatrix); + setOutputBuffering(true); + setOutAndExpectedDeletionDisabled(true); String HOME = SCRIPT_DIR + TEST_DIR; - File directory = new File(HOME); - if(!directory.exists()) { - directory.mkdir(); - } - String dataPath = HOME + "matrix_data.raw"; - int clen = sampleMatrix[0].length; - writeRawString(sampleRaw, dataPath); - - GenerateReader.GenerateReaderMatrix gr = new GenerateReader.GenerateReaderMatrix(sampleRaw, sampleMB, parallel); - MatrixReader mr = gr.getReader(); - MatrixBlock matrixBlock = mr.readMatrixFromHDFS(dataPath, sampleMB.getNumRows(), clen, -1, -1); - TestUtils.compareMatrices(sampleMB, matrixBlock, 0); + String inputDataset = HOME + INPUT_DIR + getInputDatasetFileName(); + String inputSampleMatrix = HOME + INPUT_DIR + getInputSampleMatrixFileName(); + String inputSampleRaw = HOME + INPUT_DIR + getInputSampleRawFileName(); + String outputSrc = HOME +"iogensrc/" + getOutputGIO(); + String outputMatrix = output(getInputDatasetFileName()); + + File outDir = new File(HOME + OUTPUT_DIR); + if(!outDir.exists()) + outDir.mkdirs(); + + outDir = new File(HOME +"iogensrc/"); + if(!outDir.exists()) + outDir.mkdirs(); + + + fullDMLScriptName = HOME + getTestName() + "_" + testNumber + ".dml"; + programArgs = new String[] {"-args", inputDataset, inputSampleMatrix, inputSampleRaw, outputSrc, outputMatrix }; + + runTest(true, false, null, -1); + } catch(Exception exception) { exception.printStackTrace(); @@ -104,10 +112,4 @@ protected void runGenerateReaderTest(boolean parallel) { DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; } } - - private static void writeRawString(String raw, String fileName) throws IOException { - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName)); - writer.write(raw); - writer.close(); - } } diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GIOReadCSVTest1.java b/src/test/java/org/apache/sysds/test/functions/iogen/GIOReadCSVTest1.java new file mode 100644 index 00000000000..4fbb8c5eab6 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GIOReadCSVTest1.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.functions.iogen; + + +public class GIOReadCSVTest1 extends GIOMatrixReader { + + private final static String TEST_NAME = "ReaderCSV"; + private final static String TEST_CLASS_DIR = TEST_DIR + GIOReadCSVTest1.class.getSimpleName() + "/"; + + protected String getTestName() { + return TEST_NAME; + } + + protected String getTestClassDir() { + return TEST_CLASS_DIR; + } + + protected int getId() { + return 1; + } +} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java deleted file mode 100644 index ebe0cfe1e64..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateRandomFrame.java +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.apache.sysds.common.Types; -import org.apache.sysds.runtime.util.UtilFunctions; -import org.apache.sysds.test.AutomatedTestBase; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.BufferedWriter; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Random; - -public class GenerateRandomFrame extends AutomatedTestBase { - - protected final static String TEST_DIR = "functions/iogen/"; - - @Override - public void setUp() { - - } - - - protected Types.ValueType[] types = {Types.ValueType.STRING, Types.ValueType.INT32, Types.ValueType.INT64, - Types.ValueType.FP32, Types.ValueType.FP64}; - - protected String[][] generateRandomData(Types.ValueType[] types, int nrows, int ncols, double min, double max, - double sparsity, String[] naStrings) { - String[][] data = new String[nrows][ncols]; - for(int i = 0; i < ncols; i++) { - if(types[i] == Types.ValueType.STRING) - generateRandomString(nrows, 100, naStrings, sparsity, data, i); - if(types[i].isNumeric()) { - generateRandomNumeric(nrows, types[i], min, max, naStrings, sparsity, data, i); - } - } - return data; - } - - protected String getRandomString(int length) { - String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - StringBuilder salt = new StringBuilder(); - Random rnd = new Random(); - while(salt.length() < length) { // length of the random string. - int index = (int) (rnd.nextFloat() * alphabet.length()); - salt.append(alphabet.charAt(index)); - } - String saltStr = salt.toString(); - return saltStr; - - } - - protected void generateRandomString(int size, int maxStringLength, String[] naStrings, double sparsity, - String[][] data, int colIndex) { - - double[][] lengths = getRandomMatrix(size, 1, 10, maxStringLength, sparsity, 714); - - for(int i = 0; i < size; i++) { - int length = (int) lengths[i][0]; - if(length > 0) { - String generatedString = getRandomString(length); - data[i][colIndex] = generatedString; - } - else { - data[i][colIndex] = null; - } - } - } - - @SuppressWarnings("incomplete-switch") - protected void generateRandomNumeric(int size, Types.ValueType type, double min, double max, String[] naStrings, - double sparsity, String[][] data, int colIndex) { - - double[][] randomData = getRandomMatrix(size, 1, min, max, sparsity, -1); - for(int i = 0; i < size; i++) { - if(randomData[i][0] != 0) { - Object o = null; - switch(type) { - case INT32: - o = UtilFunctions.objectToObject(type, (int) randomData[i][0]); - break; - case INT64: - o = UtilFunctions.objectToObject(type, (long) randomData[i][0]); - break; - case FP32: - o = UtilFunctions.objectToObject(type, (float) randomData[i][0]); - break; - case FP64: - o = UtilFunctions.objectToObject(type, randomData[i][0]); - break; - } - String s = UtilFunctions.objectToString(o); - data[i][colIndex] = s; - } - else { - data[i][colIndex] = "0"; - } - } - } - // Write 2D Data in CSV format - private static void writeInCSVFormat(String[][] data, int nrows, int ncols, String fileName, String separator, - String[] naString) throws Exception { - - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".raw")); - - for(int r = 0; r < nrows; r++) { - StringBuilder row = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - row.append(data[r][c]); - if(c != ncols - 1) - row.append(separator); - } - writer.write(row.toString()); - if(r != nrows - 1) - writer.write("\n"); - } - writer.close(); - } - - // Write 2D in LIBSVM format - private static String[][] writeInLIBSVMFormat(int firstIndex,Types.ValueType[] schema, String[][] data, int nrows, int ncols, String fileName, - String separator, String indexSeparator) throws IOException { - - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".raw")); - int mid = ncols/2; - String[][] dataLibSVM = new String[2 * nrows][ncols+1]; - StringBuilder sb = new StringBuilder(); - int indexRow = 0; - for(int r = 0; r < nrows; r++) { - StringBuilder row1 = new StringBuilder(); - StringBuilder row2 = new StringBuilder(); - row1.append("+1"); - for(int c = 0; c < ncols; c++) { - if(mid > c) { - dataLibSVM[indexRow][c] = data[r][c]; - row1.append(separator).append(c + firstIndex).append(indexSeparator).append(data[r][c]); - } - else { - if(schema[c].isNumeric() || schema[c] == Types.ValueType.BOOLEAN){ - dataLibSVM[indexRow][c] = "0"; - } - else if(schema[c] == Types.ValueType.STRING) - dataLibSVM[indexRow][c] = ""; - } - } - dataLibSVM[indexRow++][ncols] = "+1"; - - row2.append("-1"); - for(int c = 0; c < ncols ; c++) { - if(mid <= c) { - dataLibSVM[indexRow][c] = data[r][c]; - row2.append(separator).append(c + firstIndex).append(indexSeparator).append(data[r][c]); - } - else { - if(schema[c].isNumeric() || schema[c] == Types.ValueType.BOOLEAN){ - dataLibSVM[indexRow][c] = "0"; - } - else if(schema[c] == Types.ValueType.STRING) - dataLibSVM[indexRow][c] = ""; - } - } - dataLibSVM[indexRow++][ncols] = "-1"; - writer.write(row1.toString()); - writer.write("\n"); - writer.write(row2.toString()); - if(r != nrows - 1) - writer.append("\n"); - - sb.append(row1).append("\n"); - sb.append(row2); - if(r != nrows - 1) - sb.append("\n"); - } - writer.close(); - return dataLibSVM; - } - - // Write in Matrix Market Format - private static void writeInMatrixMarketFormat(int firstIndex, String[][] data, int nrows, int ncols, String fileName, - String separator) throws IOException { - - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".raw")); - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(data[r][c] != null && !data[r][c].equals("0")) { - String rs = (r + firstIndex) + separator + (c + firstIndex) + separator + data[r][c]; - writer.write(rs); - if(r != nrows - 1 || c != ncols - 1) - writer.write("\n"); - } - } - } - writer.close(); - } - - - - // Write 2D Data in CSV format - private static void writeSampleFrame(Types.ValueType[] schema, String[][] sample, String fileName, int nrows, int ncols) - throws Exception { - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".frame")); - for(int r = 0; r < nrows; r++) { - StringBuilder row = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - row.append(sample[r][c]); - if(c != ncols - 1) - row.append(","); - } - writer.write(row.toString()); - if(r != nrows - 1) - writer.write("\n"); - } - writer.close(); - - writer = new BufferedWriter(new FileWriter(fileName + ".schema")); - StringBuilder sb = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - sb.append(schema[c]); - if(c != ncols - 1) - sb.append(","); - } - - writer.write(sb.toString()); - writer.close(); - } - - @Test - @Ignore - public void generateDataset() throws Exception { - int nrows = 5000; - int ncols = 5000; - double sparsity = 1; - String HOME = SCRIPT_DIR + TEST_DIR; - String[] naStrings = {"Nan", "NAN", "", "inf", "null", "NULL"}; - String[] names = new String[ncols]; - Types.ValueType[] schema = new Types.ValueType[ncols]; - - for(int i = 0; i < nrows; i++) { - names[i] = "C_" + i; - Random rn = new Random(); - int rnt = rn.nextInt(types.length); - schema[i] = types[rnt]; - } - String[][] data = generateRandomData(schema, nrows, ncols, -100, 100, sparsity, naStrings); - saveData(schema, data, nrows, ncols, " ", ":", naStrings, HOME + "/data/", sparsity, false); - - for(int r = 10; r <= 100; r += 10) { - saveData(schema, data, r, r, " ", ":", naStrings, HOME + "/samples/", sparsity, true); - } - - BufferedWriter writer = new BufferedWriter(new FileWriter(HOME+"/data/data"+"_nrows_" + nrows + "_ncols_" + ncols + "_sparsity_" + sparsity + ".schema")); - StringBuilder sb = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - sb.append(schema[c]); - if(c != ncols - 1) - sb.append(","); - } - writer.write(sb.toString()); - writer.close(); - } - - private static void saveData(Types.ValueType[] schema, String[][] data, int nrows, int ncols, String separator, - String indexSeparator, String[] naStrings, String HOME, double sparsity, boolean saveSampleFrame) - throws Exception { - - String baseFileName = "_nrows_" + nrows + "_ncols_" + ncols + "_sparsity_" + sparsity; - - String csv = HOME + "CSV" + baseFileName; - - String libsvmFirstZero = HOME + "LIBSVM-FZ" + baseFileName; - String libsvmFirstOne = HOME + "LIBSVM-FO" + baseFileName; - - String mmFirstZero = HOME + "MM-FZ" + baseFileName; - String mmFirstOne = HOME + "MM-FO" + baseFileName; - - // Write all data as a source dataset - writeInCSVFormat(data, nrows, ncols, csv, separator, naStrings); - String[][] libsvm = writeInLIBSVMFormat(0,schema, data, nrows, ncols, libsvmFirstZero, separator, indexSeparator); - writeInLIBSVMFormat(1,schema, data, nrows, ncols, libsvmFirstOne, separator, indexSeparator); - writeInMatrixMarketFormat(0, data, nrows, ncols, mmFirstZero, separator); - writeInMatrixMarketFormat(1, data, nrows, ncols, mmFirstOne, separator); - - if(saveSampleFrame) { - writeSampleFrame(schema,data, csv, nrows, ncols); - Types.ValueType[] libsvmSchema = new Types.ValueType[ncols+1]; - for(int i=0;i 0) { - sampleMatrix[indexRow][c] = data[r][c]; - row1.append(separator).append(c + firstIndex).append(indexSeparator).append(data[r][c]); - } - else { - sampleMatrix[indexRow][c] = 0; - } - } - sampleMatrix[indexRow++][ncols] = 1; - - row2.append("-1"); - for(int c = 0; c < ncols; c++) { - if(data[r][c] < 0) { - sampleMatrix[indexRow][c] = data[r][c]; - row2.append(separator).append(c + firstIndex).append(indexSeparator).append(data[r][c]); - } - else { - sampleMatrix[indexRow][c] = 0; - } - } - - sampleMatrix[indexRow++][ncols] = -1; - writer.write(row1.toString()); - writer.write("\n"); - writer.write(row2.toString()); - if(r != nrows - 1) - writer.append("\n"); - } - writer.close(); - return sampleMatrix; - } - - // Write in Matrix Market Format - private static void writeInMatrixMarketFormat(int firstIndex, double[][] data, int nrows, int ncols, String fileName, - String separator) throws IOException { - - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".raw")); - - for(int r = 0; r < nrows; r++) { - for(int c = 0; c < ncols; c++) { - if(data[r][c] != 0) { - String rs = (r + firstIndex) + separator + (c + firstIndex) + separator + data[r][c]; - writer.write(rs); - if(r != nrows - 1 || c != ncols - 1) - writer.write("\n"); - } - } - } - writer.close(); - } - - private static void writeInSymmetricMatrixMarketFormat(int firstIndex, double[][] data, String fileName, int size, - String separator, boolean isUpperTriangular) throws IOException { - - int start, end; - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".raw")); - for(int r = 0; r < size; r++) { - if(isUpperTriangular) { - start = r; - end = size; - } - else { - start = 0; - end = r + 1; - } - for(int c = start; c < end; c++) { - if(data[r][c] != 0) { - String rs = (r + firstIndex) + separator + (c + firstIndex) + separator + data[r][c]; - writer.write(rs); - if(r != size - 1 || c != size - 1) - writer.write("\n"); - } - } - } - writer.close(); - } - - // Write 2D Data in CSV format - private static void writeSampleMatrix(double[][] sample, String fileName, int nrows, int ncols) throws Exception { - - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName + ".matrix")); - for(int r = 0; r < nrows; r++) { - StringBuilder row = new StringBuilder(); - for(int c = 0; c < ncols; c++) { - row.append(sample[r][c]); - if(c != ncols - 1) - row.append(","); - } - writer.write(row.toString()); - if(r != nrows - 1) - writer.write("\n"); - } - writer.close(); - } - - @Test - @Ignore - public void generateDataset() throws Exception { - int nrows = 5000; - int ncols = 5000; - double sparsity = 1; - String HOME = SCRIPT_DIR + TEST_DIR; - String[] naString = {"Nan", "NAN", "", "inf", "null", "NULL"}; - double[][] data = generateRandom2DData(nrows, ncols, -100, 100, sparsity); - saveData(data, nrows, ncols, " ", ":", naString, HOME + "/data/", sparsity, false); - - for(int r = 10; r <= 100; r += 10) { - saveData(data, r, r, " ", ":", naString, HOME + "/samples/", sparsity, true); - } - } - - private void saveData(double[][] data, int nrows, int ncols, String separator, String indexSeparator, - String[] naStrings, String HOME, double sparsity, boolean saveSampleMatrix) throws Exception { - - String baseFileName = "_nrows_" + nrows + "_ncols_" + ncols + "_sparsity_" + sparsity; - - String csv = HOME + "CSV" + baseFileName; - - String libsvmFirstZero = HOME + "LIBSVM-FZ" + baseFileName; - String libsvmFirstOne = HOME + "LIBSVM-FO" + baseFileName; - - String mmFirstZero = HOME + "MM-FZ" + baseFileName; - String mmFirstOne = HOME + "MM-FO" + baseFileName; - - String mmFirstZeroSymUT = HOME + "MM-FZ-SYM-UT" + baseFileName; - String mmFirstZeroSymLT = HOME + "MM-FZ-SYM-LT" + baseFileName; - String mmFirstOneSymUT = HOME + "MM-FO-SYM-UT" + baseFileName; - String mmFirstOneSymLT = HOME + "MM-FO-SYM-LT" + baseFileName; - - String mmFirstZeroSkewUT = HOME + "MM-FZ-SKEW-UT" + baseFileName; - String mmFirstZeroSkewLT = HOME + "MM-FZ-SKEW-LT" + baseFileName; - String mmFirstOneSkewUT = HOME + "MM-FO-SKEW-UT" + baseFileName; - String mmFirstOneSkewLT = HOME + "MM-FO-SKEW-LT" + baseFileName; - - // Write all data as a source dataset - writeInCSVFormat(data, nrows, ncols, csv, separator, naStrings); - double[][] libsvm = writeInLIBSVMFormat(0, data, nrows, ncols, libsvmFirstZero, separator, indexSeparator); - writeInLIBSVMFormat(1, data, nrows, ncols, libsvmFirstOne, separator, indexSeparator); - writeInMatrixMarketFormat(0, data, nrows, ncols, mmFirstZero, separator); - writeInMatrixMarketFormat(1, data, nrows, ncols, mmFirstOne, separator); - - if(saveSampleMatrix) { - writeSampleMatrix(data, csv, nrows, ncols); - writeSampleMatrix(libsvm, HOME + "LIBSVM" + baseFileName, 2 * nrows, ncols + 1); - writeSampleMatrix(data, HOME + "MM" + baseFileName, nrows, ncols); - } - - // Write MM Symmetric and Skew - if(nrows == ncols) { - double[][] mm = getSymmetric2DData(data, nrows, false); - writeInSymmetricMatrixMarketFormat(0, mm, mmFirstZeroSymUT, ncols, separator, true); - writeInSymmetricMatrixMarketFormat(1, mm, mmFirstOneSymUT, ncols, separator, true); - writeInSymmetricMatrixMarketFormat(0, mm, mmFirstZeroSymLT, ncols, separator, false); - writeInSymmetricMatrixMarketFormat(1, mm, mmFirstOneSymLT, ncols, separator, false); - if(saveSampleMatrix) - writeSampleMatrix(mm, HOME + "MM-SYM" + baseFileName, nrows, nrows); - - mm = getSymmetric2DData(data, nrows, true); - writeInSymmetricMatrixMarketFormat(0, mm, mmFirstZeroSkewUT, ncols, separator, true); - writeInSymmetricMatrixMarketFormat(1, mm, mmFirstOneSkewUT, ncols, separator, true); - writeInSymmetricMatrixMarketFormat(0, mm, mmFirstZeroSkewLT, ncols, separator, false); - writeInSymmetricMatrixMarketFormat(1, mm, mmFirstOneSkewLT, ncols, separator, false); - if(saveSampleMatrix) - writeSampleMatrix(mm, HOME + "MM-SKEW" + baseFileName, nrows, nrows); - } - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java deleted file mode 100644 index e785eb9bb90..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GenerateReaderFrameTest.java +++ /dev/null @@ -1,204 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.apache.sysds.api.DMLScript; -import org.apache.sysds.common.Types; -import org.apache.sysds.conf.CompilerConfig; -import org.apache.sysds.runtime.io.FrameReader; -import org.apache.sysds.runtime.iogen.GenerateReader; -import org.apache.sysds.runtime.matrix.data.FrameBlock; -import org.apache.sysds.runtime.util.DataConverter; -import org.apache.sysds.runtime.util.UtilFunctions; -import org.apache.sysds.test.AutomatedTestBase; -import org.apache.sysds.test.TestConfiguration; -import org.apache.sysds.test.TestUtils; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Random; - -public abstract class GenerateReaderFrameTest extends AutomatedTestBase { - - protected final static String TEST_DIR = "functions/iogen/"; - protected final static String TEST_CLASS_DIR = TEST_DIR + GenerateReaderFrameTest.class.getSimpleName() + "/"; - protected String sampleRaw; - protected String[][] data; - protected String[] names; - protected Types.ValueType[] schema; - protected Types.ValueType[] types= { - Types.ValueType.STRING, - Types.ValueType.INT32, - Types.ValueType.INT64, - Types.ValueType.FP32, - Types.ValueType.FP64 - }; - - protected abstract String getTestName(); - - @Override public void setUp() { - TestUtils.clearAssertionInformation(); - addTestConfiguration(getTestName(), new TestConfiguration(TEST_DIR, getTestName(), new String[] {"Y"})); - } - - protected String getRandomString(int length) { - String alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - StringBuilder salt = new StringBuilder(); - Random rnd = new Random(); - while (salt.length() < length) { // length of the random string. - int index = (int) (rnd.nextFloat() * alphabet.length()); - salt.append(alphabet.charAt(index)); - } - String saltStr = salt.toString(); - return saltStr; - } - - @SuppressWarnings("incomplete-switch") - protected String defaultValue(Types.ValueType vt){ - switch(vt){ - case STRING: return ""; - case BOOLEAN: return null; - case FP32: - case FP64: - case INT32: - case INT64: - return "0"; - } - return null; - } - - protected void generateRandomString(int size, int maxStringLength, String[] naStrings, double sparsity, String[][] data, int colIndex) { - - double[][] lengths = getRandomMatrix(size, 1, 10, maxStringLength, sparsity, 714); - - for(int i = 0; i < size; i++) { - int length = (int) lengths[i][0]; - if(length > 0) { - String generatedString = getRandomString(length); - data[i][colIndex] = generatedString; - } - else { - data[i][colIndex] = null; - } - } - } - - @SuppressWarnings("incomplete-switch") - protected void generateRandomNumeric(int size, Types.ValueType type, double min, double max, String[] naStrings, - double sparsity, String[][] data, int colIndex) { - - double[][] randomData = getRandomMatrix(size, 1, min, max, sparsity, -1); - for(int i = 0; i < size; i++) { - if(randomData[i][0] != 0) { - Object o = null; - switch(type){ - case INT32: o = UtilFunctions.objectToObject(type,(int)randomData[i][0]); break; - case INT64: o = UtilFunctions.objectToObject(type,(long)randomData[i][0]); break; - case FP32: o = UtilFunctions.objectToObject(type,(float)randomData[i][0]); break; - case FP64: o = UtilFunctions.objectToObject(type,randomData[i][0]); break; - case BOOLEAN: Boolean b= randomData[i][0] >0 ? true: null; o = UtilFunctions.objectToObject(type, b); break; - } - String s = UtilFunctions.objectToString(o); - data[i][colIndex] = s; - } - else { - if(type.isNumeric()) - data[i][colIndex] ="0"; - else - data[i][colIndex] =null; - } - } - } - - protected void generateRandomData(int nrows, int ncols, double min, double max, double sparsity, String[] naStrings) { - - names = new String[ncols]; - schema = new Types.ValueType[ncols]; - data = new String[nrows][ncols]; - - for(int i = 0; i < ncols; i++) { - names[i] = "C_" + i; - - Random rn = new Random(); - int rnt = rn.nextInt(types.length); - if(i == 0|| i==ncols-1) - rnt = 3; - schema[i] = types[rnt]; - - if(types[rnt] == Types.ValueType.STRING) - generateRandomString(nrows,100,naStrings,sparsity,data,i); - else if(types[rnt].isNumeric() || types[rnt] == Types.ValueType.BOOLEAN) - generateRandomNumeric(nrows, types[rnt],min,max,naStrings, sparsity,data,i); - } - } - @SuppressWarnings("unused") - protected void runGenerateReaderTest(boolean parallel) { - - Types.ExecMode oldPlatform = rtplatform; - rtplatform = Types.ExecMode.SINGLE_NODE; - - boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG; - boolean oldpar = CompilerConfig.FLAG_PARREADWRITE_TEXT; - - try { - CompilerConfig.FLAG_PARREADWRITE_TEXT = false; - - TestConfiguration config = getTestConfiguration(getTestName()); - loadTestConfiguration(config); - - FrameBlock sampleFrame = new FrameBlock(schema, data); - - String HOME = SCRIPT_DIR + TEST_DIR; - File directory = new File(HOME); - if(!directory.exists()) { - directory.mkdir(); - } - String dataPath = HOME + "frame_data.raw"; - int clen = data[0].length; - writeRawString(sampleRaw, dataPath); - GenerateReader.GenerateReaderFrame gr = new GenerateReader.GenerateReaderFrame(sampleRaw, sampleFrame, parallel); - - FrameReader fr = gr.getReader(); - FrameBlock frameBlock = fr.readFrameFromHDFS(dataPath, schema, data.length, clen); - - String[][] expected = DataConverter.convertToStringFrame(sampleFrame); - String[][] actual = DataConverter.convertToStringFrame(frameBlock); - - TestUtils.compareFrames(expected, actual, sampleFrame.getNumRows(), sampleFrame.getNumColumns()); - - } - catch(Exception exception) { - exception.printStackTrace(); - } - finally { - rtplatform = oldPlatform; - CompilerConfig.FLAG_PARREADWRITE_TEXT = oldpar; - DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld; - } - } - - private static void writeRawString(String raw, String fileName) throws IOException { - BufferedWriter writer = new BufferedWriter(new FileWriter(fileName)); - writer.write(raw); - writer.close(); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java deleted file mode 100644 index b56a803cf04..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixMultiRowNestedTest.java +++ /dev/null @@ -1,171 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class MatrixMultiRowNestedTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixSingleRowFlatTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - // XML Dataset - //1. flat object, in-order values - @Test - public void test1() { - sampleRaw = "\n" + - "1\n" + - "2\n" + - "3\n" + - "\n" + - "\n" + - "4\n" + - "5\n" + - "6\n" + - "" + - "\n" + - "7\n" + - "8\n" + - "9\n" + - ""; - sampleMatrix = new double[][] {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}; - runGenerateReaderTest(false); - } - - //2. flat object, out-of-order values - @Test - public void test2() { - sampleRaw = "{\"b\":2,\"a\":1,\"e\":5,\"c\":3,\"d\":4}\n" + - "{\"d\":9,\"b\":7,\"c\":8,\"a\":6,\"e\":10}\n" + - "{\"d\":14,\"a\":11,\"e\":15,\"b\":12,\"c\":13}"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - //3. nested object with unique attribute names - @Test - public void test3() { - sampleRaw = "{\"a\":1,\"b\":{\"c\":2,\"d\":3,\"e\":4},\"f\":5}\n" + - "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + - "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; - sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(false); - } - - //4. nested object with unique attribute names, out-of-order - @Test - public void test4() { - sampleRaw = "{\"a\":1,\"f\":5,\"b\":{\"c\":2,\"d\":3,\"e\":4}}\n" + - "{\"a\":6,\"f\":10,\"b\":{\"e\":9,\"c\":7,\"d\":8}}\n" + - "{\"b\":{\"d\":13,\"c\":12,\"e\":14},\"a\":11,\"f\":15}\n"; - sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(false); - } - - //5. nested object with repeated attribute names, out-of-order - @Test - public void test5() { - sampleRaw = "{\"a\":1,\"b\":{\"a\":2,\"b\":3,\"f\":4},\"f\":5}\n" + - "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + - "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - - // XML - //6. nested object with unique attribute names, in-order - // single type of object, "article" is an object - @Test - public void test6() { - sampleRaw = "

12345
\n" + - "
678910
\n" + - "
1112131415
"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - - //6. nested object with unique attribute names, in-order - // multi types of object, "article", "book", and "homepage" are the object types - @Test - public void test7() { - sampleRaw = "
12345
\n" + - "678910\n" + - "1112131415"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - - //7. nested object with unique attribute names, in-order - // multi types of object, "article", "book", and "homepage" are the object types - @Test - public void test8() { - sampleRaw = "
122022GIO45
\n" + - "671980DB910\n" + - "11122012CEP1415\n"; - sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; - runGenerateReaderTest(false); - } - - @Test - public void test9() { - sampleRaw = "#index 1\n" + - "#* 12\n" + - "#@ 13;14;15\n" + - "#o 16;17;18\n" + - "#t 19\n" + - "#c 110\n" + - "#% 111\n" + - "#% 112\n" + - "\n" + - "#index 2\n" + - "#* 22\n" + - "#@ 23;24;25\n" + - "#o 26;27;28\n" + - "#t 29\n" + - "#c 210\n" + - "#% 211\n" + - "#% 212\n" + - "\n" + - "\n" + - "#index 3\n" + - "#* 32\n" + - "#@ 33;34;35\n" + - "#o 36;37;38\n" + - "#t 39\n" + - "#c 310\n" + - "#% 311\n" + - "#% 500\n"+ - "\n" + - "#index 4\n" + - "#* 42\n" + - "#@ 43;44;45\n" + - "#o 46;47;48\n" + - "#t 49\n" + - "#c 410\n" + - "#% 411\n" + - "#% 600"; - - sampleMatrix = new double[][] {{1,12,13,14,15},{2,22,23,24,25},{3,32,33,34,35}, {4,42,43,44,45}}; - runGenerateReaderTest(false); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java deleted file mode 100644 index 6514d74a9dc..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowFlatTest.java +++ /dev/null @@ -1,162 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class MatrixSingleRowFlatTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixSingleRowFlatTest"; - - @Override protected String getTestName() { - return TEST_NAME; - } - - // CSV Dataset - // 1. matrix and dataset are dense and "," is delim - @Test public void test1() { - sampleRaw = "1,2,3,4,5\n" + "6,7,8,9,10\n" + "11,12,13,14,15"; - sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; - runGenerateReaderTest(false); - } - - // 2. matrix and dataset are dense and ",a" is delim - @Test public void test2() { - sampleRaw = "1,a2,a3,a4,a5\n" + "6,a7,a8,a9,a10\n" + "11,a12,a13,a14,a15"; - sampleMatrix = new double[][] {{1, 5}, {6, 10}, {11, 15}}; - runGenerateReaderTest(false); - } - - //3. matrix and dataset are dense and ",," is delim - @Test public void test3() { - sampleRaw = "1,,2,,3,,4,,5\n" + "6,,7,,8,,9,,10\n" + "11,,12,,13,,14,,15"; - sampleMatrix = new double[][] {{1, 3, 5}, {6, 8, 10}, {11, 13, 15}}; - runGenerateReaderTest(false); - } - - //4. matrix and dataset contain empty/0 values and "," is delim - @Test public void test4() { - sampleRaw = "1,2,,4,5\n" + ",7,8,9,10\n" + "11,12,,,\n" + "13,14,,,16"; - sampleMatrix = new double[][] {{1, 2, 5}, {0, 7, 10}, {11, 12, 0}, {13, 14, 16}}; - runGenerateReaderTest(false); - } - - // LibSVM - //5. LibSVM with in-order col indexes and numeric col indexes - @Test public void test5() { - sampleRaw = "+1 1:10 2:20 3:30\n" + "-1 4:40 5:50 6:60\n" + "+1 1:101 2:201 \n" + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, - {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(false); - } - - //6. LibSVM with out-of-order col indexes and numeric col indexes - @Test public void test6() { - sampleRaw = "+1 3:30 1:10 2:20\n" + "-1 5:50 6:60 4:40\n" + "+1 1:101 2:201 \n" + "-1 6:601 \n" + "-1 5:501\n" + "+1 3:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, - {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(false); - } - - //7. Special LibSVM with in-order col indexes and none-numeric col indexes - // a -> 1, b->2, c->3, d->4, e->5, f->6 - @Test public void test7() { - sampleRaw = "+1 a:10 b:20 c:30\n" + "-1 d:40 e:50 f:60\n" + "+1 a:101 b:201 \n" + "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, - {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(false); - } - - //8. Special LibSVM with out-of-order col indexes and none-numeric col indexes - // a -> 1, b->2, c->3, d->4, e->5, f->6 - @Test public void test8() { - sampleRaw = "+1 c:30 a:10 b:20\n" + "-1 e:50 f:60 d:40\n" + "+1 a:101 b:201 \n" + "-1 f:601 \n" + "-1 e:501\n" + "+1 c:301"; - sampleMatrix = new double[][] {{1, 10, 20, 30, 0, 0, 0}, {-1, 0, 0, 0, 40, 50, 60}, {1, 101, 201, 0, 0, 0, 0}, {-1, 0, 0, 0, 0, 0, 601}, - {-1, 0, 0, 0, 0, 501, 0}, {1, 0, 0, 301, 0, 0, 0}}; - runGenerateReaderTest(false); - } - - // MatrixMarket(MM) - //9. MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) - @Test public void test9() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "1,5,50\n" + "2,1,101\n" + "2,2,201\n" + "4,1,104\n" + "4,5,504\n" + "5,3,305"; - sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(false); - } - - //10. MM with inorder dataset, (RowIndex,Col Index,Value). Row begin index: Row & Col begin index: (0,1) - @Test public void test10() { - sampleRaw = "0,1,10\n" + "0,2,20\n" + "0,3,30\n" + "0,5,50\n" + "1,1,101\n" + "1,2,201\n" + "3,1,104\n" + "3,5,504\n" + "4,3,305"; - sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(false); - } - - //11. MM with inorder dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,0) - @Test public void test11() { - sampleRaw = "1,0,10\n" + "1,1,20\n" + "1,2,30\n" + "1,4,50\n" + "2,0,101\n" + "2,1,201\n" + "4,0,104\n" + "4,4,504\n" + "5,2,305"; - sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(false); - } - - //12. MM with inorder dataset, (RowIndex,Col Index,Value). Row begin index: Row & Col begin index: (0,0) - @Test public void test12() { - sampleRaw = "0,0,10\n" + "0,1,20\n" + "0,2,30\n" + "0,4,50\n" + "1,0,101\n" + "1,1,201\n" + "2,0,104\n" + "2,4,504\n" + "3,2,305"; - sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(false); - } - - //13. MM with out-of-order dataset, (RowIndex,Col Index,Value). Row & Col begin index: (1,1) - @Test public void test13() { - sampleRaw = "4,5,504\n" + "1,2,20\n" + "1,1,10\n" + "2,1,101\n" + "1,3,30\n" + "1,5,50\n" + "2,2,201\n" + "4,1,104\n" + "5,3,305"; - sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 0, 0}, {0, 0, 305}}; - runGenerateReaderTest(false); - } - - //14. MM with out-of-order dataset, (ColIndex,Row Index, Value). Row & Col begin index: (1,1) - @Test public void test14() { - sampleRaw = "5,4,504\n" + "2,1,20\n" + "1,1,10\n" + "1,2,101\n" + "3,1,30\n" + "5,1,50\n" + "2,2,201\n" + "1,4,104\n" + "3,5,305\n" + "2,4,204"; - sampleMatrix = new double[][] {{10, 20, 30}, {101, 201, 0}, {0, 0, 0}, {104, 204, 0}, {0, 0, 305}}; - runGenerateReaderTest(false); - } - - @Test public void test15() { - sampleRaw = "1,2,3,4\n" + "5,6,7,8\n" + "9,10,11,12\n" + "13,14,15,16"; - sampleMatrix = new double[][] {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}, {13, 14, 15, 16}}; - runGenerateReaderTest(false); - } - - @Test public void test16() { - sampleRaw = "1,2,3,0\n" + "5,0,7,8\n" + "9,0,0,12\n" + "13,14,0,0"; - sampleMatrix = new double[][] {{1, 2, 3, 0}, {5, 0, 7, 8}, {9, 0, 0, 12}, {13, 14, 0, 0}}; - runGenerateReaderTest(false); - } - - @Test public void test17() { - sampleRaw = "0:10 1:20 2:30\n" + "3:40 4:50\n" + "0:60 1:70 2:80\n" + "3:90 4:100"; - sampleMatrix = new double[][] {{10, 20, 30, 0, 0}, {0, 0, 0, 40, 50}, {60, 70, 80, 0, 0}, {0, 0, 0, 90, 100}}; - runGenerateReaderTest(false); - } - - @Test public void test18() { - sampleRaw = "1,1,10\n" + "1,2,20\n" + "1,3,30\n" + "1,4,40\n" + "2,2,20\n" + "2,3,30\n" + "2,4,40\n" + "3,3,30\n" + "3,4,40\n" + "4,4,40\n"; - sampleMatrix = new double[][] {{10, 20, 30, 40}, {0, 20, 30, 40}, {0, 0, 30, 40}, {0, 0, 0, 40}}; - runGenerateReaderTest(false); - } -} diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java b/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java deleted file mode 100644 index 42b25fe1b7a..00000000000 --- a/src/test/java/org/apache/sysds/test/functions/iogen/MatrixSingleRowNestedTest.java +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.sysds.test.functions.iogen; - -import org.junit.Test; - -public class MatrixSingleRowNestedTest extends GenerateReaderMatrixTest { - - private final static String TEST_NAME = "MatrixSingleRowFlatTest"; - - @Override - protected String getTestName() { - return TEST_NAME; - } - - // JSON Dataset - //1. flat object, in-order values - @Test - public void test1() { - sampleRaw = "{\"a\":1,\"b\":2,\"c\":3,\"d\":4,\"e\":5}\n" + - "{\"a\":6,\"b\":7,\"c\":8,\"d\":9,\"e\":10}\n" + - "{\"a\":11,\"b\":12,\"c\":13,\"d\":14,\"e\":15}"; - sampleMatrix = new double[][] {{1, 2}, {6, 7}, {11, 12}}; - runGenerateReaderTest(false); - } - - //2. flat object, out-of-order values - @Test - public void test2() { - sampleRaw = "{\"b\":2,\"a\":1,\"e\":5,\"c\":3,\"d\":4}\n" + - "{\"d\":9,\"b\":7,\"c\":8,\"a\":6,\"e\":10}\n" + - "{\"d\":14,\"a\":11,\"e\":15,\"b\":12,\"c\":13}"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - //3. nested object with unique attribute names - @Test - public void test3() { - sampleRaw = "{\"a\":1,\"b\":{\"c\":2,\"d\":3,\"e\":4},\"f\":5}\n" + - "{\"a\":6,\"b\":{\"c\":7,\"d\":8,\"e\":9},\"f\":10}\n" + - "{\"a\":11,\"b\":{\"c\":12,\"d\":13,\"e\":14},\"f\":15}\n"; - sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(false); - } - - //4. nested object with unique attribute names, out-of-order - @Test - public void test4() { - sampleRaw = "{\"a\":1,\"f\":5,\"b\":{\"c\":2,\"d\":3,\"e\":4}}\n" + - "{\"a\":6,\"f\":10,\"b\":{\"e\":9,\"c\":7,\"d\":8}}\n" + - "{\"b\":{\"d\":13,\"c\":12,\"e\":14},\"a\":11,\"f\":15}\n"; - sampleMatrix = new double[][] {{1, 2, 5}, {6, 7, 10}, {11, 12, 15}}; - runGenerateReaderTest(false); - } - - //5. nested object with repeated attribute names, out-of-order - @Test - public void test5() { - sampleRaw = "{\"a\":1,\"b\":{\"a\":2,\"b\":3,\"f\":4},\"f\":5}\n" + - "{\"a\":6,\"b\":{\"a\":7,\"b\":8,\"f\":9},\"f\":10}\n" + - "{\"a\":11,\"b\":{\"a\":12,\"b\":13,\"f\":14},\"f\":15}"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - - // XML - //6. nested object with unique attribute names, in-order - // single type of object, "article" is an object - @Test - public void test6() { - sampleRaw = "
12345
\n" + - "
678910
\n" + - "
1112131415
"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - - //6. nested object with unique attribute names, in-order - // multi types of object, "article", "book", and "homepage" are the object types - @Test - public void test7() { - sampleRaw = "
12345
\n" + - "678910\n" + - "1112131415"; - sampleMatrix = new double[][] {{1, 2, 3, 4, 5}, {6, 7, 8, 9, 10}, {11, 12, 13, 14, 15}}; - runGenerateReaderTest(false); - } - - //7. nested object with unique attribute names, in-order - // multi types of object, "article", "book", and "homepage" are the object types - @Test - public void test8() { - sampleRaw = "
122022GIO45
\n" + - "671980DB910\n" + - "11122012CEP1415\n"; - sampleMatrix = new double[][] {{1, 2022}, {6, 1980}, {11, 2012}}; - runGenerateReaderTest(false); - } -} diff --git a/src/test/scripts/functions/iogen/ReaderCSV_1.dml b/src/test/scripts/functions/iogen/ReaderCSV_1.dml new file mode 100644 index 00000000000..de12dbcd720 --- /dev/null +++ b/src/test/scripts/functions/iogen/ReaderCSV_1.dml @@ -0,0 +1,25 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +x = read($2) +read(sample=x, sample_raw=$3, format=$4, data_type="matrix") +y = read($1, format=$4) +write(y, $5, format="csv"); diff --git a/src/test/scripts/functions/iogen/in/dataset_1.dat b/src/test/scripts/functions/iogen/in/dataset_1.dat new file mode 100644 index 00000000000..1a24f59a4e6 --- /dev/null +++ b/src/test/scripts/functions/iogen/in/dataset_1.dat @@ -0,0 +1,3 @@ +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 diff --git a/src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx b/src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx new file mode 100644 index 00000000000..1a24f59a4e6 --- /dev/null +++ b/src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx @@ -0,0 +1,3 @@ +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 diff --git a/src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx.mtd b/src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx.mtd new file mode 100644 index 00000000000..aae90f3ea75 --- /dev/null +++ b/src/test/scripts/functions/iogen/in/sampleMatrix_1.mtx.mtd @@ -0,0 +1,6 @@ +{ + "data_type": "matrix" + ,"format": "csv" + ,"header": false + ,"description": { "author": "Saeed Fathollahzadeh" } +} diff --git a/src/test/scripts/functions/iogen/in/sampleMatrix_1.raw b/src/test/scripts/functions/iogen/in/sampleMatrix_1.raw new file mode 100644 index 00000000000..1a24f59a4e6 --- /dev/null +++ b/src/test/scripts/functions/iogen/in/sampleMatrix_1.raw @@ -0,0 +1,3 @@ +1,2,3,4,5 +6,7,8,9,10 +11,12,13,14,15 From d612708dbea406f7f03976f26130b43a48f02596 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 5 Aug 2022 02:02:36 +0200 Subject: [PATCH 81/84] Fix Code Style --- .../apache/sysds/hops/GenerateReaderOp.java | 38 ++++++++++++------- .../java/org/apache/sysds/lops/DataIOGen.java | 12 ++++-- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java b/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java index e410715c458..da6649f7f38 100644 --- a/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java +++ b/src/main/java/org/apache/sysds/hops/GenerateReaderOp.java @@ -49,19 +49,23 @@ private GenerateReaderOp() { //default constructor for clone } - @Override public void checkArity() { + @Override + public void checkArity() { } - @Override public boolean allowsAllExecTypes() { + @Override + public boolean allowsAllExecTypes() { return false; } - @Override protected DataCharacteristics inferOutputCharacteristics(MemoTable memo) { + @Override + protected DataCharacteristics inferOutputCharacteristics(MemoTable memo) { return null; } - @Override public Lop constructLops() { + @Override + public Lop constructLops() { //return already created lops if( getLops() != null ) return getLops(); @@ -87,45 +91,53 @@ private GenerateReaderOp() { return getLops(); } - @Override protected Types.ExecType optFindExecType(boolean transitive) { + @Override + protected Types.ExecType optFindExecType(boolean transitive) { return null; } - @Override public String getOpString() { + @Override + public String getOpString() { String s = new String(""); s += _op.toString(); s += " "+getName(); return s; } - @Override public boolean isGPUEnabled() { + @Override + public boolean isGPUEnabled() { return false; } - @Override protected double computeOutputMemEstimate(long dim1, long dim2, long nnz) { + @Override + protected double computeOutputMemEstimate(long dim1, long dim2, long nnz) { return 0; } - @Override protected double computeIntermediateMemEstimate(long dim1, long dim2, long nnz) { + @Override + protected double computeIntermediateMemEstimate(long dim1, long dim2, long nnz) { return 0; } - @Override public void refreshSizeInformation() { + @Override + public void refreshSizeInformation() { } - @Override public Object clone() throws CloneNotSupportedException { + @Override + public Object clone() throws CloneNotSupportedException { return null; } - @Override public boolean compare(Hop that) { + @Override + public boolean compare(Hop that) { return false; } /** * Generate Reader operation for Matrix * This constructor supports expression in parameters - * + * @param l ? * @param dt data type * @param dop data operator type * @param in high-level operator diff --git a/src/main/java/org/apache/sysds/lops/DataIOGen.java b/src/main/java/org/apache/sysds/lops/DataIOGen.java index 803bef1bb41..44111148a9d 100644 --- a/src/main/java/org/apache/sysds/lops/DataIOGen.java +++ b/src/main/java/org/apache/sysds/lops/DataIOGen.java @@ -140,7 +140,8 @@ public String getFileFormatType() { return formatType; } - @Override public String toString() { + @Override + public String toString() { return getID() + ":" + "File_Name: " + getOutputParameters().getFile_name() + " " + "Label: " + getOutputParameters().getLabel() + " " + "Operation: = " + _op + " " + "Format: " + outParams.getFormat() + " Datatype: " + getDataType() + " Valuetype: " + getValueType() + " num_rows = " + getOutputParameters().getNumRows() + " num_cols = " + getOutputParameters().getNumCols() + " UpdateInPlace: " + getOutputParameters().getUpdateType(); } @@ -236,7 +237,8 @@ public boolean isPersistentRead() { * Method to get CP instructions for reading/writing scalars and matrices from/to HDFS. * This method generates CP read/write instructions. */ - @Override public String getInstructions(String input1, String input2) { + @Override + public String getInstructions(String input1, String input2) { if(getOutputParameters().getFile_name() == null && _op.isRead()) throw new LopsException( this.printErrorLocation() + "Data.getInstructions(): Exepecting a SCALAR data type, encountered " + getDataType()); @@ -293,11 +295,13 @@ else if(_op.isWrite()) { /** * Method to generate createvar instruction that updates symbol table with metadata, hdfsfile name, etc. */ - @Override public String getInstructions() { + @Override + public String getInstructions() { return getCreateVarInstructions(getOutputParameters().getFile_name(), getOutputParameters().getLabel()); } - @Override public String getInstructions(String outputFileName) { + @Override + public String getInstructions(String outputFileName) { return getCreateVarInstructions(outputFileName, getOutputParameters().getLabel()); } From 21bddc075c34ab090e3098e5fdefaff577e09c36 Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Fri, 5 Aug 2022 02:16:58 +0200 Subject: [PATCH 82/84] Minor Rename Input File Name --- .../org/apache/sysds/test/functions/iogen/GIOMatrixReader.java | 2 +- .../iogen/in/{sampleMatrix_1.raw => sampleMatrix_1.dat} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename src/test/scripts/functions/iogen/in/{sampleMatrix_1.raw => sampleMatrix_1.dat} (100%) diff --git a/src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java b/src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java index 04dd854c8fd..bc078541f41 100644 --- a/src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java +++ b/src/test/java/org/apache/sysds/test/functions/iogen/GIOMatrixReader.java @@ -45,7 +45,7 @@ protected String getInputSampleMatrixFileName() { } protected String getInputSampleRawFileName() { - return "sampleMatrix_" + getId() + ".raw"; + return "sampleMatrix_" + getId() + ".dat"; } protected String getOutputGIO() { diff --git a/src/test/scripts/functions/iogen/in/sampleMatrix_1.raw b/src/test/scripts/functions/iogen/in/sampleMatrix_1.dat similarity index 100% rename from src/test/scripts/functions/iogen/in/sampleMatrix_1.raw rename to src/test/scripts/functions/iogen/in/sampleMatrix_1.dat From f096a225c94bab1880213fc74587011de104cb3d Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 24 Aug 2022 19:58:36 +0200 Subject: [PATCH 83/84] Formatting, resolve some comments,and rename class name --- .../java/org/apache/sysds/lops/DataIOGen.java | 40 ++++++++------ .../org/apache/sysds/lops/compile/Dag.java | 52 ++++++++++--------- .../apache/sysds/parser/StatementBlock.java | 14 +++-- .../sysds/runtime/io/FrameReaderFactory.java | 2 +- .../sysds/runtime/io/FrameReaderJSONL.java | 3 +- .../sysds/runtime/io/MatrixReaderFactory.java | 2 +- ...Identifying.java => FormatIdentifyer.java} | 6 +-- .../sysds/runtime/iogen/GenerateReader.java | 12 +++-- 8 files changed, 71 insertions(+), 60 deletions(-) rename src/main/java/org/apache/sysds/runtime/iogen/{FormatIdentifying.java => FormatIdentifyer.java} (99%) diff --git a/src/main/java/org/apache/sysds/lops/DataIOGen.java b/src/main/java/org/apache/sysds/lops/DataIOGen.java index 44111148a9d..df6618127f0 100644 --- a/src/main/java/org/apache/sysds/lops/DataIOGen.java +++ b/src/main/java/org/apache/sysds/lops/DataIOGen.java @@ -48,7 +48,8 @@ public class DataIOGen extends Lop { */ public static DataIOGen createLiteralLop(ValueType vt, String literalValue) { // All literals have default format type of TEXT - return new DataIOGen(OpOpData.PERSISTENTREAD, null, null, null, literalValue, DataType.SCALAR, vt, FileFormat.TEXT.toString()); + return new DataIOGen(OpOpData.PERSISTENTREAD, null, null, null, literalValue, DataType.SCALAR, vt, + FileFormat.TEXT.toString()); } /** @@ -65,8 +66,8 @@ public static DataIOGen createLiteralLop(ValueType vt, String literalValue) { * @param vt value type * @param fmt file format */ - public DataIOGen(OpOpData op, Lop input, HashMap inputParametersLops, String name, String literal, DataType dt, ValueType vt, - String fmt) { + public DataIOGen(OpOpData op, Lop input, HashMap inputParametersLops, String name, String literal, + DataType dt, ValueType vt, String fmt) { super(Type.Data, dt, vt); _op = op; literal_var = (literal != null); @@ -74,7 +75,8 @@ public DataIOGen(OpOpData op, Lop input, HashMap inputParametersLop // Either name or literal can be non-null. if(literal_var) { if(_op.isTransient()) - throw new LopsException("Invalid parameter values while setting up a Data LOP -- transient flag is invalid for a literal."); + throw new LopsException( + "Invalid parameter values while setting up a Data LOP -- transient flag is invalid for a literal."); getOutputParameters().setLabel(literal); } else if(name != null) { @@ -86,7 +88,8 @@ else if(name != null) { } } else { - throw new LopsException("Invalid parameter values while setting up a Data LOP -- the lop must have either literal value or a name."); + throw new LopsException( + "Invalid parameter values while setting up a Data LOP -- the lop must have either literal value or a name."); } // WRITE operation must have an input Lops, we always put this @@ -105,7 +108,8 @@ else if(name != null) { lop.addOutput(this); } if(inputParametersLops.get(DataExpression.IO_FILENAME) != null) { - OutputParameters outParams = (inputParametersLops.get(DataExpression.IO_FILENAME)).getOutputParameters(); + OutputParameters outParams = (inputParametersLops.get( + DataExpression.IO_FILENAME)).getOutputParameters(); String fName = outParams.getLabel(); this.getOutputParameters().setFile_name(fName); } @@ -123,8 +127,6 @@ private void setLopProperties() { /** * Data-Lop-specific method to set the execution type for persistent write. - * TODO: split lops into MR/CP lop. - * * @param et execution type */ public void setExecType(ExecType et) { @@ -133,7 +135,6 @@ public void setExecType(ExecType et) { /** * method to get format type for input, output files. - * * @return file format */ public String getFileFormatType() { @@ -142,12 +143,20 @@ public String getFileFormatType() { @Override public String toString() { - return getID() + ":" + "File_Name: " + getOutputParameters().getFile_name() + " " + "Label: " + getOutputParameters().getLabel() + " " + "Operation: = " + _op + " " + "Format: " + outParams.getFormat() + " Datatype: " + getDataType() + " Valuetype: " + getValueType() + " num_rows = " + getOutputParameters().getNumRows() + " num_cols = " + getOutputParameters().getNumCols() + " UpdateInPlace: " + getOutputParameters().getUpdateType(); + return getID() + ":" + + " File_Name: " + getOutputParameters().getFile_name() + + " Label: " + getOutputParameters().getLabel() + + " Operation: = " + _op + + " Format: " + outParams.getFormat() + + " Datatype: " + getDataType() + + " Valuetype: " + getValueType() + + " num_rows = " + getOutputParameters().getNumRows() + + " num_cols = " + getOutputParameters().getNumCols() + + " UpdateInPlace: " + getOutputParameters().getUpdateType(); } /** * method to get operation type, i.e. read/write. - * * @return operation type */ @@ -157,7 +166,6 @@ public OpOpData getOperationType() { /** * method to get inputParams - * * @return input parameters */ public HashMap getInputParams() { @@ -177,7 +185,6 @@ public Lop getNamedInputLop(String name, String defaultVal) { /** * method to check if this data lop represents a literal. - * * @return true if data lop is a literal */ public boolean isLiteral() { @@ -210,7 +217,8 @@ public long getLongValue() { return (long) Double.parseDouble(getOutputParameters().getLabel()); default: - throw new LopsException("Encountered a non-numeric value " + (vt) + ", while a numeric value is expected."); + throw new LopsException( + "Encountered a non-numeric value " + (vt) + ", while a numeric value is expected."); } } else @@ -280,7 +288,9 @@ else if(_op.isWrite()) { Lop descriptionLop = getInputParams().get(DataExpression.DESCRIPTIONPARAM); if(descriptionLop != null) { boolean descLiteral = (descriptionLop instanceof DataIOGen && ((DataIOGen) descriptionLop).isLiteral()); - sb.append(prepOperand(descriptionLop.getOutputParameters().getLabel(), DataType.SCALAR, ValueType.STRING, descLiteral)); + sb.append( + prepOperand(descriptionLop.getOutputParameters().getLabel(), DataType.SCALAR, ValueType.STRING, + descLiteral)); } else { sb.append(prepOperand("", DataType.SCALAR, ValueType.STRING, true)); diff --git a/src/main/java/org/apache/sysds/lops/compile/Dag.java b/src/main/java/org/apache/sysds/lops/compile/Dag.java index 20b9adb2d4a..273c001915f 100644 --- a/src/main/java/org/apache/sysds/lops/compile/Dag.java +++ b/src/main/java/org/apache/sysds/lops/compile/Dag.java @@ -306,10 +306,13 @@ private ArrayList doPlainInstructionGen(StatementBlock sb, List execNodes = nodes.stream() - .filter(l -> (!l.isDataIOGenExecLocation() && !l.isReaderGenExecLocation() && (!l.isDataExecLocation() - || (((Data)l).getOperationType().isWrite() && !isTransientWriteRead((Data)l)) - || (((Data)l).isPersistentRead() && l.getDataType().isScalar())))) - .collect(Collectors.toList()); + .filter(l -> (!l.isDataIOGenExecLocation() && + !l.isReaderGenExecLocation() && + (!l.isDataExecLocation()|| + (((Data)l).getOperationType().isWrite() && !isTransientWriteRead((Data)l)) || + (((Data)l).isPersistentRead() && l.getDataType().isScalar()) + ) + )).collect(Collectors.toList()); // generate executable instruction generateControlProgramJobs(execNodes, inst, writeInst, deleteInst); @@ -381,15 +384,16 @@ private static List deleteUpdatedTransientReadVariables(StatementBl HashMap updatedLabelsLineNum = new HashMap<>(); // first capture all transient read variables - for ( Lop node : nodeV ) { - if ((node.isDataIOGenExecLocation() && node.getDataType() == DataType.MATRIX) - || (node.isDataExecLocation() - && ((Data) node).getOperationType().isTransient() - && ((Data) node).getOperationType().isRead() - && ((Data) node).getDataType() == DataType.MATRIX)) { + for(Lop node : nodeV) { + if((node.isDataIOGenExecLocation() && node.getDataType() == DataType.MATRIX) || + (node.isDataExecLocation() && + ((Data) node).getOperationType().isTransient()&& + ((Data) node).getOperationType().isRead() && + ((Data) node).getDataType() == DataType.MATRIX + )){ // "node" is considered as updated ONLY IF the old value is not used any more // So, make sure that this READ node does not feed into any (transient/persistent) WRITE - boolean hasWriteParent=false; + boolean hasWriteParent = false; for(Lop p : node.getOutputs()) { if(p.isDataExecLocation()) { // if the "p" is of type Data, then it has to be a WRITE @@ -397,7 +401,7 @@ private static List deleteUpdatedTransientReadVariables(StatementBl break; } } - if ( !hasWriteParent ) { + if(!hasWriteParent) { // node has no parent of type WRITE, so this is a CANDIDATE variable // add it to labelNodeMapping so that it is considered in further processing labelNodeMapping.put(node.getOutputParameters().getLabel(), node); @@ -469,13 +473,16 @@ private static List generateRemoveInstructions(StatementBlock sb) { private static ArrayList generateInstructionsForInputVariables(List nodes_v) { ArrayList insts = new ArrayList<>(); for(Lop n : nodes_v) { - if (n.isReaderGenExecLocation() || n.isDataIOGenExecLocation() || ( n.isDataExecLocation() - && !((Data) n).getOperationType().isTransient() - && ((Data) n).getOperationType().isRead() - && (n.getDataType() == DataType.MATRIX || n.getDataType() == DataType.FRAME - || n.getDataType() == DataType.LIST))) - { - if ( n.isDataIOGenExecLocation() || n.isReaderGenExecLocation() || (n.isDataExecLocation() && !((Data)n).isLiteral()) ) { + if( n.isReaderGenExecLocation() || + n.isDataIOGenExecLocation() || + (n.isDataExecLocation() && + !((Data) n).getOperationType().isTransient() && + ((Data) n).getOperationType().isRead() && + (n.getDataType() == DataType.MATRIX || n.getDataType() == DataType.FRAME || n.getDataType() == DataType.LIST) + )) { + if ( n.isDataIOGenExecLocation() || + n.isReaderGenExecLocation() || + (n.isDataExecLocation() && !((Data)n).isLiteral()) ) { try { String inst_string = n.getInstructions(); CPInstruction currInstr = CPInstructionParser.parseSingleInstruction(inst_string); @@ -649,13 +656,8 @@ else if (node.getType() == Lop.Type.Nary) { inst_string = node.getInstructions(inputs, node.getOutputParameters().getLabel()); } - else if(node instanceof ReaderGen) + else if(node instanceof ReaderGen || node instanceof DataIOGen) inst_string = node.getInstructions(); - - else if(node instanceof DataIOGen){ - inst_string = node.getInstructions(); - } - else { if ( node.getInputs().isEmpty() ) { // currently, such a case exists only for Rand lop diff --git a/src/main/java/org/apache/sysds/parser/StatementBlock.java b/src/main/java/org/apache/sysds/parser/StatementBlock.java index a4e2b86333a..f1bd83dcbab 100644 --- a/src/main/java/org/apache/sysds/parser/StatementBlock.java +++ b/src/main/java/org/apache/sysds/parser/StatementBlock.java @@ -917,14 +917,12 @@ private void validateAssignmentStatement(Statement current, DMLProgram dmlProg, else { //all builtin functions and expressions if( target == null ) { // check if IOGEN - if (source instanceof DataExpression && ((DataExpression)source).getOpCode() == Expression.DataOp.READ){ - if(!(((DataExpression)source).getVarParam(DataExpression.SAMPLE_RAW) != null && - ((DataExpression)source).getVarParam(DataExpression.SAMPLE) != null && - ((DataExpression)source).getVarParam(DataExpression.FORMAT_TYPE) != null && - ((DataExpression)source).getVarParam(DataExpression.DATATYPEPARAM) != null)){ - raiseValidateError("Missing variable assignment.", false); - } - } + if(source instanceof DataExpression && ((DataExpression) source).getOpCode() == Expression.DataOp.READ && + !(((DataExpression) source).getVarParam(DataExpression.SAMPLE_RAW) != null && + ((DataExpression) source).getVarParam(DataExpression.SAMPLE) != null && + ((DataExpression) source).getVarParam(DataExpression.FORMAT_TYPE) != null && + ((DataExpression) source).getVarParam(DataExpression.DATATYPEPARAM) != null)) + raiseValidateError("Missing variable assignment.", false); else raiseValidateError("Missing variable assignment.", false); } diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java index 8df9994352d..e6e6dca461f 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderFactory.java @@ -84,7 +84,7 @@ public static FrameReader createFrameReader(FileFormat fmt, FileFormatProperties reader = frm.getReader(); } catch(Exception e) { - throw new DMLRuntimeException("IOGEN Matrix Reader Error: " + e); + throw new DMLRuntimeException("IOGEN Matrix Reader Error: ", e); } break; diff --git a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java index 1f9d6c72c03..f43ae5670c3 100644 --- a/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java +++ b/src/main/java/org/apache/sysds/runtime/io/FrameReaderJSONL.java @@ -128,8 +128,7 @@ private static String getStringFromJSONPath(JSONObject jsonObject, String path) } if(temp == null){ - return null; - //throw new IOException("Could not traverse the JSON path: '" + path + "'!"); + throw new IOException("Could not traverse the JSON path: '" + path + "'!"); } return temp.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java index c428a1f6b02..297a5c0439d 100644 --- a/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java +++ b/src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java @@ -139,7 +139,7 @@ public static MatrixReader createMatrixReader( ReadProperties props ) { reader = grm.getReader(); } catch(Exception e) { - throw new DMLRuntimeException("IOGEN Matrix Reader Error: " + e); + throw new DMLRuntimeException("IOGEN Matrix Reader Error: ", e); } break; diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java similarity index 99% rename from src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java rename to src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java index fabb1875ae7..369999d2536 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifying.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java @@ -31,7 +31,7 @@ import java.util.Map; import java.util.Set; -public class FormatIdentifying { +public class FormatIdentifyer { private int[][] mapRow; private int[][] mapCol; @@ -49,12 +49,12 @@ public class FormatIdentifying { private ReaderMapping mappingValues; private CustomProperties properties; - public FormatIdentifying(String raw, MatrixBlock matrix) throws Exception { + public FormatIdentifyer(String raw, MatrixBlock matrix) throws Exception { this.mappingValues = new ReaderMapping(raw, matrix); this.runIdentification(); } - public FormatIdentifying(String raw, FrameBlock frame) throws Exception { + public FormatIdentifyer(String raw, FrameBlock frame) throws Exception { this.mappingValues = new ReaderMapping(raw, frame); this.runIdentification(); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java index 922efe91d4b..d7d342d13b8 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/GenerateReader.java @@ -40,10 +40,10 @@ public abstract class GenerateReader { protected String className; public GenerateReader(SampleProperties sampleProperties) throws Exception { - FormatIdentifying formatIdentifying = sampleProperties.getDataType().isMatrix() ? new FormatIdentifying(sampleProperties.getSampleRaw(), - sampleProperties.getSampleMatrix()) : new FormatIdentifying(sampleProperties.getSampleRaw(), sampleProperties.getSampleFrame()); + FormatIdentifyer formatIdentifyer = sampleProperties.getDataType().isMatrix() ? new FormatIdentifyer(sampleProperties.getSampleRaw(), + sampleProperties.getSampleMatrix()) : new FormatIdentifyer(sampleProperties.getSampleRaw(), sampleProperties.getSampleFrame()); - properties = formatIdentifying.getFormatProperties(); + properties = formatIdentifyer.getFormatProperties(); if(properties == null) { throw new Exception("The file format couldn't recognize!!"); } @@ -103,7 +103,8 @@ public MatrixReader getReader() throws Exception { return matrixReader; } - @Override public String getReaderString() { + @Override + public String getReaderString() { MatrixCodeGen src = new MatrixCodeGen(properties, className); // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; @@ -133,7 +134,8 @@ public FrameReader getReader() throws Exception { return frameReader; } - @Override public String getReaderString() { + @Override + public String getReaderString() { FrameCodeGen src = new FrameCodeGen(properties, className); // constructor with arguments as CustomProperties Class[] cArg = new Class[1]; From ac02ff1932fac5a3d7e8d7815291994c689948ae Mon Sep 17 00:00:00 2001 From: Saeed Fathollahzadeh Date: Wed, 24 Aug 2022 20:11:26 +0200 Subject: [PATCH 84/84] Minor --- .../org/apache/sysds/parser/StatementBlock.java | 14 ++++++++------ .../sysds/runtime/iogen/FormatIdentifyer.java | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/apache/sysds/parser/StatementBlock.java b/src/main/java/org/apache/sysds/parser/StatementBlock.java index f1bd83dcbab..013ca4f8244 100644 --- a/src/main/java/org/apache/sysds/parser/StatementBlock.java +++ b/src/main/java/org/apache/sysds/parser/StatementBlock.java @@ -917,12 +917,14 @@ private void validateAssignmentStatement(Statement current, DMLProgram dmlProg, else { //all builtin functions and expressions if( target == null ) { // check if IOGEN - if(source instanceof DataExpression && ((DataExpression) source).getOpCode() == Expression.DataOp.READ && - !(((DataExpression) source).getVarParam(DataExpression.SAMPLE_RAW) != null && - ((DataExpression) source).getVarParam(DataExpression.SAMPLE) != null && - ((DataExpression) source).getVarParam(DataExpression.FORMAT_TYPE) != null && - ((DataExpression) source).getVarParam(DataExpression.DATATYPEPARAM) != null)) - raiseValidateError("Missing variable assignment.", false); + if (source instanceof DataExpression && ((DataExpression)source).getOpCode() == Expression.DataOp.READ){ + if(!(((DataExpression)source).getVarParam(DataExpression.SAMPLE_RAW) != null && + ((DataExpression)source).getVarParam(DataExpression.SAMPLE) != null && + ((DataExpression)source).getVarParam(DataExpression.FORMAT_TYPE) != null && + ((DataExpression)source).getVarParam(DataExpression.DATATYPEPARAM) != null)){ + raiseValidateError("Missing variable assignment.", false); + } + } else raiseValidateError("Missing variable assignment.", false); } diff --git a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java index 369999d2536..5e3b6677857 100644 --- a/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java +++ b/src/main/java/org/apache/sysds/runtime/iogen/FormatIdentifyer.java @@ -40,8 +40,8 @@ public class FormatIdentifyer { private MappingProperties mappingProperties; private ArrayList sampleRawIndexes; - private static int nrows; - private static int ncols; + private int nrows; + private int ncols; private int nlines; private int windowSize = 20;