Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions docs/site/dml-language-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -869,6 +869,7 @@ SystemDS supports 4 file formats:
* Matrix Market (coordinate)
* Text (i,j,v)
* Binary
* HDF5

The CSV format is a standard text-based format where columns are separated by delimiter characters, typically commas, and
rows are represented on separate lines.
Expand All @@ -889,6 +890,10 @@ can span multiple part files.

The binary format can only be read and written by SystemDS.

Hierarchical Data Format (HDF) is a file format designed to store and organize large amounts of data. SystemDS supports
some features of the HDF5 like two dimension data (Matrix), matrix with FP64 (double) data type,
single dataset, single group, and contiguous dataset.

Let's look at a matrix and examples of its data represented in the supported formats with corresponding metadata. In the table below, we have
a matrix consisting of 4 rows and 3 columns.

Expand Down Expand Up @@ -995,6 +1000,24 @@ Below, we have examples of this matrix in the CSV, Matrix Market, IJV, and Binar
}
</div>

<div data-lang="HDF5" markdown="1">
HDF5 is not a text-based format.
</div>

<div data-lang="HDF5 MTD" markdown="1">
{
"data_type": "matrix",
"value_type": "double",
"rows": 4,
"cols": 3,
"nnz": 6,
"dataset": "systemds",
"format": "hdf5",
"author": "SystemDS",
"created": "2021-06-11 13:36:15 CET"
}
</div>

</div>

As another example, here we see the content of the MTD file `scalar.mtd` associated with a scalar data file `scalar`
Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/apache/sysds/common/Types.java
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,8 @@ public enum FileFormat {
JSONL, // text nested JSON (Line) representation
BINARY, // binary block representation (dense/sparse/ultra-sparse)
FEDERATED, // A federated matrix
PROTO; // protocol buffer representation
PROTO, // protocol buffer representation
HDF5; // Hierarchical Data Format (HDF)

public boolean isIJV() {
return this == TEXT || this == MM;
Expand Down
42 changes: 41 additions & 1 deletion src/main/java/org/apache/sysds/lops/Data.java
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,21 @@ else if (_op.isWrite()) {
}
}

if(oparams.getFormat() == FileFormat.HDF5) {
Data datasetNameLop = (Data) getNamedInputLop(DataExpression.HDF5_DATASET_NAME);
if(datasetNameLop.isVariable())
throw new LopsException(
this.printErrorLocation() + "Parameter " + DataExpression.HDF5_DATASET_NAME + " must be a literal for a seq operation.");

sb.append(OPERAND_DELIMITOR);
sb.append(datasetNameLop.getStringValue());

if(this.getExecType() == ExecType.SPARK) {
sb.append(OPERAND_DELIMITOR);
sb.append(true); //isInputMatrixBlock
}
}

}

if (_op.isWrite()) {
Expand Down Expand Up @@ -425,6 +440,12 @@ public String getCreateVarInstructions(String outputFileName, String outputLabel
sb.append( createVarLIBSVMHelper() );
}

// Format-specific properties
if ( oparams.getFormat() == FileFormat.HDF5 ) {
sb.append(OPERAND_DELIMITOR);
sb.append( createVarHDF5Helper() );
}

// Frame-specific properties
if( getDataType()==DataType.FRAME ) {
Data schema = (Data) getNamedInputLop(DataExpression.SCHEMAPARAM);
Expand Down Expand Up @@ -538,4 +559,23 @@ private String createVarLIBSVMHelper() {
}
return sb.toString();
}
}

private String createVarHDF5Helper() {
StringBuilder sb = new StringBuilder();
if ( _op.isRead() ) {
Data datasetNameLop = (Data) getNamedInputLop(DataExpression.HDF5_DATASET_NAME);
sb.append(datasetNameLop.getStringValue());
sb.append(OPERAND_DELIMITOR);
}
else { // (operation == OperationTypes.WRITE)
Data datasetNameLop = (Data) getNamedInputLop(DataExpression.HDF5_DATASET_NAME);
if(datasetNameLop.isVariable())
throw new LopsException(
this.printErrorLocation() + "Parameter " + DataExpression.HDF5_DATASET_NAME + " must be a literal for a seq operation.");

sb.append(datasetNameLop.getStringValue());
sb.append(OPERAND_DELIMITOR);
}
return sb.toString();
}
}
4 changes: 4 additions & 0 deletions src/main/java/org/apache/sysds/parser/DMLTranslator.java
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,10 @@ public void constructHops(StatementBlock sb) {
case FEDERATED:
ae.setOutputParams(ae.getDim1(), ae.getDim2(), -1, ae.getUpdateType(), -1);
break;
case HDF5:
// write output in HDF5 format
ae.setOutputParams(ae.getDim1(), ae.getDim2(), ae.getNnz(), ae.getUpdateType(), -1);
break;
default:
throw new LanguageException("Unrecognized file format: " + ae.getFileFormat());
}
Expand Down
14 changes: 11 additions & 3 deletions src/main/java/org/apache/sysds/parser/DataExpression.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ public class DataExpression extends DataIdentifier
public static final String DELIM_NA_STRING_SEP = "\u00b7";
// Parameter names relevant to reading/writing delimited index/libsvmv files
public static final String LIBSVM_INDEX_DELIM = "indSep";

// Parameter names relevant to reading/writing dataset name/hdf5 files
public static final String HDF5_DATASET_NAME = "dataset";

public static final String DELIM_SPARSE = "sparse"; // applicable only for write

Expand Down Expand Up @@ -140,6 +143,8 @@ public class DataExpression extends DataIdentifier
DELIM_FILL_VALUE, DELIM_DELIMITER, DELIM_FILL, DELIM_HAS_HEADER_ROW, DELIM_NA_STRINGS,
// Parameters related to delimited/libsvm files.
LIBSVM_INDEX_DELIM,
//Parameters related to dataset name/HDF4 files.
HDF5_DATASET_NAME,
// Parameters related to privacy
PRIVACY, FINE_GRAINED_PRIVACY));

Expand All @@ -150,7 +155,9 @@ public class DataExpression extends DataIdentifier
// Parameters related to delimited/csv files.
DELIM_FILL_VALUE, DELIM_DELIMITER, DELIM_FILL, DELIM_HAS_HEADER_ROW, DELIM_NA_STRINGS,
// Parameters related to delimited/libsvm files.
LIBSVM_INDEX_DELIM));
LIBSVM_INDEX_DELIM,
//Parameters related to dataset name/HDF4 files.
HDF5_DATASET_NAME));

/* Default Values for delimited (CSV/LIBSVM) files */
public static final String DEFAULT_DELIM_DELIMITER = ",";
Expand Down Expand Up @@ -1205,7 +1212,8 @@ else if( getVarParam(READNNZPARAM) != null ) {
}
}
}

boolean isHDF5 = (formatTypeString != null && formatTypeString.equalsIgnoreCase(FileFormat.HDF5.toString()));

dataTypeString = (getVarParam(DATATYPEPARAM) == null) ? null : getVarParam(DATATYPEPARAM).toString();

if ( dataTypeString == null || dataTypeString.equalsIgnoreCase(Statement.MATRIX_DATA_TYPE)
Expand All @@ -1232,7 +1240,7 @@ else if( getVarParam(READNNZPARAM) != null ) {
// initialize size of target data identifier to UNKNOWN
getOutput().setDimensions(-1, -1);

if (!isCSV && !isLIBSVM && ConfigurationManager.getCompilerConfig()
if (!isCSV && !isLIBSVM && !isHDF5 && ConfigurationManager.getCompilerConfig()
.getBool(ConfigType.REJECT_READ_WRITE_UNKNOWNS) //skip check for csv/libsvm format / jmlc api
&& (getVarParam(READROWPARAM) == null || getVarParam(READCOLPARAM) == null) ) {
raiseValidateError("Missing or incomplete dimension information in read statement: "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@
import org.apache.sysds.runtime.io.FileFormatProperties;
import org.apache.sysds.runtime.io.FileFormatPropertiesCSV;
import org.apache.sysds.runtime.io.FileFormatPropertiesLIBSVM;
import org.apache.sysds.runtime.io.FileFormatPropertiesHDF5;
import org.apache.sysds.runtime.io.IOUtilFunctions;
import org.apache.sysds.runtime.io.ListReader;
import org.apache.sysds.runtime.io.ListWriter;
import org.apache.sysds.runtime.io.WriterMatrixMarket;
import org.apache.sysds.runtime.io.WriterTextCSV;
import org.apache.sysds.runtime.io.WriterHDF5;
import org.apache.sysds.runtime.lineage.LineageItem;
import org.apache.sysds.runtime.lineage.LineageItemUtils;
import org.apache.sysds.runtime.lineage.LineageTraceable;
Expand Down Expand Up @@ -367,6 +369,11 @@ else if(fmt.equalsIgnoreCase("libsvm")) {
if(parts.length < 12 + extSchema)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
}
else if(fmt.equalsIgnoreCase("hdf5")) {
// 11 inputs: createvar corresponding to WRITE/READ -- includes properties dataset name
if(parts.length < 11 + extSchema)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
}
else {
if ( parts.length != 6 && parts.length != 11+extSchema )
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
Expand Down Expand Up @@ -458,6 +465,17 @@ else if(fmt.equalsIgnoreCase("libsvm")) {
return new VariableCPInstruction(VariableOperationCode.CreateVariable,
in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str);
}
else if(fmt.equalsIgnoreCase("hdf5")) {
// Cretevar instructions for HDF5 format has 13.
// 11 inputs: createvar corresponding to WRITE/READ -- includes properties dataset name
int curPos = 11;
String datasetName = parts[curPos];
FileFormatProperties fmtProperties = new FileFormatPropertiesHDF5(datasetName);

return new VariableCPInstruction(VariableOperationCode.CreateVariable,
in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str);
}

else {
return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, schema, opcode, str);
}
Expand Down Expand Up @@ -524,6 +542,10 @@ else if ( in3.getName().equalsIgnoreCase("libsvm") ) {
boolean sparse = Boolean.parseBoolean(parts[6]);
fprops = new FileFormatPropertiesLIBSVM(delim, indexDelim, sparse);
}
else if(in3.getName().equalsIgnoreCase("hdf5") ){
String datasetName = parts[4];
fprops = new FileFormatPropertiesHDF5(datasetName);
}
else {
fprops = new FileFormatProperties();
in4 = new CPOperand(parts[5]); // blocksize in empty description
Expand Down Expand Up @@ -962,7 +984,7 @@ private void processWriteInstruction(ExecutionContext ec) {
String fname = ec.getScalarInput(getInput2().getName(), ValueType.STRING, getInput2().isLiteral()).getStringValue();
String fmtStr = getInput3().getName();
FileFormat fmt = FileFormat.safeValueOf(fmtStr);
if( fmt != FileFormat.LIBSVM ) {
if( fmt != FileFormat.LIBSVM && fmt != FileFormat.HDF5) {
String desc = ec.getScalarInput(getInput4().getName(), ValueType.STRING, getInput4().isLiteral()).getStringValue();
_formatProperties.setDescription(desc);
}
Expand All @@ -975,8 +997,10 @@ else if( getInput1().getDataType() == DataType.MATRIX ) {
writeMMFile(ec, fname);
else if( fmt == FileFormat.CSV )
writeCSVFile(ec, fname);
else if(fmt == FileFormat.LIBSVM)
writeLIBSVMFile(ec, fname);
else if(fmt == FileFormat.LIBSVM)
writeLIBSVMFile(ec, fname);
else if(fmt == FileFormat.HDF5)
writeHDF5File(ec, fname);
else {
// Default behavior
MatrixObject mo = ec.getMatrixObject(getInput1().getName());
Expand Down Expand Up @@ -1102,6 +1126,40 @@ private void writeLIBSVMFile(ExecutionContext ec, String fname) {
}
}

/**
* Helper function to write HDF5 files to HDFS.
*
* @param ec execution context
* @param fname file name
*/
private void writeHDF5File(ExecutionContext ec, String fname) {
MatrixObject mo = ec.getMatrixObject(getInput1().getName());
String outFmt = "hdf5";

if(mo.isDirty()) {
// there exist data computed in CP that is not backed up on HDFS
// i.e., it is either in-memory or in evicted space
mo.exportData(fname, outFmt, _formatProperties);
}
else {
try {
FileFormat fmt = ((MetaDataFormat) mo.getMetaData()).getFileFormat();
DataCharacteristics dc = (mo.getMetaData()).getDataCharacteristics();
if(fmt == FileFormat.HDF5 && !getInput1().getName().startsWith(org.apache.sysds.lops.Data.PREAD_PREFIX)) {
WriterHDF5 writer = new WriterHDF5((FileFormatPropertiesHDF5) _formatProperties);
}
else {
mo.exportData(fname, outFmt, _formatProperties);
}
HDFSTool.writeMetaDataFile(fname + ".mtd", mo.getValueType(), dc, FileFormat.HDF5, _formatProperties,
mo.getPrivacyConstraint());
}
catch (IOException e) {
throw new DMLRuntimeException(e);
}
}
}

/**
* Helper function to write MM files to HDFS.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.sysds.runtime.io;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import java.io.Serializable;

public class FileFormatPropertiesHDF5 extends FileFormatProperties implements Serializable {
protected static final Log LOG = LogFactory.getLog(FileFormatPropertiesHDF5.class.getName());
private static final long serialVersionUID = 8646275033790103030L;

private String datasetName;

public FileFormatPropertiesHDF5() {
this.datasetName = "systemdsh5";
}

public FileFormatPropertiesHDF5(String datasetName) {
this.datasetName = datasetName;
}

public String getDatasetName() {
return datasetName;
}

@Override public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(" datasetName " + datasetName);
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ public InputOutputInfo(Class<? extends InputFormat> formatClsIn, Class<? extends
TextInputFormat.class, TextOutputFormat.class, LongWritable.class, Text.class);
public static final InputOutputInfo LIBSVMInputOutputInfo = new InputOutputInfo(
TextInputFormat.class, TextOutputFormat.class, LongWritable.class, Text.class);
public static final InputOutputInfo HDF5InputOutputInfo = new InputOutputInfo(
TextInputFormat.class, TextOutputFormat.class, LongWritable.class, Text.class);

@SuppressWarnings("incomplete-switch")
public static InputOutputInfo get(DataType dt, FileFormat fmt) {
Expand All @@ -82,6 +84,7 @@ public static InputOutputInfo get(DataType dt, FileFormat fmt) {
case MM: return MatrixMarketInputOutputInfo;
case CSV: return CSVInputOutputInfo;
case LIBSVM: return LIBSVMInputOutputInfo;
case HDF5: return HDF5InputOutputInfo;
case BINARY: {
switch( dt ) {
case MATRIX: return BinaryBlockInputOutputInfo;
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/org/apache/sysds/runtime/io/MatrixReaderFactory.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ public static MatrixReader createMatrixReader(FileFormat fmt) {
new ReaderBinaryBlockParallel(false) : new ReaderBinaryBlock(false);
break;

case HDF5:
reader = (par & mcsr) ? new ReaderHDF5Parallel(
new FileFormatPropertiesHDF5()) : new ReaderHDF5(new FileFormatPropertiesHDF5());
break;
default:
throw new DMLRuntimeException("Failed to create matrix reader for unknown format: " + fmt.toString());
}
Expand Down Expand Up @@ -108,6 +112,12 @@ public static MatrixReader createMatrixReader( ReadProperties props ) {
new ReaderBinaryBlockParallel(props.localFS) : new ReaderBinaryBlock(props.localFS);
break;

case HDF5:
FileFormatPropertiesHDF5 fileFormatPropertiesHDF5 = props.formatProperties != null ? (FileFormatPropertiesHDF5) props.formatProperties : new FileFormatPropertiesHDF5();
reader = (par & mcsr) ? new ReaderHDF5Parallel(fileFormatPropertiesHDF5) : new ReaderHDF5(
fileFormatPropertiesHDF5);
break;

default:
throw new DMLRuntimeException("Failed to create matrix reader for unknown format: " + fmt.toString());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ public static MatrixWriter createMatrixWriter(FileFormat fmt, int replication, F
writer = new WriterBinaryBlock(replication);
break;

case HDF5:
if(props != null && !(props instanceof FileFormatPropertiesHDF5))
throw new DMLRuntimeException("Wrong type of file format properties for HDF5 writer.");
else if( ConfigurationManager.getCompilerConfigFlag(ConfigType.PARALLEL_CP_WRITE_TEXTFORMATS) )
return new WriterHDF5Parallel((FileFormatPropertiesHDF5) props);
else
return new WriterHDF5((FileFormatPropertiesHDF5) props);

default:
throw new DMLRuntimeException("Failed to create matrix writer for unknown format: " + fmt.toString());
}
Expand Down
Loading