From ecdf74fda5e39d0ea479ae436af0a0577c3b2509 Mon Sep 17 00:00:00 2001 From: baunsgaard Date: Fri, 26 Aug 2022 17:13:35 +0200 Subject: [PATCH 1/2] [SYSTEMDS-3409] Read CSV directly without MTD python This patch allows reading CSV files without a MTD file, just like inside dml, but through the Python API. Source requires imports that are not used in code, but is dynamically loaded therefore a comment was added to enforce this. Closes #1691 --- .../getting_started/simpleExamples/l2svm.py | 3 +- .../simpleExamples/l2svm_internal.py | 3 +- .../simpleExamples/multiply.py | 3 +- .../simpleExamples/multiplyMatrix.py | 3 +- .../code/guide/algorithms/FullScript.py | 3 +- .../federated/federatedTutorial_part2.py | 4 +- .../federated/federatedTutorial_part3.py | 3 +- .../systemds/context/systemds_context.py | 80 ++++++++----- .../systemds/examples/tutorials/adult.py | 6 +- .../python/systemds/operator/nodes/frame.py | 3 +- .../python/systemds/operator/nodes/list.py | 2 +- .../python/systemds/operator/nodes/matrix.py | 3 +- .../systemds/operator/nodes/multi_return.py | 2 +- .../python/systemds/operator/nodes/source.py | 19 +-- .../tests/docs/test_algorithms_basics.py | 2 + .../python/tests/docs/test_simple_example.py | 3 + src/main/python/tests/iotests/__init__.py | 20 ++++ src/main/python/tests/iotests/test_io_csv.py | 112 ++++++++++++++++++ 18 files changed, 221 insertions(+), 53 deletions(-) create mode 100644 src/main/python/tests/iotests/__init__.py create mode 100644 src/main/python/tests/iotests/test_io_csv.py diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py index 75ecc20202d..66482c375f9 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py @@ -20,6 +20,7 @@ # ------------------------------------------------------------- # Import numpy and SystemDS import numpy as np +import logging from systemds.context import SystemDSContext from systemds.operator.algorithm import l2svm @@ -41,4 +42,4 @@ with SystemDSContext() as sds: model = l2svm(sds.from_numpy(features), sds.from_numpy(labels)).compute() - print(model) + logging.info(model) diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py index a1f32a1892b..de10ce32249 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py @@ -19,6 +19,7 @@ # # ------------------------------------------------------------- # Import SystemDS +import logging from systemds.context import SystemDSContext from systemds.operator.algorithm import l2svm @@ -31,4 +32,4 @@ labels = sds.rand(10, 1, 1, 1, sparsity = 0.5) model = l2svm(features, labels).compute() - print(model) + logging.info(model) diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py b/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py index 69b884e05c8..314b6c363f2 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py @@ -19,6 +19,7 @@ # # ------------------------------------------------------------- # Import SystemDSContext +import logging from systemds.context import SystemDSContext # Create a context and if necessary (no SystemDS py4j instance running) # it starts a subprocess which does the execution in SystemDS @@ -30,5 +31,5 @@ m_res = m * 3.1 # Do the calculation in SystemDS by calling compute(). # The returned value is an numpy array that can be directly printed. - print(m_res.compute()) + logging.info(m_res.compute()) # context will automatically be closed and process stopped diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py b/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py index ad56f5d4387..418888b3e76 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py @@ -19,6 +19,7 @@ # # ------------------------------------------------------------- import numpy as np +import logging from systemds.context import SystemDSContext # create a random array @@ -34,4 +35,4 @@ m_res = sds.from_numpy(m1) * sds.from_numpy(m2) # lets do the actual computation in SystemDS! The result is an numpy array m_res_np = m_res.compute() - print(m_res_np) + logging.info(m_res_np) diff --git a/src/main/python/docs/source/code/guide/algorithms/FullScript.py b/src/main/python/docs/source/code/guide/algorithms/FullScript.py index 28fbe14cf14..97af857c6a5 100644 --- a/src/main/python/docs/source/code/guide/algorithms/FullScript.py +++ b/src/main/python/docs/source/code/guide/algorithms/FullScript.py @@ -18,6 +18,7 @@ # under the License. # # ------------------------------------------------------------- +import logging from systemds.context import SystemDSContext from systemds.operator.algorithm import multiLogReg, multiLogRegPredict from systemds.examples.tutorials.mnist import DataManager @@ -39,4 +40,4 @@ Yt_ds = sds.from_numpy(Yt) + 1.0 [m, y_pred, acc] = multiLogRegPredict(Xt_ds, bias, Yt_ds).compute() -print(acc) +logging.info(acc) diff --git a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py index ac9c0bfc831..c0c507dde8e 100644 --- a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py +++ b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py @@ -19,7 +19,7 @@ # # ------------------------------------------------------------- # Python -import numpy as np +import logging from systemds.context import SystemDSContext # Create a federated matrix @@ -35,5 +35,5 @@ with SystemDSContext() as sds: fed_a = sds.federated([address], [dims]) # Sum the federated matrix and call compute to execute - print(fed_a.sum().compute()) + logging.info(fed_a.sum().compute()) # Result should be 45. diff --git a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py index 1d1125aa1b8..c1026a2b716 100644 --- a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py +++ b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py @@ -19,6 +19,7 @@ # # ------------------------------------------------------------- # Python +import logging import numpy as np from systemds.context import SystemDSContext @@ -44,4 +45,4 @@ # Multiply local and federated ret = loc @ fed # execute the lazy script and print - print(ret.compute()) + logging.info(ret.compute()) diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py index e467a4d65a0..1e13a217c5a 100644 --- a/src/main/python/systemds/context/systemds_context.py +++ b/src/main/python/systemds/context/systemds_context.py @@ -22,29 +22,30 @@ __all__ = ["SystemDSContext"] import json +import logging import os import socket import sys +from contextlib import contextmanager from glob import glob from queue import Queue from subprocess import PIPE, Popen from threading import Thread from time import sleep from typing import Dict, Iterable, Sequence, Tuple, Union -from contextlib import contextmanager import numpy as np import pandas as pd from py4j.java_gateway import GatewayParameters, JavaGateway from systemds.operator import (Frame, List, Matrix, OperationNode, Scalar, Source) -from systemds.script_building import OutputType +from systemds.script_building import DMLScript, OutputType from systemds.utils.consts import VALID_INPUT_TYPES from systemds.utils.helpers import get_module_dir class SystemDSContext(object): - """A context with a connection to a java instance with which SystemDS operations are executed. + """A context with a connection to a java instance with which SystemDS operations are executed. The java process is started and is running using a random tcp port for instruction parsing. This class is used as the starting point for all SystemDS execution. It gives the ability to create @@ -54,22 +55,35 @@ class SystemDSContext(object): java_gateway: JavaGateway _capture_statistics: bool _statistics: str + _log: logging.Logger - def __init__(self, port: int = -1, capture_statistics: bool = False): + def __init__(self, port: int = -1, capture_statistics: bool = False, logging_level: int = 20): """Starts a new instance of SystemDSContext, in which the connection to a JVM systemds instance is handled Any new instance of this SystemDS Context, would start a separate new JVM. Standard out and standard error form the JVM is also handled in this class, filling up Queues, that can be read from to get the printed statements from the JVM. + + :param port: default -1, giving a random port for communication with JVM + :param capture_statistics: If the statistics of the execution in SystemDS should be captured + :param logging_level: Specify the logging level used for informative messages, default 20 indicating INFO. + The logging levels are as follows: 10 DEBUG, 20 INFO, 30 WARNING, 40 ERROR, 50 CRITICAL. """ actual_port = self.__start(port) process = self.__process self._statistics = "" self._capture_statistics = capture_statistics + + self._log = logging.Logger(self.__class__.__name__) + self._log.setLevel(logging_level) + if process.poll() is None: self.__start_gateway(actual_port) else: - self.exception_and_close("Java process stopped before gateway could connect") + self.exception_and_close( + "Java process stopped before gateway could connect") + + self._log.debug("Started JVM and SystemDS python context manager") def get_stdout(self, lines: int = -1): """Getter for the stdout of the java subprocess @@ -189,10 +203,9 @@ def __build_startup_command(self, port: int): files = glob(os.path.join(root, "conf", "log4j*.properties")) if len(files) > 1: - print( - "WARNING: Multiple logging files found selecting: " + files[0]) + self._log.warning("Multiple logging files found selecting: " + files[0]) if len(files) == 0: - print("WARNING: No log4j file found at: " + self._log.warning("No log4j file found at: " + os.path.join(root, "conf") + " therefore using default settings") else: @@ -200,12 +213,11 @@ def __build_startup_command(self, port: int): command.append("org.apache.sysds.api.PythonDMLScript") - files = glob(os.path.join(root, "conf", "SystemDS*.xml")) + files=glob(os.path.join(root, "conf", "SystemDS*.xml")) if len(files) > 1: - print( - "WARNING: Multiple config files found selecting: " + files[0]) + self._log.warning("Multiple config files found selecting: " + files[0]) if len(files) == 0: - print("WARNING: No log4j file found at: " + self._log.warning("No log4j file found at: " + os.path.join(root, "conf") + " therefore using default settings") else: @@ -279,7 +291,7 @@ def close(self): self.__kill_Popen(self.java_gateway.java_process) self.java_gateway.shutdown() if hasattr(self, '__process'): - print("Has process variable") + logging.error("Has process variable") self.__kill_Popen(self.__process) if hasattr(self, '__stdout_thread') and self.__stdout_thread.is_alive(): self.__stdout_thread.join(0) @@ -311,7 +323,7 @@ def __get_open_port(self): s.close() return port - def _execution_completed(self, script: 'DMLScript'): + def _execution_completed(self, script: DMLScript): """ Should/will be called after execution of a script. Used to update statistics. @@ -448,6 +460,10 @@ def rand(self, rows: int, cols: int, return Matrix(self, 'rand', [], named_input_nodes=named_input_nodes) + def __fix_string_args(self, arg: str) -> str: + nf = str(arg).replace('"', "").replace("'", "") + return f'"{nf}"' + def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> OperationNode: """ Read an file from disk. Supportted types include: CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binary, etc. @@ -455,33 +471,44 @@ def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Ope :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar. """ mdt_filepath = path + ".mtd" - if os.path.exists(mdt_filepath): + if os.path.exists(mdt_filepath): # If metadata file is existing, then simply use that and force data type to mtd file with open(mdt_filepath) as jspec_file: mtd = json.load(jspec_file) kwargs["data_type"] = mtd["data_type"] + if kwargs.get("format", None): + kwargs["format"] = self.__fix_string_args(kwargs["format"]) + elif kwargs.get("format", None): # If format is specified. Then use that format + kwargs["format"] = self.__fix_string_args(kwargs["format"]) + else: #Otherwise guess at what format the file is based on file extension + if ".csv" in path[-4:]: + kwargs["format"] = '"csv"' + self._log.warning("Guessing '"+path+"' is a csv file, please add a mtd file, or specify in arguments") + if not ("header" in kwargs) and "data_type" in kwargs and kwargs["data_type"] == "frame": + kwargs["header"] = True data_type = kwargs.get("data_type", None) - file_format = kwargs.get("format", None) + if data_type == "matrix": kwargs["data_type"] = f'"{data_type}"' return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs) elif data_type == "frame": kwargs["data_type"] = f'"{data_type}"' - if isinstance(file_format, str): - kwargs["format"] = f'"{kwargs["format"]}"' return Frame(self, "read", [f'"{path}"'], named_input_nodes=kwargs) elif data_type == "scalar": kwargs["data_type"] = f'"{data_type}"' output_type = OutputType.from_str(kwargs.get("value_type", None)) - kwargs["value_type"] = f'"{output_type.name}"' - return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type) + if output_type: + kwargs["value_type"] = f'"{output_type.name}"' + return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type) + else: + raise ValueError("Invalid arguments for reading scalar, value_type must be specified") elif data_type == "list": # Reading a list have no extra arguments. return List(self, "read", [f'"{path}"']) - - kwargs["data_type"] = None - print("WARNING: Unknown type read please add a mtd file, or specify in arguments") - return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs) + else: + kwargs["data_type"] = '"matrix"' + self._log.warning("Unknown type read please add a mtd file, or specify in arguments, defaulting to matrix") + return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs) def scalar(self, v: Dict[str, VALID_INPUT_TYPES]) -> Scalar: """ Construct an scalar value, this can contain str, float, double, integers and booleans. @@ -572,7 +599,7 @@ def federated(self, addresses: Iterable[str], named_params.update(kwargs) return Matrix(self, 'federated', args, named_params) - def source(self, path: str, name: str, print_imported_methods: bool = False) -> Source: + def source(self, path: str, name: str) -> Source: """Import methods from a given dml file. The importing is done thorugh the DML command source, and adds all defined methods from @@ -587,9 +614,8 @@ def source(self, path: str, name: str, print_imported_methods: bool = False) -> :param path: The absolute or relative path to the file to import :param name: The name to give the imported file in the script, this name must be unique - :param print_imported_methods: boolean specifying if the imported methods should be printed. """ - return Source(self, path, name, print_imported_methods) + return Source(self, path, name) def list(self, *args: Sequence[VALID_INPUT_TYPES], **kwargs: Dict[str, VALID_INPUT_TYPES]) -> List: """ Create a List object containing the given nodes. diff --git a/src/main/python/systemds/examples/tutorials/adult.py b/src/main/python/systemds/examples/tutorials/adult.py index 575697ae01a..6bcfc8057fb 100644 --- a/src/main/python/systemds/examples/tutorials/adult.py +++ b/src/main/python/systemds/examples/tutorials/adult.py @@ -25,7 +25,7 @@ import pandas as pd import requests from systemds.context import SystemDSContext - +from systemds.operator import Frame, Scalar class DataManager: @@ -41,8 +41,8 @@ class DataManager: _test_data_loc: str _test_labels_loc: str - _data_columns: [] - _data_string_labels: [] + _data_columns: list + _data_string_labels: list def __init__(self): self._data_zip_url = "https://systemds.apache.org/assets/datasets/adult/data.zip" diff --git a/src/main/python/systemds/operator/nodes/frame.py b/src/main/python/systemds/operator/nodes/frame.py index a609929e2cf..2ff656af212 100644 --- a/src/main/python/systemds/operator/nodes/frame.py +++ b/src/main/python/systemds/operator/nodes/frame.py @@ -75,8 +75,7 @@ def code_line(self, var_name: str, unnamed_input_vars: Sequence[str], named_inpu def compute(self, verbose: bool = False, lineage: bool = False) -> pd.DataFrame: if self._is_pandas(): - if verbose: - print("[Pandas Frame - No Compilation necessary]") + self.sds_context._log.info("Pandas Frame - No Compilation necessary") return self._pd_dataframe else: return super().compute(verbose, lineage) diff --git a/src/main/python/systemds/operator/nodes/list.py b/src/main/python/systemds/operator/nodes/list.py index 578535e939e..9a31c7bb927 100644 --- a/src/main/python/systemds/operator/nodes/list.py +++ b/src/main/python/systemds/operator/nodes/list.py @@ -34,7 +34,7 @@ class List(OperationNode): - def __init__(self, sds_context: 'SystemDSContext', func='list', + def __init__(self, sds_context, func='list', unnamed_input_nodes: Union[str, Iterable[VALID_INPUT_TYPES]] = None, named_input_nodes: Dict[str, VALID_INPUT_TYPES] = None): diff --git a/src/main/python/systemds/operator/nodes/matrix.py b/src/main/python/systemds/operator/nodes/matrix.py index 6816245a508..040575fc9a3 100644 --- a/src/main/python/systemds/operator/nodes/matrix.py +++ b/src/main/python/systemds/operator/nodes/matrix.py @@ -68,8 +68,7 @@ def code_line(self, var_name: str, unnamed_input_vars: Sequence[str], def compute(self, verbose: bool = False, lineage: bool = False) -> np.array: if self._is_numpy(): - if verbose: - print('[Numpy Array - No Compilation necessary]') + self.sds_context._log.info('Numpy Array - No Compilation necessary') return self._np_array else: return super().compute(verbose, lineage) diff --git a/src/main/python/systemds/operator/nodes/multi_return.py b/src/main/python/systemds/operator/nodes/multi_return.py index b766157119d..cb6b923d2c5 100644 --- a/src/main/python/systemds/operator/nodes/multi_return.py +++ b/src/main/python/systemds/operator/nodes/multi_return.py @@ -35,7 +35,7 @@ class MultiReturn(OperationNode): - def __init__(self, sds_context: 'SystemDSContext', operation, + def __init__(self, sds_context, operation, output_nodes: List[OperationNode], unnamed_input_nodes: Union[str, Iterable[VALID_INPUT_TYPES]] = None, diff --git a/src/main/python/systemds/operator/nodes/source.py b/src/main/python/systemds/operator/nodes/source.py index 3a61f1149ff..7191fc8208e 100644 --- a/src/main/python/systemds/operator/nodes/source.py +++ b/src/main/python/systemds/operator/nodes/source.py @@ -22,11 +22,12 @@ __all__ = ["Source"] from types import MethodType -from typing import (TYPE_CHECKING, Dict, Iterable, Optional, Sequence, Tuple, - Union) +from typing import TYPE_CHECKING, Dict, Iterable, Sequence -import numpy as np -from systemds.operator import List, Matrix, OperationNode, Scalar +# Import more node types than used, +# since source dynamically adds code and the import is needed. +from systemds.operator import (List, ListAccess, Matrix, MultiReturn, + OperationNode, Scalar) from systemds.script_building.dag import OutputType @@ -43,7 +44,8 @@ def __init__(self, name: str, inputs: str, outputs: str): else: self._outputs = None - def get_func(self, sds_context: "SystemDSContext", source_name, id: int, print_imported_methods: bool = False) -> MethodType: + def get_func(self, sds_context, source_name) -> MethodType: + operation = f'"{source_name}::{self._name}"' argument_string, named_arguments = self.parse_inputs() named_intput_nodes = f'named_arguments = {{{named_arguments}}}' @@ -61,8 +63,7 @@ def get_func(self, sds_context: "SystemDSContext", source_name, id: int, print_i full_function = "\n\t".join(lines) - if print_imported_methods: - print(full_function) + sds_context._log.debug(full_function) # Use Exec to build the function from the string exec(full_function) @@ -130,7 +131,7 @@ class Source(OperationNode): __name: str - def __init__(self, sds_context: "SystemDSContext", path: str, name: str, print_imported_methods: bool = False) -> "Import": + def __init__(self, sds_context, path: str, name: str): super().__init__(sds_context, f'"{path}"', output_type=OutputType.IMPORT) self.__name = name @@ -138,7 +139,7 @@ def __init__(self, sds_context: "SystemDSContext", path: str, name: str, print_i # Add all the functions found in the source file to this object. for id, f in enumerate(functions): - func = f.get_func(sds_context, name, id, print_imported_methods) + func = f.get_func(sds_context, name) setattr(self, f._name, MethodType(func, self)) def __parse_functions_from_script(self, path: str) -> Iterable[Func]: diff --git a/src/main/python/tests/docs/test_algorithms_basics.py b/src/main/python/tests/docs/test_algorithms_basics.py index e008361d3a6..358f414d208 100644 --- a/src/main/python/tests/docs/test_algorithms_basics.py +++ b/src/main/python/tests/docs/test_algorithms_basics.py @@ -20,7 +20,9 @@ # ------------------------------------------------------------- import unittest +import logging +logging.getLogger("root").setLevel(50) class TestAlgorithmsBasics(unittest.TestCase): def test_algorithms_script(self): diff --git a/src/main/python/tests/docs/test_simple_example.py b/src/main/python/tests/docs/test_simple_example.py index 37118d5583f..ea60b412042 100644 --- a/src/main/python/tests/docs/test_simple_example.py +++ b/src/main/python/tests/docs/test_simple_example.py @@ -20,6 +20,9 @@ # ------------------------------------------------------------- import unittest +import logging + +logging.getLogger("root").setLevel(50) class TestSimpleExample(unittest.TestCase): diff --git a/src/main/python/tests/iotests/__init__.py b/src/main/python/tests/iotests/__init__.py new file mode 100644 index 00000000000..e66abb4646f --- /dev/null +++ b/src/main/python/tests/iotests/__init__.py @@ -0,0 +1,20 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- diff --git a/src/main/python/tests/iotests/test_io_csv.py b/src/main/python/tests/iotests/test_io_csv.py new file mode 100644 index 00000000000..9929658e92a --- /dev/null +++ b/src/main/python/tests/iotests/test_io_csv.py @@ -0,0 +1,112 @@ +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import os +import shutil +import unittest + +import numpy as np +import pandas as pd +from systemds.context import SystemDSContext + + +class TestReadCSV(unittest.TestCase): + + sds: SystemDSContext = None + temp_dir: str = "tests/iotests/temp_write_csv/" + n_cols = 3 + n_rows = 100 + + df = pd.DataFrame( + { + "col1": [f"ss{i}s.{i}" for i in range(n_rows)], + "col2": [i for i in range(n_rows)], + "col3": [i * 0.1 for i in range(n_rows)], + } + ) + + df2 = pd.DataFrame( + { + "col2": [i for i in range(n_rows)], + "col3": [i * 0.1 for i in range(n_rows)], + } + ) + + @classmethod + def setUpClass(cls): + cls.sds = SystemDSContext(logging_level=50) + if not os.path.exists(cls.temp_dir): + os.makedirs(cls.temp_dir) + + @classmethod + def tearDownClass(cls): + cls.sds.close() + shutil.rmtree(cls.temp_dir, ignore_errors=True) + + def test_write_read_data_frame_csv_header(self): + filename = self.temp_dir + "data_frame_header.csv" + self.df.to_csv(filename, index=False, header=True) + result_df = self.sds.read(filename, data_type="frame").compute() + self.compare_frame(result_df, self.df) + + def test_write_read_data_frame_csv_header_active(self): + filename = self.temp_dir + "data_frame_header_active.csv" + self.df.to_csv(filename, index=False, header=True) + result_df = self.sds.read( + filename, data_type="frame", header=True).compute() + self.compare_frame(result_df, self.df) + + def test_write_read_data_frame_csv_no_header(self): + filename = self.temp_dir + "data_frame_no_header.csv" + self.df.to_csv(filename, index=False, header=False) + result_df = self.sds.read( + filename, data_type="frame", header=False).compute() + self.compare_frame(result_df, self.df) + + def test_write_read_matrix_csv_no_extra_argument(self): + filename = self.temp_dir + "data_matrix_no_header.csv" + self.df2.to_csv(filename, index=False, header=False) + result_df = (self.sds.read(filename)).compute() + self.assertTrue(np.allclose(self.df2.to_numpy(), result_df)) + + def test_write_read_matrix_csv_no_extra_argument_header(self): + filename = self.temp_dir + "data_matrix_header.csv" + self.df2.to_csv(filename, index=False, header=True) + result_df = (self.sds.read(filename, header=True)).compute() + self.assertTrue(np.allclose(self.df2.to_numpy(), result_df)) + + def test_write_read_matrix_csv_no_extra_argument_header_csv(self): + filename = self.temp_dir + "data_matrix_header_2.csv" + self.df2.to_csv(filename, index=False, header=True) + result_df = (self.sds.read( + filename, format="csv", header=True)).compute() + self.assertTrue(np.allclose(self.df2.to_numpy(), result_df)) + + def compare_frame(self, a: pd.DataFrame, b: pd.DataFrame): + a = a.astype(str) + b = b.astype(str) + self.assertTrue(isinstance(a, pd.DataFrame)) + self.assertTrue(isinstance(b, pd.DataFrame)) + self.assertTrue((a.values == b.values).all()) + + +if __name__ == "__main__": + unittest.main(exit=False) From b20779a62a734647d64e69592aedf0b012f87fdc Mon Sep 17 00:00:00 2001 From: baunsgaard Date: Fri, 2 Sep 2022 11:06:42 +0200 Subject: [PATCH 2/2] [SYSTEMDS-3433] Python docs test fail in IDE This patch fixes IDE integration of the docs tests for python, the bug is only apparent in IDE since the docs folder for tests is called the same as the docs folder in root of the python tests. Also included is some minor formatting of the python tests, to make all tests runnable directly using python "test_name.py". --- .../getting_started/simpleExamples/l2svm.py | 5 +-- .../simpleExamples/l2svm_internal.py | 4 +-- .../simpleExamples/multiply.py | 4 ++- .../simpleExamples/multiplyMatrix.py | 3 +- .../code/guide/algorithms/FullScript.py | 3 +- .../federated/federatedTutorial_part1.py | 4 ++- .../federated/federatedTutorial_part2.py | 1 + .../federated/federatedTutorial_part3.py | 9 ++--- .../systemds/context/systemds_context.py | 34 +++++++++++-------- .../systemds/examples/tutorials/adult.py | 13 +++---- src/main/python/tests/algorithms/test_lm.py | 6 ++-- src/main/python/tests/algorithms/test_pca.py | 2 +- .../python/tests/algorithms/test_signal.py | 4 ++- .../tests/basics/test_context_creation.py | 3 ++ .../python/tests/basics/test_context_stats.py | 7 ++-- .../tests/{docs => docs_test}/__init__.py | 0 .../test_algorithms_basics.py | 7 +++- .../test_simple_example.py | 6 +++- .../federated/test_federated_tutorial.py | 4 +++ src/main/python/tests/frame/test_rIndexing.py | 20 +++++------ src/main/python/tests/frame/test_r_c_bind.py | 34 +++++++++++-------- src/main/python/tests/frame/test_replace.py | 7 ++-- src/main/python/tests/frame/test_slice.py | 4 +++ .../tests/manual_tests/multi_log_reg_mnist.py | 4 ++- src/main/python/tests/matrix/test_slice.py | 3 ++ .../python/tests/source/test_source_list.py | 3 ++ 26 files changed, 123 insertions(+), 71 deletions(-) rename src/main/python/tests/{docs => docs_test}/__init__.py (100%) rename src/main/python/tests/{docs => docs_test}/test_algorithms_basics.py (94%) rename src/main/python/tests/{docs => docs_test}/test_simple_example.py (96%) diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py index 66482c375f9..fdc00309241 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm.py @@ -18,9 +18,10 @@ # under the License. # # ------------------------------------------------------------- -# Import numpy and SystemDS -import numpy as np + import logging + +import numpy as np from systemds.context import SystemDSContext from systemds.operator.algorithm import l2svm diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py index de10ce32249..2f9fc625748 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/l2svm_internal.py @@ -18,8 +18,8 @@ # under the License. # # ------------------------------------------------------------- -# Import SystemDS import logging + from systemds.context import SystemDSContext from systemds.operator.algorithm import l2svm @@ -29,7 +29,7 @@ # Add value to all cells in features features += 1.1 # Generate labels of all ones and zeros - labels = sds.rand(10, 1, 1, 1, sparsity = 0.5) + labels = sds.rand(10, 1, 1, 1, sparsity=0.5) model = l2svm(features, labels).compute() logging.info(model) diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py b/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py index 314b6c363f2..b4d70ddce36 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/multiply.py @@ -18,9 +18,11 @@ # under the License. # # ------------------------------------------------------------- -# Import SystemDSContext + import logging + from systemds.context import SystemDSContext + # Create a context and if necessary (no SystemDS py4j instance running) # it starts a subprocess which does the execution in SystemDS with SystemDSContext() as sds: diff --git a/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py b/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py index 418888b3e76..926c890725b 100644 --- a/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py +++ b/src/main/python/docs/source/code/getting_started/simpleExamples/multiplyMatrix.py @@ -18,8 +18,9 @@ # under the License. # # ------------------------------------------------------------- -import numpy as np import logging + +import numpy as np from systemds.context import SystemDSContext # create a random array diff --git a/src/main/python/docs/source/code/guide/algorithms/FullScript.py b/src/main/python/docs/source/code/guide/algorithms/FullScript.py index 97af857c6a5..9c213d01294 100644 --- a/src/main/python/docs/source/code/guide/algorithms/FullScript.py +++ b/src/main/python/docs/source/code/guide/algorithms/FullScript.py @@ -19,9 +19,10 @@ # # ------------------------------------------------------------- import logging + from systemds.context import SystemDSContext -from systemds.operator.algorithm import multiLogReg, multiLogRegPredict from systemds.examples.tutorials.mnist import DataManager +from systemds.operator.algorithm import multiLogReg, multiLogRegPredict d = DataManager() diff --git a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part1.py b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part1.py index 7795c4b1ebb..86370bab5df 100644 --- a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part1.py +++ b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part1.py @@ -19,8 +19,10 @@ # # ------------------------------------------------------------- # Python -import numpy as np import os + +import numpy as np + if not os.path.isdir("temp"): os.mkdir("temp") a = np.asarray([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) diff --git a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py index c0c507dde8e..8925bb895bb 100644 --- a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py +++ b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part2.py @@ -20,6 +20,7 @@ # ------------------------------------------------------------- # Python import logging + from systemds.context import SystemDSContext # Create a federated matrix diff --git a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py index c1026a2b716..ebb13604313 100644 --- a/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py +++ b/src/main/python/docs/source/code/guide/federated/federatedTutorial_part3.py @@ -20,6 +20,7 @@ # ------------------------------------------------------------- # Python import logging + import numpy as np from systemds.context import SystemDSContext @@ -38,10 +39,10 @@ ([6, 0], [9, 3])]) # local matrix to multiply with loc = sds.from_numpy(np.array([ - [1,2,3,4,5,6,7,8,9], - [1,2,3,4,5,6,7,8,9], - [1,2,3,4,5,6,7,8,9] - ])) + [1, 2, 3, 4, 5, 6, 7, 8, 9], + [1, 2, 3, 4, 5, 6, 7, 8, 9], + [1, 2, 3, 4, 5, 6, 7, 8, 9] + ])) # Multiply local and federated ret = loc @ fed # execute the lazy script and print diff --git a/src/main/python/systemds/context/systemds_context.py b/src/main/python/systemds/context/systemds_context.py index 1e13a217c5a..32f0f900296 100644 --- a/src/main/python/systemds/context/systemds_context.py +++ b/src/main/python/systemds/context/systemds_context.py @@ -203,23 +203,25 @@ def __build_startup_command(self, port: int): files = glob(os.path.join(root, "conf", "log4j*.properties")) if len(files) > 1: - self._log.warning("Multiple logging files found selecting: " + files[0]) + self._log.warning( + "Multiple logging files found selecting: " + files[0]) if len(files) == 0: self._log.warning("No log4j file found at: " - + os.path.join(root, "conf") - + " therefore using default settings") + + os.path.join(root, "conf") + + " therefore using default settings") else: command.append("-Dlog4j.configuration=file:" + files[0]) command.append("org.apache.sysds.api.PythonDMLScript") - files=glob(os.path.join(root, "conf", "SystemDS*.xml")) + files = glob(os.path.join(root, "conf", "SystemDS*.xml")) if len(files) > 1: - self._log.warning("Multiple config files found selecting: " + files[0]) + self._log.warning( + "Multiple config files found selecting: " + files[0]) if len(files) == 0: self._log.warning("No log4j file found at: " - + os.path.join(root, "conf") - + " therefore using default settings") + + os.path.join(root, "conf") + + " therefore using default settings") else: command.append("-config") command.append(files[0]) @@ -471,19 +473,21 @@ def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Ope :return: an Operation Node, containing the read data the operationNode read can be of types, Matrix, Frame or Scalar. """ mdt_filepath = path + ".mtd" - if os.path.exists(mdt_filepath): # If metadata file is existing, then simply use that and force data type to mtd file + # If metadata file is existing, then simply use that and force data type to mtd file + if os.path.exists(mdt_filepath): with open(mdt_filepath) as jspec_file: mtd = json.load(jspec_file) kwargs["data_type"] = mtd["data_type"] if kwargs.get("format", None): kwargs["format"] = self.__fix_string_args(kwargs["format"]) - elif kwargs.get("format", None): # If format is specified. Then use that format + elif kwargs.get("format", None): # If format is specified. Then use that format kwargs["format"] = self.__fix_string_args(kwargs["format"]) - else: #Otherwise guess at what format the file is based on file extension + else: # Otherwise guess at what format the file is based on file extension if ".csv" in path[-4:]: kwargs["format"] = '"csv"' - self._log.warning("Guessing '"+path+"' is a csv file, please add a mtd file, or specify in arguments") - if not ("header" in kwargs) and "data_type" in kwargs and kwargs["data_type"] == "frame": + self._log.warning( + "Guessing '"+path+"' is a csv file, please add a mtd file, or specify in arguments") + if not ("header" in kwargs) and "data_type" in kwargs and kwargs["data_type"] == "frame": kwargs["header"] = True data_type = kwargs.get("data_type", None) @@ -501,13 +505,15 @@ def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> Ope kwargs["value_type"] = f'"{output_type.name}"' return Scalar(self, "read", [f'"{path}"'], named_input_nodes=kwargs, output_type=output_type) else: - raise ValueError("Invalid arguments for reading scalar, value_type must be specified") + raise ValueError( + "Invalid arguments for reading scalar, value_type must be specified") elif data_type == "list": # Reading a list have no extra arguments. return List(self, "read", [f'"{path}"']) else: kwargs["data_type"] = '"matrix"' - self._log.warning("Unknown type read please add a mtd file, or specify in arguments, defaulting to matrix") + self._log.warning( + "Unknown type read please add a mtd file, or specify in arguments, defaulting to matrix") return Matrix(self, "read", [f'"{path}"'], named_input_nodes=kwargs) def scalar(self, v: Dict[str, VALID_INPUT_TYPES]) -> Scalar: diff --git a/src/main/python/systemds/examples/tutorials/adult.py b/src/main/python/systemds/examples/tutorials/adult.py index 6bcfc8057fb..cf0e05963f4 100644 --- a/src/main/python/systemds/examples/tutorials/adult.py +++ b/src/main/python/systemds/examples/tutorials/adult.py @@ -27,6 +27,7 @@ from systemds.context import SystemDSContext from systemds.operator import Frame, Scalar + class DataManager: _data_zip_url: str @@ -59,7 +60,7 @@ def get_train_data_pandas(self) -> pd.DataFrame: def get_train_data(self, sds: SystemDSContext) -> 'Frame': self._get_data(self._train_data_loc) - return sds.read(self._train_data_loc)[:,0:14] + return sds.read(self._train_data_loc)[:, 0:14] def get_train_labels_pandas(self) -> pd.DataFrame: self._get_data(self._train_data_loc) @@ -67,16 +68,16 @@ def get_train_labels_pandas(self) -> pd.DataFrame: def get_train_labels(self, sds: SystemDSContext) -> 'Frame': self._get_data(self._train_data_loc) - return sds.read(self._train_data_loc)[:,14] + return sds.read(self._train_data_loc)[:, 14] def get_test_data_pandas(self) -> pd.DataFrame: self._get_data(self._test_data_loc) return self._parse_data(self._test_data_loc)\ .drop(labels=["income"], axis=1) - + def get_test_data(self, sds: SystemDSContext) -> 'Frame': self._get_data(self._test_data_loc) - return sds.read(self._test_data_loc)[:,0:14] + return sds.read(self._test_data_loc)[:, 0:14] def get_test_labels_pandas(self) -> pd.DataFrame: self._get_data(self._test_data_loc) @@ -84,13 +85,13 @@ def get_test_labels_pandas(self) -> pd.DataFrame: def get_test_labels(self, sds: SystemDSContext) -> 'Frame': self._get_data(self._test_data_loc) - return sds.read(self._test_data_loc)[:,14] + return sds.read(self._test_data_loc)[:, 14] def get_jspec_string(self) -> str: self._get_data(self._jspec_loc) with open(self._jspec_loc, "r") as f: return f.read() - + def get_jspec(self, sds: SystemDSContext) -> 'Scalar': self._get_data(self._jspec_loc) return sds.read(self._jspec_loc, data_type="scalar", value_type="string") diff --git a/src/main/python/tests/algorithms/test_lm.py b/src/main/python/tests/algorithms/test_lm.py index a6a05c49dc5..779c5437751 100644 --- a/src/main/python/tests/algorithms/test_lm.py +++ b/src/main/python/tests/algorithms/test_lm.py @@ -28,6 +28,7 @@ np.random.seed(7) + class TestLm(unittest.TestCase): sds: SystemDSContext = None @@ -47,8 +48,8 @@ def test_lm_simple(self): regressor = LinearRegression(fit_intercept=False) model = regressor.fit(X, Y).coef_ - X_sds = self.sds.from_numpy( X) - Y_sds = self.sds.from_numpy( Y) + X_sds = self.sds.from_numpy(X) + Y_sds = self.sds.from_numpy(Y) sds_model_weights = lm(X_sds, Y_sds).compute() model = model.reshape(sds_model_weights.shape) @@ -60,6 +61,5 @@ def test_lm_simple(self): "All elements are not close") - if __name__ == "__main__": unittest.main(exit=False) diff --git a/src/main/python/tests/algorithms/test_pca.py b/src/main/python/tests/algorithms/test_pca.py index 0f774c005dd..d410e12dd8b 100644 --- a/src/main/python/tests/algorithms/test_pca.py +++ b/src/main/python/tests/algorithms/test_pca.py @@ -50,7 +50,7 @@ def test_500x2(self): """ m1 = self.generate_matrices_for_pca(30, seed=1304) X = self.sds.from_numpy( m1) - # print(features) + [res, model, _, _] = pca(X, K=1, scale="FALSE", center="FALSE").compute() for (x, y) in zip(m1, res): self.assertTrue((x[0] > 0 and y > 0) or (x[0] < 0 and y < 0)) diff --git a/src/main/python/tests/algorithms/test_signal.py b/src/main/python/tests/algorithms/test_signal.py index 85b0438b027..0680761bdd7 100644 --- a/src/main/python/tests/algorithms/test_signal.py +++ b/src/main/python/tests/algorithms/test_signal.py @@ -51,4 +51,6 @@ def test_create_signal(self): Xa = M.cos() @ signal Xb = M.sin() @ signal DFT = signal.cbind(Xa).cbind(Xb).compute() - print(DFT) + +if __name__ == "__main__": + unittest.main(exit=False) \ No newline at end of file diff --git a/src/main/python/tests/basics/test_context_creation.py b/src/main/python/tests/basics/test_context_creation.py index 6a72757255d..702a4a68598 100644 --- a/src/main/python/tests/basics/test_context_creation.py +++ b/src/main/python/tests/basics/test_context_creation.py @@ -50,3 +50,6 @@ def test_create_multiple_context(self): c.close() d.close() + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/src/main/python/tests/basics/test_context_stats.py b/src/main/python/tests/basics/test_context_stats.py index 745629868c3..8578cf8a9ba 100644 --- a/src/main/python/tests/basics/test_context_stats.py +++ b/src/main/python/tests/basics/test_context_stats.py @@ -23,6 +23,7 @@ import numpy as np from systemds.context import SystemDSContext + np.random.seed(1412) @@ -30,7 +31,6 @@ class TestContextCreation(unittest.TestCase): sds: SystemDSContext = None - @classmethod def setUpClass(cls): cls.sds = SystemDSContext() @@ -57,7 +57,10 @@ def test_stats_v1(self): stats = self.sds.get_stats() self.sds.clear_stats() - instructions = "\n".join(stats.split("Heavy hitter instructions:")[1].split("\n")[2:]) + instructions = "\n".join(stats.split( + "Heavy hitter instructions:")[1].split("\n")[2:]) assert("+" in instructions and "*" in instructions and "/" in instructions) +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/src/main/python/tests/docs/__init__.py b/src/main/python/tests/docs_test/__init__.py similarity index 100% rename from src/main/python/tests/docs/__init__.py rename to src/main/python/tests/docs_test/__init__.py diff --git a/src/main/python/tests/docs/test_algorithms_basics.py b/src/main/python/tests/docs_test/test_algorithms_basics.py similarity index 94% rename from src/main/python/tests/docs/test_algorithms_basics.py rename to src/main/python/tests/docs_test/test_algorithms_basics.py index 358f414d208..e6a07bb1023 100644 --- a/src/main/python/tests/docs/test_algorithms_basics.py +++ b/src/main/python/tests/docs_test/test_algorithms_basics.py @@ -19,11 +19,16 @@ # # ------------------------------------------------------------- -import unittest import logging +import unittest logging.getLogger("root").setLevel(50) + class TestAlgorithmsBasics(unittest.TestCase): def test_algorithms_script(self): import docs.source.code.guide.algorithms.FullScript + + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/src/main/python/tests/docs/test_simple_example.py b/src/main/python/tests/docs_test/test_simple_example.py similarity index 96% rename from src/main/python/tests/docs/test_simple_example.py rename to src/main/python/tests/docs_test/test_simple_example.py index ea60b412042..0bf4613089d 100644 --- a/src/main/python/tests/docs/test_simple_example.py +++ b/src/main/python/tests/docs_test/test_simple_example.py @@ -19,8 +19,8 @@ # # ------------------------------------------------------------- -import unittest import logging +import unittest logging.getLogger("root").setLevel(50) @@ -37,3 +37,7 @@ def test_l2svm(self): def test_l2svm_internal(self): import docs.source.code.getting_started.simpleExamples.l2svm_internal + + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/src/main/python/tests/federated/test_federated_tutorial.py b/src/main/python/tests/federated/test_federated_tutorial.py index 6ef7caab05b..05132536fc7 100644 --- a/src/main/python/tests/federated/test_federated_tutorial.py +++ b/src/main/python/tests/federated/test_federated_tutorial.py @@ -37,3 +37,7 @@ def test_part2(self): def test_part3(self): import docs.source.code.guide.federated.federatedTutorial_part3 + + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/src/main/python/tests/frame/test_rIndexing.py b/src/main/python/tests/frame/test_rIndexing.py index cb69f03ab3c..43d6c367734 100644 --- a/src/main/python/tests/frame/test_rIndexing.py +++ b/src/main/python/tests/frame/test_rIndexing.py @@ -63,37 +63,37 @@ def test_3(self): def test_4(self): m1 = self.sds.from_pandas(self.df) - npres = np.array(self.df.loc[:,4]) - res = np.array(m1[:,4].compute()).flatten() + npres = np.array(self.df.loc[:, 4]) + res = np.array(m1[:, 4].compute()).flatten() self.assertTrue(np.allclose(res, npres)) def test_5(self): m1 = self.sds.from_pandas(self.df) - npres = np.array(self.df.loc[:,4:5]) - res = np.array(m1[:,4:6].compute()) + npres = np.array(self.df.loc[:, 4:5]) + res = np.array(m1[:, 4:6].compute()) self.assertTrue(np.allclose(res, npres)) def test_6(self): m1 = self.sds.from_pandas(self.df) - npres = self.df.loc[1:1,4:5] - res = m1[1:2,4:6].compute() + npres = self.df.loc[1:1, 4:5] + res = m1[1:2, 4:6].compute() self.assertTrue(np.allclose(res, npres)) def test_7(self): m1 = self.sds.from_pandas(self.df) - npres = self.df.loc[1,4:5] - res = m1[1,4:6].compute() + npres = self.df.loc[1, 4:5] + res = m1[1, 4:6].compute() self.assertTrue(np.allclose(res, npres)) def test_8(self): m1 = self.sds.from_pandas(self.df) with self.assertRaises(NotImplementedError) as context: - res = m1[1:,4:6].compute() + res = m1[1:, 4:6].compute() def test_9(self): m1 = self.sds.from_pandas(self.df) with self.assertRaises(NotImplementedError) as context: - res = m1[:3,4:6].compute() + res = m1[:3, 4:6].compute() if __name__ == "__main__": diff --git a/src/main/python/tests/frame/test_r_c_bind.py b/src/main/python/tests/frame/test_r_c_bind.py index 71d79f0bb1c..6a6e99a1896 100644 --- a/src/main/python/tests/frame/test_r_c_bind.py +++ b/src/main/python/tests/frame/test_r_c_bind.py @@ -31,16 +31,19 @@ class TestRCBind(unittest.TestCase): # shape (2, 3) df_cb_1 = pd.DataFrame( - {"col1": ["col1_hello", "col1_world"], "col2": [0, 1], "col3": [0.0, 0.1]} + {"col1": ["col1_hello", "col1_world"], + "col2": [0, 1], "col3": [0.0, 0.1]} ) # shape (2, 2) - df_cb_2 = pd.DataFrame({"col4": ["col4_hello", "col4_world"], "col5": [0, 1]}) - df_cb_3 = pd.DataFrame({"col6": ["col6_hello", "col6_world"], "col7": [0, 1]}) - + df_cb_2 = pd.DataFrame( + {"col4": ["col4_hello", "col4_world"], "col5": [0, 1]}) + df_cb_3 = pd.DataFrame( + {"col6": ["col6_hello", "col6_world"], "col7": [0, 1]}) #shape (2, 3) df_rb_1 = pd.DataFrame( - {"col1": ["col1_hello_1", "col1_world_1"], "col2": [0, 1], "col3": [0.0, 0.1]} + {"col1": ["col1_hello_1", "col1_world_1"], + "col2": [0, 1], "col3": [0.0, 0.1]} ) #shape (4, 3) df_rb_2 = pd.DataFrame( @@ -59,7 +62,6 @@ class TestRCBind(unittest.TestCase): } ) - @classmethod def setUpClass(cls): cls.sds = SystemDSContext() @@ -82,9 +84,10 @@ def test_r_bind_triple(self): f3 = self.sds.from_pandas(self.df_rb_3) result_df = f1.rbind(f2).rbind(f3).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) - target_df = pd.concat([self.df_rb_1, self.df_rb_2, self.df_rb_3], ignore_index=True) + target_df = pd.concat( + [self.df_rb_1, self.df_rb_2, self.df_rb_3], ignore_index=True) self.assertTrue(target_df.equals(result_df)) - + def test_r_bind_triple_twostep(self): f1 = self.sds.from_pandas(self.df_rb_1) f2 = self.sds.from_pandas(self.df_rb_2) @@ -92,15 +95,16 @@ def test_r_bind_triple_twostep(self): tmp_df = f1.rbind(f2).compute() result_df = self.sds.from_pandas(tmp_df).rbind(f3).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) - target_df = pd.concat([self.df_rb_1, self.df_rb_2, self.df_rb_3], ignore_index=True) + target_df = pd.concat( + [self.df_rb_1, self.df_rb_2, self.df_rb_3], ignore_index=True) self.assertTrue(target_df.equals(result_df)) - + def test_c_bind_pair(self): f1 = self.sds.from_pandas(self.df_cb_1) f2 = self.sds.from_pandas(self.df_cb_2) result_df = f1.cbind(f2).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) - target_df= pd.concat([self.df_cb_1, self.df_cb_2], axis=1) + target_df = pd.concat([self.df_cb_1, self.df_cb_2], axis=1) self.assertTrue(target_df.equals(result_df)) def test_c_bind_triple(self): @@ -109,7 +113,8 @@ def test_c_bind_triple(self): f3 = self.sds.from_pandas(self.df_cb_3) result_df = f1.cbind(f2).cbind(f3).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) - target_df = pd.concat([self.df_cb_1, self.df_cb_2, self.df_cb_3], axis=1) + target_df = pd.concat( + [self.df_cb_1, self.df_cb_2, self.df_cb_3], axis=1) self.assertTrue(target_df.equals(result_df)) def test_c_bind_triple_twostep(self): @@ -119,11 +124,10 @@ def test_c_bind_triple_twostep(self): tmp_df = f1.cbind(f2).compute() result_df = self.sds.from_pandas(tmp_df).cbind(f3).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) - target_df = pd.concat([self.df_cb_1, self.df_cb_2, self.df_cb_3], axis=1) + target_df = pd.concat( + [self.df_cb_1, self.df_cb_2, self.df_cb_3], axis=1) self.assertTrue(target_df.equals(result_df)) - - if __name__ == "__main__": unittest.main(exit=False) diff --git a/src/main/python/tests/frame/test_replace.py b/src/main/python/tests/frame/test_replace.py index 7be735c19e5..c731a8c8e1a 100644 --- a/src/main/python/tests/frame/test_replace.py +++ b/src/main/python/tests/frame/test_replace.py @@ -57,11 +57,10 @@ def test_apply_recode_bin(self): format="csv", header=True, ) - ret = F1.replace("north", "south").replace("west", "south").replace("east","south").compute() + ret = F1.replace("north", "south").replace( + "west", "south").replace("east", "south").compute() self.assertTrue(any(ret.district == "south")) - self.assertTrue(not( any(ret.district == "north"))) - - + self.assertTrue(not(any(ret.district == "north"))) if __name__ == "__main__": diff --git a/src/main/python/tests/frame/test_slice.py b/src/main/python/tests/frame/test_slice.py index 8bfd0a0b674..5abb46cd3b4 100644 --- a/src/main/python/tests/frame/test_slice.py +++ b/src/main/python/tests/frame/test_slice.py @@ -93,3 +93,7 @@ def test_slice_last_row(self): def test_slice_row_col_both(self): with self.assertRaises(NotImplementedError): self.sds.from_pandas(df)[[1, 2], [0, 2]] + + +if __name__ == "__main__": + unittest.main(exit=False) diff --git a/src/main/python/tests/manual_tests/multi_log_reg_mnist.py b/src/main/python/tests/manual_tests/multi_log_reg_mnist.py index d4c34ed5be5..8e8690bb2c0 100644 --- a/src/main/python/tests/manual_tests/multi_log_reg_mnist.py +++ b/src/main/python/tests/manual_tests/multi_log_reg_mnist.py @@ -19,6 +19,8 @@ # # ------------------------------------------------------------- +import logging + from systemds.context import SystemDSContext from systemds.operator.algorithm import multiLogReg, multiLogRegPredict from systemds.examples.tutorials.mnist import DataManager @@ -35,4 +37,4 @@ Yt = sds.from_numpy(d.get_test_labels()) + 1.0 [_, _, acc] = multiLogRegPredict(Xt, bias, Yt).compute() -print(acc) +logging.info(acc) diff --git a/src/main/python/tests/matrix/test_slice.py b/src/main/python/tests/matrix/test_slice.py index bb988a6995f..a7ddfac79d4 100644 --- a/src/main/python/tests/matrix/test_slice.py +++ b/src/main/python/tests/matrix/test_slice.py @@ -79,3 +79,6 @@ def test_slice_single_col(self): def test_slice_row_col_both(self): with self.assertRaises(NotImplementedError): self.sds.from_numpy(m)[[1, 2], [0, 3]] + +if __name__ == "__main__": + unittest.main(exit=False) \ No newline at end of file diff --git a/src/main/python/tests/source/test_source_list.py b/src/main/python/tests/source/test_source_list.py index dc868916a8a..4a3cc7f60a1 100644 --- a/src/main/python/tests/source/test_source_list.py +++ b/src/main/python/tests/source/test_source_list.py @@ -61,3 +61,6 @@ def test_input_multireturn(self): # [b, c] = self.sds.source(self.source_path, "test", True).func2(arr) # res = c.sum().compute() # self.assertTrue(res == 10*10*4) + +if __name__ == "__main__": + unittest.main(exit=False) \ No newline at end of file