Skip to content

Commit

Permalink
Merge pull request #4 from burtonrj/DataFrameTransform
Browse files Browse the repository at this point in the history
DataFrame transform
  • Loading branch information
burtonrj committed Jul 24, 2023
2 parents 302d9d1 + 4e7265f commit 03057c0
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 16 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
name: Cytotransform test & build

on: push
on:
push:
branches: [master, main]

jobs:
test:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
with:
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ pip install cytotransform

## Usage

The `transform` and `inverse_transform` methods take a numpy array or Pandas DataFrame as input and return a numpy the
transformed array/dataframe.

### Parametrized logarithmic transformation

```python
Expand Down
44 changes: 38 additions & 6 deletions cytotransform/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Any, Callable

import numpy as np
import pandas as pd
from joblib import Parallel, cpu_count, delayed


Expand All @@ -23,11 +24,17 @@ def __init__(
def validation(self):
...

def transform(self, data: np.ndarray) -> np.ndarray:
return self._multiprocess_call(data, self._transform_function)
def transform(self, data: np.ndarray | pd.DataFrame) -> np.ndarray | pd.DataFrame:
if isinstance(data, pd.DataFrame):
return self._multiprocess_call_df(data, self._transform_function)
return self._multiprocess_call_array(data, self._transform_function)

def inverse_transform(self, data: np.ndarray) -> np.ndarray:
return self._multiprocess_call(data, self._inverse_transform_function)
def inverse_transform(
self, data: np.ndarray | pd.DataFrame
) -> np.ndarray | pd.DataFrame:
if isinstance(data, pd.DataFrame):
return self._multiprocess_call_df(data, self._inverse_transform_function)
return self._multiprocess_call_array(data, self._inverse_transform_function)

def _batches(self, data: np.ndarray) -> list[np.ndarray[Any, np.dtype[Any]]]:
"""
Expand All @@ -43,14 +50,39 @@ def _batches(self, data: np.ndarray) -> list[np.ndarray[Any, np.dtype[Any]]]:
np.ndarray
Batches of data.
"""
n = self.n_jobs if len(data) > 100 else 1
n = self.n_jobs if len(data) > 10000 else 1
return np.array_split(data, n)

def _multiprocess_call(self, data: np.ndarray, func: Callable) -> np.ndarray:
def _multiprocess_call_array(self, data: np.ndarray, func: Callable) -> np.ndarray:
if self.n_jobs in [0, 1]:
return func(data, **self.parameters)
with Parallel(n_jobs=self.n_jobs) as parallel:
return np.concatenate(
parallel(
delayed(func)(batch, **self.parameters)
for batch in self._batches(data)
)
)

def _multiprocess_call_df(self, data: pd.DataFrame, func: Callable) -> pd.DataFrame:
if self.n_jobs in [0, 1]:
return pd.concat(
[
pd.Series(
func(data[col], **self.parameters), name=col, index=data.index
)
for col in data.columns
],
axis=1,
)
with Parallel(n_jobs=self.n_jobs) as parallel:
transformed = parallel(
delayed(func)(data[col], **self.parameters) for col in data.columns
)
return pd.concat(
[
pd.Series(t, name=col, index=data.index)
for t, col in zip(transformed, data.columns)
],
axis=1,
)
94 changes: 93 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "cytotransform"
version = "0.1.13"
version = "0.2.0"
description = "Rapid transformations for cytometry data"
authors = ["Ross Burton <burtonrossj@gmail.com>"]
license = "MIT"
Expand All @@ -14,6 +14,7 @@ scipy = "^1.10.1"
matplotlib = "^3.7.1"
setuptools = "^67.7.1"
pybind11 = "^2.10.4"
pandas = "^2.0.3"


[tool.poetry.group.dev.dependencies]
Expand Down
2 changes: 1 addition & 1 deletion tests/report.xml
Original file line number Diff line number Diff line change
@@ -1 +1 @@
<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="6" time="1.260" timestamp="2023-07-22T15:57:12.881162" hostname="baymax"><testcase classname="tests.test_transformer" name="test_transforms[1-group0]" time="0.001" /><testcase classname="tests.test_transformer" name="test_transforms[-1-group1]" time="0.672" /><testcase classname="tests.test_transformer" name="test_transforms[1-group2]" time="0.001" /><testcase classname="tests.test_transformer" name="test_transforms[-1-group3]" time="0.131" /><testcase classname="tests.test_transformer" name="test_transforms[1-group4]" time="0.001" /><testcase classname="tests.test_transformer" name="test_transforms[-1-group5]" time="0.087" /></testsuite></testsuites>
<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="6" time="1.849" timestamp="2023-07-24T07:00:30.680886" hostname="baymax"><testcase classname="tests.test_transformer" name="test_transforms_dataframe[1-group0]" time="0.005" /><testcase classname="tests.test_transformer" name="test_transforms_dataframe[-1-group1]" time="1.245" /><testcase classname="tests.test_transformer" name="test_transforms_dataframe[1-group2]" time="0.003" /><testcase classname="tests.test_transformer" name="test_transforms_dataframe[-1-group3]" time="0.228" /><testcase classname="tests.test_transformer" name="test_transforms_dataframe[1-group4]" time="0.005" /><testcase classname="tests.test_transformer" name="test_transforms_dataframe[-1-group5]" time="0.140" /></testsuite></testsuites>
41 changes: 36 additions & 5 deletions tests/test_transformer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import NamedTuple, Type

import numpy as np
import pandas as pd
import pytest

from cytotransform.asinh import AsinhTransform
Expand Down Expand Up @@ -174,22 +175,52 @@ class TestGroup(NamedTuple):
(-1, LogicleGroup),
],
)
def test_transforms(n_jobs: int, group: TestGroup):
def test_transforms_array(n_jobs: int, group: TestGroup):
for case in group.cases:
transformer = group.klass(**case.params, n_jobs=n_jobs)
if n_jobs == -1:
assert np.allclose(
transformer.transform(np.concatenate([group.x for _ in range(1000)])),
np.concatenate([case.y for _ in range(1000)]),
transformer.transform(np.concatenate([group.x for _ in range(10000)])),
np.concatenate([case.y for _ in range(10000)]),
atol=1e-5,
)
assert np.allclose(
transformer.inverse_transform(
np.concatenate([case.y for _ in range(1000)])
np.concatenate([case.y for _ in range(10000)])
),
np.concatenate([group.x for _ in range(1000)]),
np.concatenate([group.x for _ in range(10000)]),
atol=1e-5,
)

assert np.allclose(transformer.transform(group.x), case.y, atol=1e-5)
assert np.allclose(transformer.inverse_transform(case.y), group.x, atol=1e-5)


@pytest.mark.parametrize(
"n_jobs,group",
[
(1, AsinhGroup),
(-1, AsinhGroup),
(1, LogGroup),
(-1, LogGroup),
(1, LogicleGroup),
(-1, LogicleGroup),
],
)
def test_transforms_dataframe(n_jobs: int, group: TestGroup):
for case in group.cases:
transformer = group.klass(**case.params, n_jobs=n_jobs)
x, y = group.x, case.y
if n_jobs == -1:
x, y = np.concatenate([x for _ in range(10000)]), np.concatenate(
[y for _ in range(10000)]
)
df = pd.DataFrame({"x1": x, "x2": x, "x3": x})
transformed_df = transformer.transform(df)

for col in transformed_df.columns:
assert np.allclose(transformed_df[col], y, atol=1e-5)

inverse_transformed_df = transformer.inverse_transform(transformed_df)
for col in inverse_transformed_df.columns:
assert np.allclose(inverse_transformed_df[col], x, atol=1e-5)

0 comments on commit 03057c0

Please sign in to comment.