In [3]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import flowkit as fk

import ipywidgets as widgets
from tqdm.notebook import tqdm

from utils import ask_directory
from data import get_df
from logicle_scaling import logicle_transform


SEED = 7
np.random.seed(SEED)

  from pandas.core import (


### Set the raw data directory (*FCS* files):

In [4]:
raw_data_dir = ask_directory(title="Select your data directory")

print(raw_data_dir)

In [6]:
raw_data_dir = Path(raw_data_dir)
fcs_files = raw_data_dir.glob("**/*.fcs")
sample_files = []
for fcs in fcs_files:
    has_images = len(list(fcs.parent.glob("*images"))) > 0
    sample_files.append((fcs, has_images))

### Select samples to transform into the Logicle scale:

In [5]:
selector = widgets.SelectMultiple(
    options=[f"{item[0].name} (images: {item[1]})" for item in sample_files],
    value=[f"{item[0].name} (images: {item[1]})" for item in sample_files if item[1]],
    rows=14,
    description="Select Samples:"
)

display(selector)

SelectMultiple(description='Select Samples:', index=(1, 2, 3, 4, 5, 9), options=('Amphidinium.fcs (images: Fal…

### Select the folder where you want to save transformed data:

In [17]:
root = tk.Tk()
root.overrideredirect(1)
root.withdraw()

transformed_data_dir = filedialog.askdirectory(
    title="Where to save transformed data",
    mustexist=False,
    parent=root
)

root.quit()
root.destroy()

In [18]:
transformed_data_dir = Path(transformed_data_dir)
transformed_data_dir.mkdir(exist_ok=True, parents=True)

print(transformed_data_dir)

/Users/mehdi.seifi/Projects/AI4Life/project_10_sorting/data/transformed_data


In [19]:
# save configs
with open("./config.json", mode="w") as f:
    json.dump({
        "raw_data_dir": str(raw_data_dir.absolute()),
        "logicle_data_dir": str(transformed_data_dir.absolute())
    }, f, indent=4)

### Cleaning data and dropping uninformative channels (columns)

In [8]:
selected_samples = [sample_files[i] for i in selector.index]

In [9]:
# process samples dataframes and drop uninformative columns
dataframes = {}
all_drop_cols = ["Saturated", "Time"]

for path, _ in tqdm(selected_samples, desc="processing/cleaning samples"):
    name = path.stem
    print(f"proessing {name}")
    df, drop_cols = get_df(path)
    dataframes[name] = df
    all_drop_cols.extend(drop_cols)

processing/cleaning samples:   0%|          | 0/6 [00:00<?, ?it/s]

proessing Skeletonema
proessing Lingodinium
proessing C.6818
proessing Emiliana_huxley
proessing Phaeodactylum
proessing Synechococcus


In [10]:
all_drop_cols = set(all_drop_cols)
dataframes = {
    name: df.drop(columns=all_drop_cols, errors="ignore")
    for name, df in dataframes.items()
}

### Transform selected data into the Logicle scale:

In [11]:
logicle_dataframes = {}
logicle_cols = [
    "top_of_scale", "m_positive_decades",
    "width_of_linear", "addition_negative"
]

for name in tqdm(dataframes.keys(), desc="Dataframes", position=0):
    # hdf5 storage
    storage = pd.HDFStore(transformed_data_dir.joinpath(f"./{name}.h5"), mode="w")

    logicle_transforms = {}
    df = dataframes[name].copy()
    for col in tqdm(df.columns, desc=f"{name}: applying logicle transform on channels", position=1, leave=True):
        df[col], logicle = logicle_transform(df[col].to_numpy(), return_transform=True)
        logicle_transforms[col] = logicle.get_params()
    
    storage["df"] = df
    storage["logicles"] = pd.DataFrame(
        data=list(logicle_transforms.values()),
        index=list(logicle_transforms.keys()),
        columns=logicle_cols
    )

    storage.close()

    logicle_dataframes[name] = df

Dataframes:   0%|          | 0/6 [00:00<?, ?it/s]

Skeletonema: applying logicle transform on channels:   0%|          | 0/421 [00:00<?, ?it/s]

Lingodinium: applying logicle transform on channels:   0%|          | 0/421 [00:00<?, ?it/s]

C.6818: applying logicle transform on channels:   0%|          | 0/421 [00:00<?, ?it/s]

Emiliana_huxley: applying logicle transform on channels:   0%|          | 0/421 [00:00<?, ?it/s]

Phaeodactylum: applying logicle transform on channels:   0%|          | 0/421 [00:00<?, ?it/s]

Synechococcus: applying logicle transform on channels:   0%|          | 0/421 [00:00<?, ?it/s]