In [6]:
import sys

sys.path.append("../scripts")
from pathlib import Path

from tqdm.auto import tqdm

import pandas as pd
from sklearn.decomposition import PCA, KernelPCA

In [7]:
profiles = {
    "subset->preprocess": "../1.load/output/subset_processed_profiles.parquet",
    "subset->cc_adjust->preprocess": "../3.correct/output/subset_profiles_cc_corrected.parquet",
    "subset->well_correct->preprocess": "../3.correct/output/subset_profiles_mean_corrected_preprocessed.parquet",
    "subset->cc_adjust->well_correct->preprocess": "../3.correct/output/subset_profiles_cc_well_corrected.parquet",
}

In [8]:
for p in profiles:
    df = pd.read_parquet(profiles[p])
    print(
        df.shape,
        "\n",
        df.filter(regex="^(?!Metadata_)").columns.str.split("_").str[0].value_counts(),
    )
    df = df.loc[:, ~df.columns.str.startswith("Image_")]
    print(df.shape)
    # save to parquet with appended suffix _no_image to the filename
    df.to_parquet(
        Path(profiles[p]).parent / (Path(profiles[p]).stem + "_no_image.parquet")
    )

(19143, 842) 
 Cells        223
Cytoplasm    216
Nuclei       195
Image        191
dtype: int64
(19143, 651)
(19143, 905) 
 Cytoplasm    243
Cells        223
Image        222
Nuclei       200
dtype: int64
(19143, 683)
(19143, 972) 
 Cytoplasm    278
Nuclei       241
Cells        226
Image        210
dtype: int64
(19143, 762)
(19143, 984) 
 Cytoplasm    276
Image        249
Nuclei       232
Cells        210
dtype: int64
(19143, 735)


#### Also try PCA

In [9]:
for p in profiles:
    df = pd.read_parquet(profiles[p])
    print(df.shape)
    pca = PCA(n_components=0.99, svd_solver="full")
    df = pd.concat(
        [
            df.filter(regex="^Metadata_"),
            pd.DataFrame(pca.fit_transform(df.filter(regex="^(?!Metadata_)"))),
        ],
        axis=1,
    )
    df.columns = df.columns.map(str)
    print(df.shape)
    df.to_parquet(Path(profiles[p]).parent / (Path(profiles[p]).stem + "_pca.parquet"))

(19143, 842)
(19143, 45)
(19143, 905)
(19143, 26)
(19143, 972)
(19143, 20)
(19143, 984)
(19143, 20)


In [10]:
for p in profiles:
    df = pd.read_parquet(profiles[p])
    df = df.loc[:, ~df.columns.str.startswith("Image_")]
    print(df.shape)
    pca = PCA(n_components=0.99, svd_solver="full")
    df = pd.concat(
        [
            df.filter(regex="^Metadata_"),
            pd.DataFrame(pca.fit_transform(df.filter(regex="^(?!Metadata_)"))),
        ],
        axis=1,
    )
    df.columns = df.columns.map(str)
    print(df.shape)
    df.to_parquet(
        Path(profiles[p]).parent / (Path(profiles[p]).stem + "_no_image_pca.parquet")
    )

(19143, 651)
(19143, 301)
(19143, 683)
(19143, 25)
(19143, 762)
(19143, 50)
(19143, 735)
(19143, 19)


#### PCA using cosine kernel (takes >15 min on M1 32GB)

In [5]:
components = [30, 20]

for c_num in components:
    print(f"{c_num=}")
    for p in tqdm(profiles):
        df = pd.read_parquet(profiles[p])
        print(profiles[p], df.shape)
        kpca = KernelPCA(n_components=c_num, kernel="cosine")
        df = pd.concat(
            [
                df.filter(regex="^Metadata_"),
                pd.DataFrame(kpca.fit_transform(df.filter(regex="^(?!Metadata_)"))),
            ],
            axis=1,
        )
        df.columns = df.columns.map(str)
        print(df.shape)
        df.to_parquet(
            Path(profiles[p]).parent
            / (Path(profiles[p]).stem + f"_cospca_{c_num}.parquet")
        )

c_num=30


  0%|          | 0/4 [00:00<?, ?it/s]

../1.load/output/subset_processed_profiles.parquet (19143, 842)
(19143, 47)
../3.correct/output/subset_profiles_cc_corrected.parquet (19143, 905)
(19143, 47)
../3.correct/output/subset_profiles_mean_corrected_preprocessed.parquet (19143, 972)
(19143, 47)
../3.correct/output/subset_profiles_cc_well_corrected.parquet (19143, 984)
(19143, 47)
c_num=20


  0%|          | 0/4 [00:00<?, ?it/s]

../1.load/output/subset_processed_profiles.parquet (19143, 842)
(19143, 37)
../3.correct/output/subset_profiles_cc_corrected.parquet (19143, 905)
(19143, 37)
../3.correct/output/subset_profiles_mean_corrected_preprocessed.parquet (19143, 972)
(19143, 37)
../3.correct/output/subset_profiles_cc_well_corrected.parquet (19143, 984)
(19143, 37)
