In [1]:
import sys
sys.path.append("../1.load")
from pathlib import Path

import pandas as pd
from sklearn.decomposition import PCA, KernelPCA

from preprocess import preprocess_profiles

In [2]:
profiles = {
    "subset->preprocess": "../1.load/output/subset_processed_profiles.parquet",
    "subset->cc_adjust->preprocess": "../3.correct/output/subset_profiles_cc_corrected.parquet",
    "subset->well_correct->preprocess": "../3.correct/output/subset_profiles_mean_corrected_preprocessed.parquet",
    "subset->cc_adjust->well_correct->preprocess": "../3.correct/output/subset_profiles_cc_well_corrected.parquet",
}

In [3]:
for p in profiles:
    df = pd.read_parquet(profiles[p])
    print(df.shape, "\n", df.filter(regex="^(?!Metadata_)").columns.str.split("_").str[0].value_counts())
    df = df.loc[:, ~df.columns.str.startswith("Image_")]
    print(df.shape)
    # save to parquet with appended suffix _no_image to the filename
    df.to_parquet(Path(profiles[p]).parent / (Path(profiles[p]).stem + "_no_image.parquet"))

(19143, 614) 
 Cytoplasm    166
Image        160
Nuclei       140
Cells        131
dtype: int64
(19143, 454)
(19143, 665) 
 Cytoplasm    200
Image        172
Nuclei       141
Cells        135
dtype: int64
(19143, 493)
(19143, 644) 
 Image        174
Cytoplasm    172
Nuclei       144
Cells        137
dtype: int64
(19143, 470)
(19143, 680) 
 Cytoplasm    198
Image        183
Nuclei       145
Cells        137
dtype: int64
(19143, 497)


#### Also try PCA

In [5]:
for p in profiles:
    df = pd.read_parquet(profiles[p])
    print(df.shape)
    pca = PCA(n_components=0.99, svd_solver='full')
    df = pd.concat([
        df.filter(regex="^Metadata_"),
        pd.DataFrame(pca.fit_transform(df.filter(regex="^(?!Metadata_)")))
    ], axis=1)
    df.columns = df.columns.map(str)
    print(df.shape)
    df.to_parquet(Path(profiles[p]).parent / (Path(profiles[p]).stem + "_pca.parquet"))

(19143, 614)
(19143, 33)
(19143, 665)
(19143, 26)
(19143, 644)
(19143, 19)
(19143, 680)
(19143, 21)


#### PCA using cosine kernel (takes ~15 min on M1 32GB)

In [6]:
components = [33, 26, 19, 21]

for i, p in enumerate(profiles):
    df = pd.read_parquet(profiles[p])
    print(profiles[p], df.shape)
    kpca = KernelPCA(n_components=components[i], kernel='cosine')
    df = pd.concat([
        df.filter(regex="^Metadata_"),
        pd.DataFrame(kpca.fit_transform(df.filter(regex="^(?!Metadata_)")))
    ], axis=1)
    df.columns = df.columns.map(str)
    print(df.shape)
    df.to_parquet(Path(profiles[p]).parent / (Path(profiles[p]).stem + "_cospca.parquet"))

../1.load/output/subset_processed_profiles.parquet (19143, 614)
(19143, 50)
../3.correct/output/subset_profiles_cc_corrected.parquet (19143, 665)
(19143, 43)
../3.correct/output/subset_profiles_mean_corrected_preprocessed.parquet (19143, 644)
(19143, 36)
../3.correct/output/subset_profiles_cc_well_corrected.parquet (19143, 680)
(19143, 38)
