In [3]:
import torch
import glob
import json
import os

import pandas as pd
import torch
import numpy as np


In [7]:

bf_folders = sorted(glob.glob("/proj/haste_berzelius/exps/specs_non_grit_based/*bf*"))[:5]
fl_folders = sorted(glob.glob("/proj/haste_berzelius/exps/specs_non_grit_based/*fl*"))
print(bf_folders, fl_folders)

['/proj/haste_berzelius/exps/specs_non_grit_based/bf_exps_1_split1', '/proj/haste_berzelius/exps/specs_non_grit_based/bf_exps_1_split2', '/proj/haste_berzelius/exps/specs_non_grit_based/bf_exps_1_split3', '/proj/haste_berzelius/exps/specs_non_grit_based/bf_exps_1_split4', '/proj/haste_berzelius/exps/specs_non_grit_based/bf_exps_1_split5'] ['/proj/haste_berzelius/exps/specs_non_grit_based/fl_exps_1_split1', '/proj/haste_berzelius/exps/specs_non_grit_based/fl_exps_1_split2', '/proj/haste_berzelius/exps/specs_non_grit_based/fl_exps_1_split3', '/proj/haste_berzelius/exps/specs_non_grit_based/fl_exps_1_split4', '/proj/haste_berzelius/exps/specs_non_grit_based/fl_exps_1_split5']


In [74]:
from sklearn import preprocessing
from utils.cka.cka_features import CKA
from sklearn.impute import SimpleImputer

cp_df_cell = pd.read_csv("stats/non_grit_based/CP_features_cells.csv")
feature_groups = ['AreaShape', 'Correlation', 'Granularity', 'Intensity', 'Neighbors', 'RadialDistribution']

fv_columns = [f"fv_{i}" for i in range(2048)]

site_conversion = pd.DataFrame(
    {"bf_sites": ["s1", "s2", "s3", "s4", "s5"], "f_sites": ["s2", "s4", "s5", "s6", "s8"]}
)

cka_dict_list = []

for i, (bf_folder, fl_folder) in enumerate(zip(bf_folders, fl_folders)):
    fl_exp_folder = os.path.join(fl_folder, "fl_11cls_basic_aug_dmso_norm_750e_sgd/ResNet_resnet50/")
    fl_df = pd.read_csv(os.path.join(fl_exp_folder, "feature_data_test.csv"))
    
    bf_exp_folder = os.path.join(bf_folder, "bf_11cls_basic_aug_dmsonorm_750e_sgd/ResNet_resnet50/")
    bf_df = pd.read_csv(os.path.join(bf_exp_folder, "feature_data_test.csv"))
    bf_df["site"] = bf_df["site"].map(site_conversion.set_index("bf_sites")["f_sites"])

    fl_bf_merge = pd.merge(fl_df, bf_df, on=["plate", "well", "compound", "site"], suffixes=("_fl", "_bf"))

    bf_columns = [f"fv_{i}_bf" for i in range(2048)]
    bf_features = fl_bf_merge[bf_columns].values
    bf_features = preprocessing.StandardScaler().fit_transform(bf_features)

    fl_columns = [f"fv_{i}_fl" for i in range(2048)]
    fl_features = fl_bf_merge[fl_columns].values
    fl_features = preprocessing.StandardScaler().fit_transform(fl_features)

    cka = CKA()
    linear_cka = cka.linear_CKA(fl_features, bf_features)
    kernel_cka = cka.kernel_CKA(fl_features, bf_features)
    # print(f"Split {i+1}: FL vs BF :Linear CKA: {linear_cka:3f}, Kernel CKA: {kernel_cka:3f}")
    # cka_dict[f"split_{i+1}"] = {"mode1": "fl", "mode2": "bf", "linear": linear_cka, "kernel": kernel_cka}
    cka_dict_list.append({"split": i+1, "mode1": "fl", "mode2": "bf", "linear_cka": linear_cka, "rkernel_cka": kernel_cka})

    fl_cp_df = pd.merge(fl_df, cp_df_cell, on=["plate", "well", "site", "compound"])
    cp_all_features = fl_cp_df[cp_df_cell.columns[6:]].values
    cp_all_features = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(cp_all_features)   
    cp_all_features = preprocessing.StandardScaler().fit_transform(cp_all_features)

    fl_features = fl_cp_df[fv_columns].values
    fl_features = preprocessing.StandardScaler().fit_transform(fl_features)

    cka = CKA()
    linear_cka = cka.linear_CKA(cp_all_features, fl_features)
    kernel_cka = cka.kernel_CKA(cp_all_features, fl_features)
    # print(f"Split {i+1}: FL vs CP Overall: Linear CKA: {linear_cka:3f}, Kernel CKA: {kernel_cka:3f}")
    # cka_dict[f"split_{i+1}"] = {"mode1": "fl", "mode2": "cp_all", "linear": linear_cka, "kernel": kernel_cka}
    cka_dict_list.append({"split": i+1, "mode1": "fl", "mode2": "cp_all", "linear_cka": linear_cka, "rkernel_cka": kernel_cka})
    for feature_group in feature_groups:
        cp_group_columns = [col for col in fl_cp_df.columns if feature_group in col]
        cp_group_features = fl_cp_df[cp_group_columns].values
        cp_group_features = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(cp_group_features)
        cp_group_features = preprocessing.StandardScaler().fit_transform(cp_group_features)

        cka = CKA()
        linear_cka = cka.linear_CKA(cp_group_features, fl_features)
        kernel_cka = cka.kernel_CKA(cp_group_features, fl_features)
        # print(f"Split {i+1}: FL vs CP {feature_group}: Linear CKA: {linear_cka:3f}, Kernel CKA: {kernel_cka:3f}")
        # cka_dict[f"split_{i+1}"] = {"mode1": "fl", "mode2": f"cp_{feature_group}", "linear": linear_cka, "kernel": kernel_cka}
        cka_dict_list.append({"split": i+1, "mode1": "fl", "mode2": f"cp_{feature_group}", "linear_cka": linear_cka, "rkernel_cka": kernel_cka})
    bf_cp_df = pd.merge(bf_df, cp_df_cell, on=["plate", "well", "site", "compound"])
    cp_all_features = bf_cp_df[cp_df_cell.columns[6:]].values
    cp_all_features = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(cp_all_features)
    cp_all_features = preprocessing.StandardScaler().fit_transform(cp_all_features)

    bf_features = bf_cp_df[fv_columns].values
    bf_features = preprocessing.StandardScaler().fit_transform(bf_features)

    cka = CKA()
    linear_cka = cka.linear_CKA(cp_all_features, bf_features)
    kernel_cka = cka.kernel_CKA(cp_all_features, bf_features)
    # print(f"Split {i+1}: BF vs CP Overall: Linear CKA: {linear_cka:3f}, Kernel CKA: {kernel_cka:3f}")
    # cka_dict[f"split_{i+1}"] = {"mode1": "bf", "mode2": "cp_all", "linear": linear_cka, "kernel": kernel_cka}
    cka_dict_list.append({"split": i+1, "mode1": "bf", "mode2": "cp_all", "linear_cka": linear_cka, "rkernel_cka": kernel_cka})
    for feature_group in feature_groups:
        cp_group_columns = [col for col in bf_cp_df.columns if feature_group in col]
        cp_group_features = bf_cp_df[cp_group_columns].values
        cp_group_features = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(cp_group_features)
        cp_group_features = preprocessing.StandardScaler().fit_transform(cp_group_features)

        cka = CKA()
        linear_cka = cka.linear_CKA(cp_group_features, bf_features)
        kernel_cka = cka.kernel_CKA(cp_group_features, bf_features)
        # print(f"Split {i+1}: BF vs CP {feature_group}: Linear CKA: {linear_cka:3f}, Kernel CKA: {kernel_cka:3f}")
        # cka_dict[f"split_{i+1}"] = {"mode1": "bf", "mode2": f"cp_{feature_group}", "linear": linear_cka, "kernel": kernel_cka}
        cka_dict_list.append({"split": i+1, "mode1": "bf", "mode2": f"cp_{feature_group}", "linear_cka": linear_cka, "rkernel_cka": kernel_cka})

cka_df = pd.DataFrame(cka_dict_list, columns=["split", "mode1", "mode2", "linear_cka", "rkernel_cka"])
# cka_df = pd.DataFrame(cka_dict).T
print(cka_df)


    split mode1                  mode2  linear_cka  rkernel_cka
0       1    fl                     bf    0.374020     0.470067
1       1    fl                 cp_all    0.247485     0.334978
2       1    fl           cp_AreaShape    0.168469     0.258229
3       1    fl         cp_Correlation    0.252894     0.359747
4       1    fl         cp_Granularity    0.221514     0.309515
..    ...   ...                    ...         ...          ...
70      5    bf         cp_Correlation    0.267929     0.406174
71      5    bf         cp_Granularity    0.252181     0.411151
72      5    bf           cp_Intensity    0.204782     0.361768
73      5    bf           cp_Neighbors    0.343397     0.391400
74      5    bf  cp_RadialDistribution    0.203854     0.384160

[75 rows x 5 columns]


In [76]:
cka_df.to_csv("cka_df.csv", index=False)


In [47]:
from sklearn.impute import SimpleImputer
cp_df_cell = pd.read_csv("stats/non_grit_based/CP_features_cells.csv")
cp_df_cell.head()

x = pd.merge(cp_df_cell, fl_df, on=["plate", "well", "site", "compound"])
x.head()
fl_features = preprocessing.StandardScaler().fit_transform(x[fl_columns].values)  
cp_features = preprocessing.StandardScaler().fit_transform(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x[cp_df_cell.columns[6:]].values))   
linear_cka = cka.linear_CKA(cp_features, fl_features)
kernel_cka = cka.kernel_CKA(cp_features, fl_features)
print(linear_cka, kernel_cka)

0.3130609035422896 0.4327822092817944


In [48]:
x = pd.merge(cp_df_cell, bf_df, on=["plate", "well", "site", "compound"])
x.head()
bf_features = preprocessing.StandardScaler().fit_transform(x[bf_columns].values)  
cp_features = preprocessing.StandardScaler().fit_transform(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x[cp_df_cell.columns[6:]].values))   
linear_cka = cka.linear_CKA(cp_features, bf_features)
kernel_cka = cka.kernel_CKA(cp_features, bf_features)
print(linear_cka, kernel_cka)

0.32074577766791257 0.5312861691513319


In [55]:
feature_groups = ['AreaShape', 'Correlation', 'Granularity', 'Intensity', 'Neighbors', 'RadialDistribution']

x = pd.merge(cp_df_cell, bf_df, on=["plate", "well", "site", "compound"])
bf_features = preprocessing.StandardScaler().fit_transform(x[bf_columns].values)  
cp_features = preprocessing.StandardScaler().fit_transform(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x[cp_df_cell.columns[6:]].values))   
linear_cka = cka.linear_CKA(cp_features, bf_features)
kernel_cka = cka.kernel_CKA(cp_features, bf_features)
print("Overall", linear_cka, kernel_cka)
for feature_group in feature_groups:
    feature_columns = [x for x in cp_df_cell.columns if feature_group in x]
    cp_features = preprocessing.StandardScaler().fit_transform(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x[feature_columns].values))
    bf_features = preprocessing.StandardScaler().fit_transform(x[bf_columns].values)
    linear_cka = cka.linear_CKA(cp_features, bf_features)
    kernel_cka = cka.kernel_CKA(cp_features, bf_features)
    print(feature_group, linear_cka, kernel_cka)


Overall 0.32074577766791257 0.5312861691513319
AreaShape 0.21452502655963746 0.4252961088318483
Correlation 0.32589881732014125 0.5002984544429935
Granularity 0.2937203549788155 0.5082503450563838
Intensity 0.23312933536660416 0.45004064467799487
Neighbors 0.39817254534097063 0.4785261225206233
RadialDistribution 0.23665023800461593 0.46678059689676116


In [56]:
feature_groups = ['AreaShape', 'Correlation', 'Granularity', 'Intensity', 'Neighbors', 'RadialDistribution']

x = pd.merge(cp_df_cell, fl_df, on=["plate", "well", "site", "compound"])
fl_features = preprocessing.StandardScaler().fit_transform(x[fl_columns].values)  
cp_features = preprocessing.StandardScaler().fit_transform(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x[cp_df_cell.columns[6:]].values))   
linear_cka = cka.linear_CKA(cp_features, fl_features)
kernel_cka = cka.kernel_CKA(cp_features, fl_features)
print("Overall", linear_cka, kernel_cka)
for feature_group in feature_groups:
    feature_columns = [x for x in cp_df_cell.columns if feature_group in x]
    cp_features = preprocessing.StandardScaler().fit_transform(SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(x[feature_columns].values))
    fl_features = preprocessing.StandardScaler().fit_transform(x[fl_columns].values)
    linear_cka = cka.linear_CKA(cp_features, fl_features)
    kernel_cka = cka.kernel_CKA(cp_features, fl_features)
    print(feature_group, linear_cka, kernel_cka)


Overall 0.3130609035422896 0.4327822092817944
AreaShape 0.18705810730408373 0.3163120473222049
Correlation 0.3031421266001194 0.45014942276013176
Granularity 0.2545782300344608 0.38195592547576124
Intensity 0.24751656929927032 0.3887707566232473
Neighbors 0.2509795225575832 0.3217496472097429
RadialDistribution 0.24589908479104142 0.3789290935606736
