#  INIT

In [1]:
import sys
sys.path.append('../../../')
sys.path.append('../../infras/cellMix/')
sys.path.append('../../infras/cytof_data/')
sys.path.append('../../infras/')
sys.path.append('../../infras/dashboards/')
sys.path.append('../../experiments/')
sys.path.append('../../experiments/pipeline/')
sys.path.append('../../preprocess/intra_variance/')
sys.path.append('../../models/cell_proportions/')
sys.path.append('../../measures/cell_proportions_measures/')
sys.path.append('../../preprocess/cell_specifics/')
sys.path.append('../../preprocess/data_sets/')


In [2]:
from data_factory import DataFactory
from global_utils import GlobalUtils
from cytof_cell_count_infra import CytofCellCountInfra
from cell_proportions_experiments import  CellProportionsExperiments
import exploration_cytof_plots as cytof_plots
from basic import BasicDeconv
from pp_entropy_based import PpEntropyBased
from pp_dep_de_based import  PpDepDeBased
from cell_proportions_measure import CellProportionsMeasure
from pp_clean_high_intra_var import PpCleanHighIntraVar
from pp_clean_irrelevant_proteins import PpCleanIrrelevantProteins
from pp_empty import PpEmpty
from pp_entropy_based_only_largest import PpEntropyBasedOnlyLargest
from aggregate_intra_variance import AggregateIntraVariance
from pipeline_deconv import PipelineDeconv
from deconv_py.preprocess.base import BasePreprocess as PP_base
from deconv_py.preprocess.cell_specific import CellSpecific as PP_proteins
from deconv_py.preprocess.cell_specifics.pp_svm_signature import PpSvmSignature
from deconv_py.preprocess.cell_specifics.pp_entropy_based_totel_sum import PpEntropyBasedTotelSum
from deconv_py.preprocess.cell_specifics.pp_floor_under_quantile import PpFloorUnderQuantile
from pick_data_set import PickDataSet
# from deconvolution_results_plots import DeconvolutionResultsPlots



from pp_keep_specific_cells  import  PpKeepSpecificCells
from pp_agg_to_specific_cells import PpAggToSpecificCells

# from deconv_py.infras.data_factory import DataFactory
from deconv_py.infras.data_loader import DataLoader
from deconv_py.models.base import Base as Models_base
from deconv_py.models.cell_proportions_models import CellProportions
from deconv_py.models.cell_specific_models import CellSpecificPerPermutation
from deconv_py.experiments.cell_specific import CellSpecificMetricsPlot
from cellMix_coordinator import CellMixCoordinator

import pandas as pd
import numpy as np
from functools import partial
import multiprocessing
from sklearn import pipeline
import itertools
from scipy.optimize import least_squares
from sklearn.metrics import mean_squared_error
from functools import partial
from scipy.optimize import minimize
import scipy.optimize
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import os
import pickle as pkl 
from sklearn.decomposition import PCA
from IPython.display import display, HTML


%connect_info

{
  "shell_port": 63534,
  "iopub_port": 63535,
  "stdin_port": 63536,
  "control_port": 63537,
  "hb_port": 63538,
  "ip": "127.0.0.1",
  "key": "920e6d62-fee106acbb5bbf71ce9b42e4",
  "transport": "tcp",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Paste the above JSON into a file, and connect with:
    $> jupyter <app> --existing <file>
or, if you are local, you can connect with just:
    $> jupyter <app> --existing kernel-7d0fa404-f647-459b-9811-5e3033ee6ad6.json
or even just:
    $> jupyter <app> --existing
if this is the most recent Jupyter kernel you have started.


# read and build any data

In [3]:
data_factory = DataFactory()
A_all_vs,B_all_vs = data_factory.load_IBD_all_vs("Intensity",index_func=lambda x:x.split(";")[0],log2_transformation=True)
A_all_vs_not_impu,B_all_vs_not_impu = data_factory.load_no_imputation_IBD_all_vs("Intensity",index_func=lambda x:x.split(";")[0],log2_transformation=False)


  from ipykernel import kernelapp as app
  app.launch_new_instance()


# run meta analysis

In [4]:
# A_all_vs.to_csv("../../profile.csv")
# B_all_vs.to_csv("../../mixtures.csv")

## configuration

In [5]:
pick_set = PickDataSet()
spec_cells,agg_spec_cells = PpKeepSpecificCells(),PpAggToSpecificCells()
agg_iv,pp_irl_prot  = AggregateIntraVariance(),PpCleanIrrelevantProteins()
pp_chiv = PpCleanHighIntraVar()
pp_entropy_only_largest, pp_entropy, pp_empty,pp_dep = PpEntropyBasedOnlyLargest(), PpEntropyBased(), PpEmpty(),PpDepDeBased()
pp_svm_signature,pp_totel_sum = PpSvmSignature(),PpEntropyBasedTotelSum()
pp_floor_quantile = PpFloorUnderQuantile()

bd = BasicDeconv()
cpm = CellProportionsMeasure(how="groups")

hyper_configuration = [ 
                         {"step_name": "floor",
                         "steps": [
                             {"function_name": "floor_quantile", "function": pp_floor_quantile,
                              "params": {}},
                             {"function_name": "PpEmpty_floor", "function": pp_empty,
                              "params": {}}
                         ]},
#                        -----------------------------------
                        {"step_name": "per_cells_filter",
                         "steps": [
                            {"function_name": "kepp_specific_cells", "function": spec_cells,
                                  "params": {}},
                            {"function_name": "agg_to_specific_cells", "function": agg_spec_cells,
                                  "params": {}},
#                          {"function_name": "PpEmpty_cells_filt", "function": pp_empty,
#                              "params": {}}
                         ]},
                        # -------------------------------
                        {"step_name": "cleanHighIntraVariance",
                        "steps": [
                            {"function_name": "PpCleanHighIntraVar", "function": pp_chiv,
#                              "params": {"how": ["std"],"std_trh":[1,2]}},
                            "params": {"how": ["std"],"std_trh":[1]}},
                        {"function_name": "PpEmpty_clean_iv", "function": pp_empty,
                             "params": {}}]},
                        # -------------------------------
                        {"step_name": "AggregateIntraVariance",
                        "steps": [
                            {"function_name": "AggregateIntraVariance", "function": agg_iv,
#                              "params": {"how": ["mean", "median","max"]}}]},
                                "params": {"how": ["mean", "median"]}}]},
                       # --------------------------------
                       {"step_name": "cleen_irrelevant_proteins",
                        "steps": [
                            {"function_name": "CleanIrrelevantProteins", "function": pp_irl_prot,
                             "params": {}}]},
                       # --------------------------------
                       {"step_name": "Cytof_X_Building",
                        "steps": [
                            {"function_name": "Cytof_X_Building", "function": pp_empty,
                             "params": {"keep_labels":[True],"with_label_prop":[False]}}]},
                       # --------------------------------
                       {"step_name": "preprocess",
                        "steps": [
                            {"function_name": "pp_totel_sum", "function": pp_totel_sum,
#                     "params": {"totel_sum_percentage": [0.001, 0.0001],"with_norm": [True,False],"number_of_bins" :[0,10,20] ,
                            "params": {"totel_sum_percentage": [0.001, 0.0001],"with_norm": [False],"number_of_bins" :[0,20] ,
                               "only_largest":[True,False]}},
                            {"function_name": "PpEntropyBased", "function": pp_entropy,
#                              "params": {"n_genes_per_cell": [20,100], "gene_entropy_trh": [1,3],"number_of_bins" :[0,10,20] ,
                             "params": {"n_genes_per_cell": [20,100], "gene_entropy_trh": [1,3],"number_of_bins" :[0,20] ,
                                        "with_norm": [True,False]}},
                            {"function_name": "PpEntropyBasedOnlyLargest", "function": pp_entropy_only_largest,
#                              "params": {"n_genes_per_cell": [20,80],"number_of_bins" :[0,10,20] ,"with_norm": [True, False]}},
                            "params": {"n_genes_per_cell": [20,80],"number_of_bins" :[0,20] ,"with_norm": [True, False]}},
                            {"function_name": "PpDepDeBased", "function": pp_dep,
                              "params": {"n_of_genes": [20,80] ,"is_agg_cells":[True,False]}},
                            {"function_name": "PpSvm", "function": pp_svm_signature,
                  "params": {"n_features": [20,80], "with_norm": [ False]}},
                            {"function_name": "PpEmpty_prepro", "function": pp_empty,
                             "params": {}}
                        ]},
                       # --------------------------------
                       {"step_name": "deconv",
                        "steps": [
                            {"function_name": "BasicDeconv", "function": bd,
                             "params": {"normalize": [True], "cellMix": [ False]}}]}]

hyper_measure_configuration = [
    {"step_name": "measure",
     "steps": [
         {"function_name": "CellProportionsMeasure", "function": cpm,
#           "params": {"how": ["correlation","RMSE","MI"],"with_pvalue":[True],"with_iso_test":[False]}}]}]
          "params": {"how": ["correlation","MI","entropy"],"with_pvalue":[True],"with_iso_test":[True]}}]}]

_pipe = PipelineDeconv(hyper_configuration=hyper_configuration,
                                 hyper_measure_configuration=hyper_measure_configuration)

## run

In [6]:
meta_results_original_data = _pipe.run_cytof_pipeline(A_all_vs, B_all_vs,per_cell_analysis = False)
meta_results_not_imputed = _pipe.run_cytof_pipeline(A_all_vs_not_impu, B_all_vs_not_impu,per_cell_analysis = False)

 30%|███████████████████████▌                                                      | 188/624 [18:29<4:00:19, 33.07s/it]

pipeline is : Pipeline(memory=None,
         steps=[('floor_quantile', PpFloorUnderQuantile(quantile=0.1)),
                ('agg_to_specific_cells',
                 PpAggToSpecificCells(cells_mapping={'NOT_BCellmemory': 'B '
                                                                        'cells ',
                                                     'NOT_BCellnaive': 'B '
                                                                       'cells ',
                                                     'NOT_BCellplasma': 'B '
                                                                        'cells ',
                                                     'NOT_CD4TCellTem': 'CD4+ '
                                                                        'effector '
                                                                        'memory '
                                                                        'T '
                                                    

 30%|████████████████████████                                                        | 188/624 [18:35<43:06,  5.93s/it]


KeyError: 'NK'

In [None]:
raise

In [None]:
meta_results.sort_values("corrIso",ascending=False)

In [None]:
# with open("../cache/meta_result_example.pkl",'rb') as f :
#     meta_results = pkl.load(f)
    

In [None]:
valid_corr = meta_results[[col for col in meta_results.columns if "cellcorr" in col]].dropna(axis=0).index
valid_meta = meta_results.loc[valid_corr]
# _res = valid_meta[(valid_meta["corrPval"] < 0.15)].sort_values(by="corrMean",ascending=False)
# _res = _res.drop_duplicates(subset=["corrMean","entropy"])
# _res.head()["uuid"]

valid_meta.loc[valid_meta[[col for col in meta_results.columns if "cellcorr" in col]].idxmax()][["uuid"]+[col for col in meta_results.columns if "cellcorr" in col]]

In [None]:
valid_meta[[col for col in meta_results.columns if "cellcorr" in col]].max()

In [None]:
describe_results(str(1773278244),with_mixtures_pca = True)

In [None]:
cpm = CellProportionsMeasure(how="correlation", return_dict_measure_statas=True, with_pvalue=True)
multi_result = _pipe.run_multi_signature_pipeline(A_all_vs,B_all_vs,meta_results,cpm)
multi_result

In [None]:
# meta_results[~meta_results["PpEntropyBasedTotelSum_with_norm"].isna()][["corrMean","corrPval","entropy","uuid"]]

## result

In [None]:
_res = meta_results[(meta_results["corrPval"] < 0.15)].sort_values(by="corrMean",ascending=False)
_res = _res.drop_duplicates(subset=["corrMean","entropy"])
_res.head()

In [None]:
_res = meta_results.sort_values(by="corrMean",ascending=False)[["corrMean","corrPval","entropy","uuid"]]
_res[(_res["corrMean"] > 0.5)&(_res["entropy"] > 0.5)&(_res["corrPval"] < 0.05)].sort_values(by="entropy")

In [None]:
# meta_results[(meta_results["rmsePval"] < 0.15)&(meta_results["corrPval"] < 0.15)].sort_values(by="corrMean",ascending=False).head(10)

# best results - "1773278244" i think are the best

## get best results

In [None]:
def build_distance_matrix() :
    uuid_to_results = {}
    for uuid in meta_results["uuid"] : 
        params = meta_results[meta_results["uuid"] == int(uuid)].T.copy(deep=True).dropna()
        
        best_results_and_known = _pipe.load_results_from_archive(uuid)
        best_results=best_results_and_known["result"]
        res_values = best_results.sort_index(axis=0).sort_index(axis=1).values
        
        uuid_to_results[uuid] = res_values
    
    results = {}
    looked_uuids = {}
    for uuid,res in uuid_to_results.items():
        if uuid in looked_uuids.keys():
            continue
            
        neighbors = [] 
        for sec_uuid,sec_res in uuid_to_results.items():
            if (res == sec_res).all() :
                neighbors.append(sec_uuid)
                looked_uuids[sec_uuid] = sec_uuid
        if len(neighbors) > 1 :
            results[uuid] = neighbors
    
    clusters = [] 
    for _,cluster in  results.items():
        uuids = [] 
        for uuid in cluster : 
            params = meta_results[meta_results["uuid"] == int(uuid)].T.copy(deep=True).dropna()
            uuids.append(params)
        clusters.append(pd.concat(uuids,axis=1))
        
    
    return clusters
      
def plot_correlation_per_cell(uuids) : 
    if type(uuids) is str : 
        uuids = [uuids]
    
    max_corr_per_cell = {}
    best_param_per_cell = {}
    
    for uuid in meta_results["uuid"] : 
        params = meta_results[meta_results["uuid"] == int(uuid)].T.copy(deep=True).dropna()
        
        best_results_and_known = _pipe.load_results_from_archive(uuid)
        best_results=best_results_and_known["result"]
        best_known=best_results_and_known["known"]
        mapping = GlobalUtils.get_corospanding_mixtures_map(best_known,best_results)
        best_known = best_known.rename(columns=mapping)
        best_known = best_known[[col for col in mapping.values()]]

        mutual_col = best_known.columns.intersection(best_results.columns)
        best_results = best_results[mutual_col]
        best_known = best_known[mutual_col]

        corr_per_cell =  best_results.T.corrwith(best_known.T,method="spearman")
        
        for cell in corr_per_cell.index : 
            if cell in max_corr_per_cell.keys() : 
                if corr_per_cell[cell] > max_corr_per_cell[cell] :
                    max_corr_per_cell[cell] = corr_per_cell[cell]
                    best_param_per_cell[cell] = params
            else : 
                max_corr_per_cell[cell] = corr_per_cell[cell]
                best_param_per_cell[cell] = params
    
    return max_corr_per_cell,best_param_per_cell

def plot_results_vs_known_pca(best_results,best_known):
    pca = PCA(n_components=2)
    pca.fit(pd.concat([best_results,best_known],axis=1).T)

    deconv_principalcomp = pca.transform(best_results.T)
    known_principalcomp = pca.transform(best_known.T)

    deconv_principalDf =pd.DataFrame(data = deconv_principalcomp
                 , columns = ['principal component 1', 'principal component 2'],index = best_results.columns)
    known_principalDf = pd.DataFrame(data = known_principalcomp
                 , columns = ['principal component 1', 'principal component 2'],index = best_known.columns)

    deconv_principalDf["color"] = "b"
    known_principalDf["color"] = "r"

    principalDf = deconv_principalDf.append(known_principalDf)
    fig = plt.figure(figsize = (25,15))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_title('blue - deconvolution result,red -  known proportions', fontsize = 20)

    ax.scatter(principalDf['principal component 1'], principalDf['principal component 2'],c=principalDf['color'], s = 50)

    for mixture in range(deconv_principalcomp.shape[0]):
        deconv_point = deconv_principalcomp[mixture,:]
        known_point = known_principalcomp[mixture,:]
        plt.plot([deconv_point[0],known_point[0]],[deconv_point[1],known_point[1]], ':')

    for i, txt in enumerate(principalDf.index):
        ax.annotate(txt, (principalDf['principal component 1'].iloc[i], principalDf['principal component 2'].iloc[i]))

def describe_results(uuids,with_per_mixture_plot = False,with_mixtures_pca = False,meta_results = None) : 
    if type(uuids) is str : 
        uuids = [uuids]
    
    for uuid in uuids : 
        if meta_results is not None : 
            params = meta_results[meta_results["uuid"] == int(uuid)].T.copy(deep=True).dropna()
            print("params : ")
            display(HTML(params.to_html()))


        best_results_and_known = _pipe.load_results_from_archive(uuid)
        best_results=best_results_and_known["result"]
        best_known=best_results_and_known["known"]
        mapping = GlobalUtils.get_corospanding_mixtures_map(best_known,best_results)
        best_known = best_known.rename(columns=mapping)
        best_known = best_known[[col for col in mapping.values()]]

        mutual_col = best_known.columns.intersection(best_results.columns)
        best_results = best_results[mutual_col]
        best_known = best_known[mutual_col]

        print("mixtures : ")
        display(HTML(best_results.to_html()))

        print("mixtures details :")
        display(HTML(best_results.corrwith(best_known,method="spearman").describe().to_frame().to_html()))

        cytof_plots.plot_mass_to_cytof_scatter_all_on_one(best_results,best_known,best_results)
        if with_per_mixture_plot : 
            cytof_plots.plot_mass_to_cytof_scatter(best_results,best_known,best_results)
        if with_mixtures_pca : 
            plot_results_vs_known_pca(best_results,best_known)

In [None]:
describe_results("1773278244",True)

# test

In [None]:
describe_results("1773278244",True)