In [1]:
import sys
sys.path.append('../../')
sys.path.append('../infras/cellMix/')
from cellMix_coordinator import CellMixCoordinator

In [2]:
import pandas as pd
import numpy as np
from functools import partial
import multiprocessing

from scipy.optimize import least_squares
from sklearn.metrics import mean_squared_error
from functools import partial
from scipy.optimize import minimize
import scipy.optimize
from itertools import combinations
import matplotlib.pyplot as plt

from IPython.display import display, HTML


import os

In [3]:
from deconv_py.preprocess.base import BasePreprocess as PP_base
from deconv_py.preprocess.cell_specific import CellSpecific as PP_proteins

from deconv_py.infras.data_factory import DataFactory
from deconv_py.infras.data_loader import DataLoader

from deconv_py.models.base import Base as Models_base
from deconv_py.models.cell_proportions_models import CellProportions
from deconv_py.models.cell_specific_models import CellSpecificPerPermutation

from deconv_py.experiments.cell_specific import CellSpecificMetricsPlot

# from deconv_py.infras.cellMix.cellMix_coordinator import CellMixCoordinator

# Calc cell specific - LFQ

## read data

### mixture & cell specific data

In [None]:
data_loader = DataLoader(mass_spec_path=os.path.abspath('../data/20150208_mixture_proteinGroups.xls'),
                         protein_profile_path = os.path.abspath('../data/20150718_Cerberus_proteinGroups.txt'))
data_factory = DataFactory(data_loader)
profile_data,profile_data_relvent_data = data_factory.build_cell_specific_profile()
mixtures,mixtures_relvent_data = data_factory.build_mixture_data()

### cell_proportions

In [None]:
cell_proportions_df = pd.DataFrame(index = ['LFQ intensity NOT_CD4TCellTcm_01','LFQ intensity NOT_BCellmemory_01','LFQ intensity NOT_Monocytesnonclassical_01'],
                                   columns = list(range(1,9)),
                                   data =np.array(((100,0,0),(0,100,0),(0,0,100),(33,33,33),(25,25,50),(25,50,25),(50,25,25),(47.5,47.5,5.0))).T)

## build A ,X and B 

### set unique index

In [None]:
# index_func = lambda x:x.split(';')[0]
index_func = lambda x:x
_profile_data,_mixtures = PP_base.return_mutual_proteins_by_index(profile_data,mixtures,index_func=index_func)
A = _profile_data[profile_data_relvent_data]
B = _mixtures[mixtures_relvent_data]

X = cell_proportions_df

In [None]:
B=B.rename({f:f.split('LFQ intensity ')[1] for f in B.columns},axis=1)

X = X.rename({f:f.split('LFQ intensity ')[1] for f in X.index},axis=0)
X = X.rename({f:'mixture'+str(f) for f in X.columns},axis=1)

A = A.rename({f:f.split('LFQ intensity ')[1] for f in A.columns},axis=1)

In [None]:
B.shape

## models

In [None]:
cmc = CellMixCoordinator()
def _calc_and_display_with_cellmix(_a,_b,X,with_cellMix = False,as_heatmap = False) : 
    if _a.empty :
        raise Exception("A is empty")
    if _b.empty :
        raise Exception("B is empty")
    
    cell_abundance_over_samples = CellProportions.fit_as_df(_a,_b)
    
    if with_cellMix : 
        cellMax_cell_abundance_over_samples = cmc.cell_prop_with_bash(_b,_a).rename({"Unnamed: 0":"cells"},axis=1).set_index("cells")
        if as_heatmap : 
            sns.heatmap(cellMax_cell_abundance_over_samples)
            plt.show()
        else :
            display(HTML(' <span style="color:blue"><h1>cellMix : </h1> </span>  '))
            display(cellMax_cell_abundance_over_samples)
    
    if as_heatmap : 
        sns.heatmap(cell_abundance_over_samples)
    else : 
        display(HTML(' <span style="color:blue"><h1>model : </h1> </span>  '))
        display(cell_abundance_over_samples)
        display(HTML(' <span style="color:blue"><h1>known cell proportion : </h1> </span>  '))
        display(X)

### naive

In [None]:
A.shape

In [None]:
_calc_and_display_with_cellmix(A,B,X,True)

### with preprocess

#### naive discriminative

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_naive_discriminative_proteins(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### binary occurrence

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_binary_occurrence(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### TopMergin

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_margin_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### under quantile 

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_under_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

# calc cell proportions - iBAQ

## read data - iBAQ

### mixture & cell specific data

In [None]:
data_factory = DataFactory(data_loader)
profile_data,profile_data_relvent_data = data_factory.build_cell_specific_profile(intensity_type='iBAQ')
mixtures,mixtures_relvent_data = data_factory.build_mixture_data(intensity_type='iBAQ')

### cell proportions

In [None]:
cell_proportions_df = pd.DataFrame(index = ['iBAQ NOT_CD4TCellTcm_01','iBAQ NOT_BCellmemory_01','iBAQ NOT_Monocytesnonclassical_01'],
                                   columns = list(range(1,9)),
                                   data =np.array(((100,0,0),(0,100,0),(0,0,100),(33,33,33),(25,25,50),(25,50,25),(50,25,25),(47.5,47.5,5.0))).T)

## build A ,X and B 

### set unique index

In [None]:
# index_func = lambda x:x.split(';')[0]
index_func = lambda x:x
_profile_data,_mixtures = PP_base.return_mutual_proteins_by_index(profile_data,mixtures,index_func=index_func)
A = _profile_data[profile_data_relvent_data]
B = _mixtures[mixtures_relvent_data]

X = cell_proportions_df

In [None]:
B=B.rename({f:f.split('iBAQ ')[1] for f in B.columns},axis=1)

X = X.rename({f:f.split('iBAQ ')[1] for f in X.index},axis=0)
X = X.rename({f:'mixture'+str(f) for f in X.columns},axis=1)

A = A.rename({f:f.split('iBAQ ')[1] for f in A.columns},axis=1)

## models

### naive

In [None]:
_calc_and_display_with_cellmix(A,B,X)

### with preprocess

#### naive discriminative

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_naive_discriminative_proteins(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### binary occurrence

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_binary_occurrence(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### TopMergin

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_margin_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### under quantile 

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_under_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

# calc cell proportions - intensities

## read data - intensities

### mixture & cell specific data

In [None]:
data_factory = DataFactory(data_loader)
profile_data,profile_data_relvent_data = data_factory.build_cell_specific_profile(intensity_type='Intensity')
mixtures,mixtures_relvent_data = data_factory.build_mixture_data(intensity_type='Intensity')

### cell proportions

In [None]:
cell_proportions_df = pd.DataFrame(index = ['Intensity NOT_CD4TCellTcm_01','Intensity NOT_BCellmemory_01','Intensity NOT_Monocytesnonclassical_01'],
                                   columns = list(range(1,9)),
                                   data =np.array(((100,0,0),(0,100,0),(0,0,100),(33,33,33),(25,25,50),(25,50,25),(50,25,25),(47.5,47.5,5.0))).T)

## build A ,X and B 

### set unique index

In [None]:
# index_func = lambda x:x.split(';')[0]
index_func = lambda x:x
_profile_data,_mixtures = PP_base.return_mutual_proteins_by_index(profile_data,mixtures,index_func=index_func)
A = _profile_data[profile_data_relvent_data]
B = _mixtures[mixtures_relvent_data]

X = cell_proportions_df

In [None]:
B=B.rename({f:f.split('Intensity ')[1] for f in B.columns},axis=1)

X = X.rename({f:f.split('Intensity ')[1] for f in X.index},axis=0)
X = X.rename({f:'mixture'+str(f) for f in X.columns},axis=1)

A = A.rename({f:f.split('Intensity ')[1] for f in A.columns},axis=1)

## models

### naive

In [None]:
_calc_and_display_with_cellmix(A,B,X,True)

### with preprocess

#### naive discriminative

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_naive_discriminative_proteins(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### binary occurrence

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_binary_occurrence(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### TopMergin

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_margin_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### under quantile 

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_under_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

# most important proteins from genes 

### read gene data

In [None]:
# raw_gene_abundance = pd.read_excel('../../../data/ni.3693-S7.xlsx')
# raw_gene_abundance.set_index('Gene name',inplace=True)
# gene_abundance_over_cells = raw_gene_abundance[['Bmemory_not_2', 'MOclassical_not_3', 'T4cm_not_5']]

# wrong_indexs = pd.Index([g for g in gene_abundance_over_cells.index if type(g) != str])
# clean_indexs = gene_abundance_over_cells.index.difference(wrong_indexs)
# gene_abundance_over_cells = gene_abundance_over_cells.loc[clean_indexs]
# gene_abundance_over_cells = gene_abundance_over_cells.dropna()

#### take high std and binary differentiate genes 

In [None]:
# high_std_gene_abundance_idx = gene_abundance_over_cells[gene_abundance_over_cells.std(axis=1) > gene_abundance_over_cells.std(axis=1).quantile(0.99)].index
# gene_binary_filtering_idx = gene_abundance_over_cells[abs(gene_abundance_over_cells)<abs(gene_abundance_over_cells).quantile(0.01)].dropna(how='all').index

# genes_to_take = high_std_gene_abundance_idx.union(gene_binary_filtering_idx)
# gene_to_major_protein_dict = mixtures[['major_protein_id','Gene names']].set_index('Gene names').to_dict()['major_protein_id']
# proteins_to_take = [gene_to_major_protein_dict[gene] for gene in genes_to_take if gene in gene_to_major_protein_dict.keys()]

In [None]:
# _A,_B = (A.loc[proteins_to_take],B.loc[proteins_to_take])
# _A,_B = PP_proteins.pp_under_quantile(_A,_B)

# _calc_and_display_with_cellmix(_A,_B,X)

In [None]:
# from sklearn.decomposition.pca  import PCA 

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# plt.figure(figsize=(240, 240))


# # Run The PCA
# pca = PCA(n_components=3)
# pca.fit(B.T)
 
# # Store results of PCA in a data frame
# result=pd.DataFrame(pca.transform(B.T), columns=['PCA%i' % i for i in range(3)], index=range(8))
 
# # Plot initialisation
# fig = plt.figure()
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(result['PCA0'], result['PCA1'], result['PCA2'], cmap="Set2_r", s=60)
 
# # make simple, bare axis lines through space:
# xAxisLine = ((min(result['PCA0']), max(result['PCA0'])), (0, 0), (0,0))
# ax.plot(xAxisLine[0], xAxisLine[1], xAxisLine[2], 'r')
# yAxisLine = ((0, 0), (min(result['PCA1']), max(result['PCA1'])), (0,0))
# ax.plot(yAxisLine[0], yAxisLine[1], yAxisLine[2], 'r')
# zAxisLine = ((0, 0), (0,0), (min(result['PCA2']), max(result['PCA2'])))
# ax.plot(zAxisLine[0], zAxisLine[1], zAxisLine[2], 'r')
 
# # label the axes
# ax.set_xlabel("PC1")
# ax.set_ylabel("PC2")
# ax.set_zlabel("PC3")
# plt.show()