In [1]:
import sys
sys.path.append('../../')
sys.path.append('../infras/cellMix/')

In [2]:
import pandas as pd
import numpy as np
from functools import partial
import multiprocessing

from scipy.optimize import least_squares
from sklearn.metrics import mean_squared_error
from functools import partial
from scipy.optimize import minimize
import scipy.optimize
from itertools import combinations
import matplotlib.pyplot as plt

import os

In [3]:
from deconv_py.preprocess.base import BasePreprocess as PP_base
from deconv_py.preprocess.cell_specific import CellSpecific as PP_proteins

from deconv_py.infras.data_factory import DataFactory
from deconv_py.infras.data_loader import DataLoader

from deconv_py.models.base import Base as Models_base
from deconv_py.models.cell_proportions_models import CellProportions
from deconv_py.models.cell_specific_models import CellSpecificPerPermutation

from deconv_py.experiments.cell_specific import CellSpecificMetricsPlot

from deconv_py.infras.cellMix.cellMix_coordinator import CellMixCoordinator

# Calc cell specific - LFQ

## read data

### mixture & cell specific data

In [4]:
data_loader = DataLoader(mass_spec_path=os.path.abspath('../data/20150208_mixture_proteinGroups.xls'),
                         protein_profile_path = os.path.abspath('../data/20150718_Cerberus_proteinGroups.txt'))
data_factory = DataFactory(data_loader)
profile_data,profile_data_relvent_data = data_factory.build_cell_specific_profile()
mixtures,mixtures_relvent_data = data_factory.build_mixture_data()

  if self.run_code(code, result):


### cell_proportions

In [5]:
cell_proportions_df = pd.DataFrame(index = ['LFQ intensity NOT_CD4TCellTcm_01','LFQ intensity NOT_BCellmemory_01','LFQ intensity NOT_Monocytesnonclassical_01'],
                                   columns = list(range(1,9)),
                                   data =np.array(((100,0,0),(0,100,0),(0,0,100),(33,33,33),(25,25,50),(25,50,25),(50,25,25),(47.5,47.5,5.0))).T)

## build A ,X and B 

### set unique index

In [6]:
index_func = lambda x:x.split(';')[0]
_profile_data,_mixtures = PP_base.return_mutual_proteins_by_index(profile_data,mixtures,index_func=index_func)
A = _profile_data[profile_data_relvent_data]
B = _mixtures[mixtures_relvent_data]

X = cell_proportions_df

In [7]:
B=B.rename({f:f.split('LFQ intensity ')[1] for f in B.columns},axis=1)

X = X.rename({f:f.split('LFQ intensity ')[1] for f in X.index},axis=0)
X = X.rename({f:'mixture'+str(f) for f in X.columns},axis=1)

A = A.rename({f:f.split('LFQ intensity ')[1] for f in A.columns},axis=1)

## models

In [8]:
cmc = CellMixCoordinator()
def _calc_and_display_with_cellmix(_a,_b,X,with_cellMix = False) : 
    cell_abundance_over_samples = CellProportions.fit(_a,_b.values)
    
    if with_cellMix : 
        cellMax_cell_abundance_over_samples = cmc.cell_prop_with_bashop_with_bash(_b,_a)
        display(cellMax_cell_abundance_over_samples)
        
    display(cell_abundance_over_samples)
    display(X)
    

### with negtive

In [9]:
#  cell_specific_per_permutation = CellSpecificPerPermutation.fit(X.T,B.T)

In [10]:
# with dfs
# cell_specific_per_permutation_dfs =CellSpecificPerPermutation.fit_as_df(X.T,B.T)

### no negtive

In [11]:
# cell_specific_per_permutation_nn = CellSpecificPerPermutation.nn_fit(X.T,B.T)
cell_specific_per_permutation_nn_dfs,permuts = CellSpecificPerPermutation.nn_fit_as_df(X.T,B.T)

# calc cell proportions - iBAQ

## read data - iBAQ

### mixture & cell specific data

In [12]:
data_factory = DataFactory(data_loader)
profile_data,profile_data_relvent_data = data_factory.build_cell_specific_profile(intensity_type='iBAQ')
mixtures,mixtures_relvent_data = data_factory.build_mixture_data(intensity_type='iBAQ')

### cell proportions

In [13]:
cell_proportions_df = pd.DataFrame(index = ['iBAQ NOT_CD4TCellTcm_01','iBAQ NOT_BCellmemory_01','iBAQ NOT_Monocytesnonclassical_01'],
                                   columns = list(range(1,9)),
                                   data =np.array(((100,0,0),(0,100,0),(0,0,100),(33,33,33),(25,25,50),(25,50,25),(50,25,25),(47.5,47.5,5.0))).T)

## build A ,X and B 

### set unique index

In [14]:
index_func = lambda x:x.split(';')[0]
_profile_data,_mixtures = PP_base.return_mutual_proteins_by_index(profile_data,mixtures,index_func=index_func)
A = _profile_data[profile_data_relvent_data]
B = _mixtures[mixtures_relvent_data]

X = cell_proportions_df

In [15]:
B=B.rename({f:f.split('iBAQ ')[1] for f in B.columns},axis=1)

X = X.rename({f:f.split('iBAQ ')[1] for f in X.index},axis=0)
X = X.rename({f:'mixture'+str(f) for f in X.columns},axis=1)

A = A.rename({f:f.split('iBAQ ')[1] for f in A.columns},axis=1)

## models

### naive

In [16]:
_calc_and_display_with_cellmix(A,B,X)

array([[0.76, 0.19, 0.05],
       [0.42, 0.58, 0.  ],
       [0.29, 0.  , 0.71],
       [0.63, 0.22, 0.14],
       [0.44, 0.39, 0.18],
       [0.73, 0.  , 0.27],
       [0.59, 0.32, 0.1 ],
       [0.37, 0.58, 0.05]])

Unnamed: 0.1,Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
0,NOT_CD4TCellTcm_01,0.763596,0.423652,0.294335,0.634969,0.435925,0.733819,0.587157,0.366978
1,NOT_BCellmemory_01,0.18803,0.576348,0.0,0.224542,0.385447,0.0,0.315142,0.580757
2,NOT_Monocytesnonclassical_01,0.048374,0.0,0.705665,0.140489,0.178628,0.266181,0.097701,0.052265


Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
NOT_CD4TCellTcm_01,100.0,0.0,0.0,33.0,25.0,25.0,50.0,47.5
NOT_BCellmemory_01,0.0,100.0,0.0,33.0,25.0,50.0,25.0,47.5
NOT_Monocytesnonclassical_01,0.0,0.0,100.0,33.0,50.0,25.0,25.0,5.0


### with preprocess

#### naive discriminative

In [17]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_naive_discriminative_proteins(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

array([[0.89, 0.11, 0.  ],
       [0.  , 1.  , 0.  ],
       [0.  , 0.  , 1.  ],
       [0.3 , 0.5 , 0.2 ],
       [0.08, 0.68, 0.24],
       [0.46, 0.02, 0.52],
       [0.41, 0.44, 0.15],
       [0.27, 0.71, 0.01]])

Unnamed: 0.1,Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
0,NOT_CD4TCellTcm_01,0.889542,0.0,0,0.295997,0.081861,0.461521,0.41488,0.272538
1,NOT_BCellmemory_01,0.110458,0.995714,0,0.50366,0.679006,0.019773,0.438832,0.713898
2,NOT_Monocytesnonclassical_01,0.0,0.004286,1,0.200343,0.239133,0.518706,0.146289,0.013565


Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
NOT_CD4TCellTcm_01,100.0,0.0,0.0,33.0,25.0,25.0,50.0,47.5
NOT_BCellmemory_01,0.0,100.0,0.0,33.0,25.0,50.0,25.0,47.5
NOT_Monocytesnonclassical_01,0.0,0.0,100.0,33.0,50.0,25.0,25.0,5.0


#### binary occurrence

In [18]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_binary_occurrence(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

Exception: Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...): 0 (non-NA) cases



#### TopMergin

In [22]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_margin_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

array([[0.77, 0.18, 0.05],
       [0.43, 0.57, 0.  ],
       [0.3 , 0.  , 0.7 ],
       [0.64, 0.21, 0.14],
       [0.44, 0.37, 0.18],
       [0.73, 0.  , 0.27],
       [0.59, 0.31, 0.1 ],
       [0.37, 0.58, 0.05]])

Unnamed: 0.1,Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
0,NOT_CD4TCellTcm_01,0.77055,0.429671,0.295269,0.644214,0.444104,0.734059,0.594403,0.370674
1,NOT_BCellmemory_01,0.180191,0.570329,0.0,0.21337,0.374703,0.0,0.306613,0.576275
2,NOT_Monocytesnonclassical_01,0.049259,0.0,0.704731,0.142417,0.181193,0.265941,0.098984,0.053051


Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
NOT_CD4TCellTcm_01,100.0,0.0,0.0,33.0,25.0,25.0,50.0,47.5
NOT_BCellmemory_01,0.0,100.0,0.0,33.0,25.0,50.0,25.0,47.5
NOT_Monocytesnonclassical_01,0.0,0.0,100.0,33.0,50.0,25.0,25.0,5.0


#### under quantile 

In [23]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_under_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

array([[0.65, 0.35, 0.  ],
       [0.19, 0.8 , 0.02],
       [0.07, 0.27, 0.66],
       [0.37, 0.53, 0.1 ],
       [0.23, 0.67, 0.11],
       [0.44, 0.34, 0.22],
       [0.4 , 0.53, 0.07],
       [0.36, 0.63, 0.01]])

Unnamed: 0.1,Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
0,NOT_CD4TCellTcm_01,0.645109,0.185666,0.070559,0.366191,0.226082,0.440234,0.396027,0.360395
1,NOT_BCellmemory_01,0.351032,0.797093,0.267529,0.529373,0.666119,0.336887,0.529219,0.626288
2,NOT_Monocytesnonclassical_01,0.003859,0.01724,0.661912,0.104436,0.107799,0.222879,0.074754,0.013318


Unnamed: 0,mixture1,mixture2,mixture3,mixture4,mixture5,mixture6,mixture7,mixture8
NOT_CD4TCellTcm_01,100.0,0.0,0.0,33.0,25.0,25.0,50.0,47.5
NOT_BCellmemory_01,0.0,100.0,0.0,33.0,25.0,50.0,25.0,47.5
NOT_Monocytesnonclassical_01,0.0,0.0,100.0,33.0,50.0,25.0,25.0,5.0


# calc cell proportions - intensities

## read data - intensities

### mixture & cell specific data

In [None]:
data_factory = DataFactory(data_loader)
profile_data,profile_data_relvent_data = data_factory.build_cell_specific_profile(intensity_type='Intensity')
mixtures,mixtures_relvent_data = data_factory.build_mixture_data(intensity_type='Intensity')

### cell proportions

In [None]:
cell_proportions_df = pd.DataFrame(index = ['Intensity NOT_CD4TCellTcm_01','Intensity NOT_BCellmemory_01','Intensity NOT_Monocytesnonclassical_01'],
                                   columns = list(range(1,9)),
                                   data =np.array(((100,0,0),(0,100,0),(0,0,100),(33,33,33),(25,25,50),(25,50,25),(50,25,25),(47.5,47.5,5.0))).T)

## build A ,X and B 

### set unique index

In [None]:
index_func = lambda x:x.split(';')[0]
_profile_data,_mixtures = PP_base.return_mutual_proteins_by_index(profile_data,mixtures,index_func=index_func)
A = _profile_data[profile_data_relvent_data]
B = _mixtures[mixtures_relvent_data]

X = cell_proportions_df

In [None]:
B=B.rename({f:f.split('Intensity ')[1] for f in B.columns},axis=1)

X = X.rename({f:f.split('Intensity ')[1] for f in X.index},axis=0)
X = X.rename({f:'mixture'+str(f) for f in X.columns},axis=1)

A = A.rename({f:f.split('Intensity ')[1] for f in A.columns},axis=1)

## models

### naive

In [None]:
_calc_and_display_with_cellmix(A,B,X)

### with preprocess

#### naive discriminative

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_naive_discriminative_proteins(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### binary occurrence

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_binary_occurrence(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### TopMergin

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_margin_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

#### under quantile 

In [None]:
_A,_B = PP_proteins.pp_clean_irrelevant_proteins(A,B)
_A,_B = PP_proteins.pp_under_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

# most important proteins from genes 

### read gene data

In [None]:
raw_gene_abundance = pd.read_excel('../../../data/ni.3693-S7.xlsx')
raw_gene_abundance.set_index('Gene name',inplace=True)
gene_abundance_over_cells = raw_gene_abundance[['Bmemory_not_2', 'MOclassical_not_3', 'T4cm_not_5']]

wrong_indexs = pd.Index([g for g in gene_abundance_over_cells.index if type(g) != str])
clean_indexs = gene_abundance_over_cells.index.difference(wrong_indexs)
gene_abundance_over_cells = gene_abundance_over_cells.loc[clean_indexs]
gene_abundance_over_cells = gene_abundance_over_cells.dropna()

#### take high std and binary differentiate genes 

In [None]:
high_std_gene_abundance_idx = gene_abundance_over_cells[gene_abundance_over_cells.std(axis=1) > gene_abundance_over_cells.std(axis=1).quantile(0.99)].index
gene_binary_filtering_idx = gene_abundance_over_cells[abs(gene_abundance_over_cells)<abs(gene_abundance_over_cells).quantile(0.01)].dropna(how='all').index

genes_to_take = high_std_gene_abundance_idx.union(gene_binary_filtering_idx)
gene_to_major_protein_dict = mixtures[['major_protein_id','Gene names']].set_index('Gene names').to_dict()['major_protein_id']
proteins_to_take = [gene_to_major_protein_dict[gene] for gene in genes_to_take if gene in gene_to_major_protein_dict.keys()]

In [None]:
_A,_B = (A.loc[proteins_to_take],B.loc[proteins_to_take])
_A,_B = PP_proteins.pp_under_quantile(_A,_B)

_calc_and_display_with_cellmix(_A,_B,X)

In [None]:
from sklearn.decomposition.pca  import PCA 