In [2]:
import numpy as np
import pandas as pd
import sklearn
import math 
from typing import AnyStr, Callable, Tuple
from sklearn.base import ClusterMixin,BaseEstimator
from scipy.cluster.hierarchy import ward, cut_tree
from sklearn.metrics import make_scorer
from sklearn.cluster import AffinityPropagation
from sklearn.decomposition import PCA

from load_data import *
from pred_score import *
from Filter_FS import *
from hybrid_FS import *
from overlap_genes import *
from crossValidation import *

# Using other genesets

In [21]:
#Load AE3 data
AE3 = pd.read_csv ('../data/merged_data/D0.csv')
AE3 = AE3.set_index('Unnamed: 0')
y_AE3 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D0.csv')))

AE3_master = pyreadr.read_r('../data/data_master/Master_Almut_LSK_D2_exp1_20_3t5_nocellcyclesplit_lifted.rds')
print(AE3_master)
AE3_master = AE3_master[None]
AE3_master = AE3_master.set_index('gene')     

CV2_of_mean = AE3_master['P_value_estimate_CV2_ofmeans_20_']

CV2_of_mean = CV2_of_mean[CV2_of_mean <= 0.05].index
pd.DataFrame(CV2_of_mean).to_csv('../data/optimized_subsets/LK_exp1_CV2mean.csv', index=False)

OrderedDict([(None,                P_value_estimate_CV2_ofmeans_20_  \
rownames                                          
0610006L08Rik                               NaN   
0610007P14Rik                              0.20   
0610009B22Rik                              0.90   
0610009E02Rik                              0.45   
0610009L18Rik                               NaN   
...                                         ...   
mt-Nd3                                     0.00   
mt-Nd4                                     0.00   
mt-Nd4l                                    0.45   
mt-Nd5                                     0.45   
mt-Nd6                                     0.75   

               P_value_estimate_CV2_ofmeans_20_NA  \
rownames                                            
0610006L08Rik                                 NaN   
0610007P14Rik                                0.65   
0610009B22Rik                                0.85   
0610009E02Rik                                0.90  

KeyError: "None of ['gene'] are in the columns"

In [4]:
ind_CV2 = get_ind_genes(AE3.index, CV2_of_mean)
ind_CV2_genes = np.zeros((len(AE3.index),))
ind_CV2_genes[ind_CV2] = True
ind_CV2_genes = list(ind_CV2_genes.astype(bool))

NameError: name 'get_ind_genes' is not defined

In [5]:
def get_ind_genes(all_genes:np.array, subset:np.array):
    ind = []
    for gene in subset:
        ind = np.append(ind, int(np.squeeze(np.where(all_genes == gene))))
        ind = list(ind.astype(int))
        
    return ind

In [9]:
subsets = [np.squeeze([ind_CV2_genes]), np.squeeze([ind_intraCV2_genes]), np.squeeze([ind_corr_genes])]
model = EnsemblingHierarchical(np.unique(y_AE3),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = np.array(AE3).T, y= y_AE3)
print(model.score_, model.recovery)

(3, 333)
0.5174129353233831 0.7477477477477478


# Unsupervised genes subset selection

In [1]:
from scipy.cluster.hierarchy import cophenet
import numpy as np

def compute_cophe_coeff(orign_dists:np.array, Z:np.array):
    '''Compute the Cophenetic coefficient from the original distance matrix and generated dendrogram of clustering
        parameters:
        -------
        orign_dists : np.array,
            original distance matrix before clustering
        Z : np.array,
            dendrogram of the clustering to evaluate
    
        Returns
        -------
        corr_coeff:float [0,1],
            computed Cophenetic coefficient '''
    cophe_dists = cophenet(Z) 
    corr_coef = np.corrcoef(orign_dists, cophe_dists)[0,1]
    
    return corr_coef

In [2]:
import numpy as np
import pandas as pd
import sklearn
import math 
from typing import AnyStr, Callable, Tuple
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.base import ClusterMixin,BaseEstimator
from scipy.cluster.hierarchy import ward, cut_tree
from sklearn.metrics import make_scorer
from scipy_cut_tree_balanced import cut_tree_balanced

from pred_score import *

In [3]:
class FamiliesClusters(ClusterMixin, BaseEstimator):
    '''
    Families clustering
    Hierachical clustering with the ward2 criterion, use the spearman correlation as the distance measure.
    Parameters
    ----------
    family_interest: np.array,
        list of family of interest
    Scoring : Callable,
        scoring function use to evaluate the model
    maximize: bool,
        if True the scoring function is maximize, else it is minimize
        
    Attributes
    ----------
    n_clusters_ : int
        The number of clusters found by the algorithm.
    labels_ : ndarray of shape (n_samples)
        Cluster labels for each point.
    family_interest: np.array,
        list of family of interest.
    Scoring_ : Callable,
        scoring function use to evaluate the model.
    maximize_: bool,
        if True the scoring function is maximize, else it is minimize.
    
    '''
    
    def __init__(self, family_interest_:np.array, Scoring_:Callable, maximize_:bool):
        super().__init__()
        self.family_interest_ = family_interest_
        self.Scoring_ = Scoring_
        self.maximize_ = maximize_
        
    def fit(self, X:np.array, y:np.array, NmaxCluster:int = None):
        '''Fit data using hierachical clustering with the ward2 criterion and use the spearman correlation as the distance measure and predict.
        
        parameters:
        -------
        x : np.array,
            features of each data points
        y : np.array,
            family of each data points
        NmaxCluster : int,
            max number of cells in a cluster
        

        Returns
        -------
        self,
            return fitted self'''
        
        #Compute the spearman correlation of X
        X_pd = pd.DataFrame(X.T)
        corr_expr= X_pd.corr(method= 'spearman')
        corr_expr = np.array((1 - corr_expr)/2)
        corr_expr = upper_tri_indexing(corr_expr)**2 #Squared for ward2 criterion
        
        if(np.shape(X.T)[0] == 1):
            corr_expr.fill(1)
        
        #Create clustering tree using hierarchical clustering with spearmann correlation and ward2 criterion
        Z = ward(corr_expr)
        
        #Cut the tree into clusters of maximum size equal to the number of cells in the largest family in data set
        if NmaxCluster == None:
            Nmax = round(np.mean(np.unique(y,return_counts=True)[1]))
        else:
            Nmax = NmaxCluster
        
        clustering = np.squeeze(cut_tree_balanced(Z, max_cluster_size = Nmax)[0])
        
        #Assign all cells predicted alone in a cluster the label 0
        clustering += 1
        values, counts = np.unique(clustering, return_counts=True)
        onecell_family = values[np.where(counts==1)]
        for fam in onecell_family:
            clustering[clustering == fam] = 0
        
        #Compute recovery
        self.recovery = compute_recovery(clustering)
    
        #Score the cluster and determine the number of clusters
        if self.Scoring_ != compute_cophe_coeff:
            score = self.Scoring_(y,clustering)
        else:
            score = self.Scoring_(corr_expr,Z)
            
        N = len(np.unique(clustering))
   
        self.n_clusters_, self.labels_, self.score_ = N, clustering, score
        return self
    
    def fit_predict(self, X:np.array, y:np.array,NmaxCluster:int = None):
        self.fit(X,y,NmaxCluster)
        
        return self.labels_      
    
    def score(self, X, y_true):
        #Error come from here y_true and X not same size as self.labels_ -> function fit_as
        return self.score_

In [4]:
AE3= np.array(pd.read_csv ('../data/processed_data/AE3.csv'))
y = np.array(AE3[:,-1],dtype=int)
AE3 = AE3[:,0:-1]
AE3.shape

(333, 11894)

In [5]:
model = FamiliesClusters(np.unique(y),compute_cophe_coeff,True)
pred = model.fit_predict(AE3,y)
print(model.score_, model.recovery)

0.22874411254286686 0.6726726726726727


In [6]:
genes_AE3 = np.squeeze(pd.read_csv ('../data/processed_data/AE3genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE3genes_bestMIM.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_AE3 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
AE3 = AE3[:,ind_opt_genes]
print(AE3.shape)

(333, 1010)


In [9]:
model = FamiliesClusters(np.unique(y),compute_cophe_coeff,True)
pred = model.fit_predict(AE3,y)
print(model.score_, model.recovery)

0.5333144434049149 0.8468468468468469
