In [2]:
import numpy as np
import pandas as pd
import sklearn
import math 
from typing import AnyStr, Callable, Tuple
from sklearn.base import ClusterMixin,BaseEstimator
from scipy.cluster.hierarchy import ward, cut_tree
from sklearn.metrics import make_scorer
from sklearn.cluster import AffinityPropagation
from sklearn.decomposition import PCA

from load_data import *
from pred_score import *
from Filter_FS import *
from hybrid_FS import *
from overlap_genes import *
from crossValidation import *

# Iterative clustering

In [13]:
class IterativeClustering(ClusterMixin, BaseEstimator):
    '''
    Iterative Families clustering
    Hierachical clustering with the ward2 criterion, use the pearson's correlation as the distance measure.
    Parameters
    ----------
    family_interest: np.array,
        list of family of interest
    Scoring : Callable,
        scoring function use to evaluate the model
    maximize: bool,
        if True the scoring function is maximize, else it is minimize
        
    Attributes
    ----------
    n_clusters_ : int
        The number of clusters found by the algorithm.
    labels_ : ndarray of shape (n_samples)
        Cluster labels for each point.
    family_interest: np.array,
        list of family of interest.
    Scoring : Callable,
        scoring function use to evaluate the model.
    maximize: bool,
        if True the scoring function is maximize, else it is minimize.
    
    '''
    
    def __init__(self, family_interest_:np.array, Scoring_:Callable, maximize_:bool):
        super().__init__()
        self.family_interest_ = family_interest_
        self.Scoring_ = Scoring_
        self.maximize_ = maximize_
        
    def fit(self, X:np.array, y:np.array, N:int =2, iterations:int =20, subset: np.array = None):
        '''Fit data using hierachical clustering with the ward2 criterion and use the spearman correlation as the distance measure and predict.
        
        parameters:
        -------
        x : np.array,
            features of each data points
        y : np.array,
            family of each data points
        N : int,
            max number of
        iterations : int,
            number of iterative clustering
    

        Returns
        -------
        self,
            return fitted self'''
        
        #Iterative clustering algorithm
        result = iterative_clustering(X,y , N, iterations)
        r = result[0]
        rcsv = pd.DataFrame(r)
        rcsv.to_csv('../data/test.csv', index=False)
        
        
        #Score the cluster and determine the number of clusters
        TP,FP,tot = test_prediction_multiple_overlap_3(result, result, result,y)
        score = TP/(TP+FP)
        self.score_ = score
        
        return self, TP, FP  
    
    def score(self, X, y_true):
        return self.score_

In [15]:
A = [1,20,1,20,20,1,100,100,100]
B = [100,30,100,30,30,100,1,1,1]
norm_data = np.array([A,B])
y_test = [1,2,1,2,2,1,3,3,3]

In [16]:
model = IterativeClustering(np.unique(y_test),compute_precision,True)
result  = model.fit(norm_data,y_test,N=2, iterations=20)
model.score_

0.6666666666666666

In [38]:
#Load preprocess data
AE3= np.array(pd.read_csv ('../data/processed_data/AE3.csv'))
y = np.array(AE3[:,-1],dtype=int)
AE3 = AE3[:,0:-1]

In [19]:
model = IterativeClustering(np.unique(y),compute_precision,True)
result,TP,FP  = model.fit(AE3,y,N=2, iterations=20)
model.score_

print(TP,FP)

4 448


In [12]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': IterativeClustering, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE3, IterativeClustering, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = IterativeClustering(np.unique(y),compute_precision,True)
result  = model.fit(AE3,y,N=2, iterations=20)
model.score_


TypeError: fit() missing 1 required positional argument: 'y'

# Repeated prediction

In [86]:
class EnsemblingHierarchical(ClusterMixin, BaseEstimator):
    '''
    Hierachical clustering with the ward2 criterion, use the spearmann's correlation as the distance measure, on N subsets of genes. 
    Then, use ensembling method to give final cluster assignments.
    Parameters
    ----------
    family_interest: np.array,
        list of family of interest
    Scoring : Callable,
        scoring function use to evaluate the model
    maximize: bool,
        if True the scoring function is maximize, else it is minimize
    subsets: list,
        list of the different subsets of genes
    ensembling_: str,
        ensembling method to produce final clustering
        
    Attributes
    ----------
    n_clusters_ : int
        The number of clusters found by the algorithm.
    labels_ : ndarray of shape (n_samples)
        Cluster labels for each point.
    family_interest: np.array,
        list of family of interest.
    Scoring_ : Callable,
        scoring function use to evaluate the model.
    maximize: bool,
        if True the scoring function is maximize, else it is minimize.
    subsets_: list,
        list of the different subsets of genes
    ensembling_: str,
        ensembling method to produce final clustering
    
    '''
    
    def __init__(self, family_interest_:np.array, Scoring_:Callable, maximize_:bool, subsets: list, ensembling: str):
        super().__init__()
        self.family_interest_ = family_interest_
        self.Scoring_ = Scoring_
        self.maximize_ = maximize_
        self.subsets_ = subsets
        self.ensembling_ = ensembling
        
    def fit(self, X:np.array, y:np.array, NmaxCluster:int=None):
        '''Fit data using hierachical clustering with the ward2 criterion and use the spearman correlation as the distance measure and predict 
        on provided subsets.
        
        parameters:
        -------
        x : np.array,
            features of each data points
        y : np.array,
            family of each data points
        NmaxCluster : int,
            max number of cells in a cluster

        return
        -------
        self,
            return fitted self'''
        
        clustering = []
        #Cluster data using the different subsets of features
        for subset in self.subsets_:
            model = FamiliesClusters(self.family_interest_, self.Scoring_, self.maximize_)
            pred = model.fit_predict(X,y)
            clustering.append(pred)
            
        #Get the final clustering from the individual clustering result
        if self.ensembling_ == 'voting':
            final_ensembling = ensembling_voting(clustering)
        elif self.esembling_ == 'all':
            final_ensembling = ensembling_all(clustering)
        
        #Score the cluster and determine the number of clusters
        score = self.Scoring_(y,final_ensembling)
        N = len(np.unique(final_ensembling))
   
        self.n_clusters_, self.labels_, self.score_ = N, final_ensembling, score
        return self
    
    def fit_predict(self, X:np.array, y:np.array,NmaxCluster:int = None):
        self.fit(X,y,NmaxCluster)
        
        return self.labels_ 
    
    def score(self, X, y_true):
        return self.score_

In [79]:
def ensembling_voting(clustering:list):
    """ Compute the final clustering from given individual clustering. 
    Two cells that are the most of the clusterings predict together are together in the final clustering. 
  
      parameters:
      clustering: list,
        list of the independent clusterings from which the final clustering is computed

      return:
      final_label: np.array,
        final computed 
    """
    #Compute co_occurrence matrix of the clustering
    co_occurrence = np.zeros((len(clustering[0]), len(clustering[0])))
    for cluster in clustering:
        co_occurrence += outer_equal(cluster)
    
    #Compute final clustering with majority voting
    N_vote = math.floor(len(clustering)/2 + 1) #Vote necessary to consider two cells same family
    same_family = co_occurrence >= N_vote 
    for i in range(0,len(clustering[0])):
        for j in range(0,i+1):
            same_family[i,j] = False
    #Get indexes of cell together
    ind_together = np.nonzero(same_family)
    final_label = np.zeros((len(clustering[0],)))
    
    for i in range(0,len(ind_together[0])):
        first_cell, second_cell = ind_together[0][i], ind_together[1][i]
        if final_label[first_cell] == 0 or  final_label[second_cell] == 0:
            if final_label[first_cell] == 0 and  final_label[second_cell] == 0:
                final_label[first_cell], final_label[second_cell] = np.max(final_label) + 1, np.max(final_label) + 1
            else:
                final_label[first_cell], final_label[second_cell] = np.max([final_label[first_cell], final_label[second_cell]]), np.max([final_label[first_cell], final_label[second_cell]])
        else:
            final_label[final_label == final_label[second_cell]] = final_label[first_cell]
    
    return final_label.astype(int)

In [80]:
def ensembling_all(clustering:list):
    """ Compute the final clustering from given individual clustering. 
    Two cells need to be predicted together in all the clusters, otherwise the cell family is undertermined. 
  
      parameters:
      clustering: list,
        list of the independent clusterings from which the final clustering is computed

      return:
      final_label: np.array,
        final computed 
    """
    ##Compute co_occurrence matrix of the clustering
    co_occurrence = np.zeros((len(clustering[0]), len(clustering[0])))
    for cluster in clustering:
        co_occurrence += outer_equal(cluster)
    
    #Compute final clustering 
    N_vote =  len(clustering)
    same_family = co_occurrence >= N_vote 
    for i in range(0,len(clustering[0])):
        for j in range(0,i+1):
            same_family[i,j] = False
    #Get indexes of cell together
    ind_together = np.nonzero(same_family)
    final_label = np.zeros((len(clustering[0],)))
    
    for i in range(0,len(ind_together[0])):
        first_cell, second_cell = ind_together[0][i], ind_together[1][i]
        if final_label[first_cell] == 0 or  final_label[second_cell] == 0:
            if final_label[first_cell] == 0 and  final_label[second_cell] == 0:
                final_label[first_cell], final_label[second_cell] = np.max(final_label) + 1, np.max(final_label) + 1
            else:
                final_label[first_cell], final_label[second_cell] = np.max([final_label[first_cell], final_label[second_cell]]), np.max([final_label[first_cell], final_label[second_cell]])
        else:
            final_label[final_label == final_label[second_cell]] = final_label[first_cell]
    
    return final_label.astype(int)

In [75]:
def compute_recovery(final_label:np.array):
    return len(np.nonzero(final_label)[0])/len(final_label)

In [76]:
clustering=[[1,1,2,2,1,2,3,3,3],[1,1,2,2,2,2,3,3,3],[1,1,2,2,1,2,3,3,1]]
final = ensembling_all(clustering)
compute_recovery(final)

0.7777777777777778

In [77]:
def subsampling_genes(subset:np.array, N:int, p_mutate:float):
    subsets = []
    
    for i in range(0,N):
        subsets.append(mutate(subset,p_mutate))
        
    return subsets

In [84]:
#Load preprocess data
AE3= np.array(pd.read_csv ('../data/processed_data/AE3.csv'))
y = np.array(AE3[:,-1],dtype=int)
AE3 = AE3[:,0:-1]

genes_AE3 = np.squeeze(pd.read_csv ('../data/processed_data/AE3genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE3genes_bestANOVA.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_AE3 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
AE3 = AE3[:,ind_opt_genes]
print(AE3.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 1, 0.25)
print(len(subsets[0]))

(333, 280)
280


In [85]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting')
result  = model.fit_predict(X = AE3, y= y)
model.score_

1


0.01585014409221902

In [87]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE3,y)
model.score_

0.039301310043668124