# Import all necessary libraries and .py

In [12]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd  
import math

from load_data import *
from pred_score import *
from Filter_FS import *
from overlap_genes import *
from crossValidation import *
from evaluation_measure import *

#Fixing seed to get reproducible results
random.seed(3)
np.random.seed(3)

# Load normalized data 

In [89]:
#Enter filepath bellow as a str
filepath = '../data/processed_data/AE3.csv' #Provide the path to the normalized data you would like to predict the family of.

norm = pd.read_csv(filepath)#.T
genes_names = norm.columns
norm = np.array(norm) #convert to array
#If that's not case can just comment/uncomment the .T
print('Shape of the normalized data should be: number of cells x number of genes. ', norm.shape)

#If the optimization takes too much time, you might consider removing all genes that are not expressed in at least 5% of the cells.
#To do so, uncomment the lines bellow
gene_expressed = filter_norm_data(norm.T, 0.05)
norm = (norm.T[gene_expressed]).T

Shape of the normalized data should be: number of cells x number of genes.  (333, 11895)


# Baseline with all genes

When clustering the data in an unsupervised manner, the maximum expected number of cell in one family/cluster needs to be provided. 

In [7]:
Nmax = 3 #Put the max number of cell expected in a cluster

#Predict families of the dataset and evaluate using Cophenetic coefficient
model = FamiliesClusters(family_interest_ = None, Scoring_ = compute_cophe_coeff, maximize_ = True)
pred = model.fit_predict(X=norm, y=None, NmaxCluster=Nmax) 

print('Cophenetic coefficient and recovery: ', model.score_, ' ', model.recovery)

Cophenetic coefficient and recovery:  0.22877350018532344   0.6726726726726727


# Mutual information maximizer (MIM)

Mutual information maximization (MIM) utilizes the mutual information. It is an estimate of the statistical dependency between random variables. It is able to capture non-linear dependencies.
$$ MI = I(X ; Y) = H(X) – H(X | Y) $$
$$ H(X) = - \sum_{i=1}^{n} P(x_i)log_2(P(x_i))$$
The simplest way of using this scoring criterion is by ranking the features with decreasing mutual information content and choosing the top N features.

A grid search is perform to find the final size of the gene subset. You might want to play around with the range of final number of genes depending on the size of the dataset. The number of fold for k-fold cross-validation also need to be provided. For MIM fature selection, the number of neighbors for the computatation of the mutual information of each gene is a tunable parameter.

In [83]:
def split_data_cv_unsupervised(x:np.array, kfold:int):
    """ Split the data for CV.
  
      parameters:
      x: np.array,
        gene expression of each data points
      kfold: int,
        number of fold for CV    

      returns:
      split_x : list of np.array,
        split normalized data, split_x[k] = kth split
      """
    
    #Get the ind of the cells in each fold
    Ncells_fold = math.floor(x.shape[0]/kfold)
    cells_ind = np.arange(0,x.shape[0])
    ind_folds = list(np.random.choice(cells_ind,(kfold-1, Ncells_fold), replace=False))
    
    last_fold = list(set(cells_ind) - set(ind_folds[0]))
    for i in range (1,len(ind_folds)):
        last_fold = list(set(last_fold) - set(ind_folds[i]))
    ind_folds.append(np.array(last_fold))
    
    x_split = []
    for fold in ind_folds:
        x_split.append(x[fold,:])

    return x_split

In [85]:
x_split = split_data_cv_unsupervised(norm,5)

In [None]:
def cross_validation(x:np.array, Model_test: Callable, Scoring_test: Callable, maximize_test:bool, kfold:int, func: Callable, **kwargs: dict):
    """ Cross validate any feature selection method in a unsupervised manner.
  
      parameters:
      x : np.array,
        genes expression of each data points
      Model_test : Callable,
        the model is fitted using this method
      Scoring_test: Callable,
        scoring function use to evaluate the model
      maximize_test: bool,
        if True the scoring function is maximize, else it is minimize
      kfold: int,
        number of folds for CV
      func: Callable,
        feature selection function, should return seleted subset and associated score
      kwargs: **kwargs : dict,
        dictionnary of parameters and their values (kwargs = {'param_name' : value}) to pass to the given method (func)
        

      returns:
      final_subset : np.array,
        subset of features with the best score
      best_test_score : float,
        test score obtained with the best subset of features """
    
    #Store score training and best subset
    score_training = []
    score_testing = []
    final_subset = []
    
    #Split the data in kfold
    split_x = split_data_cv(x,kfold)
    
    for i in range(0,kfold):
        #Get split data
        x_test = np.squeeze(split_x[i])
        x_train = np.squeeze(split_x[:i] + split_x[i+1:])
        
        #Run feature selection on training set
        subset, score = func(y_train, x_train, **kwargs)
        
        #Evaluate subset on test set
        model_test = Model_test(np.unique(y_test),Scoring_test,True)
        pred_test = model_test.fit_predict(x_test[:, subset],y_test)
        test_score = model_test.score(x_test[:, subset],y_test)
        
        #Store best score on current folds
        score_training.append(score)
        score_testing.append(test_score)
        if (len(final_subset) == 0 or np.argmax(score_testing) == i): #if the last best test score is best overall keep subset as the finals subset
            final_subset = subset
        
    return final_subset, score_training, score_testing

In [None]:
#Define parameters for MIM method
N = np.arange(100,700,300)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_cophe_coeff, 'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

#Cross validate the MIM optimization 
subset, score_training, score_testing = cross_validation(y=None, x=norm, Model_test=FamiliesClusters, Scoring_test=compute_cophe_coeff, maximize_test=True, kfold=3,  func=MIM, **kwargs)
mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test score and std ', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_RI,True)
x_subset = AE3[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))