In [None]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt

import io 
import pandas as pd  
import pyreadr

from load_data import *
from pred_score import *
from Filter_FS import *
from hybrid_FS import *
from overlap_genes import *
from crossValidation import *

In [None]:
#Fixing seed to get reproducible results
random.seed(3)
np.random.seed(3)

# Baseline with preselected genes 

In [None]:
#Load preprocess data
all_norm= np.array(pd.read_csv ('../data/processed_data/all.csv'))
y = np.array(all_norm[:,-1],dtype=int)
all_norm = all_norm[:,0:-1]

In [None]:
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(all_norm,y)

In [None]:
print(all_norm.shape, len(pred))
acc = model.score_
print(acc)

In [None]:
print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Mutual information maximizer (MIM)

In [None]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,all_norm, FamiliesClusters, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluates on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = all_norm[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Anova F test

In [None]:
#Define parameters for ANOVA method
N = np.arange(80,3000,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,all_norm, FamiliesClusters, compute_precision,True, 5,  ANOVA, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = all_norm[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))    

# Hybrid methods

# Mutual information and stimulated annealing

In [None]:
#Define parameters for MI + stimulated annealing method
N = np.array([700])
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_iter': 1400, 'n_neighbors': 3, 'p_mutate': 0.1, 'c': 1, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,all_norm, FamiliesClusters, compute_precision,True, 5,  MI_stimulated_annealing, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = all_norm[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred)) 

# Every data set but switch which is used for training and which is used for testing

In [None]:
def getTrainTestAll(y:np.array, x:np.array, ind_dataset:list, i:int):
    """Split the merged data, one data set is keept for testing, the others for training.
  
      parameters:
      y : np.array,
        family of each data points
      x : np.array,
        features of each data points
      ind_dataset : list,
        list of indices where each data set is stored
      i: int, 
        ind of the data set to keep for testing
        

      returns:
      x_train : np.array,
        norm data without the test dataset for training
      y_train : np.array,
        families of each data point without the test dataset for training
      x_test : np.array,
        norm data of the test dataset
      y_test : np.array,
        families of each data point of the test dataset"""
    
    ind_i = None
    if i == 0:
        ind_i = np.arange(0,ind_dataset[i],1)
    else:
        ind_i = np.arange(ind_dataset[i-1], ind_dataset[i], 1)
    
    
    x_train = np.delete(x, ind_i, axis=0)
    y_train = np.delete(y, ind_i)
    x_test = x[ind_i,:]
    y_test = y[ind_i]
    
    return x_train, y_train, x_test, y_test

In [None]:
 def optimization_on_allsets(y:np.array, x:np.array, ind_dataset:list, Model_test: Callable, Scoring_test: Callable, maximize_test:bool, kfold:int, func: Callable, **kwargs: dict):
    """ 
  
      parameters:
      y : np.array,
        family of each data points
      x : np.array,
        features of each data points
      ind_dataset : list,
        list of indices where each data set is stored
      Model_test : Callable,
        the model is fitted using this method
      Scoring_test: Callable,
        scoring function use to evaluate the model
      maximize_test: bool,
        if True the scoring function is maximize, else it is minimize
      kfold: int,
        number of folds for CV
      func: Callable,
        feature selection function, should return seleted subset and associated score
      kwargs: **kwargs : dict,
        dictionnary of parameters and their values (kwargs = {'param_name' : value}) to pass to the given method (func)
        

      returns:
      final_subset : np.array,
        subset of features with the best score
      best_test_score : float,
        test score obtained with the best subset of features """
    
    #Store score training and best subset
    score_training = []
    score_testing = []
    final_subset = []
    
    for i in range(0,len(ind_dataset)):
        #Get split data
        x_train, y_train, x_test, y_test = getTrainTestAll(y, x, ind_dataset, i)
        
        #Run feature selection on training set
        subset, score = func(y_train, x_train, **kwargs)
        
        #Evaluate subset on test set
        model_test = Model_test(np.unique(y_test),Scoring_test,True)
        pred_test = model_test.fit_predict(x_test[:, subset],y_test)
        test_score = model_test.score(x_test[:, subset],y_test)
        
        #Store best score on current folds
        score_training.append(score)
        score_testing.append(test_score)
        if (len(final_subset) == 0 or np.argmax(score_testing) == i): #if the last best test score is best overall keep subset as the finals subset
            final_subset = subset
        
    return final_subset, score_training, score_testing

In [None]:
ind_dataset = np.array(pd.read_csv ('../data/processed_data/ind_datasets.csv'))

# Mutual information maximizer (MIM)

In [None]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,all_norm, ind_dataset, FamiliesClusters, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluates on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = all_norm[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))