In [1]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt

import io 
import pandas as pd  
import pyreadr

from load_data import *
from pred_score import *
from Filter_FS import *
from hybrid_FS import *
from overlap_genes import *
from crossValidation import *

In [2]:
#Fixing seed to get reproducible results
random.seed(3)
np.random.seed(3)

# AE: AE3, AE4 fused

In [6]:
#Load AE3 data
AE3 = pd.read_csv ('../data/merged_data/AE3.csv')
AE3 = AE3.set_index('Unnamed: 0')
y_AE3 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_AE3.csv')))
print(np.shape(AE3), np.shape(y_AE3))

#Load AE4 data
AE4 = pd.read_csv ('../data/merged_data/AE4.csv')
AE4 = AE4.set_index('Unnamed: 0')
y_AE4 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_AE4.csv')))
print(np.shape(AE4), np.shape(y_AE4))

AE = pd.concat([AE3, AE4], axis=1)
print(AE.shape)

#Fuse families info
y_AE4 += max(y_AE3) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_AE = np.hstack((y_AE3, y_AE4))

#Remove all genes that are not expressed in at least percentage (default 50%) of the all cells
gene_expressed = filter_norm_data(AE,0.05)
AE = AE[gene_expressed]
print(AE.shape)

#Store the name of genes of interest
genes_interest = AE.index.values #get name of genes
pd.DataFrame(genes_interest).to_csv('../data/processed_data/AE_interest.csv', index=False)

#Create preprocess data 
AE.to_csv('../data/processed_data/AE.csv', index=False)
pd.DataFrame(y_AE).to_csv('../data/processed_data/y_AE.csv', index=False)

(32285, 333) (333,)
(32285, 1473) (1473,)
(32285, 1806)
(10470, 1806)


# Baseline with genes expressed at least in 5% of cells

In [3]:
#Load preprocess data
AE = pd.read_csv ('../data/processed_data/AE.csv')
y = np.squeeze(np.array(pd.read_csv('../data/processed_data/y_AE.csv')))
print(np.shape(AE), np.shape(y))

(10470, 1806) (1806,)


In [None]:
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE,y)

In [None]:
print(AE.shape, len(pred))
acc = model.score_
print(acc, model.recovery)

In [None]:
print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Mutual information maximizer (MIM)

In [None]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE, FamiliesClusters, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_RI,True)
x_subset = AE[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AEgenes_interest.csv', '../data/optimized_subsets/AEgenes_bestMIM.csv')

# Anova F test

In [None]:
#Define parameters for ANOVA method
N = np.arange(900,2000,10)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE, FamiliesClusters, compute_precision,True, 5,  ANOVA, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = AE[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))    
print(len(subset))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AEgenes_interest.csv', '../data/optimized_subsets/AEgenes_bestANOVA.csv')

# AE3-MEF: AE3, MEF fused

In [5]:
#Load AE3 data
AE3 = pd.read_csv ('../data/merged_data/AE3.csv')
AE3 = AE3.set_index('Unnamed: 0')
y_AE3 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_AE3.csv')))
print(np.shape(AE3), np.shape(y_AE3))

#Load BIDDY D0 data
D0 = pd.read_csv ('../data/merged_data/D0.csv')
D0 = D0.set_index('Unnamed: 0')
y_D0 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D0.csv')))
print(np.shape(D0), np.shape(y_D0))

#Load BIDDY D6 data
D6 = pd.read_csv ('../data/merged_data/D6.csv')
D6 = D6.set_index('Unnamed: 0')
y_D6 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D6.csv')))
print(np.shape(D6), np.shape(y_D6))

#Load BIDDY D15 data
D15 = pd.read_csv ('../data/merged_data/D15.csv')
D15 = D15.set_index('Unnamed: 0')
y_D15 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D15.csv')))
print(np.shape(D15), np.shape(y_D15))

AE3_MEF = pd.concat([AE3, D0,D6,D15], axis=1)
print(AE3_MEF.shape)

#Fuse families infos
y_D0 += max(y_AE3)
y_AE3_MEF = np.hstack((y_AE3, y_D0))

y_D6 += max(y_AE3_MEF)
y_AE3_MEF = np.hstack((y_AE3_MEF, y_D6))

y_D15 += max(y_AE3_MEF)
y_AE3_MEF = np.hstack((y_AE3_MEF, y_D15))

#Remove all genes that are not expressed in at least percentage (default 50%) of the all cells
gene_expressed = filter_norm_data(AE3_MEF,0.05)
AE3_MEF = AE3_MEF[gene_expressed]
print(AE3_MEF.shape)

#Store the name of genes of interest
genes_interest = AE3_MEF.index.values #get name of genes
pd.DataFrame(genes_interest).to_csv('../data/processed_data/AE3_MEF_interest.csv', index=False)

#Create preprocess data 
AE3_MEF.to_csv('../data/processed_data/AE3_MEF.csv', index=False)
pd.DataFrame(y_AE3_MEF).to_csv('../data/processed_data/y_AE3_MEF.csv', index=False)

(32285, 333) (333,)
(32285, 461) (461,)
(32285, 154) (154,)
(32285, 178) (178,)
(32285, 1126)
(11485, 1126)


# Baseline with genes expressed at least in 5% of cells

In [6]:
#Load preprocess data
AE3_MEF = pd.read_csv ('../data/processed_data/AE3_MEF.csv')
y = np.squeeze(np.array(pd.read_csv('../data/processed_data/y_AE3_MEF.csv')))
print(np.shape(AE3_MEF), np.shape(y))

(11485, 1126) (1126,)


In [None]:
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE3_MEF,y)

print(model.score_, model.recovery)

# Mutual information maximizer (MIM)

In [None]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE3_MEF, FamiliesClusters, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_RI,True)
x_subset = AE3_MEF[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AE3_MEFgenes_interest.csv', '../data/optimized_subsets/AE3_MEFgenes_bestMIM.csv')

# Anova F test

In [None]:
#Define parameters for ANOVA method
N = np.arange(900,2000,10)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE3_MEF, FamiliesClusters, compute_precision,True, 5,  ANOVA, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = AE3_MEF[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))    
print(len(subset))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AE3_MEFgenes_interest.csv', '../data/optimized_subsets/AE3_MEFgenes_bestANOVA.csv')

# AE4-MEF: AE4, MEF fused

In [7]:
#Load AE3 data
AE4 = pd.read_csv ('../data/merged_data/AE4.csv')
AE4 = AE4.set_index('Unnamed: 0')
y_AE4 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_AE4.csv')))
print(np.shape(AE4), np.shape(y_AE4))

#Load BIDDY D0 data
D0 = pd.read_csv ('../data/merged_data/D0.csv')
D0 = D0.set_index('Unnamed: 0')
y_D0 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D0.csv')))
print(np.shape(D0), np.shape(y_D0))

#Load BIDDY D6 data
D6 = pd.read_csv ('../data/merged_data/D6.csv')
D6 = D6.set_index('Unnamed: 0')
y_D6 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D6.csv')))
print(np.shape(D6), np.shape(y_D6))

#Load BIDDY D15 data
D15 = pd.read_csv ('../data/merged_data/D15.csv')
D15 = D15.set_index('Unnamed: 0')
y_D15 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D15.csv')))
print(np.shape(D15), np.shape(y_D15))

AE4_MEF = pd.concat([AE4, D0,D6,D15], axis=1)
print(AE4_MEF.shape)

#Fuse families infos
y_D0 += max(y_AE4)
y_AE4_MEF = np.hstack((y_AE4, y_D0))

y_D6 += max(y_AE4_MEF)
y_AE4_MEF = np.hstack((y_AE4_MEF, y_D6))

y_D15 += max(y_AE4_MEF)
y_AE4_MEF = np.hstack((y_AE4_MEF, y_D15))

#Remove all genes that are not expressed in at least percentage (default 50%) of the all cells
gene_expressed = filter_norm_data(AE4_MEF,0.05)
AE4_MEF = AE4_MEF[gene_expressed]
print(AE4_MEF.shape)

#Store the name of genes of interest
genes_interest = AE4_MEF.index.values #get name of genes
pd.DataFrame(genes_interest).to_csv('../data/processed_data/AE4_MEF_interest.csv', index=False)

#Create preprocess data 
AE4_MEF.to_csv('../data/processed_data/AE4_MEF.csv', index=False)
pd.DataFrame(y_AE4_MEF).to_csv('../data/processed_data/y_AE4_MEF.csv', index=False)

(32285, 1473) (1473,)
(32285, 461) (461,)
(32285, 154) (154,)
(32285, 178) (178,)
(32285, 2266)
(10625, 2266)


# Baseline with genes expressed at least in 5% of cells

In [8]:
#Load preprocess data
AE4_MEF = pd.read_csv ('../data/processed_data/AE4_MEF.csv')
y = np.squeeze(np.array(pd.read_csv('../data/processed_data/y_AE4_MEF.csv')))
print(np.shape(AE4_MEF), np.shape(y))

(10625, 2266) (2266,)


In [None]:
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4_MEF,y)

print(model.score_, model.recovery)

# Mutual information maximizer (MIM)

In [None]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE4_MEF, FamiliesClusters, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_RI,True)
x_subset = AE4_MEF[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AE4_MEFgenes_interest.csv', '../data/optimized_subsets/AE4_MEFgenes_bestMIM.csv')

# Anova F test

In [None]:
#Define parameters for ANOVA method
N = np.arange(900,2000,10)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE4_MEF, FamiliesClusters, compute_precision,True, 5,  ANOVA, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = AE4_MEF[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))    
print(len(subset))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AE4_MEFgenes_interest.csv', '../data/optimized_subsets/AE4_MEFgenes_bestANOVA.csv')

# AE-MEF: AE3, AE4, MEF fused

In [9]:
#Load AE3 data
AE3 = pd.read_csv ('../data/merged_data/AE3.csv')
AE3 = AE3.set_index('Unnamed: 0')
y_AE3 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_AE3.csv')))
print(np.shape(AE3), np.shape(y_AE3))

#Load AE4 data
AE4 = pd.read_csv ('../data/merged_data/AE4.csv')
AE4 = AE4.set_index('Unnamed: 0')
y_AE4 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_AE4.csv')))
print(np.shape(AE4), np.shape(y_AE4))

#Load BIDDY D0 data
D0 = pd.read_csv ('../data/merged_data/D0.csv')
D0 = D0.set_index('Unnamed: 0')
y_D0 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D0.csv')))
print(np.shape(D0), np.shape(y_D0))

#Load BIDDY D6 data
D6 = pd.read_csv ('../data/merged_data/D6.csv')
D6 = D6.set_index('Unnamed: 0')
y_D6 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D6.csv')))
print(np.shape(D6), np.shape(y_D6))

#Load BIDDY D15 data
D15 = pd.read_csv ('../data/merged_data/D15.csv')
D15 = D15.set_index('Unnamed: 0')
y_D15 = np.squeeze(np.array(pd.read_csv('../data/merged_data/y_D15.csv')))
print(np.shape(D15), np.shape(y_D15))

AE_MEF = pd.concat([AE3, AE4, D0 ,D6, D15], axis=1)
print(AE_MEF.shape)

#Fuse families infos
y_AE4 += max(y_AE3) #Add the max indice of families in first dataset so that there is no overlap between families indices.
y_AE_MEF = np.hstack((y_AE3, y_AE4))

y_D0 += max(y_AE_MEF)
y_AE_MEF = np.hstack((y_AE_MEF, y_D0))

y_D6 += max(y_AE_MEF)
y_AE_MEF = np.hstack((y_AE_MEF, y_D6))

y_D15 += max(y_AE_MEF)
y_AE_MEF = np.hstack((y_AE_MEF, y_D15))

#Remove all genes that are not expressed in at least percentage (default 50%) of the all cells
gene_expressed = filter_norm_data(AE_MEF,0.05)
AE_MEF = AE_MEF[gene_expressed]
print(AE_MEF.shape)

#Store the name of genes of interest
genes_interest = AE_MEF.index.values #get name of genes
pd.DataFrame(genes_interest).to_csv('../data/processed_data/AE_MEF_interest.csv', index=False)

#Create preprocess data 
AE_MEF.to_csv('../data/processed_data/AE_MEF.csv', index=False)
pd.DataFrame(y_AE_MEF).to_csv('../data/processed_data/y_AE_MEF.csv', index=False)

(32285, 333) (333,)
(32285, 1473) (1473,)
(32285, 461) (461,)
(32285, 154) (154,)
(32285, 178) (178,)
(32285, 2599)
(10955, 2599)


# Baseline with genes expressed at least in 5% of cells

In [10]:
#Load preprocess data
AE_MEF = pd.read_csv ('../data/processed_data/AE_MEF.csv')
y = np.squeeze(np.array(pd.read_csv('../data/processed_data/y_AE_MEF.csv')))
print(np.shape(AE_MEF), np.shape(y))

(10955, 2599) (2599,)


In [None]:
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE_MEF,y)

print(model.score_, model.recovery)

# Mutual information maximizer (MIM)

In [None]:
#Define parameters for MIM method
N = np.arange(80,1500,30)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'n_neighbors': 3, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE_MEF, FamiliesClusters, compute_precision,True, 5,  MIM, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_RI,True)
x_subset = AE_MEF[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AE_MEFgenes_interest.csv', '../data/optimized_subsets/AE_MEFgenes_bestMIM.csv')

# Anova F test

In [None]:
#Define parameters for ANOVA method
N = np.arange(900,2000,10)
kwargs = {'Model': FamiliesClusters, 'Scoring': compute_precision,'maximize': True,'N': N, 'plot': True} 

subset, score_training, score_testing = cross_validation(y,AE_MEF, FamiliesClusters, compute_precision,True, 5,  ANOVA, **kwargs)

mean_score_test, std_score_test = np.mean(score_testing), np.std(score_testing)    
print('test', mean_score_test, std_score_test)

#Predict and evaluate on whole data  set
model = FamiliesClusters(np.unique(y),compute_precision,True)
x_subset = AE_MEF[:, subset]
pred = model.fit_predict(x_subset,y)

print("TP, FP, ratio, sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))    
print(len(subset))

In [None]:
get_best_genes_names(subset, '../data/processed_data/AE_MEFgenes_interest.csv', '../data/optimized_subsets/AE_MEFgenes_bestANOVA.csv')