In [1]:
import numpy as np
import pandas as pd
import sklearn
import math 
import random
from pred_score import *

In [2]:
#Fixing seed to get reproducible results
random.seed(3)
np.random.seed(3)

# AE3

In [3]:
#Load preprocess data
AE3= np.array(pd.read_csv ('../data/processed_data/AE3.csv'))
y = np.array(AE3[:,-1],dtype=int)
AE3 = AE3[:,0:-1]
AE3.shape

(333, 11894)

 ### Baseline (with all genes)

In [10]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE3,y)
print(model.score_, model.recovery)

0.42011834319526625 0.6726726726726727


### Baseline (random)

In [13]:
AE3_shuffle =  np.array(pd.DataFrame(AE3).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE3_shuffle,y)
print(model.score_, model.recovery)

0.005917159763313609 0.6726726726726727


### Optimized subset (MIM)

In [4]:
genes_AE3 = np.squeeze(pd.read_csv ('../data/processed_data/AE3genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE3genes_bestMIM.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_AE3 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
AE3 = AE3[:,ind_opt_genes]
print(AE3.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(333, 1010)
1010


In [5]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE3,y)
print(model.score_, model.recovery)

0.6572769953051644 0.8468468468468469


In [15]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = AE3, y= y)
print(model.score_, model.recovery)

(101, 333)
0.7640449438202247 0.7297297297297297


# AE4

In [6]:
#Load preprocess data
AE4 = np.array(pd.read_csv ('../data/processed_data/AE4.csv'))
y = np.array(AE4[:,-1],dtype=int)
AE4 = AE4[:,0:-1]
AE4.shape

(1473, 9870)

 ### Baseline (with all genes)

In [18]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4,y)
print(model.score_, model.recovery)

0.05642420122365738 0.923285811269518


### Baseline (random)

In [19]:
AE4_shuffle =  np.array(pd.DataFrame(AE4).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4_shuffle,y)
print(model.score_, model.recovery)

0.003399048266485384 0.923285811269518


### Optimized subset (ANOVA)

In [7]:
genes_AE4 = np.squeeze(pd.read_csv ('../data/processed_data/AE4genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/AE4genes_bestANOVA.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_AE4 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
AE4 = AE4[:,ind_opt_genes]
print(AE4.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(1473, 255)
255


In [8]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4,y)
print(model.score_, model.recovery)

0.18866620594333103 0.9063136456211812


In [22]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = AE4, y= y)
print(model.score_, model.recovery)

(101, 1473)
0.4666666666666667 0.3754243041412084


# DO

In [9]:
#Load preprocess data
D0 = np.array(pd.read_csv ('../data/processed_data/D0.csv'))
y = np.array(D0[:,-1],dtype=int)
D0 = D0[:,0:-1]
D0.shape

(461, 10310)

 ### Baseline (with all genes)

In [11]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D0,y)
print(model.score_, model.recovery)

0.6686746987951807 0.720173535791757


### Baseline (random)

In [12]:
D0_shuffle =  np.array(pd.DataFrame(D0).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D0_shuffle,y)
print(model.score_, model.recovery)

0.0 0.720173535791757


### Optimized subset (MIM)

In [13]:
genes_D0 = np.squeeze(pd.read_csv ('../data/processed_data/D0genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D0genes_bestMIM.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_D0 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
D0 = D0[:,ind_opt_genes]
print(D0.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(461, 2480)
2480


In [14]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D0,y)
print(model.score_, model.recovery)

0.9444444444444444 0.8590021691973969


In [15]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = D0, y = y)
print(model.score_, model.recovery)

(101, 461)
0.9680851063829787 0.8156182212581344


# D6

In [16]:
#Load preprocess data
D6 = np.array(pd.read_csv ('../data/processed_data/D6.csv'))
y = np.array(D6[:,-1],dtype=int)
D6 = D6[:,0:-1]
D6.shape

(154, 10372)

 ### Baseline (with all genes)

In [17]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D6,y)
print(model.score_, model.recovery)

0.5961538461538461 0.6753246753246753


### Baseline (random)

In [18]:
D6_shuffle =  np.array(pd.DataFrame(D6).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D6_shuffle,y)
print(model.score_, model.recovery)

0.038461538461538464 0.6753246753246753


### Optimized subset (ANOVA)

In [19]:
genes_D6 = np.squeeze(pd.read_csv ('../data/processed_data/D6genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D6genes_bestANOVA.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_D6 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
D6 = D6[:,ind_opt_genes]
print(D6.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(154, 830)
830


In [20]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D6,y)
print(model.score_, model.recovery)

0.9552238805970149 0.8701298701298701


In [21]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = D6, y = y)
print(model.score_, model.recovery)

(101, 154)
0.953125 0.8311688311688312


# D15

In [22]:
#Load preprocess data
D15 = np.array(pd.read_csv ('../data/processed_data/D15.csv'))
y = np.array(D15[:,-1],dtype=int)
D15 = D15[:,0:-1]
D15.shape

(178, 9977)

 ### Baseline (with all genes)

In [23]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D15,y)
print(model.score_, model.recovery)

0.8714285714285714 0.7865168539325843


### Baseline (random)

In [24]:
D15_shuffle =  np.array(pd.DataFrame(D15).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D15_shuffle,y)
print(model.score_, model.recovery)

0.014285714285714285 0.7865168539325843


### Optimized subset (ANOVA)

In [25]:
genes_D15 = np.squeeze(pd.read_csv ('../data/processed_data/D15genes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/D15genes_bestANOVA.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind_opt_genes = np.append(ind_opt_genes, int(np.squeeze(np.where(genes_D15 == gene))))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
D15 = D15[:,ind_opt_genes]
print(D15.shape)

subset = np.ones((len(gene_optimized),))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(178, 110)
110


In [26]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(D15,y)
print(model.score_, model.recovery)

1.0 0.9775280898876404


In [27]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = D15, y = y)
print(model.score_, model.recovery)

(101, 178)
1.0 0.9775280898876404


# LK

In [75]:
#Load preprocess data
LK = np.array(pd.read_csv ('../data/processed_data/LK.csv'))
y = np.array(LK[:,-1],dtype=int)
LK = LK[:,0:-1]
LK.shape

(610, 8405)

 ### Baseline (with all genes)

In [29]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LK,y)
print(model.score_, model.recovery)

0.20603015075376885 0.8131147540983606


### Baseline (random)

In [30]:
LK_shuffle =  np.array(pd.DataFrame(LK).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LK_shuffle,y)
print(model.score_, model.recovery)

0.0 0.8131147540983606


### Optimized subset (MIM)

In [76]:
genes_LK = np.squeeze(pd.read_csv ('../data/processed_data/LKgenes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/LKgenes_bestMIM.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind = np.squeeze(np.where(genes_LK == gene))
    if ind.shape == ():
        ind_opt_genes = np.append(ind_opt_genes, int(ind))
    else:
        for i in ind:
                ind_opt_genes = np.append(ind_opt_genes, int(i))
            
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
LK = LK[:,ind_opt_genes]
print(LK.shape)

subset = np.ones((LK.shape[1],))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(610, 777)
777


In [77]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LK,y)
print(model.score_, model.recovery)

0.40173410404624277 0.7016393442622951


In [78]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = LK, y = y)
print(model.score_, model.recovery)

(101, 610)
0.6255924170616114 0.4737704918032787


# LSK

In [87]:
#Load preprocess data
LSK = np.array(pd.read_csv ('../data/processed_data/LSK.csv'))
y = np.array(LSK[:,-1],dtype=int)
LSK = LSK[:,0:-1]
LSK.shape

(3383, 7570)

 ### Baseline (with all genes)

In [80]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LSK,y)
print(model.score_, model.recovery)

0.08357588357588358 0.9269878805793674


### Baseline (random)

In [81]:
LSK_shuffle =  np.array(pd.DataFrame(LSK).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LSK_shuffle,y)
print(model.score_, model.recovery)

0.0008316008316008316 0.9269878805793674


### Optimized subset (ANOVA)

In [88]:
genes_LSK = np.squeeze(pd.read_csv ('../data/processed_data/LSKgenes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/LSKgenes_bestANOVA.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind = np.squeeze(np.where(genes_LSK == gene))
    if ind.shape == ():
        ind_opt_genes = np.append(ind_opt_genes, int(ind))
    else:
        for i in ind:
                ind_opt_genes = np.append(ind_opt_genes, int(i))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
LSK = LSK[:,ind_opt_genes]
print(LSK.shape)

subset = np.ones((LSK.shape[1],))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(3383, 418)
418


In [89]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LSK,y)
print(model.score_, model.recovery)

0.15286041189931351 0.8288501330180313


In [90]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = LSK, y = y)
print(model.score_, model.recovery)

(101, 3383)
0.42971887550200805 0.23558971327224357


# LK/LSK mix

In [98]:
#Load preprocess data
LSKmix = np.array(pd.read_csv ('../data/processed_data/LSKmix.csv'))
y = np.array(LSKmix[:,-1],dtype=int)
LSKmix = LSKmix[:,0:-1]
LSKmix.shape

(1023, 7230)

 ### Baseline (with all genes)

In [92]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LSKmix,y)
print(model.score_, model.recovery)

0.13649425287356323 0.8651026392961877


### Baseline (random)

In [93]:
LSKmix_shuffle =  np.array(pd.DataFrame(LSKmix).sample(frac = 1))
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LSKmix_shuffle,y)
print(model.score_, model.recovery)

0.0028735632183908046 0.8651026392961877


### Optimized subset (ANOVA)

In [99]:
genes_LSKmix = np.squeeze(pd.read_csv ('../data/processed_data/LSKmixgenes_interest.csv'))
gene_optimized = np.squeeze(pd.read_csv ('../data/optimized_subsets/LSKmixgenes_bestANOVA.csv'))

ind_opt_genes = []
for gene in gene_optimized:
    ind = np.squeeze(np.where(genes_LSKmix == gene))
    if ind.shape == ():
        ind_opt_genes = np.append(ind_opt_genes, int(ind))
    else:
        for i in ind:
                ind_opt_genes = np.append(ind_opt_genes, int(i))
ind_opt_genes= list(ind_opt_genes.astype(int))

#Only keep the optimized genes
LSKmix = LSKmix[:,ind_opt_genes]
print(LSKmix.shape)

subset = np.ones((LSKmix.shape[1],))
subsets = subsampling_genes(subset, 101, 0.25)
print(len(subsets[0]))

(1023, 294)
294


In [100]:
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(LSKmix,y)
print(model.score_, model.recovery)

0.2084507042253521 0.8807429130009775


In [101]:
model = EnsemblingHierarchical(np.unique(y),compute_precision,True,subsets = subsets, ensembling='voting', threshold_voting = 0.5)
result  = model.fit_predict(X = LSKmix, y = y)
print(model.score_, model.recovery)

(101, 1023)
0.4703196347031963 0.32942326490713586
