In [1]:
%cd "../"
%pwd

import numpy as np
import matplotlib.pyplot as plt

import io 
import pandas as pd  
import pyreadr

from load_data import *
from pred_score import *
from Filter_FS import *
from Wrapper_FS import *
from hybrid_FS import *

np.random.seed(1)
random.seed(1)

/Users/dormann/Documents/GitHub/src


# Baseline with preselected genes for AE4

In [2]:
#Load preprocess data
AE4 = np.array(pd.read_csv ('../data/processed_data/AE4intron.csv'))
y = np.array(AE4[:,-1],dtype=int)
AE4 = AE4[:,0:-1]

In [None]:
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4,y,4)

In [None]:
print(AE4.shape, len(pred))
acc = model.score_
print(acc,model.recovery)

In [None]:
print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Filter methods

# Mutual information maximizer (MIM)

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = MIM(y, AE4, FamiliesClusters, compute_sensitivity,True,N, 3, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# ANOVA

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = ANOVA(y, AE4, FamiliesClusters, compute_precision,True, N, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Fisher's score

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = fishers_score(y, AE4, FamiliesClusters,compute_precision,True,N, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Laplacian score

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = laplacian_score(y, AE4, FamiliesClusters,compute_precision,True, N, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# ReliefF algorithm

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = reliefF(y, AE4, FamiliesClusters,compute_precision,True, N, 5, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Fast Correlation Based Filter algorithm (FCBF)

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = FCBF(y, AE4, FamiliesClusters,compute_precision,True, N, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Hilbert Schmidt Independence Criterion Lasso (HSIC Lasso)

In [None]:
N = np.arange(100,4000,50)
best_subset, best_score = HsicLasso(y, AE4, FamiliesClusters,compute_precision,True, N, plot=True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Wrapper methods

# Stimulated annealing

In [None]:
best_subset, best_score  = stimulated_annealing(y,AE4, FamiliesClusters, compute_precision,True, 200, 0.1, 1, True)
print(best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Genetic feature selection algorithm

In [None]:
best_subset, best_score  = genetic_fs(y,AE4,FamiliesClusters, compute_precision,True, 300, 0.5, 0.2, 50, 3, True)
print(best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Hybrid methods

# Mutual information and stimmulated annealing

In [None]:
best_subset, best_score = MI_stimulated_annealing(y,AE4, FamiliesClusters, compute_sensitivity,True, np.array([400]), 3, 800, 0.1, 1, True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate 
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)

print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Mutual information and genetic feature selection

In [None]:
best_subset, best_score = MI_genetic_fs(y,AE4, FamiliesClusters, compute_sensitivity,True, np.array([850]), 3,  100, 0.5, 0.2, 40, 3, True)
print(best_subset, best_score)

In [None]:
print(len(best_subset))
#Predict and evaluate
model = FamiliesClusters(np.unique(y),compute_precision,True)
pred = model.fit_predict(AE4[:, best_subset],y)
print("sensitivity, specificity, precision, NPV, FDR, FNR = ", compute_statTP(y,pred))

# Best method = 