<a href="https://colab.research.google.com/github/abhik-99/MFSGC/blob/master/Univariate_Supervised_Gene_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Univariate Supervised Gene Clustering
**UFSGC** is a method where by a specific filter algorithm is used to score and filter out high ranking genes from Gene Expression Dataset and then the filtered Genes are put through SGC for Gene Augmentation. The resulting Augmentation not only increases the class separability of the genes but also their expressions.\
This Augmented gene expression set is now used for classification of cancer from healthy patients.\
The Filter Methods chosen for evaluation are:- 
1. Mutual Information.
2. ReliefF.
3. Chi Sq.
4. Fisher Score.
5. Signal To Noise Ratio (adapted for multi-class datasets).
6. T-Test.
7. Pearson Corelation Coefficient.

This method is used for evaluation of **MFSGC**.\
\
*If you already have Gene Representatives from a previous iteration, you can load them and use them here. Loading can be done using the last two cells of this notebook.

In [None]:
!pip install -U -q PyDrive
!pip install skfeature-chappers

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from sklearn.feature_selection import chi2
import json

## Loading the Dataset

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from google.colab import files

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

#2. Get the file
downloaded = drive.CreateFile({'id':'1oaOATE0D_f8MGPIMJOMXYVt0hBUWNCKV'}) # replace the id with id of file you want to access
# For Leukemia- 1xcL-LT-E_gUqWLlqqeVJP1DVHHpiAGe_
# For Colon - 1AUOto0GhTHW9fX52XSsf9kzYJS5ggv0G
# for Prostate - 13Hf7uGbyJ1sWYo8KDRDL8scm-2Fs9_gd
# For Lung- 1xuLzTWDGUbr4x3Pq1dnJj08MZqBB5I3U
# for Rahc - 1oaOATE0D_f8MGPIMJOMXYVt0hBUWNCKV
# for Raoa - 1d2vhPcT3I7ZFcAGOQYVLGB3Jx_vEMata
# for Rbreast - 1Vf-h8zfVP_twMXivcJJtbWtjThShUHvn
# for SRBCT - 1rO5EEvsoRJl2VVUB3ywKUd3kNiQ24oy3
# for MLL - 1rS7x4x_DhrUzaBhrgKMQH3uIaLJdPgW3
# for Breast - 1enhhyA4u2ByvOjnF81WoHflVNpXtfKpu
downloaded.GetContentFile('data.txt')

In [None]:
#DATASET is the name of the dataset being used.
DATASET="RAHC"

#NEIGHBOURS determines neighbours arg for ReliefF
#for any dataset which contains any class sample 
# <10, make it less than 10. Eg of such dataset - SRBCT
NEIGHBOURS = 3 

#p is the number of top genes taken after sorting the filter scores
p = 800

#q is the number of top genes to be taken from each filter after augmentation
q = 5

#uncomment the line below if using the dataset splitter else leave it commented 
#data_df = pd.read_csv("%s_train.csv"%(DATASET),index_col=0)

#uncomment the lines below if using the original dataset
dataset = pd.read_table("data.txt",header=None)
data_df = dataset



target = data_df.iloc[:,-1]
feature = pd.DataFrame(data_df.iloc[:,:-1].values,dtype='float')
m,n = feature.shape
print(m,n)
print(feature.head())
print("Number of classes - ")
classes = np.unique(target)
for x in classes:
  print("Class -",x,"Number of Sampples -", len(np.where(target == x)[0]))

feature_norm=pd.DataFrame(MinMaxScaler().fit_transform(feature))

In [None]:
#utility function
def plot_feature(feature, target, c = ['r', 'b', 'g', 'y']):
  import matplotlib.pyplot as plt
  from matplotlib import style
  import numpy as np
  style.use('ggplot')
  for idx, each in enumerate(np.unique(target)):
    y = feature[np.where(target == each)[0]]
    x = len(y)
    plt.scatter(range(1, x+1), y, color = c[idx])
    plt.plot(range(1, x+1), y, color = c[idx])

## Creating Filter Methods for Scoring and filtering top rated genes
The Filter Methods chosen for evaluation are:-

Mutual Information.
ReliefF.
Chi Sq.
Fisher Score.
Signal To Noise Ratio (adapted for multi-class datasets).
T-Test.
Pearson Corelation Coefficient.

In [None]:
#construction of ReliefF function

"""
Given a dataset, number of random instances to pick form the dataset and
number of features to consider in each iteration (k), the function returns the weigths of the attributes
of the dataset.
These weigths can then be used as the final results out of the ReliefF algorithm

Paper-

Marko Robnik-ˇSikonja and Igor Kononenko. Theoretical and empirical analysis of relieff
and rrelieff. Machine learning, 53(1-2):23–69, 2003.

"""

def hit_miss_calculator(target,instance,k = 10, hit = True, c = None, ):
    m=len(target)
    upper,lower=instance-1,instance+1
    hits=[]
    hit_flag=False
    #finds k nearest hits
    while(not hit_flag):
      #print(upper,lower)
      if(len(hits)>=k):
        hit_flag = True
        break
      if upper < 0 and lower > m:
        hit_flag = True
        break
      if(upper>=0):
        if((target[upper]==target[instance]) and hit):
          hits.append(upper)
        elif((target[upper]!=target[instance]) and (not hit) and target[upper]==c):
          hits.append(upper)
        upper-=1          
      if(lower<m):
        if((target[lower]==target[instance]) and hit):
          hits.append(lower)
        elif((target[lower]!=target[instance]) and (not hit) and target[lower]==c):
          hits.append(lower)
        lower+=1
    hits.sort()
    return hits


def reliefF(feature,target,k=10,repetitions=10, seed = 0):
  np.random.seed(seed)
  if len(feature.shape)>1:
    m,n=feature.shape
  else:
    m=len(feature)
    n=1
  #print(m,n)
  observations=list(range(m))
  classes=np.unique(target)
  weights=np.zeros(n)
  d=(np.max(feature,axis=0)-np.min(feature,axis=0))*m*k

  for i in range(repetitions):
    instance=np.random.choice(observations,1)[0]
    #print("Iteration",i)
    #print(instance)
    hits=hit_miss_calculator(target,instance,k)
    hit_class_prob=len(np.where(target==target[instance])[0])/m
    #print("\nHit Probability -",hit_class_prob)
    #print("Repetition",i,"Class",target[instance],"Hits -",hits)

    miss={}
    miss_class_prob={}

    for each_class in classes:
      if(each_class != target[instance]):
        miss[each_class]=hit_miss_calculator(target,instance,k,False,each_class)
        class_prob=len(np.where(target==each_class)[0])/m
        #print(each_class,class_prob)
        miss_class_prob[each_class]=hit_class_prob/(1 - (class_prob))

    #print("Repetition",i,"Miss-",miss,"Miss Class Probability -",miss_class_prob)
    
    for hit in hits:
      if len(feature.shape)>1:
        weights-=np.subtract(feature.iloc[instance,:],feature.iloc[hit,:])/d
      else:
        weights-=np.subtract(feature.iloc[instance],feature.iloc[hit])/d
    for each_class in miss:
      for each_miss in miss[each_class]:
        if len(feature.shape)>1:
          weights+=(np.subtract(feature.iloc[instance,:],feature.iloc[each_miss,:])/d)*miss_class_prob[each_class]
        else:
          weights+=(np.subtract(feature.iloc[instance],feature.iloc[each_miss])/d)*miss_class_prob[each_class]
    
    
  return weights.tolist()

In [None]:
#This function discretizes the given features into 3 categories
def discretize_feature(feature):
  
  mean=np.mean(feature)
  std=np.std(feature)
  discretized=np.copy(feature)
  
  discretized[np.where(feature<(mean+std/2)) ,]=2#within 1/2 std div
  discretized[np.where(feature>(mean-std/2)),]=2#within 1/2 std div
  
  discretized[np.where(feature>(mean+std/2)),]=0#greater than half
  discretized[np.where(feature<(mean-std/2)),]=1#less than half
  
  return discretized

def Xfreq(x):
  xL={}
  for e in x:
    if e not in xL:
      xL[e]=0
    else:
      xL[e]+=1
  for e in xL:
    xL[e]/=len(x)
  return xL

def XYfreq(x,y):
  freq={}
  
  rX=np.unique(x)
  rY=np.unique(y)
      
  for e in rX:
    for f in rY:
      freq[(e,f)]=round(len(np.where(y[np.where(x==e)[0]]==f)[0])/len(x),4)
       
  return freq

def mutual_info(x,y):

  xFreq=Xfreq(x)
  yFreq=Xfreq(y)
  joint=XYfreq(x,y)
  
  Xentropy=0
  for e in xFreq:
    if xFreq[e]!=0:
      Xentropy-=xFreq[e]*np.log2(xFreq[e])
      
  Yentropy=0
  for e in yFreq:
    if yFreq[e]!=0:
      Yentropy-=yFreq[e]*np.log2(yFreq[e])
      
  jentropy=0
  for e in xFreq:
    for f in yFreq:
      if joint[(e,f)]!=0:
        jentropy-=joint[(e,f)]*np.log2(joint[(e,f)])
  
  return (Xentropy+Yentropy-jentropy)

def mutual_info_wrapper(features,target):

  mi=np.array([])
  for x in features:
    discrete=discretize_feature(features[x])
    mi=np.append(mi,mutual_info(discrete,target))
  return np.array(mi)

In [None]:
"""
This cell is used for defining the method for calculating the t-scores
"""

def t_test(df,target):
  """
  Input:
  df= Dataframe of features (n_samples,n_features)
  target= Pandas Series/1D Numpy Array containing the class labels (n_samples)
  
  Output:
  scores= Descendingly Sorted array of features based on t-test 
  """
  import numpy as np
  from scipy.stats import ttest_ind
  scores=ttest_ind(df[:][target==0],df[:][target==1])[0] #Storing just the t-test scores and discarding the p-values from the result.
  
  # scores=np.argsort(scores,0)
  return [scores] if type(scores) != np.ndarray else scores

  

In [None]:
from scipy.sparse import *
def fisher_score(X, y):
    import numpy as np
    
    from skfeature.utility.construct_W import construct_W
    """
    This function implements the fisher score feature selection, steps are as follows:
    1. Construct the affinity matrix W in fisher score way
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    score: {numpy array}, shape (n_features,)
        fisher score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012.
    """

    # Construct weight matrix W in a fisherScore way
    kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
    W = construct_W(X, **kwargs)

    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000
    lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]

    # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1
    score = 1.0/lap_score - 1
    return np.transpose(score)


In [None]:

#Pearson corelation
def pearson_corr(feature,targetClass):
  import numpy as np
  coef=[np.abs(np.corrcoef(feature[i].values,targetClass)[0,1]) for i in feature.columns]
  # range(feature.shape[1])
  coef=[0 if np.isnan(i) else i for i in coef]
  return coef


In [None]:
#signal to noise ratio
#using weighted one-vs-all strategy for multi-class data
def signaltonoise(feature, target, axis = 0, ddof = 0):
  import numpy as np
  classes = np.unique(target)
  if len(feature.shape)<2:
    feature = feature.reshape(-1,1)
  row, _ = feature.shape
  if len(classes) <= 2:
    m = None
    std = 0
    for each in classes:
      idx = np.where(target == each)[0]
      #convinient way of doing m1-m2
      if m is None:
        m = feature.iloc[idx, :].mean(axis)
      else:
        m -= feature.iloc[idx, :].mean(axis)

      #sd1+sd2
      std += feature.iloc[idx, :].std(axis = axis, ddof = ddof)

    return np.asanyarray(m/std)

  else:
    snr_scores = [] #for storing the weighted scores
    #using the one vs all strategy for each class with
    for each in classes:
      idx = np.where(target == each)[0]
      idxn = np.where(target != each)[0]
      m = feature.iloc[idx, :].mean(axis) - feature.iloc[idxn, :].mean(axis)
      std = feature.iloc[idx, :].std(axis = axis, ddof = ddof) + feature.iloc[idxn, :].std(axis = axis, ddof = ddof) 
      snr_scores.append((m/std) * len(idx)/row) #weighted snr

    return np.asanyarray(snr_scores).sum(axis = axis)

In [None]:
def feature_ranking(score):
    """
    Rank features in descending order according to fisher score, the larger the fisher score, the more important the
    feature is
    """
    idx = np.argsort(score, 0)
    return idx[::-1]

In [None]:

relief_score=reliefF(feature,target,NEIGHBOURS)

mutual_inf=mutual_info_wrapper(feature,target)

mms=MinMaxScaler()
nfeature=mms.fit_transform(feature)
chi_score,p_val=chi2(nfeature,target)

p_corr = pearson_corr(feature, target)

f_score = fisher_score(feature.values, target)

tt_score = t_test(feature, target)

snr_score = signaltonoise(feature, target)

In [None]:
#The Features are sorted as per their scores
sorted_relief = feature_ranking(relief_score)[:p]
sorted_mi = feature_ranking(mutual_inf)[:p]
sorted_chi = feature_ranking(chi_score)[:p]
sorted_pc = feature_ranking(p_corr)[:p]
sorted_fs = feature_ranking(f_score)[:p]
sorted_tt = feature_ranking(tt_score)[:p]
sorted_snr = feature_ranking(snr_score)[:p]

In [None]:
#Can Skip this Cell

print("Features after sorting -")
print("\nSorted MI -",sorted_mi)
print("\nSorted Relief -",sorted_relief)
print("\nSorted Chi -",sorted_chi)
print("\nSorted Pearson Corr -",sorted_pc)
print("\nSorted Fisher Score -",sorted_fs)
print("\nSorted T-test -",sorted_tt)
print("\nSorted SNR - ", sorted_snr)

## Supervised Gene Clustering
The below cells are used for facilitating the SGC Method of Augmentation

In [None]:
def score(a,p,target):  
  if p==1:
    return mutual_info_wrapper(pd.DataFrame(a.reshape(-1,1)),target)
    
  if p==2:    
    ndf=pd.DataFrame()
    ndf[0]=a
    reliefa=reliefF(ndf,target,NEIGHBOURS,2)
    return reliefa
  
  if p==3:    
    from sklearn.preprocessing import MinMaxScaler
    mms=MinMaxScaler() 
    a=mms.fit_transform(a.reshape(-1,1))
    chia=chi2(a,target)[0]
    return chia
  
  if p==4:
    return pearson_corr(pd.DataFrame(a.reshape(-1, 1)), target)
  
  if p==5:
    return fisher_score(a.reshape(-1,1), target)
  
  if p==6:
    return t_test(a, target)
  
  if p==7:
    return signaltonoise(pd.DataFrame(a.reshape(-1,1)), target)

In [None]:
def get_clusters(genes,features,p,target):
  """
  genes - list of subset gene. These are the genes of picked by the score function. Please note that these are just the gene names. Their actual values are passed in the features dataframe
  features - the dataframe which contains the values of the genes
  p - this denotes the  type of score function. 1- mutual information, 2- reliefF, 3- chi square test.
  target - target is a pandas series of target clases for each observation
  """
  clusters={}
  cluster_gene={}
  x,y=0,0
  genes_copy_1=np.copy(genes)
  while(len(genes_copy_1)>0):
    # print("Starting New Iteration with", len(genes_copy_1),"number of genes!")
    genes_copy_2=np.copy(genes_copy_1)
    r_gene=genes_copy_2[0]
    r_gene_values=features[r_gene].values

    clusters[str(r_gene)]=[]
    
    genes_copy_2=np.delete(genes_copy_2,0)
    genes_copy_1=np.delete(genes_copy_1,0)
    
    
    
    r_score=score(r_gene_values,p,target)[0]
    
    # print("\nCluster number=",len(clusters))
    # print("First feature =========================j1=",r_gene,"\n")
    x+=1
    # print("Intial Relevance Score",r_score)

    while(len(genes_copy_2)>0):
      
      gs=genes_copy_2[0]
      gene=features[gs].values

      y+=1      
      
      a_plus=np.add(r_gene_values,gene,dtype='float64') #creating A+
      a_minus=np.subtract(r_gene_values,gene,dtype='float64') #Creating A-

      a_plus_score=score(a_plus,p,target)[0]
      a_minus_score=score(a_minus,p,target)[0]
      
      new_score=a_plus_score if a_plus_score>a_minus_score else a_minus_score
      # print("Gene",gs,"+ Score",a_plus_score,"- Score",a_minus_score)

      if new_score>r_score:

        if a_plus_score==new_score:

          # print("Gene Under Consideration",gs)
          # print("Initial Relevance",r_score,"Final Relevance",a_plus_score,r_score<a_plus_score)

          clusters[str(r_gene)].append(str(gs)+"+")
          r_gene_values=a_plus[:]
          r_score=a_plus_score

          # print("cluster member = +",gs,"\tRelevance Changed to",r_score)

        elif a_minus_score==new_score:

          # print("Gene Under Consideration",gs)
          # print("Initial Relevance",r_score,"Final Relevance",a_minus_score,r_score<a_minus_score)
          
          clusters[str(r_gene)].append(str(gs)+"-")
          r_gene_values=a_minus[:]
          r_score=a_minus_score

        #   print("cluster member = -",gs,"\tRelevance Changed to",r_score)
        # print("Gene",gs,"selected!",np.where(genes_copy_1 == gs))
        genes_copy_1 = np.delete(genes_copy_1, np.where(genes_copy_1 == gs))      
      genes_copy_2=np.delete(genes_copy_2,0)
    
    # for each in clusters[str(r_gene)]:
    #     genes_copy_1=np.delete(genes_copy_1,np.where(genes_copy_1==each))
    cluster_gene[r_gene]=r_gene_values

  #   print("\nFinal Relevance Score",r_score)
  print("Clusters formed! Returning Clusters and Gene Representatives")
  return clusters,cluster_gene

In [None]:
mi_cluster, gene_repre_1 = get_clusters(sorted_mi, feature, 1, target)
relief_cluster ,gene_repre_2 = get_clusters(sorted_relief, feature, 2, target)
chi_cluster, gene_repre_3 = get_clusters(sorted_chi, feature, 3, target)

In [None]:
pc_cluster, gene_repre_4 = get_clusters(sorted_pc, feature, 4, target)
fs_cluster, gene_repre_5 = get_clusters(sorted_fs, feature, 5, target)
tt_cluster, gene_repre_6 = get_clusters(sorted_tt, feature, 6, target)
snr_cluster, gene_repre_7 = get_clusters(sorted_snr, feature, 7, target)

In [None]:
print("Number of MI Clusters formed -",len(mi_cluster))
print("Number of ReliefF Clusters formed -",len(relief_cluster))
print("Number of ChiSq. Clusters formed -",len(chi_cluster))
print("Number of Pearson Clusters formed -",len(pc_cluster))
print("Number of Fisher Score Clusters formed -",len(fs_cluster))
print("Number of T-Test Clusters formed -",len(tt_cluster))
print("Number of SNR Clusters formed -", len(snr_cluster))

In [None]:
qmin = min([len(mi_cluster), len(relief_cluster), len(chi_cluster), len(pc_cluster), len(fs_cluster), len(tt_cluster), len(snr_cluster)])
q = q if q <= qmin else qmin
print("Choosing top %s Augmented Genes from each cluster"%(q))

In [None]:
gene_repre_1 = pd.DataFrame(gene_repre_1)
gene_repre_2 = pd.DataFrame(gene_repre_2)
gene_repre_3 = pd.DataFrame(gene_repre_3)
gene_repre_4 = pd.DataFrame(gene_repre_4)
gene_repre_5 = pd.DataFrame(gene_repre_5)
gene_repre_6 = pd.DataFrame(gene_repre_6)
gene_repre_7 = pd.DataFrame(gene_repre_7)

## Saving the Gene Representatives and Clusters formed from **SGC**

In [None]:
#Saving Gene Representatives
gene_repre_1.to_csv("%s_p%s_q%sRepresentative_Genes_1.csv"%(DATASET, p, q),index=False)
gene_repre_2.to_csv("%s_p%s_q%sRepresentative_Genes_2.csv"%(DATASET, p, q),index=False)
gene_repre_3.to_csv("%s_p%s_q%sRepresentative_Genes_3.csv"%(DATASET, p, q),index=False)
gene_repre_4.to_csv("%s_p%s_q%sRepresentative_Genes_4.csv"%(DATASET, p, q),index=False)
gene_repre_5.to_csv("%s_p%s_q%sRepresentative_Genes_5.csv"%(DATASET, p, q),index=False)
gene_repre_6.to_csv("%s_p%s_q%sRepresentative_Genes_6.csv"%(DATASET, p, q),index=False)
gene_repre_7.to_csv("%s_p%s_q%sRepresentative_Genes_7.csv"%(DATASET, p, q),index=False)

In [None]:
"""
Saving the clusters to JSON files, this preserves the gene selection sequence
"""
with open('%s_p%s_%smi_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(mi_cluster, fp)

with open('%s_p%s_%srelief_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(relief_cluster, fp)


with open('%s_p%s_%schi_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(chi_cluster, fp)

with open('%s_p%s_%spc_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(pc_cluster, fp)

with open('%s_p%s_%sfs_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(fs_cluster, fp)

with open('%s_p%s_%stt_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(tt_cluster, fp)

with open('%s_p%s_%ssnr_cluster.json'%(DATASET, p, q), 'w') as fp:
    json.dump(snr_cluster, fp)

In [None]:
def sort_keys(scores,gene_repre,target,flag=True):
  score_dict={}
  x=0
  for i in gene_repre.columns:
    score_dict[i]=scores[x]
    x+=1
  return [k for k, v in sorted(score_dict.items(), key=lambda item: item[1], reverse = True)]

In [None]:
"""
feature_ranking cannot be used here because it sorts and returns the indices 
from 0-1. They need to be sorted using a different function
"""
sorted_mi_keys=sort_keys(mutual_info_wrapper(gene_repre_1,target),gene_repre_1,target,True)[:q]

sorted_relief_keys=sort_keys(reliefF(gene_repre_2,target,k=NEIGHBOURS,repetitions=5),gene_repre_2,target,True)[:q]

mms=MinMaxScaler()
nfeature=mms.fit_transform(gene_repre_3)
chi_score,p_val=chi2(nfeature,target)
sorted_chi_keys=sort_keys(chi_score,gene_repre_3,target,False)[:q]

sorted_pc_keys=sort_keys(pearson_corr(gene_repre_4,target),gene_repre_4,target,True)[:q]

sorted_fs_keys=sort_keys(fisher_score(gene_repre_5.values,target),gene_repre_5,target,True)[:q]

sorted_tt_keys=sort_keys(t_test(gene_repre_6,target),gene_repre_6,target,True)[:q]

sorted_snr_keys = sort_keys(signaltonoise(gene_repre_7, target), gene_repre_7, target, True)[:q]

In [None]:
print("MI cluster after sorting - ",sorted_mi_keys)
print("Relief cluster after sorting - ",sorted_relief_keys)
print("Chi cluster after sorting - ",sorted_chi_keys)
print("Pearson cluster after sorting - ",sorted_pc_keys)
print("Fisher cluster after sorting - ",sorted_fs_keys)
print("T-Test cluster after sorting - ",sorted_tt_keys)
print("SNR cluster after sorting - ",sorted_snr_keys)

## Testing Classification of the Augmented Genes.
Here the classfication accuracy is tested using **KNN, Decision Tree, Naive Bayes** and **SVM** as well as the **Ensemble** of them.\
\
Top i (where i ranges from 1 to q) are chosen from each augmented dataset of filters in each iteration. This dataset is used for classification.

In [None]:
#creating a Dataframe for containing the augmented gene keys
aug_df_keys = pd.DataFrame({"MI":sorted_mi_keys, "ReliefF":sorted_relief_keys, 
                     "Chi Sq":sorted_chi_keys, "Pearson":sorted_pc_keys, 
                     "Fisher":sorted_fs_keys, "tTest":sorted_tt_keys, 
                     "SNR":sorted_snr_keys})

aug_df_dict = {"MI":gene_repre_1, "ReliefF":gene_repre_2, 
                     "Chi Sq":gene_repre_3, "Pearson":gene_repre_4, 
                     "Fisher":gene_repre_5, "tTest":gene_repre_6, 
                     "SNR":gene_repre_7}

# print(aug_df_keys.head())
# print(aug_df_dict)

In [None]:
LOOCV=LeaveOneOut()
data_KNN=KNeighborsClassifier(n_neighbors= int(feature.shape[0] ** 0.5))
data_SVM=SVC(kernel='rbf',gamma='scale')
data_NB=GaussianNB()
data_Tree= DecisionTreeClassifier()
rows=feature.shape[0]
classifiers=["NB","KNN","Tree","SVM"]

In [None]:
#Iterating over filters
for filter_name in aug_df_keys.columns:  
  acc_matrix = pd.DataFrame()
  for i in range(1,q+1):
    """
    Make a dataframe out of i keys from the gene representatives obtained from
    augmenting the chosen filter.
    Than use LOOCV to measure accuracy on Train Dataset.
    """
    acc=0
    individual_acc = np.zeros(4)
    cluster_df = aug_df_dict[filter_name].iloc[:, :i]

    # print(cluster_df.shape)

    for train_index,test_index in LOOCV.split(cluster_df):
      """
      Data is divided into train-test splits and then polling method is used 
      to find the classification results (ensemble of KNN,SVM,NB,Decision Tree)
      """
      train_data,train_labels=cluster_df.iloc[train_index,:],target[train_index]
      test_data,test_labels=cluster_df.iloc[test_index,:],target[test_index].values.tolist()[0]
      data_KNN.fit(train_data,train_labels)
      data_SVM.fit(train_data,train_labels)
      data_NB.fit(train_data,train_labels)
      data_Tree.fit(train_data,train_labels)

      class_list = [data_NB, data_KNN, data_Tree, data_SVM]
      results=[]

      #getting individual results
      for x in range(4):
        tem_result = class_list[x].predict(test_data)[0]
        if tem_result == test_labels:
          individual_acc[x]+=1
        results.append(tem_result)
      polling_result=0
      max_freq=0

      #getting ensemble results
      for x in results:
        freq=results.count(x)
        if freq>max_freq:
          max_freq=freq
          polling_result=x
      if polling_result == test_labels:
        acc+=1

    individual_acc = np.round(individual_acc/cluster_df.shape[0],4)
    individual_acc = np.append(individual_acc, np.round(acc/cluster_df.shape[0],4))
    # print(individual_acc)

    acc_matrix[i] = individual_acc
  acc_matrix = acc_matrix.T

  acc_matrix.columns = classifiers[:]+['Ensemble']

  print("\nFilter:-",filter_name, "\n",acc_matrix)

  acc_matrix.to_csv("Uni-SGC-%s_%s_p%s_q%s_Accuracy_Matrix.csv"%(DATASET, filter_name, p, q))

In [None]:
for filter_name in aug_df_keys.columns:
  files.download("Uni-SGC-%s_%s_p%s_q%s_Accuracy_Matrix.csv"%(DATASET, filter_name, p, q))

## Loading Gene Representatives and Clusters
The below cells can be run to load gene representatives and clusters if you already have them prepared.

In [None]:
"""
Loading the Cluster JSON files from memory
"""

with open('%s_p%s_%smi_cluster.json'%(DATASET, p, q), 'r') as fp:
  mi_cluster=json.load(fp)

with open('%s_p%s_%srelief_cluster.json'%(DATASET, p, q), 'r') as fp:
  chi_cluster=json.load(fp)


with open('%s_p%s_%schi_cluster.json'%(DATASET, p, q), 'r') as fp:
  relief_cluster=json.load(fp)

with open('%s_p%s_%spc_cluster.json'%(DATASET, p, q), 'r') as fp:
  pc_cluster=json.load(fp)

with open('%s_p%s_%sfs_cluster.json'%(DATASET, p, q), 'r') as fp:
  fs_cluster=json.load(fp)

with open('%s_p%s_%stt_cluster.json'%(DATASET, p, q), 'r') as fp:
  tt_cluster=json.load(fp)

with open('%s_p%s_%ssnr_cluster.json'%(DATASET, p, q), 'r') as fp:
  snr_cluster=json.load(fp)

In [None]:
"""
Loading Representative Genes from Memory
"""
gene_repre_1 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_1.csv"%(DATASET, p, q),index_col = None)
gene_repre_2 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_2.csv"%(DATASET, p, q),index_col = None)
gene_repre_3 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_3.csv"%(DATASET, p, q),index_col = None)
gene_repre_4 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_4.csv"%(DATASET, p, q),index_col = None)
gene_repre_5 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_5.csv"%(DATASET, p, q),index_col = None)
gene_repre_6 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_6.csv"%(DATASET, p, q),index_col = None)
gene_repre_7 = pd.read_csv("%s_p%s_q%sRepresentative_Genes_7.csv"%(DATASET, p, q),index_col = None)