In [73]:
import numpy as np
import pandas as pd
import os
import argparse
from tqdm import tqdm
import logging

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.feature_selection import SelectKBest, SelectPercentile, SelectFromModel

from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn import metrics

logging.basicConfig(level = logging.INFO)

In [217]:
class Dataset:
    def __init__(self, data_dir, dataset_name):
        
        self.path = os.path.join(data_dir, dataset_name)
        self.dataset_name = dataset_name
        
        if dataset_name is 'colon':
            self.gene_expression_values_file = "gene_values.txt"
            self.labels_file = "labels_for_each_tissue.txt"
        
        elif dataset_name is 'leukemia':
            self.scale_file = 'rescale_factors.txt'
            self.samples_file = "table_ALL_AML_samples.txt"
            self.train_file = "train.tsv"
            self.test_file = "test.tsv"
        
        self.read_data()
        self.transform_data()
            
    def read_data(self):
        
        logging.info("Reading Dataset %s", self.dataset_name)
        
        if self.dataset_name is 'colon':
            with open(os.path.join(self.path, self.gene_expression_values_file), 'r') as f:
                gene_expression_values = [line.strip() for line in tqdm(f.readlines())]
                expressions = []
                for gene in gene_expression_values:
                    if gene != '':
                        expression_values = np.array(gene.split(" "))
                        expressions.append(expression_values)

            with open(os.path.join(self.path, self.labels_file), 'r') as f:
                labels = [int(line.strip()) for line in tqdm(f.readlines())]
                labels = np.array(labels)
                labels[labels>0] = 1
                labels[labels<=0] = 0
                
            self.features = np.array(expressions, dtype=np.float64).T
            self.target = labels
            self.split_data(split_perc = 0.2)
            
        elif self.dataset_name is 'leukemia':
            with open(os.path.join(self.path, self.scale_file), "r") as f:
                x = f.readlines()
                x = [y.strip().split(" ") for y in x]
                scale_factors = [float(y[1]) for y in x]
                train_scale_factors = np.array(scale_factors[:38])
                test_scale_factors = np.array(scale_factors[38:])
                      
            with open(os.path.join(self.path, self.samples_file), "r") as f:
                x = f.readlines()
                labels = []
                for y in x:
                    yx = y.split("\t")
                    labels.append((1 if yx[2].strip() == 'ALL' else 0))
                self.Y_train = labels[:38]
                self.Y_test = labels[38:]
                      
            train_data = pd.read_csv(os.path.join(self.path, self.train_file), sep="\t")
            train_data = np.array(train_data).T
            self.X_train = train_data*train_scale_factors[:, np.newaxis]
               
            test_data = pd.read_csv(os.path.join(self.path, self.test_file), sep="\t")
            test_data = np.array(test_data).T
            self.X_test = test_data*test_scale_factors[:, np.newaxis]
            
            self.features = np.vstack((self.X_train, self.X_test))
            self.target = np.append(self.Y_train, self.Y_test)
            
        logging.info("Reading data completed. The train dataset size is %s", self.X_train.shape)
        
    def split_data(self, split_perc = 0.2):
        
        logging.info("Splitting the dataset into train and test sets with a split percentage of %s", split_perc)
        
        self.X_train, self.X_test, self.Y_train, self.Y_test = \
        train_test_split(self.features, self.target, test_size = split_perc, random_state = 1405)
        
        logging.info("Splitting is completed. The dimensions of the train dataset are %s", self.X_train.shape)
        
    def transform_data(self):
        
        logging.info("Standardizing the data to have to zero mean and one variance")
        
        standard_scaler = StandardScaler()
        standard_scaler.fit(self.X_train)
        self.X_train = standard_scaler.transform(self.X_train)
        self.X_test = standard_scaler.transform(self.X_test)
        
        logging.info("Standardizing is completed.")

In [218]:
colon_data = Dataset("/home/avinash/UIUC/CS466/cancer-classification/Data", 'colon')

INFO:root:Reading Dataset colon
100%|██████████| 3998/3998 [00:00<00:00, 231575.26it/s]
100%|██████████| 62/62 [00:00<00:00, 364211.27it/s]
INFO:root:Splitting the dataset into train and test sets with a split percentage of 0.2
INFO:root:Splitting is completed. The dimensions of the train dataset are (49, 2000)
INFO:root:Reading data completed. The train dataset size is (49, 2000)
INFO:root:Standardizing the data to have to zero mean and one variance
INFO:root:Standardizing is completed.


In [219]:
leukemia_data = Dataset("/home/avinash/UIUC/CS466/cancer-classification/Data", "leukemia")

INFO:root:Reading Dataset leukemia
INFO:root:Reading data completed. The train dataset size is (38, 7129)
INFO:root:Standardizing the data to have to zero mean and one variance
INFO:root:Standardizing is completed.


In [186]:
class ModelUtilities:
    def __init__(self, X_train, Y_train, X_test, Y_test):
        self.X_train = X_train
        self.Y_train = Y_train
        self.X_test = X_test
        self.Y_test = Y_test
        
    def get_important_features(self, method, number_of_features):
        if method is 'select_k_best':
            best_indices = SelectKBest(k=number_of_features, score_func=f_classif).fit(self.X_train, self.Y_train).get_support(indices=True)
            return best_indices
    
    def test_for_all(data):
        from collections import defaultdict
        scores = defaultdict(list)
        for nf in range(5, 51):
            for k in range(1, 21):
                best_features = self.get_important_features(self.X_train, self.Y_train, 'select_k_best', nf)
                tr_acc, test_acc = self.uild_nearest_neighbor_model(self, k, best_features)
                scores[nf].append(test_acc)
        return scores

In [187]:
def build_nearest_neighbor_model(X_train, Y_train, k = 5, feature_indices = None):
    if feature_indices is not None:
        X_train = X_train[:, feature_indices]

    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, Y_train)

    return model

In [188]:
def build_svm_model(X_train, Y_train, feature_indices = None, kernel = 'linear'):
    if feature_indices is not None:
        X_train = X_train[:, feature_indices]
        
    model = svm.SVC(C=2.0, kernel=kernel, gamma=0.001, random_state=1405)
    model.fit(X_train, Y_train)
    
    return model

In [189]:
def build_naive_bayes_model(X_train, Y_train, feature_indices = None):
    if feature_indices is not None:
        X_train = X_train[:, feature_indices]
        
    model = GaussianNB()
    model.fit(X_train, Y_train)
    
    return model

In [190]:
def build_random_forest_model(X_train, Y_train, feature_indices = None, n_trees = 50):
    if feature_indices is not None:
        X_train = X_train[:, feature_indices]
        
    model = RandomForestClassifier(n_estimators=n_trees)
    model.fit(X_train, Y_train)
    
    return model

In [191]:
def build_logistic_regression_model(X_train, Y_train, feature_indices = None):
    if feature_indices is not None:
        X_train = X_train[:, feature_indices]
        
    model = LogisticRegression(random_state=1405, solver='lbfgs')
    model.fit(X_train, Y_train)
    
    return model

In [192]:
def calculate_accuracies(X_train, Y_train, X_test, Y_test, model, feature_indices = None):
    if feature_indices is not None:
        X_train = X_train[:, feature_indices]
        X_test = X_test[:, feature_indices]

    train_pred_y = model.predict(X_train)
    test_pred_y = model.predict(X_test)

    train_accuracy = metrics.accuracy_score(y_pred=train_pred_y, y_true=Y_train)
    test_accuracy = metrics.accuracy_score(y_pred=test_pred_y, y_true=Y_test)

    return train_accuracy, test_accuracy

In [193]:
def create_knn_ensemble(X_train, Y_train, ks = [3, 5, 7], feature_selections = ['select_k_best', 'select_k_best']):
    models = []
    for k in ks:
        model = build_nearest_neighbor_model(X_train, Y_train, k, feature_indices)
        models.append(model)
    return models

In [225]:
model = build_nearest_neighbor_model(leukemia_data.X_train, leukemia_data.Y_train)
calculate_accuracies(leukemia_data.X_train, leukemia_data.Y_train, leukemia_data.X_test, leukemia_data.Y_test, model)

(0.8947368421052632, 0.7352941176470589)

In [226]:
cross_val_score(X=leukemia_data.features, estimator=model, cv=5, y=leukemia_data.target)

array([0.86666667, 0.86666667, 1.        , 0.71428571, 0.71428571])