In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
# import cv2
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if torch.backends.mps.is_available():
    device = torch.device("mps")

# Load Data

In [2]:
seed = 42

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [4]:
label_dict = {
    # Controls
    'n': 0,
    # Chirrhosis
    'cirrhosis': 1,
    # Colorectal Cancer
    'cancer': 1, 'small_adenoma': 0,
    # IBD
    'ibd_ulcerative_colitis': 1, 'ibd_crohn_disease': 1,
    # T2D and WT2D
    't2d': 1,
    # Obesity
    'leaness': 0, 'obesity': 1,
}
def loadData(data_dir, filename, dtype=None):
    feature_string = ''
    if filename.split('_')[0] == 'abundance':
        feature_string = "k__"
    if filename.split('_')[0] == 'marker':
        feature_string = "gi|"
    # read file
    filename = data_dir + filename
    if not os.path.isfile(filename):
        print("FileNotFoundError: File {} does not exist".format(filename))
        exit()
    raw = pd.read_csv(filename, sep='\t', index_col=0, header=None)

    # select rows having feature index identifier string
    X = raw.loc[raw.index.str.contains(feature_string, regex=False)].T

    # get class labels
    Y = raw.loc['disease']
    Y = Y.replace(label_dict)

    # train and test split
    X_train, X_test, y_train, y_test = train_test_split(X.values.astype(dtype), Y.values.astype('int'), test_size=0.2, random_state=seed, stratify=Y.values)
    print("Train data shape: ", X_train.shape)
    print("Test data shape: ", X_test.shape)
    return X_train, X_test, y_train, y_test

data_dir = '../data/marker/'
data_string = 'marker_Cirrhosis.txt'
X_train, X_test, y_train, y_test = loadData(data_dir, data_string, dtype='float32')

  raw = pd.read_csv(filename, sep='\t', index_col=0, header=None)


Train data shape:  (185, 120553)
Test data shape:  (47, 120553)


# Classification by using the raw data

In [5]:
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, f1_score
def get_metrics(clf, is_svm=False):
    y_true, y_pred = y_test, clf.predict(X_test)
    y_prob = 0
    if is_svm:
        y_prob = clf.decision_function(X_test)
    else:
        y_prob = clf.predict_proba(X_test)
    # Performance Metrics: AUC, ACC, Recall, Precision, F1_score
    metrics = {
    'ACC': accuracy_score(y_true, y_pred),
    'Recall': recall_score(y_true, y_pred),
    'Precision': precision_score(y_true, y_pred),
    'F1_score': f1_score(y_true, y_pred),
    }
    if not is_svm:
        metrics['AUC'] = roc_auc_score(y_true, y_prob[:, 1])
    else:
        metrics['AUC'] = roc_auc_score(y_true, y_prob)
    return metrics

In [6]:
# SVM
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)
print(get_metrics(clf, is_svm=True))

{'ACC': 0.9148936170212766, 'Recall': 0.875, 'Precision': 0.9545454545454546, 'F1_score': 0.9130434782608695, 'AUC': 0.9836956521739131}


In [7]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
print(get_metrics(clf))

{'ACC': 0.9148936170212766, 'Recall': 0.8333333333333334, 'Precision': 1.0, 'F1_score': 0.9090909090909091, 'AUC': 0.9864130434782609}


In [8]:
# Multi-layer Perceptron
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=1000)
clf.fit(X_train, y_train)
print(get_metrics(clf))

{'ACC': 0.8297872340425532, 'Recall': 1.0, 'Precision': 0.75, 'F1_score': 0.8571428571428571, 'AUC': 0.9909420289855073}
