In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/GAN_for_Neural_Graph/baeslines"

os.chdir(path)
os.listdir(path)

#Config

In [None]:
DATA_dir = "Dataset"
left_table_file = 'left_table.csv'
matrices_dir = 'FC_norm'
pickle_path = "ABIDE.p"
upsampled_pickle_path = "ABIDE_upsampled.p"
json_path = "split_ids.json"

weight_threshold = 0.1
train_size = 700
val_size = 183

#Process Data

In [None]:
import os, sys
from tqdm import tqdm
import pickle
import json
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from keras.utils import to_categorical

# np.set_printoptions(threshold=sys.maxsize)
LABEL_LIST = ["Control", "Autism", "Aspergers"]
INS_LIST = ['YALE', 'KKI', 'UCLA_1', 'UCLA_2', 'PITT', 'OLIN', 'SDSU', 'TRINITY', 'UM_1', 'UM_2', 'USM', 'CMU', 'LEUVEN_1', 'LEUVEN_2', 'NYU', 'MAX_MUN', 'CALTECH', 'SBL']
GENDER_LIST = [1, 2]


def map_to_onehot(value, all_values):
    """
    Convert left table entry to one hot vector.
    value: a table entry, i.e., the content in a cell of the table. E.g. "Autism" in LABEL_LIST
    all_values: all possible values that could appear in a column. E.g. LABEL_LIST
    """
    idx = all_values.index(value)
    l = len(all_values)
    one_hot = [0.,] * idx + [1.,] + [0.] * (l - idx - 1)
    return one_hot

def preprocess(A, threshold):
    """
    Preprocess an input matrix.
    A: adjacency matrix
    threshold: convert edge weights to connectivity, i.e., when edge weight is
            larger than the threshold, the edge is considered to be connecting
            its two corresponding nodes.
    """
    X = A.copy()
    # A[X <= threshold] = 0.
    # A[X > threshold] = 1.
    return A, X

def get_backbone_graph(graphs, threshold):
    """
    Get the backbone graph base on graphs from the training set
    """
    A, _ = graphs[:, 0, :, :], graphs[:, 1, :, :]
    A_mean = np.mean(A, axis = 0)
    A_backbone = A_mean.copy()
    A_backbone[A_mean > threshold] = 1.
    A_backbone[A_mean <= threshold] = 0.
    return A_backbone

def set_backbone_graph(graphs, A_backbone):
    _, X = graphs[:, 0, :, :], graphs[:, 1, :, :]
    # A_backbone = np.mean(A, axis = 0)
    N = X.shape[0]
    A = np.tile( A_backbone[np.newaxis, :, :], (N, 1, 1) )
    graphs[:, 0, :, :] = A
    return graphs

def convert_to_model_input(graphs):
    """
    Split input graph tensor into a tensor for A and another tensor for X
    """
    A, X = graphs[:, 0, :, :], graphs[:, 1, :, :]
    return A, X

def load_data(data_root_directory=DATA_dir, left_table_file=left_table_file, matrix_directory=matrices_dir):
    """
    Load data from files that are generated by converter.m
    data_root_directory: the root directory where all data files reside
    left_table_file: the file name of the left half of the original table
    matrix_directory: the directory which contains all csv files of matrices,
                file names are the Id entries of their corresponding rows
    """
    left_table = pd.read_csv(os.path.join(data_root_directory, left_table_file))
    print("Left table of shape", left_table.shape, "has been loaded!")
    print("Loading graphs...")
    matrices = []
    labels = []
    genders = []
    ages = []
    ids = []
    for row in tqdm(range(left_table.shape[0])):
        id = str(left_table.loc[row, 'Id'])
        ids.append(id)
        # Read left table
        genders.append(map_to_onehot(left_table.loc[row, 'Gender'], GENDER_LIST))
        ages.append(float(left_table.loc[row, 'Age']))
        # Read adjacency matrix
        mtx_path = os.path.join(data_root_directory, matrix_directory, id + ".csv")
        A = np.loadtxt(open(os.path.join(data_root_directory, matrix_directory, id + ".csv"), "r"), delimiter=",", skiprows=0)
        matrices.append(preprocess(A, weight_threshold))
    
    # get label
    table_l = left_table.loc[:, 'label_id']
    for l in table_l:
      labels.append(l)
    labels = to_categorical(labels, 3)

    # result
    input_ids = np.array(ids)
    input_graphs = np.array(matrices)
    input_genders = np.array(genders)
    input_ages = np.array(ages)
    input_ages /= 100.
    input_Y = np.array(labels)

    return input_ids, input_graphs, input_genders, input_ages, input_Y

def shuffle_data(input_ids, input_graphs, input_genders, input_ages, input_Y):
    input_ids, input_graphs, input_genders, input_ages, input_Y = shuffle(input_ids, input_graphs, input_genders, input_ages, input_Y)

    split_ids = {'train': list(input_ids[:train_size]),
                 'val': list(input_ids[train_size:train_size + val_size]),
                 'test': list(input_ids[train_size + val_size:])}

    train_graphs, val_graphs, test_graphs = \
                    input_graphs[:train_size, :, :], \
                    input_graphs[train_size:train_size + val_size, :, :], \
                    input_graphs[train_size + val_size:, :, :]

    train_genders, val_genders, test_genders = \
                    input_genders[:train_size, :], \
                    input_genders[train_size:train_size + val_size, :], \
                    input_genders[train_size + val_size:, :]

    train_ages, val_ages, test_ages = \
                    input_ages[:train_size], \
                    input_ages[train_size:train_size + val_size], \
                    input_ages[train_size + val_size:]

    train_Y, val_Y, test_Y = \
                    input_Y[:train_size, :], \
                    input_Y[train_size:train_size + val_size, :], \
                    input_Y[train_size + val_size:, :]

    return  train_graphs, val_graphs, test_graphs, \
            train_genders, val_genders, test_genders, \
            train_ages, val_ages, test_ages, \
            train_Y, val_Y, test_Y, \
            split_ids



input_ids, input_graphs, input_genders, input_ages, input_Y = load_data()

# Shuffle and Save

In [None]:
# del json
import json

train_graphs, val_graphs, test_graphs, \
        train_genders, val_genders, test_genders, \
        train_ages, val_ages, test_ages, \
        train_Y, val_Y, test_Y, split_ids = shuffle_data(input_ids, input_graphs, input_genders, input_ages, input_Y)

A_backbone = get_backbone_graph(train_graphs, weight_threshold)
train_graphs = set_backbone_graph(train_graphs, A_backbone)
val_graphs = set_backbone_graph(val_graphs, A_backbone)
test_graphs = set_backbone_graph(test_graphs, A_backbone)

datasets = train_graphs, val_graphs, test_graphs, \
        train_genders, val_genders, test_genders, \
        train_ages, val_ages, test_ages, \
        train_Y, val_Y, test_Y

pickle.dump( datasets, open( pickle_path, "wb" ) )
json = json.dumps(split_ids)
with open(json_path, "w") as file:
    file.write(json)

train_graphs, val_graphs, test_graphs, \
        train_genders, val_genders, test_genders, \
        train_ages, val_ages, test_ages, \
        train_Y, val_Y, test_Y = pickle.load( open( pickle_path, "rb" ) )
print("[Training]   Graph shape, Gender shape, Ages shape, Y shape: \n\t", \
    train_graphs.shape, train_genders.shape, train_ages.shape, train_Y.shape)
print("[Validation] Graph shape, Gender shape, Ages shape, Y shape: \n\t", \
    val_graphs.shape, val_genders.shape, val_ages.shape, val_Y.shape)
print("[Test]       Graph shape, Gender shape, Ages shape, Y shape: \n\t", \
    test_graphs.shape, test_genders.shape, test_ages.shape, test_Y.shape)

print("[Training]   Class distribution", np.sum(train_Y, axis = 0))
print("[Validation] Class distribution", np.sum(val_Y, axis = 0))
print("[Test]       Class distribution", np.sum(test_Y, axis = 0))

#Load Data

In [None]:
import pickle, datetime
import numpy as np
import  tensorflow as tf
import time
import os, sys
from sklearn import linear_model
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import metrics

def get_svm_data(graphs, Y):
    H = graphs[:, 1, :, :]
    flatten_H = np.reshape(H, (H.shape[0], -1))
    classes = np.argmax(Y, axis = 1)
    return flatten_H, classes

train_graphs, val_graphs, test_graphs, \
        train_genders, val_genders, test_genders, \
        train_ages, val_ages, test_ages, \
        train_Y, val_Y, test_Y = pickle.load( open( pickle_path, "rb" ) )
print("[Training]   Graph shape, Gender shape, Ages shape, Y shape: \n\t", \
    train_graphs.shape, train_genders.shape, train_ages.shape, train_Y.shape)
print("[Validation] Graph shape, Gender shape, Ages shape, Y shape: \n\t", \
    val_graphs.shape, val_genders.shape, val_ages.shape, val_Y.shape)
print("[Test]       Graph shape, Gender shape, Ages shape, Y shape: \n\t", \
    test_graphs.shape, test_genders.shape, test_ages.shape, test_Y.shape)

train_flatten_H, train_classes = get_svm_data(train_graphs, train_Y)
train_X = np.concatenate((train_flatten_H, train_genders, np.expand_dims(train_ages, axis=-1)), axis = 1)
val_flatten_H, val_classes = get_svm_data(val_graphs, val_Y)
val_X = np.concatenate((val_flatten_H, val_genders, np.expand_dims(val_ages, axis=-1)), axis = 1)
test_flatten_H, test_classes = get_svm_data(test_graphs, test_Y)
test_X = np.concatenate((test_flatten_H, test_genders, np.expand_dims(test_ages, axis=-1)), axis = 1)

# Train logistic regression

In [None]:
print('Training....')
clf_l = linear_model.LogisticRegression(max_iter=1000)
clf_l.fit(train_X, train_classes)
print('Finished')

# Test logistic regression

In [None]:
labels_test = test_classes

# Evaluate SVM
labels_pred = clf_l.predict(test_X)
correct = np.sum(labels_pred==labels_test)
num_test = labels_pred.shape
three_c_acc = correct / num_test * 100

# classify 2 as 1
labels_2_test, labels_2_pred = labels_test.copy(), labels_pred.copy()
labels_2_test[labels_2_test==2] = 1
labels_2_pred[labels_2_pred==2] = 1

# calculate the accuracy
correct = np.sum(labels_2_test==labels_2_pred)
two_c_acc = correct / num_test * 100
print('three class acc: %.3f, two class acc: %.3f' % (three_c_acc, two_c_acc))
print("="*100)

# three class confusion metrics
target_names = ['class 0', 'class 1', 'class 2']
print('three class:')

# calculate precision, recall, f1 score
print(classification_report(labels_test, labels_pred, target_names=target_names, digits=4))

# calculate AUC
pred_y = np.zeros((labels_pred.shape[0], 3))
for i, label in enumerate(labels_pred):
    pred_y[i, label] = 1
auc = sklearn.metrics.roc_auc_score(test_Y, pred_y)
print('AUC', auc)
print("="*100)

# two class confusion metrics
target_names = ['class 0', 'class 1']
print('two class:')
# calculate precision, recall, f1 score
print(classification_report(labels_2_test, labels_2_pred, target_names=target_names, digits=4))

# calcuate specificity
tn, fp, fn, tp = confusion_matrix(labels_2_test, labels_2_pred).ravel()
specificity = tn / (tn+fp)
print('specificity:', specificity)

# calculate AUC
pred_y = np.zeros((labels_2_pred.shape[0], 2))
for i, label in enumerate(labels_2_pred):
    pred_y[i, label] = 1
auc = sklearn.metrics.roc_auc_score(labels_2_test, pred_y[:, 1])
print('AUC', auc)
print("="*100)

# Train SVM

In [None]:
from sklearn import svm

print('Training....')
clf = svm.SVC()
clf.fit(train_X, train_classes)
print('Finished')

# Test SVM

In [None]:
labels_test = test_classes

# Evaluate SVM
labels_pred = clf.predict(test_X)
correct = np.sum(labels_pred==labels_test)
num_test = labels_pred.shape
three_c_acc = correct / num_test * 100

# classify 2 as 1
labels_2_test, labels_2_pred = labels_test.copy(), labels_pred.copy()
labels_2_test[labels_2_test==2] = 1
labels_2_pred[labels_2_pred==2] = 1

# calculate the accuracy
correct = np.sum(labels_2_test==labels_2_pred)
two_c_acc = correct / num_test * 100
print('three class acc: %.3f, two class acc: %.3f' % (three_c_acc, two_c_acc))
print("="*100)

# three class confusion metrics
target_names = ['class 0', 'class 1', 'class 2']
print('three class:')

# calculate precision, recall, f1 score
print(classification_report(labels_test, labels_pred, target_names=target_names, digits=4))

# calculate AUC
pred_y = np.zeros((labels_pred.shape[0], 3))
for i, label in enumerate(labels_pred):
    pred_y[i, label] = 1
auc = sklearn.metrics.roc_auc_score(test_Y, pred_y)
print('AUC', auc)
print("="*100)

# two class confusion metrics
target_names = ['class 0', 'class 1']
print('two class:')
# calculate precision, recall, f1 score
print(classification_report(labels_2_test, labels_2_pred, target_names=target_names, digits=4))

# calcuate specificity
tn, fp, fn, tp = confusion_matrix(labels_2_test, labels_2_pred).ravel()
specificity = tn / (tn+fp)
print('specificity:', specificity)

# calculate AUC
pred_y = np.zeros((labels_2_pred.shape[0], 2))
for i, label in enumerate(labels_2_pred):
    pred_y[i, label] = 1
auc = sklearn.metrics.roc_auc_score(labels_2_test, pred_y[:, 1])
print('AUC', auc)
print("="*100)