In [97]:
!ls /home/harold_triedman/elvo-analysis/data

from_luke  labels.csv  multichannel  normal  preprocessed


In [108]:
import os
import pathlib
import typing
import csv
from scipy.ndimage.interpolation import zoom
import pandas as pd
import random
import numpy as np
from sklearn import svm

In [109]:
def load_data(data_dir: str) -> typing.Dict[str, np.ndarray]:
    """Returns a dictionary which maps patient ids
    to patient pixel data."""
    data_dict = {}
    for filename in os.listdir(data_dir):
        patient_id = filename[:-4] # remove .npy extension
        matrix = np.load(pathlib.Path(data_dir) / filename)
        data_dict[patient_id] = np.ravel(matrix)
    return data_dict

In [110]:
train_data = load_data('/home/harold_triedman/elvo-analysis/data/preprocessed/normal/training/')
train_labels = {}
with open('/home/harold_triedman/elvo-analysis/data/preprocessed/train.csv', 'r') as pos_file:
    reader = csv.reader(pos_file, delimiter=',')
    for row in reader:
        print(row)
        if row[1] != '0':
            train_labels[row[1]] = int(row[2])

['', '0', '1']
['0', 'IAUKV5R644JZFD55', '1']
['1', 'OI623KTR79Z90BG1', '0']
['2', 'P64MELY8IJSLJ1WW', '0']
['3', '0MTDDGCF20DKOR59', '0']
['4', 'VKMYOQD5A64UFEED', '0']
['5', 'DVYKX4XMBHD57MZJ', '0']
['6', 'TFCLOHN4ROEYF6QY', '0']
['7', 'YDJX25T27CYATQ25', '1']
['8', '43ERS29KX3LPF334', '1']
['9', 'PAIDKQGSUI0KZ54S', '0']
['10', 'PKDK1R3JGEDSY9T2', '0']
['11', 'NMHOGEKTK7LT57ZI', '1']
['12', 'DHG4RIY13DFISJ35', '1']
['13', '27QV9Z81JZB09QPH', '0']
['14', 'QPDX2K3DS7IS5QNM', '1']
['15', 'RMID4XEKTVEYCSGK', '0']
['16', 'PYMCG60E4FESH559', '0']
['17', 'G8EI396DZCRABG20', '0']
['18', 'A56W94SJCRL9BR9P', '0']
['19', 'MSS82V0V6T9P13CV', '0']
['20', 'AJHYOJIG3DZLSIKI', '0']
['21', 'WKKADQ0Q6519IXWY', '1']
['22', 'FYSSUUSC5XYNLUQW', '0']
['23', 'XZUUUC426GUXSCGJ', '1']
['24', 'M8PXGLRLDQUQ9497', '0']
['25', 'K3C28ASFAF7ZLEIH', '1']
['26', '6GA622A55T3AOGTL', '1']
['27', 'UUKSYXBYQQZATBP5', '1']
['28', '9DWQA9BENM1XBI71', '1']
['29', 'QIA74LHB1XWMP523', '1']
['30', 'WHNAL8GW8TJ8Y4AK', '1']
['3

In [111]:
print(train_labels, list(train_data.keys()))

{'IAUKV5R644JZFD55': 1, 'OI623KTR79Z90BG1': 0, 'P64MELY8IJSLJ1WW': 0, '0MTDDGCF20DKOR59': 0, 'VKMYOQD5A64UFEED': 0, 'DVYKX4XMBHD57MZJ': 0, 'TFCLOHN4ROEYF6QY': 0, 'YDJX25T27CYATQ25': 1, '43ERS29KX3LPF334': 1, 'PAIDKQGSUI0KZ54S': 0, 'PKDK1R3JGEDSY9T2': 0, 'NMHOGEKTK7LT57ZI': 1, 'DHG4RIY13DFISJ35': 1, '27QV9Z81JZB09QPH': 0, 'QPDX2K3DS7IS5QNM': 1, 'RMID4XEKTVEYCSGK': 0, 'PYMCG60E4FESH559': 0, 'G8EI396DZCRABG20': 0, 'A56W94SJCRL9BR9P': 0, 'MSS82V0V6T9P13CV': 0, 'AJHYOJIG3DZLSIKI': 0, 'WKKADQ0Q6519IXWY': 1, 'FYSSUUSC5XYNLUQW': 0, 'XZUUUC426GUXSCGJ': 1, 'M8PXGLRLDQUQ9497': 0, 'K3C28ASFAF7ZLEIH': 1, '6GA622A55T3AOGTL': 1, 'UUKSYXBYQQZATBP5': 1, '9DWQA9BENM1XBI71': 1, 'QIA74LHB1XWMP523': 1, 'WHNAL8GW8TJ8Y4AK': 1, 'GTSAGN2MRE5D0AZI': 0, 'Z3AINLH4Y07ITBRR': 1, 'YSEPTJX4TY1WQY1T': 1, 'PBR2STY62FZ3MQHZ': 0, '9UFGGHS7R1O40ZU9': 0, 'FPTFJD4JZA7ZKYQJ': 1, 'XPBRBUB6YYHIWHVO': 0, 'CF7JC6FO4N0S96KN': 0, 'RYNCGNVRZH2LB7BN': 0, 'SP73CHWDY57ND02Z': 1, 'UO04YVZ0X59PBH6A': 0, '9AF84HK0K5CG471Y': 1, 'S6YJ44EEB

In [112]:
def as_numpy_arrays(data, labels) -> (np.ndarray, np.ndarray):
    shuffled_ids = list(data.keys())
    random.shuffle(shuffled_ids)
    X_list = []
    y_list = []
    for id_ in shuffled_ids:
        X_list.append(data[id_])
        y_list.append(labels[id_])
    print(len(X_list))
    print(len(y_list))
    return np.stack(X_list), np.stack(y_list)

In [113]:
X_train, y_train = as_numpy_arrays(train_data, train_labels)

932
932


In [114]:
print(X_train.shape, y_train.shape)

(932, 48400) (932,)


In [115]:
clf = svm.SVC()

In [116]:
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [117]:
test_data = load_data('/home/harold_triedman/elvo-analysis/data/preprocessed/normal/validation/')
test_labels = {}
with open('/home/harold_triedman/elvo-analysis/data/preprocessed/val.csv', 'r') as pos_file:
    reader = csv.reader(pos_file, delimiter=',')
    for row in reader:
        if row[1] != '0':
            test_labels[row[1]] = int(row[2])

In [118]:
X_test, y_test = as_numpy_arrays(test_data, test_labels)

104
104


In [120]:
clf.score(X_test, y_test)

0.5384615384615384

In [123]:
from sklearn.model_selection import cross_validate

In [124]:
scoring = ['accuracy', 'roc_auc']
cross_validate(svm.SVC(), X_train, y_train, scoring=scoring)



{'fit_time': array([30.44605875, 30.17047691, 30.29144096]),
 'score_time': array([30.19771862, 30.05388308, 30.04403853]),
 'test_accuracy': array([0.50320513, 0.50322581, 0.50645161]),
 'test_roc_auc': array([0.49997945, 0.49675284, 0.50318471]),
 'train_accuracy': array([1.        , 0.99839228, 0.99517685]),
 'train_roc_auc': array([1.        , 0.99999483, 0.99995347])}