In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import numpy as np
import torch
import torch.utils.data as data_utils
from sklearn.impute import SimpleImputer

In [3]:
os.chdir('/data/datasets/topcat/py_cleaned_data')
df = pd.read_csv('TOPCAT_final_2_25_2020.csv')
np.unique(df['death'].values, return_counts=True)

(array([0., 1.]), array([1380,  387]))

In [7]:
for col in df.columns:
    print(col)

Unnamed: 0
ID
BNP_YN
BNP_VAL
age_entry
GENDER
RACE_WHITE
RACE_BLACK
RACE_ASIAN
RACE_OTHER
ETHNICITY
DYSP_CUR
DYSP_YR
ORT_CUR
ORT_YR
DOE_CUR
DOE_YR
RALES_CUR
RALES_YR
JVP_CUR
JVP_YR
EDEMA_CUR
EDEMA_YR
EF
visit_dt1_hf
CHF_HOSP
chfdc_dt3
MI
mi_dt3
STROKE
stroke_dt3
CABG
cabg_dt3
PCI
pci_dt3
ANGINA
COPD
ASTHMA
HTN
PAD
DYSLIPID
ICD
PACEMAKER
AFIB
THYR_HPR
THYR_HYPO
DM
DM_DUR_YR
INSULIN
ORAL
DIET
TREAT_OTH
treat_sp_cat
DM_RETINO
DM_NEPHRO
DM_NEURO
SMOKE_YRS
SMOKE_EVER
QUIT_YRS
alcohol4_cat
HEAVY_WK
HEAVY_MIN
MED_WK
MED_MIN
LIGHT_WK
LIGHT_MIN
metsperweek
cooking_salt_score
nyha_class_cat
HR
SBP
DBP
gfr
NA_mmolL
K_mmolL
CL_mmolL
CO2_mmolL
BUN_mgdL
GLUCOSE_mgdL
WBC_kuL
HB_gdL
PLT_kuL
ALT_UL
ALP_UL
AST_UL
TBILI_mgdL
ALB_gdL
urine_val_mgg
PROTEINURIA
QRS_DUR
ECG_AFIB
ECG_BBB2
ECG_VPR
ECG_Q
ECG_LVH
drug
death
cvd_death
time_death
anyhosp
time_anyhosp
hfhosp
time_hfhosp
abortedca
time_abortedca
mi
time_mi
stroke
time_stroke
primary_ep
time_primary_ep
BMI
GLUCOSE_INDICATOR
cigpacksperday
mr_mod
lvs


In [2]:
def TOPCAT_TrainTest_loader(device_num, batch_size):
    
    outcome_cols = [
    'death',
    'cvd_death',
    'time_death',
    'anyhosp',
    'time_anyhosp',
    'hfhosp',
    'time_hfhosp',
    'abortedca',
    'time_abortedca',
    'mi',
    'time_mi',
    'stroke',
    'time_stroke',
    'primary_ep',
    'time_primary_ep'
    ]
    
    
    os.chdir('/data/datasets/topcat/py_cleaned_data')
    
    df = pd.read_csv('TOPCAT_final_2_25_2020.csv')
    
    
    
    df_X = df.drop(columns=outcome_cols)
    df_y = df['cvd_death']
    
    x_data = df_X.values
    y_data = df_y.values

    
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    x_data = imp.fit_transform(x_data)
    
    
    X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.5)
    
    # data on CPU or GPU
    if device_num:
        X_train = torch.FloatTensor(X_train).cuda(device_num)
        y_train = torch.tensor(y_train).cuda(device_num)
        X_test = torch.FloatTensor(X_test).cuda(device_num)
        y_test = torch.tensor(y_test).cuda(device_num)
        
        #X_train = X_train.clone().detach().requires_grad_(True)
        #y_train = y_train.clone().detach().requires_grad_(True)
        #X_test = X_test.clone().detach().requires_grad_(True)
        #y_test = y_test.clone().detach().requires_grad_(True)
        
        X_train, X_test = X_train.type(torch.float32), X_test.type(torch.float32)
        #y_train, y_test = y_train.type(torch.float32), y_test.type(torch.float32)
        y_train, y_test = y_train.type(torch.long), y_test.type(torch.long)
        
    else:
        X_train = torch.tensor(X_train)
        y_train = torch.tensor(y_train)
        X_test = torch.tensor(X_test)
        y_test = torch.tensor(y_test)

    
    
    train = data_utils.TensorDataset(X_train, y_train)
    train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle=True)
    
    test = data_utils.TensorDataset(X_test, y_test)
    test_loader = data_utils.DataLoader(test, batch_size=batch_size, shuffle=True)
    
    
    input_size = x_data.shape[1]
    #num_classes = y_data.shape[1]
    num_classes = np.unique(y_data).size
    
    
    return train_loader, test_loader, input_size, num_classes