In [1]:
import sys
import os
import numpy as np
from str2bool import str2bool
from rpy2.robjects.packages import STAP
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri, numpy2ri
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects.packages import importr

pandas2ri.activate()
numpy2ri.activate()



In [2]:
# Read command arguments: include_lab, include_ethdon, lag, eq_train_ratio
include_lab = str2bool("T")  # Include lab features?
include_ethdon = str2bool("T")  # Include ethnicity + donor details?
lag = int("1")  # Number of lag variables
eq_train_ratio = str2bool("T")  # Train on equal case:control ratio?
home_dir = '/home/diabetes_prediction'
visit_type = "first"
output = "temp"
post2000 = True

In [3]:
# Get features based on these inputs
with open(os.path.join(home_dir, 'features.R'), 'r') as f:
    string = f.read()
features_file = STAP(string, "features")
features_to_use = features_file.features.rx2("clin")
if include_lab:
    features_to_use = features_to_use + features_file.features.rx2("lab")

if include_ethdon:
    features_to_use = features_to_use + features_file.features.rx2("eth") + features_file.features.rx2("don")

timedep_cols = np.intersect1d(features_to_use, features_file.timedep_features)
cov_cols = np.setdiff1d(features_to_use, timedep_cols)

if eq_train_ratio:
    eq_cases_train_cols = np.array(["TRR_ID", "is_diab"])
else:
    eq_cases_train_cols = np.array()

In [21]:
# Read RDS files (load data table)
readRDS = robjects.r['readRDS']
tx_li_study = readRDS(os.path.join(home_dir, 'tx_li_formatted.rds'))
txf_li_study = readRDS(os.path.join(home_dir, 'txf_li_formatted.rds'))

In [24]:
# Merge them
with open(os.path.join(home_dir, 'R', 'functions.R'), 'r') as f:
    string = f.read()
functions = STAP(string, "functions")

merged = functions.combine_tx_txf(tx_li_study, txf_li_study, np.setdiff1d(cov_cols, "age"), timedep_cols, lag)



Attaching package: ‘data.table’



    between, first, last




Attaching package: ‘data.table’



    between, first, last




In [8]:
df = pandas2ri.ri2py_dataframe(DataFrame(merged).filter('time_next_followup > time_since_transplant'))

In [None]:
df

In [9]:
#Prep data for model training
cols = np.concatenate((timedep_cols, cov_cols))
if lag > 0:
    for l in range(1,  lag + 1):
        cols = np.append(cols, list(map(lambda x: '{}_{}'.format(x, l), timedep_cols)))

subset_cols = np.concatenate((['transplant_year', 'TRR_ID', 'age'], cols, ['is_diab', 'time_since_transplant',
                                                                   'time_next_followup', 'time_to_diab',
                                                                   'diab_time_since_tx', 'diab_in_1_year',
                                                                   'diab_now']))
df = df.dropna(subset=subset_cols)
df_test = df[(df.transplant_year.astype(int) >= 2011) & (df.time_to_diab >= 0)]
df_nontest = df[(df.transplant_year.astype(int) < 2011) & (df.time_to_diab >= 0)]


In [10]:
num_folds = 5
nontest_y = df_nontest.drop_duplicates(subset=['TRR_ID', 'is_diab']).is_diab
caret = importr('caret')
folds = caret.createFolds(nontest_y, num_folds, False)

In [11]:
data = {'test': df_test, 'train': df_nontest, 'cols': cols, 'eq_cases_train_cols': eq_cases_train_cols,
            'folds': folds}

In [12]:
import pickle

# Save data in case kernel got restarted
pickle.dump(data, open("temp.pkl", "wb")) 