In [87]:
import json
import datetime
import trustedanalytics as ia

ia.connect()

Already connected.  This client instance connected to server http://localhost:9099/v1 (version=0.4.3-201511049238) as user test_api_key_1 at 2015-11-06 19:27:15.944395.


## Set up frame...

In [88]:
def add_ids(row):
    my_json = json.loads(row[0])
    patid = my_json['PATID'] if 'PATID' in my_json else None
    visid = my_json['VISID'] if 'VISID' in my_json else None
    return patid, visid

def add_height(row):
    """Function to extract height in inches from json, and add as column in data frame."""
    my_json = json.loads(row[0])
    HEIGHT_INCHES = my_json['HEIGHT_INCHES'] if 'HEIGHT_INCHES' in my_json else 0.0
    return HEIGHT_INCHES

def add_age(row):
    my_json = json.loads(row[0])
    AGE = my_json['AGE'] if 'AGE' in my_json else 0
    return AGE

def add_weight(row):
    my_json = json.loads(row[0])
    WEIGHT_LBS = 150.0
    if 'WEIGHT_LBS' in my_json:
        WEIGHT_LBS = my_json['WEIGHT_LBS']
    try:
        WEIGHT_LBS = ia.float64(WEIGHT_LBS)
    except:
        WEIGHT_LBS = 150.0
    return WEIGHT_LBS

def add_marital_status(row):
    my_json = json.loads(row[0])
    MARITAL_STATUS = my_json['MARITAL_STATUS'] if 'MARITAL_STATUS' in my_json else None
    return MARITAL_STATUS

def add_death_flag(row):
	my_json = json.loads(row[0])
	DEATH_FLAG = my_json['DEATH_FLAG'] if 'DEATH_FLAG' in my_json else None
	return DEATH_FLAG

def add_meds(row):
    my_json = json.loads(row[0])
    med_orders = my_json['MED_ORDER_NAMEs'] if 'MED_ORDER_NAMEs' in my_json else [None]
    discharge_med_orders = my_json['DISCHARGE_MED_ORDER_NAMEs'] if 'DISCHARGE_MED_ORDER_NAMEs' in my_json else [None]
    if med_orders != [None] and discharge_med_orders != [None]:
        meds = med_orders + discharge_med_orders
    elif med_orders == [None]:
        meds = discharge_med_orders
    elif discharge_med_orders == [None]:
        meds = med_orders
    #if med_orders != [None]:
    #    med_orders = [clean_drugs.preprocess(i) for i in med_orders]
    return '|'.join([str(i) for i in med_orders])

def add_marital_status(row):
    my_json = json.loads(row[0])
    MARITAL_STATUS = my_json['MARITAL_STATUS'] if 'MARITAL_STATUS' in my_json else None
    return MARITAL_STATUS

In [None]:
#Load data frame and start fresh...
tutorial_inpat = ia.get_frame("tutorial_inpat_frame")
tutorial_inpat_model = tutorial_inpat.copy()
tutorial_inpat_model.drop_columns([i for i in tutorial_inpat_model.column_names if i != 'data_lines'])
tutorial_inpat_model.add_columns(add_ids, [("PATID", str), ("VISID", str)])
tutorial_inpat_model.add_columns(lambda row: str(row["PATID"]) + str(row["VISID"]), ("EVENT_ID", str))
tutorial_inpat_model.add_columns(add_height, ("HEIGHT_INCHES", ia.float64))
tutorial_inpat_model.add_columns(add_marital_status, ("MARITAL_STATUS", str))
tutorial_inpat_model.add_columns(add_weight, ("WEIGHT_LBS", ia.float64))
tutorial_inpat_model.add_columns(add_death_flag, ("DEATH_FLAG", str))
tutorial_inpat_model.add_columns(add_age, ("AGE", ia.float64))
tutorial_inpat_model.add_columns(add_meds, ("MEDS", str))

In [None]:
def add_admit_date(row):
    """Function to extract the admit date for a patient"""
    my_json = json.loads(row[0])
    try:
        ADMIT_DATE = my_json['ADM_DATE']['$date']/1000
    except:
        ADMIT_DATE = None
    return ADMIT_DATE

def add_discharge_date(row):
    """Function to extract the discharge date of a patient"""
    my_json = json.loads(row[0])
    try:
        DISCHARGE_DATE = my_json['DISCHARGE_DATE']['$date']/1000
    except:
        DISCHARGE_DATE = None
    return DISCHARGE_DATE

In [None]:
# Add admit and discharge date columns to our data frame...
tutorial_inpat_model.add_columns(add_admit_date, ("ADM_DATE", str))
tutorial_inpat_model.add_columns(add_discharge_date, ("DISCHARGE_DATE", str))

In [None]:
tutorial_inpat_model.inspect(columns=[i for i in tutorial_inpat_model.column_names if i != 'data_lines'])

#### Filter out missing ids...

In [None]:
# A slightly different syntax for filtering that I find convenient...
# Sometimes dropping data_lines is necessary to do a filter. 
# In these situations, make sure you're done extracting features from the original json...
def filter_patid_nones(row):
	return row['PATID'] is not None 

def filter_visid_nones(row):
	return row['VISID'] is not None 

# Filter out any rows without patid or visid...
tutorial_inpat_model.drop_columns('data_lines')
tutorial_inpat_model.filter(filter_patid_nones)
tutorial_inpat_model.filter(filter_visid_nones)

# Let's D.A.

In [None]:
# Create a working copy...
lts = tutorial_inpat_model.copy()

# Flatten on the separating character...
lts.flatten_column("MEDS", "|")

# We need to create a table ox ID x MED x Count...
lts_lda_input = lts.group_by(["VISID", "MEDS"], ia.agg.count)
if "lts_lda_input_renamed" in ia.get_frame_names():
    ia.drop_frames("lts_lda_input_renamed")
lts_lda_input_renamed = lts_lda_input.copy(name = "lts_lda_input_renamed")

In [None]:
# Set the parameters for the LDA model...
nTopics = 20
max_iterations = 2

# Clean up any old attempts...
ia.drop_models(['lts_lda_model'])

# Create LDA model..
lda_model =  ia.LdaModel(name='lts_lda_model')
lts_lda_model = lda_model.train(lts_lda_input_renamed, "VISID", "MEDS", 'count', num_topics=nTopics, max_iterations=max_iterations)

# We can extract the per-id topic distributions...
doc_mixes = lts_lda_model['topics_given_doc']

In [None]:
# Clean up old approaches...
ia.drop_frames(['lts_lda_feature_frame'])

# Copy the document x topic frame...
lts_lda_feature_frame = doc_mixes.copy(name='lts_lda_feature_frame')

# Create schema...
schema = map(lambda i: ('topic' + "_" + str(i), ia.float64), range(1, nTopics + 1))

# Create features frame...
lts_lda_feature_frame.add_columns(lambda row: (row['topic_probabilities']), schema)
lts_lda_feature_frame.drop_columns(['topic_probabilities'])

In [None]:
lts_lda_feature_frame.inspect()

In [None]:
tutorial_inpat_model_lda = tutorial_inpat_model.join(lts_lda_feature_frame, 'VISID')

In [None]:
tutorial_inpat_model_lda.inspect(columns=[i for i in tutorial_inpat_model_lda.column_names if i != 'data_lines'])

# Get gold-standard labels...

In [None]:
def get_gold_standard(row):
    """
    Function to compute nDays between discharge and admit.
    Returns GS label.
    """
    t2 = row["ADM_DATE"]
    t1 = row["DISCHARGE_DATE"]
    t1 = datetime.datetime.fromtimestamp(float(t1))
    t2 = datetime.datetime.fromtimestamp(float(t2))
    td = t2 - t1
    td_days = int(divmod(td.days, 60)[-1])
    if td_days <= 30:
        RF30 = "POSITIVE"
    else:
        RF30 = "NEGATIVE"
    return RF30

tutorial_inpat_model_lda.add_columns(get_gold_standard, ("RF30", str))

In [None]:
tutorial_inpat_model_lda.inspect(n=10, columns=[i for i in tutorial_inpat_model_lda.column_names if i in ['ADM_DATE', 'DISCHARGE_DATE', "RF30", "RF90"]])

In [None]:
rf30_count = tutorial_inpat_model_lda.group_by('RF30', ia.agg.count)

In [None]:
rf30_count.inspect()

## Demo: Preparing a column for a classifier/algorithm--feature encoding train/test!

#### Demo: Encoding features

In [None]:
# Let's demonstrate this with a non-numeric column, the MARITAL_STATUS feature...
tutorial_inpat_model_lda.inspect(columns="MARITAL_STATUS")

In [None]:
# Let's generate a summary table of the possible values here...
tutorial_inpat_model_lda_count = tutorial_inpat_model_lda.group_by("MARITAL_STATUS", ia.agg.count)

In [None]:
tutorial_inpat_model_lda_count.inspect()

In [None]:
def numericalize_udf(in_val, dx):
        try:
            return_val = dx[in_val]
        except:
            return_val = 0
        return return_val
    
# ia.drop_frames("numerical_test")
# numerical_test = tutorial_inpat.copy(name='numerical_test')
# numerical_test_count = numerical_test.group_by('MARITAL_STATUS', ia.agg.count)

for col in ["MARITAL_STATUS"]:
    new_column_name = col + "_INT"
    f = tutorial_inpat_model_lda.download()
    d = f.to_dict()[col]
    rev_dx = dict((v, k) for k, v in d.iteritems())
    
    tutorial_inpat_model_lda.add_columns(lambda row: numericalize_udf(row[col], rev_dx), (new_column_name, ia.int32))

In [None]:
tutorial_inpat_model_lda.inspect()

### Partition into train/test

In [None]:
PATIDS = tutorial_inpat_model_lda.group_by("PATID", ia.agg.count)
PATIDSdl = PATIDS.download(n=PATIDS.row_count, columns='PATID')
unique_patids = list(set(list(PATIDSdl['PATID'])))
random.shuffle(unique_patids)
test_patids = unique_patids[0:int(len(unique_patids) * 0.10)]
train_patids = [i for i in unique_patids if i not in test_patids]


def label_train_test(row, test_list=test_patids, train_list=train_patids):
    if row['PATID'] in test_patids:
        return "TEST"
    else:
        return "TRAIN"
    

tutorial_inpat_model_lda.add_columns(label_train_test, ("DATASET", str))

In [None]:
len(train_patids)

In [None]:
len(test_patids)

In [None]:
tutorial_inpat_model_lda.inspect(columns=[i for i in tutorial_inpat_model_lda.column_names if i != 'data_lines'])

## Train Random Forest

In [None]:
trainframe = tutorial_inpat_model_lda.copy()
trainframe.filter(lambda row: row['DATASET'] == "TRAIN")
testframe = tutorial_inpat_model_lda.copy()
testframe.filter(lambda row: row['DATASET'] == "TEST")

In [None]:
train.column_names

In [None]:
rf_model_tutorial = ia.RandomForestClassifierModel('rf_model_tutorial2')

In [None]:
# train.column_names
rf_model_tutorial

In [None]:
def rf30_str(row):
    if row['RF30'] == "POSITIVE":
        return 1
    else:
        return 0

trainframe.add_columns(rf30_str, ("RF30STR", ia.float32))
testframe.add_columns(rf30_str, ("RF30STR", ia.float32))

In [None]:
rf_model_tutorial_results = rf_model_tutorial.train(frame=trainframe, 
                                                    label_column="RF30STR", 
                                                    observation_columns=["topic_1", "topic_2"], 
                                                    num_classes=2, 
                                                    impurity='gini', 
                                                    max_depth=4, 
                                                    seed=01001000
                                                   )

In [None]:
# test the model
x = rf_model_tutorial.test(testframe, "RF30STR")

In [None]:
x

In [None]:
x = m.publish()