![TAP](https://avatars2.githubusercontent.com/u/13385739?v=3&s=200 "TAP")
# Module 3: Feature Generation & Encoding
This module will give you an overview for how to create feature columns from the raw data, and how to encode them in a way that is understandable to our classifiers. And to our llama.

#### Import the necessary libraries and connect to the server:

In [None]:
import re
import ast
import json
import string
import random
import trustedanalytics as ia
from clean_drugs import *

ia.connect()

random.seed = 01001000

## Demo: Mean-center the heights of patients

In [None]:
#Load data frame and start fresh...
tutorial_inpat = ia.get_frame("tutorial_inpat_frame")
tutorial_inpat.drop_columns([i for i in tutorial_inpat.column_names if i != 'data_lines'])

In [None]:
def add_ids(row):
    """Function to extract ids from json"""
    my_json = json.loads(row[0])
    patid = my_json['PATID'] if 'PATID' in my_json else None
    visid = my_json['VISID'] if 'VISID' in my_json else None
    return patid, visid

# Add Patient and Visit ids, use them to create a handy event_id as well!
tutorial_inpat.add_columns(add_ids, [("PATID", str), ("VISID", str)])
tutorial_inpat.add_columns(lambda row: str(row["PATID"]) + str(row["VISID"]), ("EVENT_ID", str))

In [None]:
def add_height(row):
    """Function to extract height in inches from json, and add as column in data frame."""
    my_json = json.loads(row[0])
    HEIGHT_INCHES = my_json['HEIGHT_INCHES'] if 'HEIGHT_INCHES' in my_json else 0.0
    return HEIGHT_INCHES

# Add a column of patient height...
tutorial_inpat.add_columns(add_height, ("HEIGHT_INCHES", ia.float64))

## Demo: Imputation

In [None]:
default_values = {"HEIGHT_INCHES": 66.0}

def impute_with_constants(frame, column_default_value_dict):
    """
    Perform imputation by a constant on a set of columns.
    Input takes a dictionary matching column names to imputation values, 
    which will replace any Nones.

    This operation is in-place and modifies the incoming dataframe.
    """
    columns_to_impute  = column_default_value_dict.keys()

    old_schema_dict = dict(frame.schema)

    new_columns_schema = []
    rename_dict = {}

    for c in columns_to_impute:
        new_columns_schema.append((c+"_new", old_schema_dict[c]))
        rename_dict[ c + "_new"] = c

    frame.add_columns(lambda row: map (lambda col: column_default_value_dict[col] if row[col] == 0.0 else row[col], columns_to_impute), new_columns_schema)
    frame.drop_columns(columns_to_impute)
    frame.rename_columns(rename_dict)
    return frame

In [None]:
tutorial_inpat_impute = impute_with_constants(tutorial_inpat, default_values)

#### Let's examine the output of the _column_summary_statistics_ function:

In [None]:
tutorial_inpat_impute.column_summary_statistics("HEIGHT_INCHES")

In [None]:
tutorial_inpat.column_summary_statistics('HEIGHT_INCHES')

In [None]:
# column_summary_statistics is a member function of data frames...
tutorial_inpat.column_summary_statistics('HEIGHT_INCHES')

In [None]:
# The result of column_summary_statistics is a dictionary. 
# We can extract just the bits we need...
avg = tutorial_inpat.column_summary_statistics('HEIGHT_INCHES')['mean']

In [None]:
def add_mean_centered_height(row, avg=avg):
    """
    Function to mean-center the heights of our data set.
    Assumes the average has already been computed and is stored in the variable 'avg'.
    """
    height = row['HEIGHT_INCHES']
    avg_height = avg/height
    return avg_height

# Add a column of mean-centered patient heights...
tutorial_inpat.add_columns(add_mean_centered_height, ("MEAN_CENTERED_HEIGHT_INCHES", ia.float64))

## Demo: Add medication lists and clean up text features

In [None]:
def add_meds(row):
    my_json = json.loads(row[0])
    med_orders = my_json['MED_ORDER_NAMEs'] if 'MED_ORDER_NAMEs' in my_json else [None]
    discharge_med_orders = my_json['DISCHARGE_MED_ORDER_NAMEs'] if 'DISCHARGE_MED_ORDER_NAMEs' in my_json else [None]
    if med_orders != [None] and discharge_med_orders != [None]:
        meds = med_orders + discharge_med_orders
    elif med_orders == [None]:
        meds = discharge_med_orders
    elif discharge_med_orders == [None]:
        meds = med_orders
    return '|'.join([str(i) for i in med_orders])

In [None]:
# Add column of medications for each patient...
tutorial_inpat.drop_columns("MEDS")
tutorial_inpat.add_columns(add_meds, ("MEDS", str))

## Let's build a massive text-cleaning procedure...

In [None]:
pattern = re.compile('[^a-zA-Z]+')
stopwords = ['mg', 'ml', 'units', 'gram', 'mcg']
re_capsule = re.compile(r'capsules?')
re_oraltablet = re.compile(r'oral\stablets?')
re_oraltablet = re.compile(r'oral\stabs?')
re_oralliquid = re.compile(r'oral\sliquid?')
re_oralliquid = re.compile(r'oral\sdose?')
re_oralsuspension = re.compile(r'oral\ssuspensions?')
re_suspension = re.compile(r'suspensions?')
re_tablet = re.compile(r'tablets?')
re_intrathecal = re.compile(r'intrathecall?y?')
STOPS = [re_capsule, re_oraltablet, re_oralliquid, re_oralsuspension, re_suspension, re_tablet, re_intrathecal]

def to_clean_doc(text, delimiter='|'):
    text = re.sub(pattern, ' ', text)
    text = text.lower()
    tokens = text.split()
    tokens = [w for w in tokens if w not in stopwords]
    tokens = [w for w in tokens if len(w)>2]
    tokens = map(preprocess, tokens)
    tokens = [w for w in tokens if len(w)>0]
    return delimiter.join(tokens)

def clean_preprocess(med):
    med = med.lower()
    for stop in STOPS:
        med = re.sub(stop, '', med)
    if len(re.findall(r'^(admission\slabs?|abdomen|abdominal)', med)) != 0:
        med = ''
    if len(re.findall(r'^(accucheck|accuchek|accu\schek|accu\scheck)', med)) != 0:
        med = 'accucheck'
    if len(re.findall(r'^glucose\stest\sstrips?', med)) != 0:
        med = 'glucose test strips'
    if len(re.findall(r'^insulin\ssliding\sscale', med)) != 0:
        med = 'insulin sliding scale'
    if len(re.findall(r'(adm override)', med)) != 0:
        med = ''
    if len(re.findall(r'^insulin\ssyringes', med)) != 0:
        med = 'insulin syringes'
    if len(re.findall(r'^insulin.*?daily', med)) != 0:
        med = 'insulin daily'
    if len(re.findall(r'^lantus', med)) != 0:
        med = 'lantus'
    if len(re.findall(r'^acetaminophen\scodeine', med)) != 0:
        med = 'acetaminophen codeine'
    elif len(re.findall(r'^acetaminophen', med)) != 0:
        med = 'acetaminophen'
    if len(re.findall(r'advair', med)) != 0:
        med = 'advair'
    if len(re.findall(r'advil', med)) != 0:
        med = 'advil'
    if len(re.findall(r'^afrin', med)) != 0:
        med = 'afrin'
    if len(re.findall(r'^albuterol', med)) != 0:
        med = 'albuterol'
    if len(re.findall(r'^zofran', med)) != 0:
        med = 'zofran '
    if len(re.findall(r'^zinc\soxide', med)) != 0:
        med = 'zinc oxide'
    if len(re.findall(r'^zinc\ssulfate', med)) != 0:
        med = 'zinc sulfate'
    if len(re.findall(r'^zinc\ssupplement', med)) != 0:
        med = 'zinc supplement'
    if len(re.findall(r'^zantac', med)) != 0:
        med = 'zantac'
    if len(re.findall(r'^xenaderm', med)) != 0:
        med = 'xenaderm'
    if len(re.findall(r'^wound\scare', med)) != 0:
        med = 'wound care'
    if len(re.findall(r'^chloraseptic', med)) != 0:
        med = 'chloraseptic'
    if len(re.findall(r'^cholecalciferol', med)) != 0:
        med = 'cholecalciferol'
    if len(re.findall(r'^amphotericin', med)) != 0:
        med = 'amphotericin'
    if len(re.findall(r'^artificial\stears?', med)) != 0:
        med = 'artificial tears'
    if len(re.findall(r'^warfarin', med)) != 0:
        med = 'warfarin'
    if len(re.findall(r'^prednisone', med)) != 0:
        med = 'prednisone'
    if len(re.findall(r'^percocet', med)) != 0:
        med = 'percocet'
    if len(re.findall(r'^benadryl', med)) != 0:
        med = 'benadryl'
    if len(re.findall(r'^vancomycin', med)) != 0:
        med = 'vancomycin'
    if len(re.findall(r'^vanco', med)) != 0:
        med = 'vanco'
    if len(re.findall(r'^bisacodyl', med)) != 0:
        med = 'bisacodyl'
    if len(re.findall(r'^nitroglycerin', med)) != 0:
        med = 'nitroglycerin'
    if len(re.findall(r'^acyclovir\soral', med)) != 0:
        med = 'acyclovir'
    if len(re.findall(r'(acylovir|acycl)$', med)) != 0:
        med = 'acylovir'
    if len(re.findall(r'^alpraz', med)) != 0:
        med = 'alprazolam'
    if len(re.findall(r'^(alprazolam)', med)) != 0:
        med = 'alprazolam'
    if len(re.findall(r'^ambien', med)) != 0:
        med = 'ambien'
    if len(re.findall(r'^carnation', med)) != 0:
        med = 'carnation instant breakfast'
    if len(re.findall(r'^cepacol', med)) != 0:
        med = 'cepacol'
    if len(re.findall(r'^coumadin', med)) != 0:
        med = 'coumadin'
    if len(re.findall(r'^ensure', med)) != 0:
        med = 'ensure'
    if len(re.findall(r'^fluticasone', med)) != 0:
        med = 'fluticasone'
    if len(re.findall(r'^hydrocortisone', med)) != 0:
        med = 'hydrocortisone'
    if len(re.findall(r'^(insulin|novolog)', med)) != 0:
        med = 'insulin'
    if len(re.findall(r'^multivitamin', med)) != 0:
        med = 'multivitamin'
    if len(re.findall(r'^normal\ssaline', med)) != 0:
        med = 'normal saline'
    if len(re.findall(r'^oxygen', med)) != 0:
        med = 'oxygen'
    if len(re.findall(r'^calcium\sacetate', med)) != 0:
        med = 'calcium acetate'
    if len(re.findall(r'^calcium\scarbonate', med)) != 0:
        med = 'calcium carbonate'
    if len(re.findall(r'^calcium\schloride', med)) != 0:
        med = 'calcium chloride'
    if len(re.findall(r'^calcium\smagnesium', med)) != 0:
        med = 'calcium magnesium'
    if len(re.findall(r'^aspirin', med)) != 0:
        med = 'aspirin'
    if len(re.findall(r'^sodium\schloride', med)) != 0:
        med = 'sodium chloride'
    if len(re.findall(r'^ferrous\ssulfate', med)) != 0:
        med = 'ferrous sulfate'
    if len(re.findall(r'^motel', med)) != 0:
        med = 'motelukast'

    if len(re.findall(r'^(picc|comprehensive\smetabolic\spanel|blood|with|work|wound|whom|wheelchair|(rolling\s)?walker|water|cervical|check|chem|chest|commode|complete|continue|continuous|contour|contrast|dexamethasone|diabetes|diabetic|diagnosis|dressing|emergency|emergencies|gauze|have|head|home|hospice|hospital|interventional|investigational|laboratory|labs?\s|labwork|lateral|needle|panel|patient|physical\stherapy|please|portable|questions?|saline|script|scan|sliding|sling|speech|supplemental\soxygen|this|total|touch|tube|twice\sweekly|upright|will\sneed|with\sdiff|would\sscare|chest\sxray|draw|outpatient|basic\smetabolic\spanel|visiting\snurse|needs?\s|stat\slabs?|should\shave|brain|alcohol|alchol)', med)) != 0:
        med = ''
    med = re.sub(r'^(daily|week?ly|monthly|days?)', '', med)
    med = re.sub(r'(daily|week?ly|monthly|days?)$', '', med)
    med = re.sub(r'(home\sdose|ivbp|ivpb|let(\s(once|twice))?|liquid|disp)$', '', med)
    med = re.sub(r'(caps|hours|solution|(piggyback\s)?every.*?|every\shours|powder\sinjection|intravenous\s?(injection|solution)?|oral|tabs|topical\s?(cream|topical\scream|ointment)|(let)?\sstop\sdate)$', '', med)
    med = re.sub(r'(extended\srelease|extended\srelease\slet\sextended\srelease|let|oral|oral\sconcentrate|orally\sdisintegrating\s(once|twice)\shome|)$', '', med)
    med = re.sub(r'(compounding\spowder|home\smedication|intrathecal|le|let|let\sthree\stimes|oral|oral\sliquid|orally(\smorning|evening)(\sorally\s(morning|evening))?)$', '', med)
    med = re.sub(r'\s+', ' ', med)
    med = re.sub(r' \-', ' ', med)
    med = re.sub(r'\d+.*?$', ' ', med)
    med = re.sub(r'\sivpb', ' ', med)
    med = re.sub(r'_', ' ', med)
    med = med.strip()
    return med

## Ok, back to the action!

In [None]:
def add_clean_meds(row):
    clean_meds_out = []
    meds = row['MEDS']
    meds_list = meds.split('|')
    for i in meds_list:
        clean_meds_out.append(clean_preprocess(med=i.strip()))
    return "|".join(clean_meds_out)

In [None]:
# Add a column of cleaned-up medications for each patient...
tutorial_inpat.drop_columns("CLEAN_MEDS")
tutorial_inpat.add_columns(add_clean_meds, ("CLEAN_MEDS", str))

In [None]:
tutorial_inpat.column_names

In [None]:
tmp_meds_test = tutorial_inpat.take(1)

In [None]:
tmp_meds_test[0][1:]