# Final Project: Admission Prediction from NHAMCS
## Initial Random Forest model
### DS5559: Big Data Analysis
### Thomas Hartka, Alicia Doan, Michael Langmayr
Created: 7/18/2020 
  
In this notebook creates and analyzes random forest model for predicting hospital admission in the NHAMCS database.  For this initial model, categorical variables will be represented with one-hot encoding and the reason for visit (RFV) variables will be ignored.  The RFV variables will be ignored because there are hundreds of different potential values and we have not yet developed a way to categorize them yet.  Binary variables were previously converted to 0/1.  

## Configuration

In [99]:
# preferences
scale_data = True        # should data be scaled
weight_outcome = True    # use weights to handle class imbalance
reg_param = 0            # regularization of LR (0=not regularizatoin)
elas_param = 0           # elastic net (0=Ridge,1=Lasso)
reduced_vars = False     # where to use a reduced variable 
SEED = 314               # seed

In [2]:
# set data directory
data_dir = "../data"
results_dir = "../results"

In [9]:
os.getcwd()

'/sfs/qumulo/qhome/ad2ew/ds5559/DS5559-Project/code'

## Import libraries and set up Spark

In [3]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [121]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint


In [6]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [78]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed.2007-2017")

## Transform data

In [83]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")
simodel = rsi.fit(NHAMCS)
NHAMCS = simodel.transform(NHAMCS)

In [84]:
# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')
NHAMCS = rohe.transform(NHAMCS)

In [85]:
# assemble vector
if reduced_vars:
    va = VectorAssembler(inputCols=['AGEYEAR','PULSE','TEMPF','COPD'], 
                         outputCol="features") 
else:    
    va = VectorAssembler(inputCols=["AGEYEAR","RESONE",'SEXMALE','ARRTIMEMIN','YEAR','PULSE','TEMPF', \
                                'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE','ALZHD','ASTHMA','CAD','CANCER', \
                                'CEBVD','CHF','CKD','COPD','DEPRN','DIABTYP0','DIABTYP1','DIABTYP2','EDHIV', \
                                'ESRD','ETOHAB','HPE','HTN','HYPLIPID','OBESITY','OSA','OSTPRSIS','SUBSTAB', \
                                'NOCHRON','TOTCHRON','INJURY','INJURY72'], 
                         outputCol="features")  
    
NHAMCS = va.setHandleInvalid("skip").transform(NHAMCS)

In [88]:
# scale features using MaxAbs
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(NHAMCS)
NHAMCS = scalerModel.transform(NHAMCS)

# determine feature column
if scale_data:
    features_col = "features"
else:
    features_col = "scaledFeatures"

## Balance Data by Downsampling

In [97]:
def downsample(df, target, positive_label, negative_label):
    """
    df              spark dataframe
    target          str, target variable
    positive_label  int, value of positive label
    negative_label  int, value of negative label
    
    """

    ### ENTER CODE HERE
    positives = df.filter(df[target] == positive_label)
    num_positives = positives.count()
    negatives =  df.filter(df[target] == negative_label)
    num_negatives = negatives.count()
    
    if (num_positives > num_negatives): # downsample positives
        sampled_df = positives.sample(withReplacement=False, fraction=num_negatives/num_positives, seed=SEED)
        df_b = sampled_df.union(negatives)
    elif (num_negatives > num_positives): # downsample negatives
        sampled_df = negatives.sample(withReplacement=False, fraction=num_positives/num_negatives, seed=SEED)
        df_b = sampled_df.union(positives)

    return df_b

In [183]:
NHAMCS_small = downsample(NHAMCS, 'ADM_OUTCOME', 1, 0)
NHAMCS_small.filter(NHAMCS_small['ADM_OUTCOME'] == 1).count()

779

In [129]:
dataRDD = NHAMCS.select(col("ADM_OUTCOME"), col("scaledFeatures")).rdd.map(tuple)
dataRDD_small = NHAMCS_small.select(col("ADM_OUTCOME"), col("scaledFeatures")).rdd.map(tuple)


In [199]:
# map label to binary values, then convert to LabeledPoint
lp = dataRDD.map(lambda row:(row[0], Vectors.dense(row[1])))    \
                    .map(lambda row: LabeledPoint(row[0], row[1]))

lp_small = dataRDD_small.map(lambda row:(row[0], Vectors.dense(row[1])))    \
                    .map(lambda row: LabeledPoint(row[0], row[1]))

In [210]:
#found which features were categorical -- plug into categoricalFeaturesInfo in model
# remove 40th feature
dense_vector = [37.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,824.0,2014.0,85.0,98.0,16.0,123.0,67.0,100.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0]
categorical_list = [str(i)+ ":2" for i in range(len(dense_vector)) if (int(dense_vector[i])== 0) |(int(dense_vector[i])== 1)] 

str(categorical_list).replace("'","")


'[1:2, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 17:2, 18:2, 19:2, 20:2, 21:2, 22:2, 23:2, 24:2, 25:2, 26:2, 27:2, 28:2, 29:2, 30:2, 31:2, 32:2, 33:2, 34:2, 35:2, 36:2, 37:2, 38:2, 39:2, 40:2, 41:2, 42:2]'

## Train and test model

In [207]:
# split into training and testing set
training, testing = lp.randomSplit([0.8, 0.2], SEED) #using non-downsampled data -- 93% accuracy
training, testing = lp_small.randomSplit([0.8, 0.2], SEED) #using downsampled data -- got about 70% accuracy

In [208]:

categoricalFeaturesInfo={1:2, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 17:2, 18:2, 19:2, 20:2, 21:2, 22:2, 23:2, 24:2, 25:2, 26:2, 27:2, 28:2, 29:2, 30:2, 31:2, 32:2, 33:2, 34:2, 35:2, 36:2, 37:2, 38:2, 39:2, 41:2, 42:2}


# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo,
                                     numTrees=100, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=5, maxBins=32)

In [209]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testing.map(lambda x: x.features))
labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testing.count())
print('Test Error = ' + str(testErr))

Test Error = 0.0706980880391285


## Evaluate model

In [16]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="ADM_OUTCOME")

print("Number of negatives: ", predict_test.where('prediction == 0').count())
print("Number of positives: ", predict_test.where('prediction == 1').count())

print("\nThe area under ROC for train set is", evaluator.evaluate(predict_train))
print("The area under ROC for test set is", evaluator.evaluate(predict_test))

Number of negatives:  1513
Number of positives:  691

The area under ROC for train set is 0.7341131404078137
The area under ROC for test set is 0.7540790183387306
