# Final Project: Admission Prediction from NHAMCS
## Initial Random Forest model
### DS5559: Big Data Analysis
### Thomas Hartka, Alicia Doan, Michael Langmayr
Created: 7/18/2020 
  
In this notebook creates and analyzes random forest model for predicting hospital admission in the NHAMCS database.  For this initial model, categorical variables will be represented with one-hot encoding and the reason for visit (RFV) variables will be ignored.  The RFV variables will be ignored because there are hundreds of different potential values and we have not yet developed a way to categorize them yet.  Binary variables were previously converted to 0/1.  

## Configuration

In [1]:
# preferences
scale_data = True        # should data be scaled
weight_outcome = True    # use weights to handle class imbalance
reg_param = 0            # regularization of LR (0=not regularizatoin)
elas_param = 0           # elastic net (0=Ridge,1=Lasso)
reduced_vars = False     # where to use a reduced variable 
SEED = 314               # seed

In [2]:
# set data directory
data_dir = "../data"
results_dir = "../results"

In [9]:
os.getcwd()

'/sfs/qumulo/qhome/ad2ew/ds5559/DS5559-Project/code'

## Import libraries and set up Spark

In [3]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [57]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression,RandomForestClassifier 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator


In [6]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [65]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed_bc.2014-2017")

## Transform data

In [66]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")
simodel = rsi.fit(NHAMCS)
NHAMCS = simodel.transform(NHAMCS)

In [67]:
# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')
NHAMCS = rohe.transform(NHAMCS)

In [69]:
# assemble vector
if reduced_vars:
    va = VectorAssembler(inputCols=['AGEYEAR','PULSE','TEMPF','COPD'], 
                         outputCol="features") 
else:    
    va = VectorAssembler(inputCols=["AGEYEAR","RESONE",'SEXMALE','ARRTIMEMIN','YEAR','PULSE','TEMPF', \
                                'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE','ALZHD','ASTHMA','CAD','CANCER', \
                                'CEBVD','CHF','CKD','COPD','DEPRN','DIABTYP0','DIABTYP1','DIABTYP2','EDHIV', \
                                'ESRD','ETOHAB','HPE','HTN','HYPLIPID','OBESITY','OSA','OSTPRSIS','SUBSTAB', \
                                'NOCHRON','TOTCHRON','INJURY','INJURY72', 'RFV1_admit_rate'], 
                         outputCol="features")  
    
NHAMCS = va.setHandleInvalid("skip").transform(NHAMCS)

In [70]:
# scale features using MaxAbs
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(NHAMCS)
NHAMCS = scalerModel.transform(NHAMCS)

# determine feature column
if scale_data:
    features_col = "features"
else:
    features_col = "scaledFeatures"

In [62]:
NHAMCS.take(2)

[Row(RFV1='Chest pain', AGEYEAR=38, AGER='25-44 years', SEXMALE=0, RESIDNCE='Private residence', ARRTIMEMIN=508, YEAR=2014, PULSE=105, TEMPF=98, RESPR=16, BPSYS=135, BPDIAS=67, POPCT=97, PAINSCALE=2, ALZHD=0, ASTHMA=0, CAD=0, CANCER=0, CEBVD=0, CHF=0, CKD=0, COPD=0, DEPRN=0, DIABTYP0=0, DIABTYP1=0, DIABTYP2=0, EDHIV=0, ESRD=0, ETOHAB=0, HPE=0, HTN=0, HYPLIPID=0, OBESITY=0, OSA=0, OSTPRSIS=0, SUBSTAB=0, NOCHRON=1, TOTCHRON=0, RFV2='Arm pain, ache, soreness, discomfort', RFV3='Blank', RFV4='Blank', RFV5='Blank', INJURY=0, INJURY72=0, ADM_OUTCOME=0, n=10326, admit=3123, RFV1_admit_rate=0.30244044160371875, RESINDEX=0.0, RESONE=SparseVector(6, {0: 1.0}), features=SparseVector(43, {0: 38.0, 1: 1.0, 8: 508.0, 9: 2014.0, 10: 105.0, 11: 98.0, 12: 16.0, 13: 135.0, 14: 67.0, 15: 97.0, 16: 2.0, 39: 1.0}), scaledFeatures=SparseVector(43, {0: 0.4086, 1: 1.0, 8: 0.353, 9: 0.9985, 10: 0.4303, 11: 0.8991, 12: 0.1067, 13: 0.4704, 14: 0.3564, 15: 0.97, 16: 0.2, 39: 1.0})),
 Row(RFV1='Chest pain', AGEYEA

## Balance Data by Downsampling

In [32]:
def downsample(df, target, positive_label, negative_label):
    """
    df              spark dataframe
    target          str, target variable
    positive_label  int, value of positive label
    negative_label  int, value of negative label
    
    """

    ### ENTER CODE HERE
    positives = df.filter(df[target] == positive_label)
    num_positives = positives.count()
    negatives =  df.filter(df[target] == negative_label)
    num_negatives = negatives.count()
    
    if (num_positives > num_negatives): # downsample positives
        sampled_df = positives.sample(withReplacement=False, fraction=num_negatives/num_positives, seed=SEED)
        df_b = sampled_df.union(negatives)
    elif (num_negatives > num_positives): # downsample negatives
        sampled_df = negatives.sample(withReplacement=False, fraction=num_positives/num_negatives, seed=SEED)
        df_b = sampled_df.union(positives)

    return df_b

In [71]:
NHAMCS_small = downsample(NHAMCS, 'ADM_OUTCOME', 1, 0)
NHAMCS_small.filter(NHAMCS_small['ADM_OUTCOME'] == 1).count()

9308

In [82]:
NHAMCS_small.take(1)

[Row(RFV1='Chest pain', AGEYEAR=53, AGER='45-64 years', SEXMALE=0, RESIDNCE='Private residence', ARRTIMEMIN=933, YEAR=2014, PULSE=76, TEMPF=97, RESPR=16, BPSYS=141, BPDIAS=63, POPCT=100, PAINSCALE=7, ALZHD=0, ASTHMA=0, CAD=0, CANCER=0, CEBVD=0, CHF=0, CKD=0, COPD=0, DEPRN=0, DIABTYP0=0, DIABTYP1=0, DIABTYP2=0, EDHIV=0, ESRD=0, ETOHAB=0, HPE=0, HTN=0, HYPLIPID=0, OBESITY=0, OSA=0, OSTPRSIS=0, SUBSTAB=0, NOCHRON=1, TOTCHRON=0, RFV2='Blank', RFV3='Blank', RFV4='Blank', RFV5='Blank', INJURY=0, INJURY72=0, ADM_OUTCOME=0, n=10326, admit=3123, RFV1_admit_rate=0.30244044160371875, RESINDEX=0.0, RESONE=SparseVector(6, {0: 1.0}), features=SparseVector(44, {0: 53.0, 1: 1.0, 8: 933.0, 9: 2014.0, 10: 76.0, 11: 97.0, 12: 16.0, 13: 141.0, 14: 63.0, 15: 100.0, 16: 7.0, 39: 1.0, 43: 0.3024}), scaledFeatures=SparseVector(44, {0: 0.5699, 1: 1.0, 8: 0.6484, 9: 0.9985, 10: 0.3115, 11: 0.8899, 12: 0.1067, 13: 0.4913, 14: 0.3351, 15: 1.0, 16: 0.7, 39: 1.0, 43: 0.336}))]

In [72]:
dataRDD = NHAMCS.select(col("ADM_OUTCOME"), col("scaledFeatures")).rdd.map(tuple)
dataRDD_small = NHAMCS_small.select(col("ADM_OUTCOME"), col("scaledFeatures")).rdd.map(tuple)


In [73]:
# map label to binary values, then convert to LabeledPoint
lp = dataRDD.map(lambda row:(row[0], Vectors.dense(row[1])))    \
                    .map(lambda row: LabeledPoint(row[0], row[1]))

lp_small = dataRDD_small.map(lambda row:(row[0], Vectors.dense(row[1])))    \
                    .map(lambda row: LabeledPoint(row[0], row[1]))

## Train and test model

In [None]:

k_min = 2 # 2 in inclusive
k_max = 10 # 2i is exlusive. will fit till 20

max_iter = 1000
seed = 42   

kmeans_input_features = 'scaledFeatures'
kmeans_prediction_features = 'prediction'

# scaler = StandardScaler(inputCol=standard_scaler_inpt_features, outputCol=kmeans_input_features, withStd=True, withMean=True)
# scaler_model = scaler.fit(NHAMCS_small)
# scaled_data = scaler_model.transform(NHAMCS_small)

wsse_collect_spark = []

for i in range(k_min, k_max):
    km = KMeans(featuresCol=kmeans_input_features, predictionCol=kmeans_prediction_features,
                        k=i, maxIter=max_iter, seed=seed)
    km_fit = km.fit(NHAMCS_small)
    wsse_spark = km_fit.computeCost(NHAMCS_small)
    wsse_collect_spark .append(wsse_spark)
    print('For k = {i:03d} WSSE is {wsse:10f}'.format(i=i, wsse=wsse_spark))

For k = 002 WSSE is 46594.069088
For k = 003 WSSE is 42873.358477
For k = 004 WSSE is 40083.133899
For k = 005 WSSE is 38615.954167
For k = 006 WSSE is 37140.380316


In [58]:
# split into training and testing set
#training, testing = lp.randomSplit([0.8, 0.2], SEED) #using non-downsampled data -- 93% accuracy
training, testing = lp_small.randomSplit([0.8, 0.2], SEED) #using downsampled data -- got about 70% accuracy

In [59]:

categoricalFeaturesInfo={1:2, 2:2, 3:2, 4:2, 5:2, 6:2, 7:2, 17:2, 18:2, 19:2, 20:2, 21:2, 22:2, 23:2, 24:2, 25:2, 26:2, 27:2, 28:2, 29:2, 30:2, 31:2, 32:2, 33:2, 34:2, 35:2, 36:2, 37:2, 38:2, 39:2, 41:2, 42:2}


# Train a RandomForest model.
#  Empty categoricalFeaturesInfo indicates all features are continuous.
#  Setting featureSubsetStrategy="auto" lets the algorithm choose.
model = RandomForest.trainClassifier(training, numClasses=2, categoricalFeaturesInfo=categoricalFeaturesInfo,
                                     numTrees=200, featureSubsetStrategy="auto",
                                     impurity='gini', maxDepth=15, maxBins=32)

In [60]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testing.map(lambda x: x.features))
labelsAndPredictions = testing.map(lambda lp: lp.label).zip(predictions)
testErr = labelsAndPredictions.filter(
    lambda lp: lp[0] != lp[1]).count() / float(testing.count())
print('Test Error = ' + str(testErr))

Test Error = 0.07025344597598933


In [30]:
confusion_metrics = MulticlassMetrics(labelsAndPredictions)
print('confusion matrix {}'.format(confusion_metrics.confusionMatrix().toArray()))

confusion matrix [[138.  54.]
 [ 29.  97.]]


In [61]:
def confusion(labelsAndPreds):
    # Additionally, compute and show the confusion matrix.
    tp = labelsAndPreds.filter(lambda pl: (pl[0] == 1.0) & (pl[1] == 1.0)).count()
    fp = labelsAndPreds.filter(lambda pl: (pl[0] == 0.0) & (pl[1] == 1.0)).count()
    tn = labelsAndPreds.filter(lambda pl: (pl[0] == 0.0) & (pl[1] == 0.0)).count()
    fn = labelsAndPreds.filter(lambda pl: (pl[0] == 1.0) & (pl[1] == 0.0)).count()
    print("\nConfusion Matrix:")
    print("\tLabel:")
    print("Pred\t 0:\t 1:")
    print('0:\t',tn,'\t',fn)
    print('1:\t',fp, '\t',tp,)
    print("\nConfusion Matrix:")
    print('tn:',tn,' fn:',fn)
    print('fp:',fp, '  tp:',tp,)


confusion(labelsAndPredictions)


Confusion Matrix:
	Label:
Pred	 0:	 1:
0:	 2090 	 158
1:	 0 	 1

Confusion Matrix:
tn: 2090  fn: 158
fp: 0   tp: 1


## Evaluate model

In [47]:
# evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="ADM_OUTCOME")

# print("Number of negatives: ", predict_test.where('prediction == 0').count())
# print("Number of positives: ", predict_test.where('prediction == 1').count())

# print("\nThe area under ROC for train set is", evaluator.evaluate(predict_train))
# print("The area under ROC for test set is", evaluator.evaluate(predict_test))

In [63]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics


metrics = BinaryClassificationMetrics(labelsAndPredictions)
print("Area under Precision/Recall (PR) curve: %.f" % (metrics.areaUnderPR * 100))
print("Area under Receiver Operating Characteristic (ROC) curve: %.3f" % (metrics.areaUnderROC * 100))


Area under Precision/Recall (PR) curve: 1
Area under Receiver Operating Characteristic (ROC) curve: 96.486
