# Final Project: Admission Prediction from NHAMCS
## Initial LR model
### DS5559: Big Data Analysis
### Thomas Hartka, Alicia Doan, Michael Langmayr
Created: 7/28/2020 
  
In this notebook creates and analyzes our lasso and ridge regression models.  The regularization parameters are found using cross-validation.

## Configuration

In [1]:
# preferences
down_sample = True             # use downsampling to handle class imbalance
scaling = False              # scale data

# variables to include
pred_totchron = True         # total chronic disease
pred_resid = True            # place of residence
pred_comorbid = True         # place of residence
pred_RFV = True              # historical admit rate based on RFV
pred_vitals = True           # vital signs (heart reate, blood pressure, etc.)
pred_arrival = True          # arrival time and year
pred_injury = True           # visits associated with injuries 

# seed for random split
SEED = 314

In [2]:
# set data directory
data_dir = "../data"
results_dir = "../results"

## Import libraries and set up Spark

In [3]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [5]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator

In [6]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [7]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed_bc.2014-2017")

## Select variables

In [8]:
# variables to include
predictors = ['AGEYEAR','SEXMALE']                    # all models have age and sex

if pred_totchron: predictors += ['TOTCHRON']          # total chronic conditions

if pred_comorbid: predictors += ['ALZHD','ASTHMA',    # preexisting health conditions
        'CAD','CANCER', 'CEBVD','CHF','CKD','COPD',
        'DEPRN','DIABTYP0','DIABTYP1','DIABTYP2',
        'EDHIV','ESRD','ETOHAB','HPE','HTN',
        'HYPLIPID','OBESITY','OSA','OSTPRSIS',
        'SUBSTAB','NOCHRON']

if pred_vitals: predictors += ['PULSE','TEMPF',        # vital signs
        'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE']
    
if pred_RFV: predictors += ['RFV1_admit_rate']         # historical admission rate based on RFV
    
if pred_arrival: predictors += ['ARRTIMEMIN','YEAR']   # arrival time and year

if pred_injury: predictors += ['INJURY','INJURY72']    # visits associated with injuries 
    
if pred_resid: predictors += ['RESONE']                # residence

In [9]:
# for crossvalidation for some reason we need to call the output 'label'
NHAMCS = NHAMCS.withColumnRenamed("ADM_OUTCOME", "label")

## Downsample data

In [10]:
def downsample(df, target, positive_label, negative_label):
    """
    df              spark dataframe
    target          str, target variable
    positive_label  int, value of positive label
    negative_label  int, value of negative label
    
    """

    positives = df.filter(df[target] == positive_label)
    num_positives = positives.count()
    negatives =  df.filter(df[target] == negative_label)
    num_negatives = negatives.count()
    
    if (num_positives > num_negatives): # downsample positives
        sampled_df = positives.sample(withReplacement=False, fraction=num_negatives/num_positives, seed=SEED)
        df_b = sampled_df.union(negatives)
    elif (num_negatives > num_positives): # downsample negatives
        sampled_df = negatives.sample(withReplacement=False, fraction=num_positives/num_negatives, seed=SEED)
        df_b = sampled_df.union(positives)

    return df_b

In [11]:
NHAMCS.count()

81081

## Split data and calculate class weighting

In [12]:
# split into training and testing set
training, testing = NHAMCS.randomSplit([0.8, 0.2], SEED)

In [13]:
if down_sample == True:
    print("Downsampling")
    training = downsample(training,'label', 1, 0 )

Downsampling


## Create Pipeline

In [14]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")

# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')

# assemble vector
va = VectorAssembler(inputCols=predictors, outputCol="features", handleInvalid='skip')  

# scaler for data
scaler = MaxAbsScaler(inputCol="features", outputCol="scaledFeatures")

# select whether to use scaled features
if scaling == True:
    featureCol = 'scaledFeatures'
else:
    featureCol = 'features'
    
# set up model
rf = RandomForestClassifier(labelCol='label',featuresCol='scaledFeatures', numTrees=100)

# Build the pipeline
pipeline = Pipeline(stages=[rsi, rohe, va, scaler, rf])

In [15]:
# set up parameters for cross validation
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [50,100,150,200]) \
    .addGrid(rf.maxDepth, [5,10]) \
    .build()

# set up  for cross validator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5,
                          seed = SEED)  

## Train and test model

In [16]:
%%time
# Fit the pipeline
cvModel = crossval.fit(training)

CPU times: user 2.92 s, sys: 617 ms, total: 3.53 s
Wall time: 16min 32s


In [17]:
# extract LR model from cross validator
cvModel.avgMetrics

[0.8078129762700031,
 0.8256433796107218,
 0.8085120109955183,
 0.8269801904482001,
 0.808154573590713,
 0.82700789190938,
 0.8098880954990817,
 0.8274814915739511]

In [18]:
lrBest = cvModel.bestModel.stages[-1]

In [19]:
# Make a predictions
pred_train = cvModel.transform(training)
pred_test = cvModel.transform(testing)

## Evaluate model

In [20]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

# compute confusion matrix
tp = pred_test.where('prediction == 1 and label==1').count() 
fp = pred_test.where('prediction == 1 and label==0').count() 
tn = pred_test.where('prediction == 0 and label==0').count() 
fn = pred_test.where('prediction == 0 and label==1').count() 

acc = (tp+tn)/(tp+fp+tn+fn)
prec = tp / (tp+fp)
recall = tp / (tp+fn)
spec = tn / (tn + fn)
f1 = 2 * (prec * recall) / (prec + recall)


print("\nConfusion Matrix:")
print('tn:',tn,' fn:',fn)
print('fp:',fp, '  tp:',tp,)  

print('\nPredicted positive:', tp+fp)
print('Predicted negitive:', tn+fn)

print('\nAccuracy', acc)

print("\nPrecision:", prec)
print("Recall:", recall)
print("F1 score:", f1)

print("\nSensitivity:", recall)
print("Specificity:", spec)

print("\nThe area under ROC for train set is", evaluator.setMetricName("areaUnderROC").evaluate(pred_train))
print("The area under ROC for test set is", evaluator.evaluate(pred_test))


Confusion Matrix:
tn: 10939  fn: 474
fp: 3443   tp: 1385

Predicted positive: 4828
Predicted negitive: 11413

Accuracy 0.7588202696878271

Precision: 0.2868682684341342
Recall: 0.7450242065626681
F1 score: 0.4142365784357709

Sensitivity: 0.7450242065626681
Specificity: 0.9584684132130027

The area under ROC for train set is 0.8848930696689579
The area under ROC for test set is 0.8341749657336401
