# Final Project: Admission Prediction from NHAMCS
## Initial LR model
### DS5559: Big Data Analysis
### Thomas Hartka, Alicia Doan, Michael Langmayr
Created: 7/28/2020 
  
In this notebook creates and analyzes a linear support vector machine model.  The regularization parameters are found using cross-validation.

## Configuration

In [1]:
# preferences
weight_outcome = True        # use weights to handle class imbalance
scaling = True               # scale data
data_reduction = 1           # fraction of data to use (set to 1.0 for all data)

# variables to include
pred_totchron = True         # total chronic disease
pred_resid = True            # place of residence
pred_comorbid = True         # place of residence
pred_RFV = True              # historical admit rate based on RFV
pred_vitals = True           # vital signs (heart reate, blood pressure, etc.)
pred_arrival = True          # arrival time and year
pred_injury = True           # visits associated with injuries 

# ridge and lasso regression settings
max_iter = 100               # maximum iterations

# seed for random split
SEED = 314

In [2]:
# set data directory
data_dir = "../data"
results_dir = "../results"

## Import libraries and set up Spark

In [3]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [5]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator

In [6]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [7]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed_bc.2014-2017")

## Reduce data if needed
Due to the time needed to run these models, tuning might be performed using a reduced data set.

In [8]:
if data_reduction < 1:
    NHAMCS = NHAMCS.sample(withReplacement=False, fraction=data_reduction, seed=SEED)

In [9]:
NHAMCS.count()

81081

## Select variables

In [10]:
# variables to include
predictors = ['AGEYEAR','SEXMALE']                    # all models have age and sex

if pred_totchron: predictors += ['TOTCHRON']          # total chronic conditions

if pred_comorbid: predictors += ['ALZHD','ASTHMA',    # preexisting health conditions
        'CAD','CANCER', 'CEBVD','CHF','CKD','COPD',
        'DEPRN','DIABTYP0','DIABTYP1','DIABTYP2',
        'EDHIV','ESRD','ETOHAB','HPE','HTN',
        'HYPLIPID','OBESITY','OSA','OSTPRSIS',
        'SUBSTAB','NOCHRON']

if pred_vitals: predictors += ['PULSE','TEMPF',        # vital signs
        'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE']
    
if pred_RFV: predictors += ['RFV1_admit_rate']         # historical admission rate based on RFV
    
if pred_arrival: predictors += ['ARRTIMEMIN','YEAR']   # arrival time and year

if pred_injury: predictors += ['INJURY','INJURY72']    # visits associated with injuries 
    
if pred_resid: predictors += ['RESONE']                # residence

In [11]:
# for crossvalidation for some reason we need to call the output 'label'
NHAMCS = NHAMCS.withColumnRenamed("ADM_OUTCOME", "label")

## Split data and calculate class weighting

In [12]:
# split into training and testing set
training, testing = NHAMCS.randomSplit([0.8, 0.2], SEED)

In [13]:
# handle class imbalance

# calculate balance ratio
balRatio = training.select("label").where('label == 0').count() / training.count()

# add weights
training = training.withColumn("classWeights", when(training.label == 1,balRatio).otherwise(1-balRatio))

## Create Pipeline

In [14]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")

# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')

# assemble vector
va = VectorAssembler(inputCols=predictors, outputCol="features", handleInvalid='skip')  

# scaler for data
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# select whether to use scaled features
if scaling == True:
    featureCol = 'scaledFeatures'
else:
    featureCol = 'features'
    
print(featureCol)

# set up model
if weight_outcome:
    print("Using outcome weights")
    svc = LinearSVC(featuresCol=featureCol, labelCol="label", weightCol="classWeights", \
                              maxIter=max_iter, standardization=False)
else:
    svc = LinearSVC(featuresCol=featureCol, labelCol="label", \
                               maxIter=max_iter, standardization=False)

# Build the pipeline
pipeline = Pipeline(stages=[rsi, rohe, va, scaler, svc])

scaledFeatures
Using outcome weights


In [15]:
# set up parameters for cross validation
paramGrid = ParamGridBuilder() \
    .addGrid(svc.regParam, [0,0.0001, 0.001, 0.01, 0.1]) \
    .build()

# set up  for cross validator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5,
                          seed = SEED)  

## Train and test model

In [16]:
%%time
# Fit the pipeline)
cvModel = crossval.fit(training)

CPU times: user 3.26 s, sys: 876 ms, total: 4.13 s
Wall time: 47min 41s


In [17]:
# extract LR model from cross validator
cvModel.avgMetrics

[0.8232845115107741,
 0.8126078444415568,
 0.823140437009303,
 0.8231145991998885,
 0.8220104244865923]

In [18]:
lrBest = cvModel.bestModel.stages[-1]

In [19]:
# Make a predictions
pred_train = cvModel.transform(training)
pred_test = cvModel.transform(testing)

## Evaluate model

In [20]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

# compute confusion matrix
tp = pred_test.where('prediction == 1 and label==1').count() 
fp = pred_test.where('prediction == 1 and label==0').count() 
tn = pred_test.where('prediction == 0 and label==0').count() 
fn = pred_test.where('prediction == 0 and label==1').count() 

acc = (tp+tn)/(tp+fp+tn+fn)
prec = tp / (tp+fp)
recall = tp / (tp+fn)
spec = tn / (tn + fn)
f1 = 2 * (prec * recall) / (prec + recall)


print("\nConfusion Matrix:")
print('tn:',tn,' fn:',fn)
print('fp:',fp, '  tp:',tp,)  

print('\nPredicted positive:', tp+fp)
print('Predicted negitive:', tn+fn)

print('\nAccuracy', acc)

print("\nPrecision:", prec)
print("Recall:", recall)
print("F1 score:", f1)

print("\nSensitivity:", recall)
print("Specificity:", spec)

print("\nThe area under ROC for train set is", evaluator.setMetricName("areaUnderROC").evaluate(pred_train))
print("The area under ROC for test set is", evaluator.evaluate(pred_test))


Confusion Matrix:
tn: 11300  fn: 542
fp: 3082   tp: 1317

Predicted positive: 4399
Predicted negitive: 11842

Accuracy 0.776861030724709

Precision: 0.2993862241418504
Recall: 0.708445400753093
F1 score: 0.4209012464046021

Sensitivity: 0.708445400753093
Specificity: 0.9542307042729269

The area under ROC for train set is 0.8250069761081206
The area under ROC for test set is 0.8252571482089136
