# Final Project: Admission Prediction from NHAMCS
## Initial LR model
### DS5559: Big Data Analysis
### Thomas Hartka, Alicia Doan, Michael Langmayr
Created: 7/28/2020 
  
In this notebook creates and analyzes our lasso and ridge regression models.  The regularization parameters are found using cross-validation.

## Configuration

In [1]:
# preferences
weight_outcome = True        # use weights to handle class imbalance
scaling = True               # scale data

# variables to include
pred_totchron = True         # total chronic disease
pred_resid = True            # place of residence
pred_comorbid = True         # place of residence
pred_RFV = True              # historical admit rate based on RFV
pred_vitals = True           # vital signs (heart reate, blood pressure, etc.)
pred_arrival = True          # arrival time and year
pred_injury = True           # visits associated with injuries 

# ridge and lasso regression settings
max_iter = 100               # maximum iterations
elasticNet = 1                 # 0=Ridge, 1=Lasso

# seed for random split
SEED = 314

In [2]:
# set data directory
data_dir = "../data"
results_dir = "../results"

## Import libraries and set up Spark

In [3]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [5]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator

In [6]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [7]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed_bc.2014-2017")

## Select variables

In [8]:
# variables to include
predictors = ['AGEYEAR','SEXMALE']                    # all models have age and sex

if pred_totchron: predictors += ['TOTCHRON']          # total chronic conditions

if pred_comorbid: predictors += ['ALZHD','ASTHMA',    # preexisting health conditions
        'CAD','CANCER', 'CEBVD','CHF','CKD','COPD',
        'DEPRN','DIABTYP0','DIABTYP1','DIABTYP2',
        'EDHIV','ESRD','ETOHAB','HPE','HTN',
        'HYPLIPID','OBESITY','OSA','OSTPRSIS',
        'SUBSTAB','NOCHRON']

if pred_vitals: predictors += ['PULSE','TEMPF',        # vital signs
        'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE']
    
if pred_RFV: predictors += ['RFV1_admit_rate']         # historical admission rate based on RFV
    
if pred_arrival: predictors += ['ARRTIMEMIN','YEAR']   # arrival time and year

if pred_injury: predictors += ['INJURY','INJURY72']    # visits associated with injuries 
    
if pred_resid: predictors += ['RESONE']                # residence

In [9]:
# for crossvalidation for some reason we need to call the output 'label'
NHAMCS = NHAMCS.withColumnRenamed("ADM_OUTCOME", "label")

## Split data and calculate class weighting

In [10]:
# split into training and testing set
training, testing = NHAMCS.randomSplit([0.8, 0.2], SEED)

In [11]:
# handle class imbalance

# calculate balance ratio
balRatio = training.select("label").where('label == 0').count() / training.count()

# add weights
training = training.withColumn("classWeights", when(training.label == 1,balRatio).otherwise(1-balRatio))

## Create Pipeline

In [12]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")

# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')

# assemble vector
va = VectorAssembler(inputCols=predictors, outputCol="features", handleInvalid='skip')  

# scaler for data
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

# select whether to use scaled features
if scaling == True:
    featureCol = 'scaledFeatures'
else:
    featureCol = 'features'
    
print(featureCol)

# set up model
if weight_outcome:
    print("Using outcome weights")
    lr = LogisticRegression(featuresCol=featureCol, labelCol="label", weightCol="classWeights", \
                              maxIter=max_iter, elasticNetParam=elasticNet)
else:
    lr = LogisticRegression(featuresCol=featureCol, labelCol="label", \
                               maxIter=max_iter, elasticNetParam=elasticNet)

# Build the pipeline
pipeline = Pipeline(stages=[rsi, rohe, va, scaler, lr])

scaledFeatures
Using outcome weights


In [13]:
# set up parameters for cross validation
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.00001,0.00005,0.0001,0.0005,0.001,0.005,0.01,1.0,10]) \
    .build()

# set up  for cross validator
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5,
                          seed = SEED)  

## Train and test model

In [14]:
%%time
# Fit the pipeline)
cvModel = crossval.fit(training)

CPU times: user 3.22 s, sys: 684 ms, total: 3.9 s
Wall time: 10min 23s


In [15]:
# extract LR model from cross validator
cvModel.avgMetrics

[0.8237444883658015,
 0.8240205191720915,
 0.8241668926829173,
 0.8237079296116474,
 0.8237803176866568,
 0.8231086145236612,
 0.8202309981106647,
 0.5,
 0.5]

In [16]:
lrBest = cvModel.bestModel.stages[-1]

In [17]:
# Print the coefficients and intercepts for logistic regression with multinomial family
print("Intercept: ", np.round(lrBest.intercept,4))
for i,coeff in enumerate(lrBest.coefficients):
    if(i<len(predictors)):
        print(predictors[i], ":", np.round(coeff,4))
    else:
        print("       :", np.round(coeff,4))

Intercept:  -0.0
AGEYEAR : 0.4151
SEXMALE : 0.0747
TOTCHRON : 0.0264
ALZHD : 0.0129
ASTHMA : -0.0745
CAD : 0.0832
CANCER : 0.1045
CEBVD : 0.0873
CHF : 0.0674
CKD : 0.0892
COPD : 0.0096
DEPRN : 0.0603
DIABTYP0 : 0.0036
DIABTYP1 : 0.0548
DIABTYP2 : 0.0267
EDHIV : -0.006
ESRD : 0.0175
ETOHAB : 0.0506
HPE : 0.0281
HTN : 0.0264
HYPLIPID : 0.0412
OBESITY : 0.0937
OSA : 0.004
OSTPRSIS : 0.0281
SUBSTAB : 0.0053
NOCHRON : -0.2099
PULSE : 0.2048
TEMPF : -0.0365
RESPR : 0.113
BPSYS : -0.0875
BPDIAS : -0.005
POPCT : -0.1125
PAINSCALE : 0.058
RFV1_admit_rate : 0.7581
ARRTIMEMIN : -0.0124
YEAR : -0.0011
INJURY : -0.0071
INJURY72 : -0.0164
RESONE : -0.0423
       : 0.017
       : 0.0057
       : 0.0389
       : 0.029
       : -0.0173


In [18]:
# Make a predictions
pred_train = cvModel.transform(training)
pred_test = cvModel.transform(testing)

## Evaluate model

In [19]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")

# compute confusion matrix
tp = pred_test.where('prediction == 1 and label==1').count() 
fp = pred_test.where('prediction == 1 and label==0').count() 
tn = pred_test.where('prediction == 0 and label==0').count() 
fn = pred_test.where('prediction == 0 and label==1').count() 

acc = (tp+tn)/(tp+fp+tn+fn)
prec = tp / (tp+fp)
recall = tp / (tp+fn)
spec = tn / (tn + fn)
f1 = 2 * (prec * recall) / (prec + recall)


print("\nConfusion Matrix:")
print('tn:',tn,' fn:',fn)
print('fp:',fp, '  tp:',tp,)  

print('\nPredicted positive:', tp+fp)
print('Predicted negitive:', tn+fn)

print('\nAccuracy', acc)

print("\nPrecision:", prec)
print("Recall:", recall)
print("F1 score:", f1)

print("\nSensitivity:", recall)
print("Specificity:", spec)

print("\nThe area under ROC for train set is", evaluator.setMetricName("areaUnderROC").evaluate(pred_train))
print("The area under ROC for test set is", evaluator.evaluate(pred_test))


Confusion Matrix:
tn: 11226  fn: 520
fp: 3156   tp: 1339

Predicted positive: 4495
Predicted negitive: 11746

Accuracy 0.7736592574348871

Precision: 0.2978865406006674
Recall: 0.7202797202797203
F1 score: 0.4214667925716084

Sensitivity: 0.7202797202797203
Specificity: 0.9557296100800272

The area under ROC for train set is 0.8254751056932993
The area under ROC for test set is 0.8258390198315085
