# Final Project: Admission Prediction from NHAMCS
## Initial LR model
### DS5559: Big Data Analysis
### Thomas Hartka, Alicia Doan, Michael Langmayr
Created: 7/15/2020 
  
In this notebook creates and analyzes our logistic regression model using all varaiblespredicting hospital admission in the NHAMCS database.  Weight of the outcomes is 

## Configuration

In [1]:
# preferences
weight_outcome = True    # use weights to handle class imbalance

# variables to include
pred_totchron = True     # total chronic disease
pred_resid = True        # place of residence
pred_comorbid = True     # place of residence
pred_RFV = True          # historical admit rate based on RFV
pred_vitals = True       # vital signs (heart reate, blood pressure, etc.)
pred_arrival = True      # arrival time and year
pred_injury = True       # visits associated with injuries 

# logistic regression settings
max_iter = 100           # maximum iterations

# seed for random split
SEED = 314

In [2]:
# set data directory
data_dir = "../data"
results_dir = "../results"

## Import libraries and set up Spark

In [3]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [4]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [5]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [6]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [7]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed_bc.2014-2017")

## Select variables

In [8]:
# variables to include
predictors = ['AGEYEAR','SEXMALE']                    # all models have age and sex

if pred_totchron: predictors += ['TOTCHRON']          # total chronic conditions

if pred_comorbid: predictors += ['ALZHD','ASTHMA',    # preexisting health conditions
        'CAD','CANCER', 'CEBVD','CHF','CKD','COPD',
        'DEPRN','DIABTYP0','DIABTYP1','DIABTYP2',
        'EDHIV','ESRD','ETOHAB','HPE','HTN',
        'HYPLIPID','OBESITY','OSA','OSTPRSIS',
        'SUBSTAB','NOCHRON']

if pred_vitals: predictors += ['PULSE','TEMPF',        # vital signs
        'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE']
    
if pred_RFV: predictors += ['RFV1_admit_rate']         # historical admission rate based on RFV
    
if pred_arrival: predictors += ['ARRTIMEMIN','YEAR']   # arrival time and year

if pred_injury: predictors += ['INJURY','INJURY72']    # visits associated with injuries 
    
if pred_resid: predictors += ['RESONE']                # residence

## Split data and calculate class weighting

In [9]:
# split into training and testing set
training, testing = NHAMCS.randomSplit([0.8, 0.2], SEED)

In [10]:
# handle class imbalance

# calculate balance ratio
balRatio = training.select("ADM_OUTCOME").where('ADM_OUTCOME == 0').count() / training.count()

# add weights
training = training.withColumn("classWeights", when(training.ADM_OUTCOME == 1,balRatio).otherwise(1-balRatio))

## Create Pipeline

In [11]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")

# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')

# assemble vector
va = VectorAssembler(inputCols=predictors, outputCol="features", handleInvalid='skip')  

# logistic regression model (no lasso or ridge)
# set up LR model
if weight_outcome:
    lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", weightCol="classWeights", \
                              maxIter=max_iter, regParam=0, elasticNetParam=0)
else:
    lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", \
                               maxIter=max_iter, regParam=0, elasticNetParam=0)

# Build the pipeline
pipeline = Pipeline(stages=[rsi, rohe, va, lr])

## Train and test model

In [12]:
%%time
# Fit the pipeline
model = pipeline.fit(training)

# extract LR model from pipeline
lrTrain = model.stages[-1]

CPU times: user 28.6 ms, sys: 5.08 ms, total: 33.7 ms
Wall time: 34 s


In [13]:
# Print the coefficients and intercepts for logistic regression with multinomial family
print("Intercept: ", np.round(lrTrain.intercept,4))
for i,coeff in enumerate(lrTrain.coefficients):
    if(i<len(predictors)):
        print(predictors[i], ":", np.round(coeff,4))
    else:
        print("       :", np.round(coeff,4))

Intercept:  -0.0
AGEYEAR : 0.016
SEXMALE : 0.145
TOTCHRON : 0.0609
ALZHD : 0.101
ASTHMA : -0.2731
CAD : 0.3404
CANCER : 0.5721
CEBVD : 0.4557
CHF : 0.3666
CKD : 0.5555
COPD : -0.0143
DEPRN : 0.1651
DIABTYP0 : -0.0403
DIABTYP1 : 0.5805
DIABTYP2 : 0.0983
EDHIV : -0.0754
ESRD : 0.162
ETOHAB : 0.2441
HPE : 0.2341
HTN : 0.0253
HYPLIPID : 0.0979
OBESITY : 0.458
OSA : 0.0273
OSTPRSIS : 0.3127
SUBSTAB : -0.0147
NOCHRON : -0.4222
PULSE : 0.0073
TEMPF : -0.0042
RESPR : 0.0162
BPSYS : 0.0001
BPDIAS : -0.002
POPCT : -0.0047
PAINSCALE : 0.0148
RFV1_admit_rate : 5.638
ARRTIMEMIN : -0.0
YEAR : -0.0009
INJURY : -0.0276
INJURY72 : -0.0478
RESONE : -0.1898
       : 0.1495
       : 0.0528
       : 0.4575
       : 0.3692
       : -0.2022


In [14]:
len(lrTrain.coefficients)

44

In [15]:
# Make a predictions
pred_train = model.transform(training)
pred_test = model.transform(testing)

## Evaluate model

In [16]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="ADM_OUTCOME")

# compute confusion matrix
tp = pred_test.where('prediction == 1 and ADM_OUTCOME==1').count() 
fp = pred_test.where('prediction == 1 and ADM_OUTCOME==0').count() 
tn = pred_test.where('prediction == 0 and ADM_OUTCOME==0').count() 
fn = pred_test.where('prediction == 0 and ADM_OUTCOME==1').count() 

acc = (tp+tn)/(tp+fp+tn+fn)
prec = tp / (tp+fp)
recall = tp / (tp+fn)
spec = tn / (tn + fn)
f1 = 2 * (prec * recall) / (prec + recall)


print("\nConfusion Matrix:")
print('tn:',tn,' fn:',fn)
print('fp:',fp, '  tp:',tp,)  

print('\nPredicted positive:', tp+fp)
print('Predicted negitive:', tn+fn)

print('\nAccuracy', acc)

print("\nPrecision:", prec)
print("Recall:", recall)
print("F1 score:", f1)

print("\nSensitivity:", recall)
print("Specificity:", spec)

print("\nThe area under ROC for train set is", evaluator.setMetricName("areaUnderROC").evaluate(pred_train))
print("The area under ROC for test set is", evaluator.evaluate(pred_test))


Confusion Matrix:
tn: 11169  fn: 514
fp: 3213   tp: 1345

Predicted positive: 4558
Predicted negitive: 11683

Accuracy 0.7705190567083308

Precision: 0.29508556384379114
Recall: 0.7235072619688004
F1 score: 0.41919900264921306

Sensitivity: 0.7235072619688004
Specificity: 0.9560044509115809

The area under ROC for train set is 0.8258379243671996
The area under ROC for test set is 0.825720453716986
