# Final Project: Admission Prediction from NHAMCS
## Progress report: Inital model evaluation
### DS5559: Big Data Analysis
### Thomas Hartka(trh6u), Alicia Doan(ad2ew), Michael Langmayr(ml8vp)
Created: 8/2/2020
  
This script performs logistic regression to predict hospital admissions.

## Configuration

In [12]:
# set data directory
data_dir = "../data"
results_dir = "../results"

In [13]:
SEED = 314

## Import libraries and set up Spark

In [14]:
# import python libraries
import os
import pandas as pd
import numpy as np
from functools import reduce

In [15]:
# set up pyspark
from pyspark.sql import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType

In [16]:
from pyspark.ml import Pipeline  
from pyspark.ml.feature import *  
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [17]:
spark = SparkSession.builder.getOrCreate()

## Read in data

In [24]:
NHAMCS = spark.read.parquet(data_dir + "/NHAMCS_processed_bc.2014-2017")

## One-hot encode residence

In [25]:
# perform string indexing to prepare for OHE for residence variable
rsi = StringIndexer(inputCol="RESIDNCE", outputCol="RESINDEX")
simodel = rsi.fit(NHAMCS)
NHAMCS = simodel.transform(NHAMCS)

In [26]:
# perform OHE on residence variable
rohe = OneHotEncoder(inputCol='RESINDEX', outputCol='RESONE')
NHAMCS = rohe.transform(NHAMCS)

## Assemble vector

In [27]:
# assemble vector
va = VectorAssembler(inputCols=["AGEYEAR","RESONE",'SEXMALE','ARRTIMEMIN','YEAR','PULSE','TEMPF', \
                                'RESPR','BPSYS','BPDIAS','POPCT','PAINSCALE','ALZHD','ASTHMA','CAD','CANCER', \
                                'CEBVD','CHF','CKD','COPD','DEPRN','DIABTYP0','DIABTYP1','DIABTYP2','EDHIV', \
                                'ESRD','ETOHAB','HPE','HTN','HYPLIPID','OBESITY','OSA','OSTPRSIS','SUBSTAB', \
                                'NOCHRON','TOTCHRON','INJURY','INJURY72','RFV1_admit_rate'], 
                         outputCol="features")  
   
NHAMCS = va.setHandleInvalid("skip").transform(NHAMCS)

## Train and test model

In [28]:
# split into training and testing set
training, testing = NHAMCS.randomSplit([0.8, 0.2], seed=SEED)

In [29]:
# handle class imbalance

# calculate balance ratio
balRatio = training.select("ADM_OUTCOME").where('ADM_OUTCOME == 0').count() / training.count()

# add weights
training = training.withColumn("classWeights", when(training.ADM_OUTCOME == 1,balRatio).otherwise(1-balRatio))

In [30]:
# function for logistic regression
def lr_nhamcs (training_set, testing_set, reg_param=0, method="Standard"):
    if method=="Standard":
        lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", weightCol="classWeights", \
                                  maxIter=10, regParam=0, elasticNetParam=0)   
    elif method=="Ridge":
        lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", weightCol="classWeights", \
                                  maxIter=10, regParam=reg_param, elasticNetParam=0)   
    elif method=="Lasso":
        lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", weightCol="classWeights", \
                                  maxIter=10, regParam=reg_param, elasticNetParam=1)   
        
    # Fit the model
    admModel = lr.fit(training_set)

    # predict on testing set 
    predict_test=admModel.transform(testing_set)
    
    # make evaluator 
    evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="ADM_OUTCOME")
    
    return evaluator.evaluate(predict_test)

## Determine best hyperparameters

In [31]:
# test standard LR model
print("ROC-AUC for standard LR is: ", lr_nhamcs(training,testing))

ROC-AUC for standard LR is:  0.7878118746993313


In [32]:
# test Ridge LR model for different values of the regularizatoin parameter
for i in np.arange(0.0, 1.1, 0.1):
    i = np.round(i,1)
    print("ROC-AUC for Ridge LR with reg_param=", i, \
          " is: ", lr_nhamcs(training,testing, i,"Ridge"))

ROC-AUC for Ridge LR with reg_param= 0.0  is:  0.7878118746993296
ROC-AUC for Ridge LR with reg_param= 0.1  is:  0.7878120991146862
ROC-AUC for Ridge LR with reg_param= 0.2  is:  0.7878155027476266
ROC-AUC for Ridge LR with reg_param= 0.3  is:  0.7878225718314305
ROC-AUC for Ridge LR with reg_param= 0.4  is:  0.7878323338995342
ROC-AUC for Ridge LR with reg_param= 0.5  is:  0.7878399640217282
ROC-AUC for Ridge LR with reg_param= 0.6  is:  0.7878427318111566
ROC-AUC for Ridge LR with reg_param= 0.7  is:  0.7878414975266823
ROC-AUC for Ridge LR with reg_param= 0.8  is:  0.7878381686988603
ROC-AUC for Ridge LR with reg_param= 0.9  is:  0.7878372336348662
ROC-AUC for Ridge LR with reg_param= 1.0  is:  0.7878359993503897


In [33]:
# test Lasso LR model for different values of the regularizatoin parameter
for i in np.arange(0.0, 1.1, 0.1):
    i = np.round(i,1)
    print("ROC-AUC for Lasso LR with reg_param=", i, \
          " is: ", lr_nhamcs(training,testing, i,"Lasso"))

ROC-AUC for Lasso LR with reg_param= 0.0  is:  0.787811874699332
ROC-AUC for Lasso LR with reg_param= 0.1  is:  0.8122173441803751
ROC-AUC for Lasso LR with reg_param= 0.2  is:  0.7748009454469451
ROC-AUC for Lasso LR with reg_param= 0.3  is:  0.5
ROC-AUC for Lasso LR with reg_param= 0.4  is:  0.5
ROC-AUC for Lasso LR with reg_param= 0.5  is:  0.5
ROC-AUC for Lasso LR with reg_param= 0.6  is:  0.5
ROC-AUC for Lasso LR with reg_param= 0.7  is:  0.5
ROC-AUC for Lasso LR with reg_param= 0.8  is:  0.5
ROC-AUC for Lasso LR with reg_param= 0.9  is:  0.5
ROC-AUC for Lasso LR with reg_param= 1.0  is:  0.5


## Size of saved model

In [42]:
# create model with best hyperparameters (Ridge, regParam=1.0)
#lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", weightCol="classWeights", \
#                        maxIter=10, regParam=1.0, elasticNetParam=0) 
lr = LogisticRegression(featuresCol="features", labelCol="ADM_OUTCOME", weightCol="classWeights", \
                                  maxIter=10, regParam=0.1, elasticNetParam=1)   

admModel = lr.fit(training)

In [43]:
# save model
admModel.write().overwrite().save("../models/001-log_regress-no_RFV")

In [44]:
# get size on disk
!du -h ../models/001-log_regress-no_RFV

28K	../models/001-log_regress-no_RFV/data
20K	../models/001-log_regress-no_RFV/metadata
52K	../models/001-log_regress-no_RFV


## Get evaluation metrics

In [45]:
# predict on testing set 
predict_test=admModel.transform(testing)

In [46]:
# calculate AUC 
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="ADM_OUTCOME")
print("ROC-AUC:", evaluator.evaluate(predict_test))

ROC-AUC: 0.812217344180374


In [47]:
# calculate accuracy
print("F1 score:",evaluator.setMetricName("areaUnderPR").evaluate(predict_test))

F1 score: 0.37724191885729574


In [48]:
# calculate accuracy
correct = predict_test.where('prediction == ADM_OUTCOME').count()
total = predict_test.count()

print("Accuracy:", correct/total)

Accuracy: 0.7753217166430638


In [49]:
# compute confusion matrix
tp = predict_test.where('prediction == 1 and ADM_OUTCOME==1').count() 
fp = predict_test.where('prediction == 1 and ADM_OUTCOME==0').count() 
tn = predict_test.where('prediction == 0 and ADM_OUTCOME==0').count() 
fn = predict_test.where('prediction == 0 and ADM_OUTCOME==1').count() 

print("\nConfusion Matrix:")
print('tn:',tn,' fn:',fn)
print('fp:',fp, '  tp:',tp,)  


Confusion Matrix:
tn: 11325  fn: 592
fp: 3057   tp: 1267


In [51]:
!pip show pyspark


Name: pyspark
Version: 2.4.5
Summary: Apache Spark Python API
Home-page: https://github.com/apache/spark/tree/master/python
Author: Spark Developers
Author-email: dev@spark.apache.org
License: http://www.apache.org/licenses/LICENSE-2.0
Location: /usr/local/spark-2.4.5-bin-hadoop2.7/python
Requires: py4j
Required-by: 
