# Build Depression Models 
This notebook contains all the code necessary to build, cross validate, and evaluate our models for Depression. 

# Load in Data and Explore Schema

In [1]:
from pyspark.sql import *
DATA_FILEPATH = 'data/depression.csv'

spark = SparkSession \
    .builder \
    .appName("Preprocessing") \
    .getOrCreate()

df = spark.read.csv(DATA_FILEPATH,  inferSchema=True, header = True)

In [2]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- SES: double (nullable = true)
 |-- RuralVsUrban: double (nullable = true)
 |-- EducationDegree: double (nullable = true)
 |-- SecondaryVsPrimary: double (nullable = true)
 |-- TeachPublicVsOther: double (nullable = true)
 |-- YearsAsTeacher: double (nullable = true)
 |-- EmployedVsNot: double (nullable = true)
 |-- PastCOVIDpositive: double (nullable = true)
 |-- COVIDvaccinated: double (nullable = true)
 |-- PrePandemicChronIllness: double (nullable = true)
 |-- PrePandemicMentIllness: double (nullable = true)
 |-- PrePandemicNeuroDis: double (nullable = true)
 |-- Depression: integer (nullable = true)
 |-- Anxiety: integer (nullable = true)
 |-- OverallHealth: double (nullable = true)
 |-- COVIDfear: double (nullable = true)
 |-- RelatImprov: double (nullable = true)
 |-- WorkloadNowVsPreCOVID: double (nullable = true)
 |-- ResourceSatisfaction: double (nullable = true)
 |-- SufficientCOVIDmeasures: doub

# Train/Validation/Test Split

In [3]:
train, test = df.randomSplit([0.85, 0.15], 314)
column_list = train.columns
column_list.remove('Depression')
column_list.remove('_c0')
train = train.withColumnRenamed('Depression', 'label')
test = test.withColumnRenamed('Depression', 'label')

# Benchmark Model: Logistic Regression

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator



In [5]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

lr = LogisticRegression(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, lr])

paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0, .2, .4, .5, .6, .8, 1]) \
    .addGrid(lr.regParam, [0.1, 0.01, .001]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

cvModel = crossval.setParallelism(4).fit(train) # train 4 models in parallel

In [6]:
import numpy as np
best_params = cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

LogisticRegression_13cbfb8096b0__elasticNetParam: 0.6
LogisticRegression_13cbfb8096b0__regParam: 0.01


In [7]:
prediction = cvModel.transform(test)

In [8]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(prediction)
#confusion_matrix = evaluator.evaluate(prediction, {evaluator.metricName: "confusionMatrix"})

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.7509727626459144
Precision: 0.7686274509803922
Recall: 0.7396226415094339
F1 Score: 0.7510293254648168
AUROC: 0.7513374251723878


# Random Forest

In [9]:
from pyspark.ml.classification import RandomForestClassifier

In [10]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

rf = RandomForestClassifier(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [1, 5, 10, 15]) \
    .addGrid(rf.numTrees, [10, 15, 20, 25, 30, 35, 40, 45]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

rfModel = crossval.setParallelism(4).fit(train) # train 4 models in parallel

In [11]:
import numpy as np
best_params = rfModel.getEstimatorParamMaps()[np.argmax(rfModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

RandomForestClassifier_97119562daf6__maxDepth: 15
RandomForestClassifier_97119562daf6__numTrees: 45


In [12]:
metrics = rfModel.avgMetrics
all_params = rfModel.getEstimatorParamMaps()
params= []
for i in range(0, len(all_params)):
    param = all_params[i]
    for k, v in param.items():
        params.append({f"{k}": f"{v}"})     
for i in range(0, len(metrics)):
    s = f'Param set {i+1} Avg Metrics: {metrics[i]}\n'
    params = all_params[i]
    for k, v in params.items():
#         kname = str(k).split('__')[-1]
        s+= f"  {k.name}: {v}\n"
#         print(k.name)
    print(s)

Param set 1 Avg Metrics: 0.8288257081099475
  maxDepth: 1
  numTrees: 10

Param set 2 Avg Metrics: 0.823365195230668
  maxDepth: 1
  numTrees: 15

Param set 3 Avg Metrics: 0.843800324808525
  maxDepth: 1
  numTrees: 20

Param set 4 Avg Metrics: 0.8353203075512105
  maxDepth: 1
  numTrees: 25

Param set 5 Avg Metrics: 0.834049730165862
  maxDepth: 1
  numTrees: 30

Param set 6 Avg Metrics: 0.8306092667991258
  maxDepth: 1
  numTrees: 35

Param set 7 Avg Metrics: 0.8220845752044051
  maxDepth: 1
  numTrees: 40

Param set 8 Avg Metrics: 0.820395651292074
  maxDepth: 1
  numTrees: 45

Param set 9 Avg Metrics: 0.8915832855369671
  maxDepth: 5
  numTrees: 10

Param set 10 Avg Metrics: 0.8932436476529284
  maxDepth: 5
  numTrees: 15

Param set 11 Avg Metrics: 0.898807289136832
  maxDepth: 5
  numTrees: 20

Param set 12 Avg Metrics: 0.8980281131870574
  maxDepth: 5
  numTrees: 25

Param set 13 Avg Metrics: 0.9023377364574684
  maxDepth: 5
  numTrees: 30

Param set 14 Avg Metrics: 0.90046439273

In [13]:
rf_prediction = rfModel.transform(test)

In [14]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(rf_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(rf_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(rf_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(rf_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(rf_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.9494163424124513
Precision: 0.995850622406639
Recall: 0.9056603773584906
F1 Score: 0.9493795726827072
AUROC: 0.9508221565507312


# MLP Model

In [15]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [16]:
layers = [[len(column_list), k, k, 2] for k in [50, 75, 100]]
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


mlp = MultilayerPerceptronClassifier(labelCol='label', featuresCol='scaledFeatures', maxIter=1000, 
                                     blockSize=128, seed=1234, solver = 'gd')

pipeline = Pipeline(stages = [assembler, scaler, mlp])

paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, layers)\
    .addGrid(mlp.stepSize, [.001, .005, .01, .05])\
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

# train the model
mlpModel = crossval.setParallelism(6).fit(train) # train 4 models in parallel

In [17]:
import numpy as np
best_params = mlpModel.getEstimatorParamMaps()[np.argmax(mlpModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

MultilayerPerceptronClassifier_87f75b233eb4__layers: [45, 100, 100, 2]
MultilayerPerceptronClassifier_87f75b233eb4__stepSize: 0.05


In [18]:
mlp_prediction = mlpModel.transform(test)

In [19]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(mlp_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.7490272373540856
Precision: 0.788135593220339
Recall: 0.7018867924528301
F1 Score: 0.7486688772942527
AUROC: 0.7505417898007123


# SVM

In [20]:
from pyspark.ml.classification import LinearSVC

In [21]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

svm = LinearSVC(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, svm])

paramGrid = ParamGridBuilder() \
    .addGrid(svm.maxIter, [10, 20, 30, 40, 50]) \
    .addGrid(svm.regParam, [0.1, 0.01, .001]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

svmModel = crossval.setParallelism(6).fit(train) # train 4 models in parallel

In [22]:
import numpy as np
best_params = svmModel.getEstimatorParamMaps()[np.argmax(svmModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

LinearSVC_a5385fc9032e__maxIter: 10
LinearSVC_a5385fc9032e__regParam: 0.1


In [23]:
svm_prediction = svmModel.transform(test)

In [24]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(svm_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(svm_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(svm_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(svm_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(svm_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.7412451361867705
Precision: 0.7844827586206896
Recall: 0.6867924528301886
F1 Score: 0.7406950883343102
AUROC: 0.7429946199893915


# Gradient Boosted Tree

In [25]:
from pyspark.ml.classification import GBTClassifier

In [26]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

gbt = GBTClassifier(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, gbt])

paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10, 20, 30]) \
    .addGrid(gbt.maxBins, [8, 16, 32,64]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

gbtModel = crossval.setParallelism(6).fit(train) # train 4 models in parallel

In [27]:
import numpy as np
best_params = gbtModel.getEstimatorParamMaps()[np.argmax(gbtModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

GBTClassifier_a24fadf68a87__maxDepth: 10
GBTClassifier_a24fadf68a87__maxBins: 16


In [28]:
gbt_prediction = gbtModel.transform(test)

In [29]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(gbt_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.914396887159533
Precision: 1.0
Recall: 0.8339622641509434
F1 Score: 0.9139965142265631
AUROC: 0.9169811320754717
