# Build Anxiety Models 
This notebook contains all the code necessary to build, cross validate, and evaluate our models for Anxiety. 

# Load in Data and Explore Schema

In [1]:
from pyspark.sql import *
DATA_FILEPATH = 'data/anxiety.csv'

spark = SparkSession \
    .builder \
    .appName("Build Anxiety Models") \
    .getOrCreate()

train = spark.read.csv(DATA_FILEPATH,  inferSchema=True, header = True)
DATA_FILEPATH = 'data/test.csv'
test = spark.read.csv(DATA_FILEPATH,  inferSchema=True, header = True)

In [2]:
train.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- SES: double (nullable = true)
 |-- RuralVsUrban: double (nullable = true)
 |-- EducationDegree: double (nullable = true)
 |-- SecondaryVsPrimary: double (nullable = true)
 |-- TeachPublicVsOther: double (nullable = true)
 |-- YearsAsTeacher: double (nullable = true)
 |-- EmployedVsNot: double (nullable = true)
 |-- PastCOVIDpositive: double (nullable = true)
 |-- COVIDvaccinated: double (nullable = true)
 |-- PrePandemicChronIllness: double (nullable = true)
 |-- PrePandemicMentIllness: double (nullable = true)
 |-- PrePandemicNeuroDis: double (nullable = true)
 |-- Depression: integer (nullable = true)
 |-- Anxiety: integer (nullable = true)
 |-- OverallHealth: double (nullable = true)
 |-- COVIDfear: double (nullable = true)
 |-- RelatImprov: double (nullable = true)
 |-- WorkloadNowVsPreCOVID: double (nullable = true)
 |-- ResourceSatisfaction: double (nullable = true)
 |-- SufficientCOVIDmeasures: doub

# Train/Validation/Test Split

In [3]:
column_list = train.columns
column_list.remove('Anxiety')
column_list.remove('_c0')
train = train.withColumnRenamed('Anxiety', 'label')
test = test.withColumnRenamed('Anxiety', 'label')

# Benchmark Model: Logistic Regression

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator



In [5]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

lr = LogisticRegression(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, lr])

paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0, .2, .4, .5, .6, .8, 1]) \
    .addGrid(lr.regParam, [0.1, 0.01, .001]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

cvModel = crossval.setParallelism(4).fit(train) # train 4 models in parallel

In [6]:
import numpy as np
best_params = cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

LogisticRegression_b6d39b492788__elasticNetParam: 0.5
LogisticRegression_b6d39b492788__regParam: 0.01


In [30]:
metrics = cvModel.avgMetrics
all_params = cvModel.getEstimatorParamMaps()
params= []
for i in range(0, len(all_params)):
    param = all_params[i]
    for k, v in param.items():
        params.append({f"{k}": f"{v}"})     
for i in range(0, len(metrics)):
    s = f'Param set {i+1} Avg Metrics: {metrics[i]}\n'
    params = all_params[i]
    for k, v in params.items():
        s+= f"  {k.name}: {v}\n"
    print(s)

Param set 1 Avg Metrics: 0.8222156491623472
  elasticNetParam: 0.0
  regParam: 0.1

Param set 2 Avg Metrics: 0.8254080150063109
  elasticNetParam: 0.0
  regParam: 0.01

Param set 3 Avg Metrics: 0.8253492607789543
  elasticNetParam: 0.0
  regParam: 0.001

Param set 4 Avg Metrics: 0.8234676045610584
  elasticNetParam: 0.2
  regParam: 0.1

Param set 5 Avg Metrics: 0.826026779822251
  elasticNetParam: 0.2
  regParam: 0.01

Param set 6 Avg Metrics: 0.8254531628183647
  elasticNetParam: 0.2
  regParam: 0.001

Param set 7 Avg Metrics: 0.818087291695732
  elasticNetParam: 0.4
  regParam: 0.1

Param set 8 Avg Metrics: 0.8268036227870663
  elasticNetParam: 0.4
  regParam: 0.01

Param set 9 Avg Metrics: 0.8254887287576786
  elasticNetParam: 0.4
  regParam: 0.001

Param set 10 Avg Metrics: 0.8157424099885194
  elasticNetParam: 0.5
  regParam: 0.1

Param set 11 Avg Metrics: 0.8269153599847812
  elasticNetParam: 0.5
  regParam: 0.01

Param set 12 Avg Metrics: 0.8254338821913817
  elasticNetParam: 0.

In [7]:
prediction = cvModel.transform(test)

In [8]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")

Accuracy: 0.7579617834394905
Precision: 0.796875
Recall: 0.8052631578947368
F1 Score: 0.7576114961469753
AUROC: 0.7453735144312394


# Random Forest

In [31]:
from pyspark.ml.classification import RandomForestClassifier

In [32]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

rf = RandomForestClassifier(labelCol='label',
                        featuresCol='features')

pipeline = Pipeline(stages = [assembler, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [1, 5, 10, 15]) \
    .addGrid(rf.numTrees, [10, 15, 20, 25, 30, 35, 40, 45]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

rfModel = crossval.setParallelism(4).fit(train) # train 4 models in parallel

In [33]:
import numpy as np
best_params = rfModel.getEstimatorParamMaps()[np.argmax(rfModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

RandomForestClassifier_5a7fb866f669__maxDepth: 15
RandomForestClassifier_5a7fb866f669__numTrees: 40


In [34]:
metrics = rfModel.avgMetrics
all_params = rfModel.getEstimatorParamMaps()
params= []
for i in range(0, len(all_params)):
    param = all_params[i]
    for k, v in param.items():
        params.append({f"{k}": f"{v}"})     
for i in range(0, len(metrics)):
    s = f'Param set {i+1} Avg Metrics: {metrics[i]}\n'
    params = all_params[i]
    for k, v in params.items():
        s+= f"  {k.name}: {v}\n"
    print(s)

Param set 1 Avg Metrics: 0.7117251668859514
  maxDepth: 1
  numTrees: 10

Param set 2 Avg Metrics: 0.7560858442513432
  maxDepth: 1
  numTrees: 15

Param set 3 Avg Metrics: 0.7644090592038224
  maxDepth: 1
  numTrees: 20

Param set 4 Avg Metrics: 0.7685659842012551
  maxDepth: 1
  numTrees: 25

Param set 5 Avg Metrics: 0.7704840442426043
  maxDepth: 1
  numTrees: 30

Param set 6 Avg Metrics: 0.7871112342330036
  maxDepth: 1
  numTrees: 35

Param set 7 Avg Metrics: 0.7942737537249087
  maxDepth: 1
  numTrees: 40

Param set 8 Avg Metrics: 0.789018517911004
  maxDepth: 1
  numTrees: 45

Param set 9 Avg Metrics: 0.823204684303359
  maxDepth: 5
  numTrees: 10

Param set 10 Avg Metrics: 0.8348237007000621
  maxDepth: 5
  numTrees: 15

Param set 11 Avg Metrics: 0.8285029207000176
  maxDepth: 5
  numTrees: 20

Param set 12 Avg Metrics: 0.8341806247384049
  maxDepth: 5
  numTrees: 25

Param set 13 Avg Metrics: 0.8343717999869938
  maxDepth: 5
  numTrees: 30

Param set 14 Avg Metrics: 0.83708855

In [35]:
rf_prediction = rfModel.transform(test)

In [36]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(rf_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(rf_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(rf_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(rf_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(rf_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")

Accuracy: 0.7292993630573248
Precision: 0.7312775330396476
Recall: 0.8736842105263158
F1 Score: 0.7175744477017777
AUROC: 0.690874363327674


# MLP Model

In [15]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [16]:
layers = [[len(column_list), k, k, 2] for k in [50, 75, 100]]
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


mlp = MultilayerPerceptronClassifier(labelCol='label', featuresCol='scaledFeatures', maxIter=1000, 
                                     blockSize=128, seed=1234, solver = 'gd')

pipeline = Pipeline(stages = [assembler, scaler, mlp])

paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, layers)\
    .addGrid(mlp.stepSize, [.001, .005, .01, .05])\
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

# train the model
mlpModel = crossval.setParallelism(6).fit(train) # train 6 models in parallel

In [17]:
import numpy as np
best_params = mlpModel.getEstimatorParamMaps()[np.argmax(mlpModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

MultilayerPerceptronClassifier_d9c0df9f3942__layers: [45, 100, 100, 2]
MultilayerPerceptronClassifier_d9c0df9f3942__stepSize: 0.05


In [18]:
mlp_prediction = mlpModel.transform(test)

In [19]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(mlp_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")

Accuracy: 0.7452229299363057
Precision: 0.8089887640449438
Recall: 0.7578947368421053
F1 Score: 0.7469484268154996
AUROC: 0.7418505942275042


# SVM

In [20]:
from pyspark.ml.classification import LinearSVC

In [21]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

svm = LinearSVC(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, svm])

paramGrid = ParamGridBuilder() \
    .addGrid(svm.maxIter, [10, 20, 30, 40, 50]) \
    .addGrid(svm.regParam, [0.1, 0.01, .001]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

svmModel = crossval.setParallelism(6).fit(train) # train 6 models in parallel

In [22]:
import numpy as np
best_params = svmModel.getEstimatorParamMaps()[np.argmax(svmModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

LinearSVC_090fc820782b__maxIter: 10
LinearSVC_090fc820782b__regParam: 0.01


In [23]:
svm_prediction = svmModel.transform(test)

In [24]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(svm_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(svm_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(svm_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(svm_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(svm_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")

Accuracy: 0.7547770700636943
Precision: 0.7897435897435897
Recall: 0.8105263157894737
F1 Score: 0.7538465571020418
AUROC: 0.7399405772495755


# Gradient Boosted Tree

In [37]:
from pyspark.ml.classification import GBTClassifier

In [38]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

gbt = GBTClassifier(labelCol='label',
                        featuresCol='features')

pipeline = Pipeline(stages = [assembler, gbt])

paramGrid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [5, 10, 20, 30]) \
    .addGrid(gbt.maxBins, [8, 16, 32,64]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

gbtModel = crossval.setParallelism(6).fit(train) # train 6 models in parallel

In [39]:
import numpy as np
best_params = gbtModel.getEstimatorParamMaps()[np.argmax(gbtModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

GBTClassifier_686a01b5d611__maxDepth: 10
GBTClassifier_686a01b5d611__maxBins: 64


In [40]:
gbt_prediction = gbtModel.transform(test)

In [41]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(gbt_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(gbt_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")

Accuracy: 0.7133757961783439
Precision: 0.7551020408163265
Recall: 0.7789473684210526
F1 Score: 0.7120502528214845
AUROC: 0.6959252971137521
