# Build Anxiety Models 
This notebook contains all the code necessary to build, cross validate, and evaluate our three models for Anxiety. 

# Load in Data and Explore Schema

In [1]:
from pyspark.sql import *
DATA_FILEPATH = 'data/anxiety.csv'

spark = SparkSession \
    .builder \
    .appName("Preprocessing") \
    .getOrCreate()

df = spark.read.csv(DATA_FILEPATH,  inferSchema=True, header = True)

In [2]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Age: double (nullable = true)
 |-- SES: double (nullable = true)
 |-- RuralVsUrban: double (nullable = true)
 |-- EducationDegree: double (nullable = true)
 |-- SecondaryVsPrimary: double (nullable = true)
 |-- TeachPublicVsOther: double (nullable = true)
 |-- YearsAsTeacher: double (nullable = true)
 |-- EmployedVsNot: double (nullable = true)
 |-- PastCOVIDpositive: double (nullable = true)
 |-- COVIDvaccinated: double (nullable = true)
 |-- PrePandemicChronIllness: double (nullable = true)
 |-- PrePandemicMentIllness: double (nullable = true)
 |-- PrePandemicNeuroDis: double (nullable = true)
 |-- Depression: integer (nullable = true)
 |-- Anxiety: integer (nullable = true)
 |-- OverallHealth: double (nullable = true)
 |-- COVIDfear: double (nullable = true)
 |-- RelatImprov: double (nullable = true)
 |-- WorkloadNowVsPreCOVID: double (nullable = true)
 |-- ResourceSatisfaction: double (nullable = true)
 |-- SufficientCOVIDmeasures: doub

# Train/Validation/Test Split

In [3]:
train, test = df.randomSplit([0.85, 0.15], 314)

# Benchmark Model: Logistic Regression

In [47]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
column_list = train.columns
column_list.remove('Anxiety')
column_list.remove('_c0')
column_list

['Age',
 'SES',
 'RuralVsUrban',
 'EducationDegree',
 'SecondaryVsPrimary',
 'TeachPublicVsOther',
 'YearsAsTeacher',
 'EmployedVsNot',
 'PastCOVIDpositive',
 'COVIDvaccinated',
 'PrePandemicChronIllness',
 'PrePandemicMentIllness',
 'PrePandemicNeuroDis',
 'Depression',
 'OverallHealth',
 'COVIDfear',
 'RelatImprov',
 'WorkloadNowVsPreCOVID',
 'ResourceSatisfaction',
 'SufficientCOVIDmeasures',
 'MonthsOnlineTeach',
 'EducProblems',
 'BehavProblems',
 'EmotProblems',
 'SocialProblems',
 'FamilyProblems',
 'DiffOnlineTeach',
 'InstructAdjust',
 'BenefitOnlineTeach',
 'TotOnlineTraining',
 'BurnoutEmotExhaust',
 'BurnoutDepersonal',
 'BurnoutPerFulfill',
 'Man',
 'Partnered',
 "Bachelor's Degree",
 'Diploma',
 "Master's Degree",
 'Non_university_studies',
 'PhD',
 'Postdoctoral Fellowship',
 '0-5 years',
 '6-11 years',
 '12-16 years',
 '17-18 years']

In [11]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

train = train.withColumnRenamed('Anxiety', 'label')

lr = LogisticRegression(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, lr])

paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0, .2, .4, .5, .6, .8, 1]) \
    .addGrid(lr.regParam, [0.1, 0.01, .001]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

cvModel = crossval.setParallelism(4).fit(train) # train 4 models in parallel

In [36]:
import numpy as np
best_params = cvModel.getEstimatorParamMaps()[np.argmax(cvModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

LogisticRegression_865107c52f99__elasticNetParam: 1.0
LogisticRegression_865107c52f99__regParam: 0.01


In [12]:
test = test.withColumnRenamed('Anxiety', 'label')
prediction = cvModel.transform(test)

In [56]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(prediction)
#confusion_matrix = evaluator.evaluate(prediction, {evaluator.metricName: "confusionMatrix"})

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.7175572519083969
Precision: 0.6930232558139535
Recall: 0.7680412371134021
F1 Score: 0.716941784517797
AUROC: 0.7181914728280578


# Random Forest

In [57]:
from pyspark.ml.classification import RandomForestClassifier

In [58]:
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")

rf = RandomForestClassifier(labelCol='label',
                        featuresCol='scaledFeatures')

pipeline = Pipeline(stages = [assembler, scaler, rf])

paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [1, 5, 10, 15]) \
    .addGrid(rf.numTrees, [10, 15, 20, 25, 30, 35, 40, 45]) \
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

rfModel = crossval.setParallelism(4).fit(train) # train 4 models in parallel

In [59]:
import numpy as np
best_params = rfModel.getEstimatorParamMaps()[np.argmax(rfModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

RandomForestClassifier_e51927cd83c4__maxDepth: 15
RandomForestClassifier_e51927cd83c4__numTrees: 40


In [129]:
metrics = rfModel.avgMetrics
all_params = rfModel.getEstimatorParamMaps()
params= []
for i in range(0, len(all_params)):
    param = all_params[i]
    for k, v in param.items():
        params.append({f"{k}": f"{v}"})     
for i in range(0, len(metrics)):
    s = f'Param set {i+1} Avg Metrics: {metrics[i]}\n'
    params = all_params[i]
    for k, v in params.items():
#         kname = str(k).split('__')[-1]
        s+= f"  {k.name}: {v}\n"
#         print(k.name)
    print(s)

Param set 1 Avg Metrics: 0.7502279252272457
  maxDepth: 1
  numTrees: 10

Param set 2 Avg Metrics: 0.7753240768211531
  maxDepth: 1
  numTrees: 15

Param set 3 Avg Metrics: 0.7874249720602038
  maxDepth: 1
  numTrees: 20

Param set 4 Avg Metrics: 0.7868984956949058
  maxDepth: 1
  numTrees: 25

Param set 5 Avg Metrics: 0.7892783366964724
  maxDepth: 1
  numTrees: 30

Param set 6 Avg Metrics: 0.7866882587905141
  maxDepth: 1
  numTrees: 35

Param set 7 Avg Metrics: 0.7834728504678502
  maxDepth: 1
  numTrees: 40

Param set 8 Avg Metrics: 0.7869202019144231
  maxDepth: 1
  numTrees: 45

Param set 9 Avg Metrics: 0.8049769568277458
  maxDepth: 5
  numTrees: 10

Param set 10 Avg Metrics: 0.8161205422387199
  maxDepth: 5
  numTrees: 15

Param set 11 Avg Metrics: 0.81519853718124
  maxDepth: 5
  numTrees: 20

Param set 12 Avg Metrics: 0.8193333617284725
  maxDepth: 5
  numTrees: 25

Param set 13 Avg Metrics: 0.8186075074103067
  maxDepth: 5
  numTrees: 30

Param set 14 Avg Metrics: 0.82385029

rf_prediction = rfModel.transform(test)

In [63]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(rf_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(rf_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(rf_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(rf_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(rf_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.8371501272264631
Precision: 0.8735632183908046
Recall: 0.7835051546391752
F1 Score: 0.8366207889152324
AUROC: 0.8364761954100399


# MLP Model

In [72]:
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [77]:
layers = [[len(column_list), k, k, 2] for k in [50, 75, 100]]
assembler = VectorAssembler(inputCols=column_list,
                            outputCol="features")

scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")


mlp = MultilayerPerceptronClassifier(labelCol='label', featuresCol='scaledFeatures', maxIter=1000, 
                                     blockSize=128, seed=1234, solver = 'gd')

pipeline = Pipeline(stages = [assembler, scaler, mlp])

paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, layers)\
    .addGrid(mlp.stepSize, [.001, .005, .01, .05])\
    .build()

# Fit the model
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=4)

# train the model
mlpModel = crossval.setParallelism(6).fit(train) # train 4 models in parallel

In [78]:
import numpy as np
best_params = mlpModel.getEstimatorParamMaps()[np.argmax(mlpModel.avgMetrics)]
for k, v in best_params.items():
    print(f"{k}: {v}")

MultilayerPerceptronClassifier_a00f1e87da6c__layers: [45, 75, 75, 2]
MultilayerPerceptronClassifier_a00f1e87da6c__stepSize: 0.05


In [79]:
mlp_prediction = mlpModel.transform(test)

In [80]:
# set up evaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",
                                          labelCol="label")
evaluator_2 = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                          labelCol="label",
                                          metricName="areaUnderROC")
#Evaluate Metrics
accuracy = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "precisionByLabel"})
recall = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "recallByLabel"})
f1_score = evaluator.evaluate(mlp_prediction, {evaluator.metricName: "f1"})
auc = evaluator_2.evaluate(mlp_prediction)

#Print Results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1_score}")
print(f"AUROC: {auc}")
#print(f"Confusion Matrix:\n{confusion_matrix}")

Accuracy: 0.7201017811704835
Precision: 0.7038834951456311
Recall: 0.7474226804123711
F1 Score: 0.7199495049374416
AUROC: 0.7204450085478942
