In [0]:
# read manually uploaded HR data file
file_location = "/FileStore/tables/HR_data.csv"
df = spark.read.csv(file_location, header="True", inferSchema="True")
df = df.withColumnRenamed('salary scale', 'salary_scale')
display(df)

# look at data types of df
print(df.dtypes)

# create temporary table of .csv to run some initial sql queries to better understand the data
temp_table_name = "HR_data_csv"
df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql
-- Total record count
SELECT COUNT(*)
FROM HR_data_csv;

-- Ratio of people who left vs. stayed
SELECT DISTINCT(left), COUNT(*)
FROM HR_data_csv
GROUP BY left;

In [0]:
# data cleaning -> confirming that there are no null values
print(df.count())

df = df.dropna()

print(df.count())

In [0]:
# split train/test sets (70/30)

train_df, test_df = df.randomSplit([.70, .30], seed=42)
print(train_df.count())
print(test_df.count())

In [0]:
# separate features and labels 
cat_cols = ["department", "salary_scale"]

# one hot encode categorical columns and string index label
from pyspark.ml.feature import StringIndexer, OneHotEncoder

stringIndexer = StringIndexer(inputCols=cat_cols, outputCols=[x + "_index" for x in cat_cols])
encoder = OneHotEncoder(inputCols=stringIndexer.getOutputCols(), outputCols=[x + "_OHE" for x in cat_cols])

labelToIndex = StringIndexer(inputCol="left", outputCol="label")

In [0]:
# fit and transform indexer model
stringIndexerModel = stringIndexer.fit(train_df)
display(stringIndexerModel.transform(train_df))

In [0]:
# create a single vector
num_cols = ["satisfaction_level", "last_evaluation", "number_project", "average_montly_hours", "time_spend_company", "Work_accident", "promotion_last_5years"]


from pyspark.ml.feature import VectorAssembler
assemblerInputs = [x + "_OHE" for x in cat_cols] + num_cols
vecAssembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

In [0]:
# set up a random forest (classification) model 
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

In [0]:
# define pipeline stages
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[stringIndexer, encoder, labelToIndex, vecAssembler, rf])

# define pipeline model
pipelineModel = pipeline.fit(train_df)

# apply pipeline model to test set
pred_df = pipelineModel.transform(test_df)

In [0]:
display(pred_df)

pred_df.select("prediction", "label", "features").show()

# RF not compatible with ROC cruve option -> display(pipelineModel.stages[-1], pred_df.drop("prediction", "rawPrediction", "probability"), "ROC")

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
 
bcEvaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
print(f"Area under ROC curve: {bcEvaluator.evaluate(pred_df)}")
 
mcEvaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print(f"Accuracy: {mcEvaluator.evaluate(pred_df)}")

In [0]:
# Hyperparameter Tuning:
# uncomment to initially install mlflow -> %pip install mlflow
#import mlflow
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

paramGrid = (ParamGridBuilder()
            .addGrid(rf.numTrees, [3, 9, 12, 21])
            .addGrid(rf.maxDepth, [3, 5, 7, 9])
            .build())
             
# create a 5-fold CrossValidator
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=bcEvaluator, numFolds=5, parallelism=4)
             
# run cross-validations
cvModel = cv.fit(train_df)

In [0]:
# use model identified by cv to make predictions on test set
cv_pred_df = cvModel.transform(test_df)
 
# evaluate performance
print(f"Area under ROC Curve: {bcEvaluator.evaluate(cv_pred_df)}")
print(f"Accuracy: {mcEvaluator.evaluate(cv_pred_df)}")

In [0]:
# isolate best performing model
bestPipeline = cvModel.bestModel
bestModel = bestPipeline.stages[-1]

In [0]:
# set up and display feature importance
import matplotlib as plt

importance = bestModel.featureImportances

