In [41]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.getOrCreate()

# reading in the data
infile = 'default.csv'
df = spark.read.csv(infile, inferSchema=True, header = True)

# rename the response as "label"
df = df.withColumnRenamed("default payment next month", "label")

In [42]:
# balance the data 
# Split data into train set (70%), test set (30%) using seed=314
training, test = df.randomSplit([0.7, 0.3], seed=314)

# Balance the data using oversampling method. Do this on train set only. 
from pyspark.sql.functions import col, explode, array, lit
major_train = training.filter(col('label') == 0)
minor_train = training.filter(col('label') == 1)
ratio = int(major_train.count()/minor_train.count())
ratio_range = range(ratio)
oversampled_train = minor_train.withColumn("dummy", explode(array([lit(x) for x in ratio_range]))).drop('dummy')
final_train = major_train.unionAll(oversampled_train)

# Decision Tree

In [43]:
# Decision Tree pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

assembler_dt = VectorAssembler(inputCols=['LIMIT_BAL','SEX',
                       'EDUCATION','MARRIAGE','AGE',
                       'PAY_0','PAY_2','PAY_3',
                       'PAY_4','PAY_5','PAY_6',
                       'BILL_AMT1','BILL_AMT2','BILL_AMT3',
                       'BILL_AMT4','BILL_AMT5','BILL_AMT6',
                       'PAY_AMT1','PAY_AMT2','PAY_AMT3',
                       'PAY_AMT4','PAY_AMT5','PAY_AMT6'], outputCol="features")

dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
pipeline_dt = Pipeline(stages=[assembler_dt,dt])

In [53]:
# model selection

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# ParamGrid for cross validation
paramGrid_dt = ParamGridBuilder()\
             .addGrid(dt.maxDepth, [2, 5, 10])\
             .addGrid(dt.maxBins, [5, 10, 20])\
             .build()
            
# use pipeline as an estimator, 10-fold cross validation, seed 314
crossVal_dt = CrossValidator(estimator = pipeline_dt,
                          estimatorParamMaps = paramGrid_dt,
                          evaluator = BinaryClassificationEvaluator(),
                          numFolds = 10,
                          seed = 314)

In [65]:
# fit models
cv_model_dt = crossVal_dt.fit(final_train)

# make predictions
prediction_dt = cv_model_dt.transform(test)

# Model Evaluation for Decision Tree

In [66]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.mllib.evaluation import MulticlassMetrics

In [98]:
# confusion matrix
label_pred_dt = prediction_dt["label", "prediction"].rdd.map(lambda p: (float(p.prediction), float(p.label)))
matrix_dt = MulticlassMetrics(label_pred_dt)

confusionMatrix = matrix_dt.confusionMatrix().toArray()
print("Confusion Matrix:\n{}".format(confusionMatrix))

Confusion Matrix:
[[5757. 1322.]
 [ 800. 1196.]]


### [TN, FP]
### [FN, TP]

In [100]:
# accuracy, precision, recall, and F1-score
TP_dt = prediction_dt.filter('prediction = 1 AND label = prediction').count()
TN_dt = prediction_dt.filter('prediction = 0 AND label = prediction').count()
FP_dt = prediction_dt.filter('prediction = 1 AND label <> prediction').count()
FN_dt = prediction_dt.filter('prediction = 0 AND label <> prediction').count()

accuracy_dt = (TN_dt + TP_dt) / (TN_dt + TP_dt + FN_dt + FP_dt)
precision_dt = TP_dt / (TP_dt + FP_dt)
recall_dt = TP_dt / (TP_dt+ FN_dt)
F1_dt =  2 * ((precision_dt*recall_dt) / (precision_dt + recall_dt))

print("accuracy: %.4F" % accuracy_dt)
print("precision: %.4F" % precision_dt)
print("recall: %.4F" % recall_dt)
print("F1 score: %.4F" % F1_dt)

accuracy: 0.7662
precision: 0.4750
recall: 0.5992
F1 score: 0.5299


In [103]:
# Area Under ROC
eval_dt = BinaryClassificationEvaluator()
auc_dt = eval_dt.evaluate(prediction_dt, {eval_dt.metricName: "areaUnderROC"})
print("Area Under ROC: %.4F" % auc_dt)

Area Under ROC: 0.6801
