In [3]:
from pathlib import Path
home = "dbfs:/mnt/data"

In [4]:
df = spark.read.parquet(f"{home}/data/10-processed-data.parquet")
df = df.selectExpr("features_scaled as features","Default as label")

In [5]:
#split the data into 70/30 ratio for train test purpose
train_df,test_df=df.randomSplit([0.7,0.3])

In [6]:
train_df.count()

85177

In [7]:
train_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|78308|
|  1.0| 6869|
+-----+-----+



In [8]:
test_df.count()

36679

In [9]:
test_df.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|33711|
|  1.0| 2968|
+-----+-----+



In [10]:
#import from spark's MLlib
from pyspark.ml.classification import LogisticRegression

In [11]:
log_reg=LogisticRegression(labelCol='label').fit(train_df)

In [12]:
#Training Results

In [13]:
lr_summary=log_reg.summary

In [14]:
lr_summary.accuracy

1.0

In [16]:
lr_summary.areaUnderROC

0.9999874176807151

In [19]:
predictions = log_reg.transform(test_df)
predictions.show(10)

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.0,0.4209415121...|  1.0|[-22.468463147612...|[1.74610533016634...|       1.0|
|[3.01169358481887...|  0.0|[25.5857653612941...|[0.99999999999226...|       0.0|
|[3.28548391071149...|  0.0|[24.9791637713798...|[0.99999999998581...|       0.0|
|[3.55927423660412...|  0.0|[24.7164347670079...|[0.99999999998155...|       0.0|
|[4.92822586606724...|  0.0|[27.2654876449040...|[0.99999999999855...|       0.0|
|[5.74959684374512...|  0.0|[20.3307376367208...|[0.99999999851928...|       0.0|
|[5.74959684374512...|  0.0|[19.5768612630449...|[0.99999999685314...|       0.0|
|[6.57096782142299...|  0.0|[24.2154357288228...|[0.99999999996956...|       0.0|
|[7.11854847320824...|  0.0|[23.3388504226903...|[0.99999999992687...|       0.0|
|[8.487500102671

In [20]:
model_predictions = log_reg.transform(test_df)

In [21]:
model_predictions = log_reg.evaluate(test_df)

In [22]:
model_predictions.accuracy

0.9999727364431964

In [23]:
model_predictions.weightedPrecision

0.999972737251916

In [24]:
model_predictions.recallByLabel

[1.0, 0.9996630727762803]

In [25]:
print(model_predictions.precisionByLabel)

[0.9999703369719981, 1.0]


In [26]:
model_predictions.areaUnderROC

0.9996129599614801

In [27]:
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label")
rf_model = rf.fit(train_df)

In [28]:
model_predictions = rf_model.transform(test_df)

In [29]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()

rf = RandomForestClassifier(labelCol="label")
paramGrid = (ParamGridBuilder()
#             .addGrid(rf.maxDepth, [5,10,20,25,30])
             .addGrid(rf.maxDepth, [5, 10, 20])
#             .addGrid(rf.maxBins, [20,30,40 ])
             .addGrid(rf.maxBins, [20, 30])
#             .addGrid(rf.numTrees, [5, 20,50])
             .addGrid(rf.numTrees, [5, 20])
             .build())
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)
cv_model = cv.fit(train_df)

In [30]:
best_rf_model = cv_model.bestModel

In [31]:
# Generate predictions for entire dataset
model_predictions = best_rf_model.transform(test_df)

In [32]:
true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()
actual_pos=model_predictions.filter(model_predictions['label']==1).count()
pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()

In [33]:
#Recall 
float(true_pos)/(actual_pos)

0.9976415094339622

In [34]:
#Precision on test Data 
float(true_pos)/(pred_pos)

1.0

In [35]:
evaluator.evaluate(cv_model.transform(test_df))

1.0