# ML Pipelines

In [23]:
from pathlib import Path
home = "dbfs:/mnt/data"

In [24]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegressionModel, LogisticRegression
from pyspark.sql.functions import count, when, isnan, col

In [25]:
df = spark.read.parquet(f"{home}/data/10-processed-data.parquet")

In [26]:
# Train the model
lr = LogisticRegression(maxIter=10, regParam=0.001, featuresCol='features_scaled',labelCol='Default')
model = lr.fit(df)

In [27]:
model.write().overwrite().save(f"{home}/data/my_trained_model")

## Predict and check the model predictions

In [28]:
model.transform(df).select("features_scaled", "Default", "prediction", "probability").filter("Default<>prediction")\
.show()

+---------------+-------+----------+-----------+
|features_scaled|Default|prediction|probability|
+---------------+-------+----------+-----------+
+---------------+-------+----------+-----------+



In [29]:
model.transform(df).select("features_scaled", "Default", "prediction", "probability").filter("Default=prediction").\
filter("prediction=0").show()

+--------------------+-------+----------+--------------------+
|     features_scaled|Default|prediction|         probability|
+--------------------+-------+----------+--------------------+
|[0.00290765326097...|    0.0|       0.0|[0.99806031359132...|
|[0.01130480255610...|    0.0|       0.0|[0.99868282197925...|
|[0.01396878242704...|    0.0|       0.0|[0.99872008133509...|
|[1.0,0.8855349500...|    0.0|       0.0|[0.99862474744516...|
|[0.00815073800182...|    0.0|       0.0|[0.99875770186950...|
|[0.00324167745856...|    0.0|       0.0|[0.99857100253159...|
|[1.0,0.7787161198...|    0.0|       0.0|[0.99915618155069...|
|[1.0,0.8454778887...|    0.0|       0.0|[0.99924395257184...|
|[0.02159931880966...|    0.0|       0.0|[0.99912552801647...|
|[1.0,0.7321540656...|    0.0|       0.0|[0.99878601775829...|
|[0.00619587507495...|    0.0|       0.0|[0.99922808149635...|
|[0.00101028630254...|    0.0|       0.0|[0.99853795698309...|
|[0.00106504436772...|    0.0|       0.0|[0.99752165458

In [30]:
model.transform(df).select("features_scaled", "Default", "prediction", "probability").filter("Default=prediction").\
filter("prediction=1").show()

+--------------------+-------+----------+--------------------+
|     features_scaled|Default|prediction|         probability|
+--------------------+-------+----------+--------------------+
|[0.00625063314012...|    1.0|       1.0|[0.00815204667352...|
|[0.00217937099410...|    1.0|       1.0|[0.01066287882261...|
|[0.00352094359097...|    1.0|       1.0|[0.01114256195393...|
|[0.00623968152709...|    1.0|       1.0|[0.01229635907867...|
|[0.00389329843419...|    1.0|       1.0|[0.00576327745933...|
|[0.00200962099205...|    1.0|       1.0|[0.01161944756625...|
|[0.00759220573700...|    1.0|       1.0|[0.01144735834942...|
|[0.00418625408289...|    1.0|       1.0|[0.01423860564174...|
|[0.00177689921504...|    1.0|       1.0|[0.01240017659024...|
|[0.00180701615089...|    1.0|       1.0|[0.00787606498800...|
|[0.00321429842597...|    1.0|       1.0|[0.00892829800123...|
|[0.00171118953682...|    1.0|       1.0|[0.01053757757563...|
|[0.00319513310316...|    1.0|       1.0|[0.01850815473