# ML Pipelines

In [65]:
from pathlib import Path
home = "dbfs:/mnt/data"

path_test_data = f"{home}/data/Automobile-Loan-Default/Train_Dataset.csv"

In [66]:
from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType, DoubleType
my_schema = StructType([StructField("ID",StringType(),True),StructField("Client_Income",DoubleType(),True),\
                        StructField("Car_Owned",DoubleType(),True),StructField("Bike_Owned",DoubleType(),True),\
                        StructField("Active_Loan",DoubleType(),True),StructField("House_Own",DoubleType(),True),\
                        StructField("Child_Count",DoubleType(),True),StructField("Credit_Amount",DoubleType(),True),\
                        StructField("Loan_Annuity",DoubleType(),True),StructField("Accompany_Client",StringType(),True),\
                        StructField("Client_Income_Type",StringType(),True),StructField("Client_Education",StringType(),True),\
                        StructField("Client_Marital_Status",StringType(),True),StructField("Client_Gender",StringType(),True),\
                        StructField("Loan_Contract_Type",StringType(),True),StructField("Client_Housing_Type",StringType(),True),\
                        StructField("Population_Region_Relative",DoubleType(),True),StructField("Age_Days",DoubleType(),True),\
                        StructField("Employed_Days",DoubleType(),True),StructField("Registration_Days",DoubleType(),True),\
                        StructField("ID_Days",DoubleType(),True),StructField("Own_House_Age",DoubleType(),True),\
                        StructField("Mobile_Tag",DoubleType(),True),StructField("Homephone_Tag",DoubleType(),True),\
                        StructField("Workphone_Working",DoubleType(),True),StructField("Client_Occupation",StringType(),True),\
                        StructField("Client_Family_Members",DoubleType(),True),StructField("Cleint_City_Rating",DoubleType(),True),\
                        StructField("Application_Process_Day",DoubleType(),True),StructField("Application_Process_Hour",DoubleType(),True),\
                        StructField("Client_Permanent_Match_Tag",StringType(),True),StructField("Client_Contact_Work_Tag",StringType(),True),\
                        StructField("Type_Organization",StringType(),True),StructField("Score_Source_1",DoubleType(),True),\
                        StructField("Score_Source_2",DoubleType(),True),StructField("Score_Source_3",DoubleType(),True),\
                        StructField("Social_Circle_Default",DoubleType(),True),StructField("Phone_Change",DoubleType(),True),\
                        StructField("Credit_Bureau",DoubleType(),True),StructField("Default",DoubleType(),True)])

In [67]:
colums_to_load = ['Score_Source_2', 'Employed_Days', 'Age_Days', 'Client_Education', 'Client_Gender', 'ID_Days', 'Population_Region_Relative', 'Credit_Amount', 'Car_Owned', 'Child_Count', 'Loan_Annuity', 'Default']
colums_to_load = ['Client_Education', 'Employed_Days', 'Age_Days', 'Client_Income_Type', 'Client_Gender', 'Car_Owned', 'ID_Days', 'Score_Source_2', 'Phone_Change','Default']

In [68]:
df_test_data = spark\
.read\
.schema(my_schema)\
.option("header", "true")\
.csv(path_test_data)

In [69]:
df_test_data = df_test_data.select(*colums_to_load)

In [70]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegressionModel, LogisticRegression
from pyspark.sql.functions import count, when, isnan, col

### Load date processing pipeline from disk

In [71]:
model_process_data = PipelineModel.load(f"{home}/data/my_data_processing_pipeline")

In [72]:
df = df_test_data.limit(100)

In [73]:
df = model_process_data.transform(df)

In [74]:
df.toPandas().head(3)

Unnamed: 0,Client_Education,Employed_Days,Age_Days,Client_Income_Type,Client_Gender,Car_Owned,ID_Days,Score_Source_2,Phone_Change,Default,Client_Education_index,Client_Income_Type_index,Client_Gender_index,Client_Education_index_OHE,Client_Income_Type_index_OHE,Client_Gender_index_OHE,features,features_scaled
0,Secondary,1062.0,13957.0,Commercial,Male,0.0,383.0,0.478787,63.0,0.0,0.0,1.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(1062.0, 13957.0, 0.0, 383.0, 0.47878667, 63.0...","[0.0029076532609796767, 0.35840228245363764, 0..."
1,Graduation,4129.0,14162.0,Service,Male,1.0,21.0,0.215068,962.074051,0.0,1.0,0.0,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(4129.0, 14162.0, 1.0, 21.0, 0.215068341, 962....","[0.011304802556106483, 0.37009985734664763, 1...."
2,Graduation dropout,5102.0,16790.0,Service,Male,0.0,331.0,0.552795,277.0,0.0,2.0,0.0,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(5102.0, 16790.0, 0.0, 331.0, 0.552794972, 277...","[0.013968782427041722, 0.5200570613409415, 0.0..."


# Load the Train model

In [75]:
model = LogisticRegressionModel.load(f"{home}/data/my_trained_model")

## Now we will test the model using sample data

In [76]:
# Get sample data for testing
df_test = df_test_data.sample(False, fraction=0.9)

In [77]:
# Transform sample data
df_test = model_process_data.transform(df_test)

In [78]:
df_test_predicted = model.transform(df_test)

In [79]:
df_test_predicted.select("features_scaled", "Default", "prediction", "probability").filter("Default=prediction").\
filter("prediction=1").show()

+--------------------+-------+----------+--------------------+
|     features_scaled|Default|prediction|         probability|
+--------------------+-------+----------+--------------------+
|[0.00625063314012...|    1.0|       1.0|[0.00815204667352...|
|[0.00217937099410...|    1.0|       1.0|[0.01066287882261...|
|[0.00352094359097...|    1.0|       1.0|[0.01114256195393...|
|[0.00623968152709...|    1.0|       1.0|[0.01229635907867...|
|[0.00389329843419...|    1.0|       1.0|[0.00576327745933...|
|[0.00200962099205...|    1.0|       1.0|[0.01161944756625...|
|[0.00759220573700...|    1.0|       1.0|[0.01144735834942...|
|[0.00418625408289...|    1.0|       1.0|[0.01423860564174...|
|[0.00180701615089...|    1.0|       1.0|[0.00787606498800...|
|[0.00321429842597...|    1.0|       1.0|[0.00892829800123...|
|[0.00171118953682...|    1.0|       1.0|[0.01053757757563...|
|[0.00319513310316...|    1.0|       1.0|[0.01850815473702...|
|[0.00292408068053...|    1.0|       1.0|[0.00707278282

In [80]:
df_test_predicted.select("features_scaled", "Default", "prediction", "probability").filter("Default=prediction").\
filter("prediction=0").show()

+--------------------+-------+----------+--------------------+
|     features_scaled|Default|prediction|         probability|
+--------------------+-------+----------+--------------------+
|[0.01130480255610...|    0.0|       0.0|[0.99868282197925...|
|[0.01396878242704...|    0.0|       0.0|[0.99872008133509...|
|[0.00815073800182...|    0.0|       0.0|[0.99875770186950...|
|[0.00324167745856...|    0.0|       0.0|[0.99857100253159...|
|[1.0,0.7787161198...|    0.0|       0.0|[0.99915618155069...|
|[1.0,0.8454778887...|    0.0|       0.0|[0.99924395257184...|
|[0.02159931880966...|    0.0|       0.0|[0.99912552801647...|
|[1.0,0.7321540656...|    0.0|       0.0|[0.99878601775829...|
|[0.00619587507495...|    0.0|       0.0|[0.99922808149635...|
|[0.00101028630254...|    0.0|       0.0|[0.99853795698309...|
|[0.00106504436772...|    0.0|       0.0|[0.99752165458072...|
|[1.0,0.7828815977...|    0.0|       0.0|[0.99918969729459...|
|[0.00102123791557...|    0.0|       0.0|[0.99934847075

df_test_predicted.select("features_scaled", "Default", "prediction", "probability").filter("Default<>prediction").\
filter("prediction=0").show()

In [81]:
df_test_predicted.select("features_scaled", "Default", "prediction", "probability").filter("Default<>prediction").\
filter("prediction=1").show()

+---------------+-------+----------+-----------+
|features_scaled|Default|prediction|probability|
+---------------+-------+----------+-----------+
+---------------+-------+----------+-----------+



In [94]:
df_test_evaluate = df_test.selectExpr("features_scaled","Default")

In [97]:
training_predictions=model.evaluate(df_test_evaluate)

In [112]:
training_predictions.accuracy

1.0

In [114]:
training_predictions.areaUnderROC

0.9999501909778348

In [116]:
training_predictions.falsePositiveRateByLabel

[0.0, 0.0]

In [123]:
training_predictions.predictions.show()

+--------------------+-------+--------------------+--------------------+----------+
|     features_scaled|Default|       rawPrediction|         probability|prediction|
+--------------------+-------+--------------------+--------------------+----------+
|[0.01130480255610...|    0.0|[6.63094564768314...|[0.99868282197925...|       0.0|
|[0.01396878242704...|    0.0|[6.65967800765018...|[0.99872008133509...|       0.0|
|[0.00815073800182...|    0.0|[6.68954921319903...|[0.99875770186950...|       0.0|
|[0.00324167745856...|    0.0|[6.54935213215870...|[0.99857100253159...|       0.0|
|[1.0,0.7787161198...|    0.0|[7.07672901929910...|[0.99915618155069...|       0.0|
|[1.0,0.8454778887...|    0.0|[7.18665011471914...|[0.99924395257184...|       0.0|
|[0.02159931880966...|    0.0|[7.04101544659649...|[0.99912552801647...|       0.0|
|[1.0,0.7321540656...|    0.0|[6.71263449465367...|[0.99878601775829...|       0.0|
|[0.00619587507495...|    0.0|[7.16585936215876...|[0.99922808149635...|    

In [125]:
training_predictions.recallByLabel

[1.0, 1.0]

In [128]:
training_predictions.truePositiveRateByLabel

[1.0, 1.0]

In [None]:
spark.stop()