# ML Pipelines

In [3]:
from pathlib import Path
home = "dbfs:/mnt/data"

path_train_data = f"{home}/data/Automobile-Loan-Default/Train_Dataset.csv"

In [4]:
from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType, DoubleType
my_schema = StructType([StructField("ID",StringType(),True),StructField("Client_Income",DoubleType(),True),\
                        StructField("Car_Owned",DoubleType(),True),StructField("Bike_Owned",DoubleType(),True),\
                        StructField("Active_Loan",DoubleType(),True),StructField("House_Own",DoubleType(),True),\
                        StructField("Child_Count",DoubleType(),True),StructField("Credit_Amount",DoubleType(),True),\
                        StructField("Loan_Annuity",DoubleType(),True),StructField("Accompany_Client",StringType(),True),\
                        StructField("Client_Income_Type",StringType(),True),StructField("Client_Education",StringType(),True),\
                        StructField("Client_Marital_Status",StringType(),True),StructField("Client_Gender",StringType(),True),\
                        StructField("Loan_Contract_Type",StringType(),True),StructField("Client_Housing_Type",StringType(),True),\
                        StructField("Population_Region_Relative",DoubleType(),True),StructField("Age_Days",DoubleType(),True),\
                        StructField("Employed_Days",DoubleType(),True),StructField("Registration_Days",DoubleType(),True),\
                        StructField("ID_Days",DoubleType(),True),StructField("Own_House_Age",DoubleType(),True),\
                        StructField("Mobile_Tag",DoubleType(),True),StructField("Homephone_Tag",DoubleType(),True),\
                        StructField("Workphone_Working",DoubleType(),True),StructField("Client_Occupation",StringType(),True),\
                        StructField("Client_Family_Members",DoubleType(),True),StructField("Cleint_City_Rating",DoubleType(),True),\
                        StructField("Application_Process_Day",DoubleType(),True),StructField("Application_Process_Hour",DoubleType(),True),\
                        StructField("Client_Permanent_Match_Tag",StringType(),True),StructField("Client_Contact_Work_Tag",StringType(),True),\
                        StructField("Type_Organization",StringType(),True),StructField("Score_Source_1",DoubleType(),True),\
                        StructField("Score_Source_2",DoubleType(),True),StructField("Score_Source_3",DoubleType(),True),\
                        StructField("Social_Circle_Default",DoubleType(),True),StructField("Phone_Change",DoubleType(),True),\
                        StructField("Credit_Bureau",DoubleType(),True),StructField("Default",DoubleType(),True)])

In [5]:
selected_features = ['Score_Source_2', 'Employed_Days', 'Age_Days', 'Client_Education', 'Client_Gender', 'ID_Days', 'Population_Region_Relative', 'Credit_Amount', 'Car_Owned', 'Child_Count', 'Loan_Annuity']
selected_features = ['Client_Education', 'Employed_Days', 'Age_Days', 'Client_Income_Type', \
 'Client_Gender', 'Car_Owned', 'ID_Days', 'Score_Source_2', 'Phone_Change']

target_variable_name = "Default"

In [6]:
df_train_data = spark\
.read\
.schema(my_schema)\
.option("header", "true")\
.csv(path_train_data)

In [7]:
cols_to_get = selected_features + [target_variable_name]
df_train_data = df_train_data.select(*cols_to_get)

# Length of the data

In [8]:
df_train_data.count()

121856

# Describe data

In [9]:
df_train_data.describe().toPandas().set_index("summary").T

summary,count,mean,stddev,min,max
Client_Education,118194,,,Graduation,Secondary
Employed_Days,118190,67154.07061511127,138971.78295053402,0.0,365243.0
Age_Days,118239,16027.422948434947,4366.356503618858,7676.0,25201.0
Client_Income_Type,118139,,,Businessman,Unemployed
Client_Gender,119426,,,Female,XNA
Car_Owned,118258,0.3428774374672326,0.4746729459548691,0.0,1.0
ID_Days,115871,2987.471015180675,1511.8845759418805,0.0,7197.0
Score_Source_2,116154,0.5186100569859782,0.7402967506270233,5.0E-6,100.0
Phone_Change,118175,962.0740511952612,827.9477872022571,0.0,4185.0
Default,121839,0.0807951476949088,0.2725213778065368,0.0,1.0


## ML persistence: Loading Pipelines
- Often times it is worth it to save a model or a pipeline to disk for later use.
- ML persistence works across Scala, Java and Python.


In [10]:
from pyspark.ml import PipelineModel

### Load date processing pipeline from disk

In [11]:
model_process_data = PipelineModel.load(f"{home}/data/my_data_processing_pipeline")

In [12]:
df = model_process_data.transform(df_train_data)

In [13]:
df.toPandas().head(3)

Unnamed: 0,Client_Education,Employed_Days,Age_Days,Client_Income_Type,Client_Gender,Car_Owned,ID_Days,Score_Source_2,Phone_Change,Default,Client_Education_index,Client_Income_Type_index,Client_Gender_index,Client_Education_index_OHE,Client_Income_Type_index_OHE,Client_Gender_index_OHE,features,features_scaled
0,Secondary,1062.0,13957.0,Commercial,Male,0.0,383.0,0.478787,63.0,0.0,0.0,1.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(1062.0, 13957.0, 0.0, 383.0, 0.47878667, 63.0...","[0.0029076532609796767, 0.35840228245363764, 0..."
1,Graduation,4129.0,14162.0,Service,Male,1.0,21.0,0.215068,962.074051,0.0,1.0,0.0,0.0,"(0.0, 1.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(4129.0, 14162.0, 1.0, 21.0, 0.215068341, 962....","[0.011304802556106483, 0.37009985734664763, 1...."
2,Graduation dropout,5102.0,16790.0,Service,Male,0.0,331.0,0.552795,277.0,0.0,2.0,0.0,0.0,"(0.0, 0.0, 1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)","(1.0, 0.0, 0.0)","(5102.0, 16790.0, 0.0, 331.0, 0.552794972, 277...","[0.013968782427041722, 0.5200570613409415, 0.0..."


In [16]:
df.write.mode("overwrite").parquet(f"{home}/data/10-processed-data.parquet")