# Water Potability Model Exploration

In [1]:
import sys
sys.path.append('/home/jovyan/work')

import mlflow
import mlflow.spark
from pyspark.sql import SparkSession, Window, DataFrame
from utilities.minio_utils import  log_artifacts_minio
from pipeline import feature_pipeline
import os

In [2]:
spark = (
    SparkSession.builder
    .config('spark.jars.packages', 'ml.combust.mleap:mleap-spark-base_2.11:0.16.0,ml.combust.mleap:mleap-spark_2.11:0.16.0')
    .config('spark.sql.execution.arrow.pyspark.enabled', True)
    .getOrCreate()
)
spark.sparkContext.setLogLevel('ERROR')



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
ml.combust.mleap#mleap-spark-base_2.11 added as a dependency
ml.combust.mleap#mleap-spark_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1c246194-c63f-420e-b863-c7e4d484c6a3;1.0
	confs: [default]
	found ml.combust.mleap#mleap-spark-base_2.11;0.16.0 in central
	found ml.combust.mleap#mleap-runtime_2.11;0.16.0 in central
	found ml.combust.mleap#mleap-core_2.11;0.16.0 in central
	found ml.combust.mleap#mleap-base_2.11;0.16.0 in central
	found ml.combust.mleap#mleap-tensor_2.11;0.16.0 in central
	found io.spray#spray-json_2.11;1.3.2 in central
	found com.github.rwl#jtransforms;2.4.0 in central
	found ml.combust.bundle#bundle-ml_2.11;0.16.0 in central
	found com.google.protobuf#protobuf-java;3.5.1 in central
	found com.thesamet.scalapb#scalapb-runtime_2.11;0.7.1 in central
	found com.thesamet.scalapb#lenses_2.11;0.7.0-test2 in central


## Load the Training Data

In [3]:
file_path = '../data/water_potability_train.csv'
df = spark.read.csv(file_path,inferSchema=True, header=True)
df.show(5)

+---+----+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+
|_c0|  ph|          Hardness|            Solids|       Chloramines|           Sulfate|      Conductivity|    Organic_carbon|  Trihalomethanes|         Turbidity|Potability|
+---+----+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+
|  0|null|  98.3679148956603| 28415.57583214058|10.558949998467961|  296.843207792478|505.24026927891407|12.882614472289333|85.32995534051292| 4.119087300328971|         1|
|  1|null|103.46475866009455| 27420.16742458204| 8.417305032089528|              null|485.97450045781375|11.351132730708514| 67.8699636759021| 4.620793451653219|         0|
|  2|null|108.91662923953173|14476.335695268315| 5.398162017711099|  281.198274407849| 512.2323064106689|15.013793389990155| 86.6714587

## Model Experimentation with MLFlow

In [4]:
experiment = 'water_potability'
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
mlflow.set_experiment(experiment)

In [5]:
degree = 3
feature_cols = ['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity',
               'Organic_carbon', 'Trihalomethanes', 'Turbidity']
assembler_out_col = 'Features'
scaler_out_col = 'ScaledFeatures'
expander_out_col = 'ExpandedFeatures'

In [6]:
with mlflow.start_run() as run:
    mlflow.log_param("degree", degree)
    pipeline = feature_pipeline.create_feature_pipeline(df, feature_cols, assembler_out_col, 
                                                    scaler_out_col, expander_out_col, degree)
    # Save the model pipeline
    mlflow.spark.save_model(pipeline,
                            '../models/pipeline_model',
                            sample_input=df.select(
                                'ph','Hardness','Solids','Chloramines','Sulfate',
                                'Conductivity', 'Organic_carbon', 'Trihalomethanes',
                                'Turbidity'
                            )
    )
    #log_artifacts_minio(run, '../models/pipeline_model', 'pipeline_mode', True)
    run_id = run.info.run_id
    print(run.info)

                                                                                

ModuleNotFoundError: No module named 'mleap'