# Water Potability Model Exploration

In [1]:
import sys
sys.path.append('/home/jovyan/work')

import mlflow
import mlflow.spark
import mlflow.mleap

import mleap.pyspark

from pyspark.sql import SparkSession,DataFrame
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression

from pipeline import feature_pipeline_builder

import os

In [2]:
spark = SparkSession.builder  \
    .config('spark.jars.packages', 'ml.combust.mleap:mleap-spark-base_2.11:0.17.0,ml.combust.mleap:mleap-spark_2.11:0.17.0') \
    .config('spark.sql.execution.arrow.pyspark.enabled', True) \
    .appName('ClassifierTraining')  \
    .getOrCreate()



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
ml.combust.mleap#mleap-spark-base_2.11 added as a dependency
ml.combust.mleap#mleap-spark_2.11 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-df6a793b-bff4-4bd9-92cb-7fa329182018;1.0
	confs: [default]
	found ml.combust.mleap#mleap-spark-base_2.11;0.17.0 in central
	found ml.combust.mleap#mleap-runtime_2.11;0.17.0 in central
	found ml.combust.mleap#mleap-core_2.11;0.17.0 in central
	found ml.combust.mleap#mleap-base_2.11;0.17.0 in central
	found ml.combust.mleap#mleap-tensor_2.11;0.17.0 in central
	found io.spray#spray-json_2.11;1.3.2 in central
	found com.github.rwl#jtransforms;2.4.0 in central
	found ml.combust.bundle#bundle-ml_2.11;0.17.0 in central
	found com.google.protobuf#protobuf-java;3.5.1 in central
	found com.thesamet.scalapb#scalapb-runtime_2.11;0.7.1 in central
	found com.thesamet.scalapb#lenses_2.11;0.7.0-test2 in central


## Load the Training Data

In [3]:
file_path = '../data/water_potability_train.csv'
df_train = spark.read.csv(file_path,inferSchema=True, header=True)
df_train = df_train.drop('_c0')
df_train.show(5)

+----+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+
|  ph|          Hardness|            Solids|       Chloramines|           Sulfate|      Conductivity|    Organic_carbon|  Trihalomethanes|         Turbidity|Potability|
+----+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+
|null|  98.3679148956603| 28415.57583214058|10.558949998467961|  296.843207792478|505.24026927891407|12.882614472289333|85.32995534051292| 4.119087300328971|         1|
|null|103.46475866009455| 27420.16742458204| 8.417305032089528|              null|485.97450045781375|11.351132730708514| 67.8699636759021| 4.620793451653219|         0|
|null|108.91662923953173|14476.335695268315| 5.398162017711099|  281.198274407849| 512.2323064106689|15.013793389990155| 86.6714587149138| 3.89557206226812

## Load the Test Data

In [4]:
file_path = '../data/water_potability_test.csv'
df_test = spark.read.csv(file_path,inferSchema=True, header=True)
df_test = df_test.drop('_c0')
df_test.show(5)

+----+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|  ph|          Hardness|            Solids|      Chloramines|           Sulfate|      Conductivity|    Organic_carbon|   Trihalomethanes|         Turbidity|Potability|
+----+------------------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+----------+
|null|105.85926357195498| 37928.14217716675|5.609440345508508|              null|358.88876761151056|12.207108489369546| 71.11989017420973| 3.873853349593973|         0|
|null|115.39297941167533| 46077.35848526223|5.289306681961538| 437.5922998268262| 422.0173564256122|10.809631953564008| 53.61703537004023| 4.212510849647721|         0|
|null|118.98857909025189|14285.583854224515|7.804173553073094|  268.646940746221| 389.3755658712614| 12.70604896865791|53.928845767512236|3.595017180957615

### Set Up MLFlow Experiment

## Create the Pipeline

In [5]:
degree = 3
prediction_col = 'Potability'
feature_cols = ['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity',
               'Organic_carbon', 'Trihalomethanes', 'Turbidity']

assembler_out_col = 'feature_vector'
scaler_out_col = 'scaled_vector'
expander_out_col = 'features'

In [6]:
feature_pipeline = feature_pipeline_builder.create_feature_pipeline(feature_cols,
                                                    assembler_out_col,
                                                    scaler_out_col,
                                                    expander_out_col, 
                                                    degree)
feature_model = feature_pipeline.fit(df_train)

train_with_features = feature_model.transform(df_train)

train_with_features.show(5)

+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+--------------------+--------------------+--------------------+
|               ph|          Hardness|            Solids|       Chloramines|           Sulfate|      Conductivity|    Organic_carbon|  Trihalomethanes|         Turbidity|Potability|      feature_vector|       scaled_vector|            features|
+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+--------------------+--------------------+--------------------+
|7.065394544064872|  98.3679148956603| 28415.57583214058|10.558949998467961|  296.843207792478|505.24026927891407|12.882614472289333|85.32995534051292| 4.119087300328971|         1|[7.06539454406487...|[-1.0191119363888...|[-1.0191119363888...|
|7.065394544064872|1

In [7]:
train_expanded_features = train_with_features.select(prediction_col,expander_out_col)
train_persisted = train_expanded_features.persist()

## Model Experimentation with MLFlow

In [8]:
experiment = 'water_potability'
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
mlflow.set_experiment(experiment)

In [9]:
#with mlflow.start_run() as run:
#    pass

In [10]:
lr = LogisticRegression(featuresCol=expander_out_col,labelCol=prediction_col)

In [11]:
fit_model = lr.fit(train_persisted)

21/07/15 14:26:44 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/07/15 14:26:44 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [12]:
test_with_features = feature_model.transform(df_test)
test_expanded_features = test_with_features.select(prediction_col, expander_out_col)
test_persisted = test_expanded_features.persist()

In [13]:
results = fit_model.transform(test_persisted)

In [14]:
my_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol=prediction_col)
results.select(prediction_col,'prediction')
AUC = my_eval.evaluate(results)
print("AUC score is : ",AUC)

AUC score is :  0.613414574701438
