# Water Potability Model Exploration

In [1]:
import sys
sys.path.append('/home/jovyan/work')

import mlflow
import mlflow.spark
import mlflow.mleap

from pyspark.sql import SparkSession,DataFrame
from pipeline import feature_pipeline_builder
import os

In [2]:
spark = SparkSession.builder  \
    .appName('ClassifierTraining')  \
    .getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

21/07/14 17:41:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Load the Training Data

In [3]:
file_path = '../data/water_potability_train.csv'
df_train = spark.read.csv(file_path,inferSchema=True, header=True)
df_train = df_train.drop('_c0')
df_train.show(5)

+----+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+
|  ph|          Hardness|            Solids|       Chloramines|           Sulfate|      Conductivity|    Organic_carbon|  Trihalomethanes|         Turbidity|Potability|
+----+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+
|null|  98.3679148956603| 28415.57583214058|10.558949998467961|  296.843207792478|505.24026927891407|12.882614472289333|85.32995534051292| 4.119087300328971|         1|
|null|103.46475866009455| 27420.16742458204| 8.417305032089528|              null|485.97450045781375|11.351132730708514| 67.8699636759021| 4.620793451653219|         0|
|null|108.91662923953173|14476.335695268315| 5.398162017711099|  281.198274407849| 512.2323064106689|15.013793389990155| 86.6714587149138| 3.89557206226812

### Set Up MLFlow Experiment

In [4]:
experiment = 'water_potability'
mlflow.set_tracking_uri(os.environ['MLFLOW_TRACKING_URI'])
mlflow.set_experiment(experiment)

## Create the Pipeline

In [5]:
degree = 3
prediction_col = 'Potability'
feature_cols = ['ph','Hardness','Solids','Chloramines','Sulfate','Conductivity',
               'Organic_carbon', 'Trihalomethanes', 'Turbidity']

assembler_out_col = 'Features'
scaler_out_col = 'ScaledFeatures'
expander_out_col = 'ExpandedFeatures'

In [6]:
feature_pipeline = feature_pipeline_builder.create_feature_pipeline(feature_cols,
                                                    assembler_out_col,
                                                    scaler_out_col,
                                                    expander_out_col, 
                                                    degree)
feature_model = feature_pipeline.fit(df_train)

train_with_features = feature_model.transform(df_train)

train_with_features.show(5)

+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+--------------------+--------------------+--------------------+
|               ph|          Hardness|            Solids|       Chloramines|           Sulfate|      Conductivity|    Organic_carbon|  Trihalomethanes|         Turbidity|Potability|            Features|      ScaledFeatures|    ExpandedFeatures|
+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+----------+--------------------+--------------------+--------------------+
|7.065394544064872|  98.3679148956603| 28415.57583214058|10.558949998467961|  296.843207792478|505.24026927891407|12.882614472289333|85.32995534051292| 4.119087300328971|         1|[7.06539454406487...|[-1.0191119363888...|[-1.0191119363888...|
|7.065394544064872|1

## Serialize Pipeline to MLeap Bundle

## Model Experimentation with MLFlow

In [7]:
with mlflow.start_run() as run:
    pass