# Water Potability Pipeline Development Notebook

In [9]:
import sys
import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.sql import SparkSession, Window, DataFrame
from pyspark.mllib.stat import Statistics
from pyspark.ml.feature import Imputer
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, PolynomialExpansion, StandardScaler
sys.path.append('/home/jovyan/work')

In [10]:
spark = SparkSession.builder.getOrCreate()

In [11]:
df_train = spark.read.csv('../data/water_potability_train.csv',inferSchema=True, header=True)

In [12]:
print('Train record count: {}'.format(df_train.count()))

Train record count: 2353


## Step 2 - Feature Pipeline Development

### Impute Missing Values

In [13]:
imputer = Imputer(
    inputCols=['ph', 'Sulfate', 'Trihalomethanes'],
    outputCols=['ph_imp', 'Sulfate_imp', 'Trihalomethanes_imp']
)
df_imputed = imputer.setStrategy("mean").fit(df_train).transform(df_train)
df_imputed = df_imputed.select('Hardness','Solids','Chloramines','Conductivity',
                              'Organic_carbon','Turbidity','ph_imp', 'Sulfate_imp', 
                              'Trihalomethanes_imp', 'Potability')
df_imputed.show(10)

+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-------------------+----------+
|          Hardness|            Solids|       Chloramines|      Conductivity|    Organic_carbon|         Turbidity|           ph_imp|       Sulfate_imp|Trihalomethanes_imp|Potability|
+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-------------------+----------+
|  98.3679148956603| 28415.57583214058|10.558949998467961|505.24026927891407|12.882614472289333| 4.119087300328971|7.065394544064872|  296.843207792478|  85.32995534051292|         1|
|103.46475866009455| 27420.16742458204| 8.417305032089528|485.97450045781375|11.351132730708514| 4.620793451653219|7.065394544064872| 333.8051408041043|   67.8699636759021|         0|
|108.91662923953173|14476.335695268315| 5.398162017711099| 512.2323064106689|15.

### Create Feature Vector

In [14]:
vec_assembler = VectorAssembler(
    inputCols=['Hardness','Solids','Chloramines','Conductivity','Organic_carbon',
               'Turbidity','ph_imp', 'Sulfate_imp', 'Trihalomethanes_imp'],
    outputCol='Features'
)
df_features = vec_assembler.transform(df_imputed)
df_features.select('Features').take(1)

[Row(Features=DenseVector([98.3679, 28415.5758, 10.5589, 505.2403, 12.8826, 4.1191, 7.0654, 296.8432, 85.33]))]

### Scale the Features

In [15]:
scaler = StandardScaler(
    inputCol='Features', outputCol='ScaledFeatures',
    withStd=True, withMean=True
)

scalerFit = scaler.fit(df_features)

df_features_scaled = scalerFit.transform(df_features)
df_features_scaled.select('ScaledFeatures').take(1)

[Row(ScaledFeatures=DenseVector([-2.9992, 0.7207, 2.1483, 0.9852, -0.4139, 0.1916, -0.0, -1.0138, 1.2052]))]

### Perform Polynomial Feature Expansion

In [16]:
poly_feature_exp = PolynomialExpansion(degree=3, inputCol="ScaledFeatures", outputCol="PolynomialFeatures")
poly_features = poly_feature_exp.transform(df_features_scaled)
poly_features.select('PolynomialFeatures').take(1)

[Row(PolynomialFeatures=DenseVector([-2.9992, 8.9953, -26.9789, 0.7207, -2.1615, 6.4829, 0.5194, -1.5578, 0.3743, 2.1483, -6.4432, 19.3247, 1.5483, -4.6436, 1.1158, 4.6152, -13.842, 3.3262, 9.9149, 0.9852, -2.9547, 8.8619, 0.71, -2.1295, 0.5117, 2.1164, -6.3476, 1.5253, 4.5467, 0.9705, -2.9109, 0.6995, 2.085, 0.9561, -0.4139, 1.2414, -3.7232, -0.2983, 0.8947, -0.215, -0.8892, 2.6669, -0.6408, -1.9103, -0.4078, 1.223, -0.2939, -0.876, -0.4017, 0.1713, -0.5138, 0.1235, 0.368, 0.1688, -0.0709, 0.1916, -0.5748, 1.7238, 0.1381, -0.4142, 0.0995, 0.4117, -1.2347, 0.2967, 0.8844, 0.1888, -0.5662, 0.1361, 0.4056, 0.186, -0.0793, 0.2379, -0.0572, -0.1704, -0.0781, 0.0328, 0.0367, -0.1101, 0.0265, 0.0789, 0.0362, -0.0152, 0.007, -0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, 0.0, 0.0, -0.0, 0.0, -0.0, -1.0138, 3.0405, -9.1191, -0.7306, 2.1913, -0.5265, -2.1779, 6.5319, -1