## Machine Learning Pipeline

### IMPORTS

In [1]:
// Import other libraries
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import java.util.Calendar
import java.io
import java.text.SimpleDateFormat
import org.apache.spark._
import org.apache.spark.ml._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.tree._
import org.apache.spark.ml.evaluation._
import spark.implicits._

Intitializing Scala interpreter ...

Spark Web UI available at http://75db835ef641:4040
SparkContext available as 'sc' (version = 2.4.3, master = local[*], app id = local-1560153697814)
SparkSession available as 'spark'


import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import java.util.Calendar
import java.io
import java.text.SimpleDateFormat
import org.apache.spark._
import org.apache.spark.ml._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.classification._
import org.apache.spark.ml.tree._
import org.apache.spark.ml.evaluation._
import spark.implicits._


### ENCODE & IMPORT DATA FUNCTIONS

In [2]:
// Encoding features for machine learning models

def encode_features_ml(df: DataFrame, target_feat: String, assembled_feat: String): (DataFrame, Array[StringIndexer], StringIndexerModel, VectorAssembler) = {
    // Ensure only categorical columns are encoded
    //val feats = df.columns.filterNot(_.toLowerCase().contains(target_feat)).filterNot(_.toLowerCase().contains("name")).filterNot(_.toLowerCase().contains("date")).filterNot(_.toLowerCase().contains("time")).filterNot(_.toLowerCase().contains("dob")).filterNot(_.toLowerCase().contains("age")).filterNot(_.toLowerCase().contains("id"))
    val feats = df.columns.filterNot(_.toLowerCase().contains("name")).filterNot(_.toLowerCase().contains("date")).filterNot(_.toLowerCase().contains("time")).filterNot(_.toLowerCase().contains("dob")).filterNot(_.toLowerCase().contains("age")).filterNot(_.toLowerCase().contains("id"))
    // Define new encoded columns both indice and vectors
    val indexers = feats.map(name => new StringIndexer()
            .setInputCol(name)
            .setHandleInvalid("keep")
            .setOutputCol(name + "_index")            
    )
    val target_indexer = new StringIndexer()
            .setInputCol(target_feat)
            .setHandleInvalid("keep")
            .setOutputCol(target_feat + "_index")  
    val target_model = target_indexer.fit(df)
    val index_feats = feats.map(_+"_index")
    val feature_assembler = new VectorAssembler() 
            .setInputCols(index_feats)
            .setHandleInvalid("keep")
            .setOutputCol(assembled_feat)
    val pipeline = new Pipeline().setStages(indexers)  // Setting the indexer in the pipeline        
    val df_transformed = pipeline.fit(df).transform(df) // Fitting the encoding pipeline to the dataframe
    val df_return = feature_assembler.transform(df_transformed) // Assembling features into one combined feature
    return (df_return, indexers, target_model, feature_assembler)
}

//read data from Phase 2 wrangling task.
def read_phase2_csv(filename: String): DataFrame = {
    val phase2 = spark.read.format("csv").option("header", "true").load(filename)
   
    val df = phase2.toDF()
    return df
}

encode_features_ml: (df: org.apache.spark.sql.DataFrame, target_feat: String, assembled_feat: String)(org.apache.spark.sql.DataFrame, Array[org.apache.spark.ml.feature.StringIndexer], org.apache.spark.ml.feature.StringIndexerModel, org.apache.spark.ml.feature.VectorAssembler)
read_phase2_csv: (filename: String)org.apache.spark.sql.DataFrame


## Machine Learning Multinomial Classifier Algorithms
### Decision Tree (Random Forest)

In [5]:
// TRAIN DECISION TREE
def train_decision_tree_model(train_data: DataFrame, target_feat: String, assembled_feat: String): RandomForestClassificationModel  = {
    // Define the Random Forest model estimator
    val rf = new RandomForestClassifier()
      .setLabelCol(target_feat + "_index")
      .setFeaturesCol(assembled_feat)
      .setNumTrees(4) 
      .setMaxDepth(3) 
      .setMaxBins(1000)
    
    // Train model with training dataset
    val model = rf.fit(train_data)
    
    return model
} 

// TEST DECISION TREE
def test_decision_tree_model(test_data: DataFrame, model: RandomForestClassificationModel, target_model: StringIndexerModel): DataFrame = {
    // Test model with testing dataset
    var test_pred = model.transform(test_data)
    
    // Decode prediction transformer
    val decoder = new IndexToString()
        .setInputCol("prediction")
        .setOutputCol("prediction_decoded")  
        .setLabels(target_model.labels)
    
    // Generate decoded prediction dataframe
    val test_pred_decoded = decoder.transform(test_pred)
    
    return test_pred_decoded
}

train_decision_tree_model: (train_data: org.apache.spark.sql.DataFrame, target_feat: String, assembled_feat: String)org.apache.spark.ml.classification.RandomForestClassificationModel
test_decision_tree_model: (test_data: org.apache.spark.sql.DataFrame, model: org.apache.spark.ml.classification.RandomForestClassificationModel, target_model: org.apache.spark.ml.feature.StringIndexerModel)org.apache.spark.sql.DataFrame


### Naive Bayes Algorithm

In [6]:
// TRAIN NAIVE BAYES
def train_naive_bayes_model(train_data: DataFrame, target_feat: String, assembled_feat: String): NaiveBayesModel  = {
    // Define Naive Bayes Classifier
    val nb = new NaiveBayes()
        .setLabelCol(target_feat + "_index")
        .setFeaturesCol(assembled_feat)
    
    // Train model with training dataset
    val model = nb.fit(train_data)
    
    return model
} 

// TEST NAIVE BAYES
def test_naive_bayes_model(test_data: DataFrame, model: NaiveBayesModel, target_model: StringIndexerModel): DataFrame = {
    // Test model with testing dataset
    var test_pred = model.transform(test_data)
    
    // Decode prediction transformer
    val decoder = new IndexToString()
        .setInputCol("prediction")
        .setOutputCol("prediction_decoded")  
        .setLabels(target_model.labels)
    
    // Generate decoded prediction dataframe
    val test_pred_decoded = decoder.transform(test_pred)
    
    return test_pred_decoded
}

train_naive_bayes_model: (train_data: org.apache.spark.sql.DataFrame, target_feat: String, assembled_feat: String)org.apache.spark.ml.classification.NaiveBayesModel
test_naive_bayes_model: (test_data: org.apache.spark.sql.DataFrame, model: org.apache.spark.ml.classification.NaiveBayesModel, target_model: org.apache.spark.ml.feature.StringIndexerModel)org.apache.spark.sql.DataFrame


### Logistic Regression Classification Algorithm
#### Train model

In [17]:
def train_lr_model(train_data: DataFrame, target_feat: String, assembled_feat: String): LogisticRegressionModel  = {
    // Define Logistic Regressor Classifier
    val lr = new LogisticRegression()
        .setLabelCol(target_feat + "_index")
        .setFeaturesCol(assembled_feat)
        .setMaxIter(10)
        .setRegParam(0.3)
        .setElasticNetParam(0.8)
    
    // Train model with training dataset
    val model = lr.fit(train_data)
    
    return model
} 

train_lr_model: (train_data: org.apache.spark.sql.DataFrame, target_feat: String, assembled_feat: String)org.apache.spark.ml.classification.LogisticRegressionModel


#### Test model

In [18]:
def test_lr_model(test_data: DataFrame, model: LogisticRegressionModel, target_model: StringIndexerModel): DataFrame = {
    // Test model with testing dataset
    var test_pred = model.transform(test_data)
    
    // Decode prediction transformer
    val decoder = new IndexToString()
        .setInputCol("prediction")
        .setOutputCol("prediction_decoded")  
        .setLabels(target_model.labels)
    
    // Generate decoded prediction dataframe
    val test_pred_decoded = decoder.transform(test_pred)
    
    return test_pred_decoded
}

test_lr_model: (test_data: org.apache.spark.sql.DataFrame, model: org.apache.spark.ml.classification.LogisticRegressionModel, target_model: org.apache.spark.ml.feature.StringIndexerModel)org.apache.spark.sql.DataFrame


### Evaluate model Function

In [7]:
def evaluate_prediction(algorithm_name: String, df: DataFrame, target_feat: String, prediction_col: String) = {
    // Evaluate test data's accuracy
    var evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol(target_feat + "_index")
      .setPredictionCol(prediction_col)
      .setMetricName("accuracy") 
    val accuracy = evaluator.evaluate(df)
    println(s"$algorithm_name - Test dataset: Accuracy=$accuracy")
    
    // Evaluate test data's f1
    evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol(target_feat + "_index")
      .setPredictionCol(prediction_col)
      .setMetricName("f1")
    val f1 = evaluator.evaluate(df)
    println(s"$algorithm_name - Test dataset: F1=$f1")
}

evaluate_prediction: (algorithm_name: String, df: org.apache.spark.sql.DataFrame, target_feat: String, prediction_col: String)Unit


### Machine Learning Pipeline

In [11]:
phase2_df.show()

+---------+---------+--------------------+------------------+----------------+----------+------------+------+----------+--------------+-------------+-----------------+--------------------+-----------+----------+--------+-------+--------+----------+----------+---------------+---------------+-------------+-----------+----------+--------+--------+---------+-----------+------------------------+--------------------+-----------------------+--------------------+--------------------+--------------------+---------------------+--------------------+------------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+
|    id_in|  name_in|   location_found_in|       intake_type|intake_condition|species_in|age_years_in|sex_in|desexed_in|prim_colour_in|sec_colour_in|mix_breed_bool_in|       prim_breed_in| town_fou

|A635476_1|    Vince|3101 Shoreline Dr...|             Stray|          Normal|       Cat|         1.0|  male|      true|          blue|        tabby|             true|  domestic shorthair|Austin (TX)|12/17/2013|06:18:00|     AM|     Dec|       Tue|06/08/2012|       Transfer|           SCRP|          1.0|       true|12/22/2013|12:56:00|      PM|      Dec|        Sun|    {"location_found_...|{"intake_type_vec...|   {"intake_conditio...|{"species_in_vect...|{"sex_in_vector":...|{"desexed_in_vect...| {"prim_colour_in_...|{"sec_colour_in_v...|    {"mix_breed_bool_...|{"prim_breed_in_v...|{"town_found_vect...|{"ampm_in_vector"...|{"month_in_vector...|{"weekday_in_vect...|{"outcome_type_ve...|  {"outcome_subtype...|{"desexed_out_vec...|{"ampm_out_vector...|{"month_out_vecto...|{"weekday_out_vec...|
|A636427_1|   *Haley|12Th And Airport ...|             Stray|          Normal|       Dog|         2.0|female|      true|         brown|        white|             true|            pit bull|Austin (T

In [22]:
// Read Phase2 csv
val phase2_df = read_phase2_csv("encoded_output")

// Select features
val selected_join = phase2_df.select(
              "id_in",
              "intake_type",
              "intake_condition",
              "species_in",
              "age_years_in",
              "sex_in",
              "desexed_out",
              "prim_colour_in",
              "prim_breed_in",
              "outcome_type",
              "town_found"
)  

// Set target feature
val target_feat = "outcome_type"
val assembled_feat = "assembled_features"

// Encode and index features
val (result, indexers, target_model, assembler) = encode_features_ml(selected_join, target_feat, assembled_feat)

// Create train and test dataset
val Array(train_data, test_data) = result.randomSplit(Array(0.80, 0.20))

// Decision Tree (Random Forest) Algorithm
val decision_tree_model = train_decision_tree_model(train_data, target_feat, assembled_feat)
// Test model
val decision_tree_prediction = test_decision_tree_model(test_data, decision_tree_model, target_model)
// Evaluate model
evaluate_prediction("Decision Tree", decision_tree_prediction, target_feat, "prediction") 

// Use Naive Bayes Algorithm
val naive_bayes_model = train_naive_bayes_model(train_data, target_feat, assembled_feat)
// Test model
val naive_bayes_prediction = test_naive_bayes_model(test_data, naive_bayes_model, target_model)
// Evaluate model
evaluate_prediction("Naive Bayes", naive_bayes_prediction, target_feat, "prediction")

// Use Logistic Regression Algorithm
val lr_model = train_lr_model(train_data, target_feat, assembled_feat)
// Test model
val lr_prediction = test_lr_model(test_data, lr_model, target_model)
// Evaluate model
evaluate_prediction("Logistic Regression", lr_prediction, target_feat, "prediction")

Decision Tree - Test dataset: Accuracy=0.8655889610716624
Decision Tree - Test dataset: F1=0.8593474342142916
Naive Bayes - Test dataset: Accuracy=0.5639321146195296
Naive Bayes - Test dataset: F1=0.5997962921369467
Logistic Regression - Test dataset: Accuracy=0.43103187792717934
Logistic Regression - Test dataset: F1=0.32140104048253065


phase2_df: org.apache.spark.sql.DataFrame = [id_in: string, name_in: string ... 47 more fields]
selected_join: org.apache.spark.sql.DataFrame = [id_in: string, intake_type: string ... 9 more fields]
target_feat: String = outcome_type
assembled_feat: String = assembled_features
result: org.apache.spark.sql.DataFrame = [id_in: string, intake_type: string ... 19 more fields]
indexers: Array[org.apache.spark.ml.feature.StringIndexer] = Array(strIdx_af1b51cb0715, strIdx_4cc734162f64, strIdx_fa3f961b71bc, strIdx_fb0ada7566f6, strIdx_bdebe2d98620, strIdx_832c0d42be02, strIdx_a206ed346197, strIdx_d662abea567e, strIdx_ee6238e98c62)
target_model: org.apache.spark.ml.feature.StringIndexerModel = strIdx_39e387c535a4
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_11a10b7b8b17
...

In [23]:
decision_tree_prediction.show()

+---------+---------------+----------------+----------+------------+------+-----------+--------------+-------------------+---------------+--------------------+-----------------+----------------------+----------------+------------+-----------------+--------------------+-------------------+------------------+----------------+--------------------+--------------------+--------------------+----------+------------------+
|    id_in|    intake_type|intake_condition|species_in|age_years_in|sex_in|desexed_out|prim_colour_in|      prim_breed_in|   outcome_type|          town_found|intake_type_index|intake_condition_index|species_in_index|sex_in_index|desexed_out_index|prim_colour_in_index|prim_breed_in_index|outcome_type_index|town_found_index|  assembled_features|       rawPrediction|         probability|prediction|prediction_decoded|
+---------+---------------+----------------+----------+------------+------+-----------+--------------+-------------------+---------------+--------------------+---