# ML Notebook for Banking Churn Model

## Importing Brunel and ML Libraries

In [None]:
%AddJar -magic https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar -f

In [None]:
//import libraries
import org.apache.spark.{SparkConf, SparkContext, SparkFiles}
import org.apache.spark.sql.{SQLContext, SparkSession, Row}
import org.apache.spark.SparkFiles

import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer, VectorAssembler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.ibm.transformers.RenameColumn

import com.ibm.analytics.ngp.ingest.Sampling
import com.ibm.analytics.ngp.util._
import com.ibm.analytics.ngp.pipeline.evaluate.{Evaluator,MLProblemType}

import com.ibm.analytics.wml.{Learner, Target}
import com.ibm.analytics.wml.cads.CADSEstimator

# Loading Data from HortonWorks Connection

In [None]:
// TODO:  Insert "cust_summary_notebook_training" remote data set as Spark DataFrame


In [None]:
// TODO: Rename the dataframe df1 in the statement below to match the dataframe automatically inserted above
val churnDataRaw = df1

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._ 

val toDouble = udf {churn: Boolean => (if(churn) 1.0 else 0.0)}

val churnData = churnDataRaw.select("AGE", "ACTIVITY", "EDUCATION", "GENDER", "STATE", "NEGTWEETS", "INCOME", "CHURN").
                             withColumn("label", toDouble(churnDataRaw.col("CHURN"))).
                             drop("CHURN")
churnData.show(5)

In [None]:
val train = 70
val test = 15
val validate = 15

//Split the data into training data set, testing data set, and validation data set

val splits = Sampling.trainingSplit(churnData, train, test, validate)

val trainingDF = splits._1
val testDF = splits._2
val validationDF = splits._3

println("Training data set")
trainingDF.show(5)

println("Testing data set")
testDF.show(5)

println("Validation data set")
validationDF.show(5)

# Building and Evaluating LR model

In [None]:
//Feature definition

val genderIndexer = new StringIndexer().setInputCol("GENDER").setOutputCol("gender_code")
val stateIndexer = new StringIndexer().setInputCol("STATE").setOutputCol("state_code")
val educationIndexer = new StringIndexer().setInputCol("EDUCATION").setOutputCol("education_code")

val featuresAssembler = new VectorAssembler().setInputCols(Array("AGE", 
                                                         "ACTIVITY", 
                                                         "education_code", 
                                                         "NEGTWEETS", 
                                                         "INCOME",
                                                         "gender_code",
                                                         "state_code")).setOutputCol("features")

In [None]:
//Logistic Regression
val lr = new LogisticRegression().setRegParam(0.01).setLabelCol("label").setFeaturesCol("features")

In [None]:
import org.apache.spark.ml.{Pipeline, PipelineStage}

val pipeline = new Pipeline().setStages(Array(genderIndexer, 
                                              stateIndexer, 
                                              educationIndexer,
                                              featuresAssembler,
                                              lr))
val newModel = pipeline.fit(trainingDF)

In [None]:
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegressionModel}

// Extract the summary from the LogisticRegressionModel instance 
val lrModel = newModel.stages(4).asInstanceOf[LogisticRegressionModel]

In [None]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint

val testDFWithPredictions = newModel.transform(testDF)
val testData = testDFWithPredictions.drop("prediction", "rawPrediction", "probability")
val trainingSummary = lrModel.evaluate(testData)
val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]

// The ROC curve and area under the ROC curve on test data
val rocOnTestData = binarySummary.roc
println(s"Area under ROC curve for the initial model: ${binarySummary.areaUnderROC}")

## Displaying the evaluation results - ROC curve with Brunel

In [None]:
%%brunel data('rocOnTestData') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid) title('ROC')

#  Save locally:  Save trained model to DSX Local Project

In [None]:
// TODO:  Rename values in code below to replace the label TODO_CHANGE_TO_TEAMNAME with your lab team's name 

// DSX Local Machine Learning - Use ML client to save model.

import com.ibm.analytics.ngp.dsxML._
import spray.json._

val ml_client=ML()
//val modelName="TODO_CHANGE_TO_TEAMNAME Banking Churn Notebook Model LR"
val modelName="TODO_CHANGE_TO_TEAMNAME-BankingChurnNotebookModelLR"

// API specification:  save(model, trainData, testData, metrics, name, description,filename, algorithmType, props: (String,String)*)
val saveResult=ml_client.save(newModel,
                              trainingDF,
                              testDF,
                              None,
                              modelName,
                              "Prediction for customer to churn from business",
                              "Churn ML Training Notebook Scala HDP LR.ipynb",
                              "Classification")

saveResult

## Test Locally:  Test model in DSX Local Project

In [None]:
import play.api.libs.json._
import scalaj.http.{Http,HttpOptions}

val json_map=Json.toJson(List(Json.toJson(Map(
    "AGE"->Json.toJson(23),
    "ACTIVITY"->Json.toJson(3),
    "EDUCATION"->Json.toJson("Masters degree"),
    "GENDER"->Json.toJson("M"),
    "STATE"->Json.toJson("NY"),
    "NEGTWEETS"->Json.toJson(7),
    "INCOME"->Json.toJson(878657)
))))

val projectName=sys.env("DSX_PROJECT_NAME")
val authToken=sys.env("DSX_TOKEN")

val scoringURL=s"http://dsx-scripted-ml-python2-svc.dsxl-ml:7300/api/v1/score/unpublished/${projectName}/${modelName}"
println(scoringURL)

val payload_scoring=Json.stringify(json_map)
println(payload_scoring)

val response_scoring=Http(scoringURL).postData(payload_scoring).header("Content-Type","application/json").header("Authorization",authToken).option(HttpOptions.connTimeout(10000)).option(HttpOptions.readTimeout(50000)).asString
response_scoring

Developed/Updated by Alexander Petrov, Matt Walli, Anup Nair Data Science Elite Team, IBM Analytics