# ML Notebook for Banking Churn Model

## Importing Brunel and ML Libraries

In [1]:
%AddJar -magic https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar -f

Starting download from https://brunelvis.org/jar/spark-kernel-brunel-all-2.3.jar
Finished download of spark-kernel-brunel-all-2.3.jar


In [2]:
//import libraries
import org.apache.spark.{SparkConf, SparkContext, SparkFiles}
import org.apache.spark.sql.{SQLContext, SparkSession, Row}
import org.apache.spark.SparkFiles

import org.apache.spark.ml.feature.{StringIndexer, IndexToString, VectorIndexer, VectorAssembler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.classification.{LogisticRegression, DecisionTreeClassifier}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.ml.ibm.transformers.RenameColumn

import com.ibm.analytics.ngp.ingest.Sampling
import com.ibm.analytics.ngp.util._
import com.ibm.analytics.ngp.pipeline.evaluate.{Evaluator,MLProblemType}

import com.ibm.analytics.wml.{Learner, Target}
import com.ibm.analytics.wml.cads.CADSEstimator

# Loading Data

In [1]:
// Insert "cust_summary_notebook_training.csv" data set as Spark DataFrame

import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
// Add data asset from file system
val df2 = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").option("mode", "DROPMALFORMED").csv(sys.env("DSX_PROJECT_DIR")+"/datasets/cust_summary_notebook_training.csv")
df2.show(5)

+------+---+----------+-------+--------+---------+------------+-------+----------+---------+-----+----------------+-----+
|GENDER|AGE|INVESTMENT| INCOME|ACTIVITY| YRLY_AMT|AVG_DAILY_TX|YRLY_TX|AVG_TX_AMT|NEGTWEETS|STATE|       EDUCATION|label|
+------+---+----------+-------+--------+---------+------------+-------+----------+---------+-----+----------------+-----+
|     F| 84|    114368|3852862|       5| 700259.0|    0.917808|    335|   2090.32|        3|   TX|Bachelors degree|  0.0|
|     F| 44|     90298|3849843|       1| 726977.0|    0.950685|    347|   2095.04|        2|   CA|Bachelors degree|  0.0|
|     F| 23|     94881|3217364|       1| 579084.0|    0.920548|    336|   1723.46|        5|   CA|Bachelors degree|  1.0|
|     F| 24|    112099|2438218|       4| 470964.0|    0.994521|    363| 1297.4199|        2|   WA|Bachelors degree|  1.0|
|     F| 67|     84638|2428245|       3| 446615.0|    0.917808|    335| 1333.1799|        3|   CT|       Doctorate|  0.0|
+------+---+----------+-

In [2]:
val churnDataRaw = df2

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._ 

val churnData = churnDataRaw.select("AGE", "ACTIVITY", "EDUCATION", "GENDER", "STATE", "NEGTWEETS", "INCOME", "label")
churnData.show(5)

+---+--------+----------------+------+-----+---------+-------+-----+
|AGE|ACTIVITY|       EDUCATION|GENDER|STATE|NEGTWEETS| INCOME|label|
+---+--------+----------------+------+-----+---------+-------+-----+
| 84|       5|Bachelors degree|     F|   TX|        3|3852862|  0.0|
| 44|       1|Bachelors degree|     F|   CA|        2|3849843|  0.0|
| 23|       1|Bachelors degree|     F|   CA|        5|3217364|  1.0|
| 24|       4|Bachelors degree|     F|   WA|        2|2438218|  1.0|
| 67|       3|       Doctorate|     F|   CT|        3|2428245|  0.0|
+---+--------+----------------+------+-----+---------+-------+-----+
only showing top 5 rows



In [4]:
val train = 70
val test = 15
val validate = 15

//Split the data into training data set, testing data set, and validation data set

val splits = Sampling.trainingSplit(churnData, train, test, validate)

val trainingDF = splits._1
val testDF = splits._2
val validationDF = splits._3

println("Training data set")
trainingDF.show(5)

println("Testing data set")
testDF.show(5)

println("Validation data set")
validationDF.show(5)

Training data set
+---+--------+--------------------+------+-----+---------+------+-----+
|AGE|ACTIVITY|           EDUCATION|GENDER|STATE|NEGTWEETS|INCOME|label|
+---+--------+--------------------+------+-----+---------+------+-----+
| 20|       0|High school graduate|     F|   CA|        7| 17088|  1.0|
| 20|       0|High school graduate|     F|   ID|       13| 17877|  1.0|
| 20|       0|High school graduate|     F|   WA|       15| 15497|  1.0|
| 20|       0|High school graduate|     M|   CA|        7| 16982|  1.0|
| 20|       1|    Associate degree|     F|   PA|       15| 26700|  1.0|
+---+--------+--------------------+------+-----+---------+------+-----+
only showing top 5 rows

Testing data set
+---+--------+--------------------+------+-----+---------+------+-----+
|AGE|ACTIVITY|           EDUCATION|GENDER|STATE|NEGTWEETS|INCOME|label|
+---+--------+--------------------+------+-----+---------+------+-----+
| 20|       1|High school graduate|     F|   ID|        7| 19761|  1.0|
| 20

# Building and Evaluating LR model

In [5]:
//Feature definition

val genderIndexer = new StringIndexer().setInputCol("GENDER").setOutputCol("gender_code")
val stateIndexer = new StringIndexer().setInputCol("STATE").setOutputCol("state_code")
val educationIndexer = new StringIndexer().setInputCol("EDUCATION").setOutputCol("education_code")

val featuresAssembler = new VectorAssembler().setInputCols(Array("AGE", 
                                                         "ACTIVITY", 
                                                         "education_code", 
                                                         "NEGTWEETS", 
                                                         "INCOME",
                                                         "gender_code",
                                                         "state_code")).setOutputCol("features")

In [6]:
//Logistic Regression

val lr = new LogisticRegression().setRegParam(0.01).setLabelCol("label").setFeaturesCol("features")

In [7]:
import org.apache.spark.ml.{Pipeline, PipelineStage}

val pipeline = new Pipeline().setStages(Array(genderIndexer, 
                                              stateIndexer, 
                                              educationIndexer,
                                              featuresAssembler,
                                              lr))
val newModel = pipeline.fit(trainingDF)

In [8]:
import org.apache.spark.ml.classification.{BinaryLogisticRegressionSummary, LogisticRegressionModel}

// Extract the summary from the LogisticRegressionModel instance 
val lrModel = newModel.stages(4).asInstanceOf[LogisticRegressionModel]

In [9]:
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.LabeledPoint

val testDFWithPredictions = newModel.transform(testDF)
val testData = testDFWithPredictions.drop("prediction", "rawPrediction", "probability")
val trainingSummary = lrModel.evaluate(testData)
val binarySummary = trainingSummary.asInstanceOf[BinaryLogisticRegressionSummary]

// The ROC curve and area under the ROC curve on test data
val rocOnTestData = binarySummary.roc
println(s"Area under ROC curve for the initial model: ${binarySummary.areaUnderROC}")

Area under ROC curve for the initial model: 0.9949513568868407


## Displaying the evaluation results - ROC curve with Brunel

In [10]:
%%brunel data('rocOnTestData') x(FPR) y(TPR) line tooltip(#all) axes(x:'False Positive Rate':grid, y:'True Positive Rate':grid) title('ROC')

#  Save locally:  Save trained model to DSX Local Project

In [11]:
// DSX Local Machine Learning - Use ML client to save model.

import com.ibm.analytics.ngp.dsxML._
import spray.json._

val ml_client=ML()
//val modelName="TODO_CHANGE_TO_TEAMNAME Banking Churn Notebook Model LR"
val modelName="BankingChurnMLNotebookModelLR"

// API specification:  save(model, trainData, testData, metrics, name, description,filename, algorithmType, props: (String,String)*)
val saveResult=ml_client.save(newModel,
                              trainingDF,
                              testDF,
                              None,
                              modelName,
                              "Prediction for customer to churn from business",
                              "Churn ML Training Notebook Scala HDP LR.ipynb",
                              "Classification")

saveResult

Success(/user-home/1010/DSX_Projects/AllHandsCustomerChurnLab/models/BankingChurnMLNotebookModelLR/1)

## Test Locally:  Test model in DSX Local Project

In [None]:
import play.api.libs.json._
import scalaj.http.{Http,HttpOptions}

val json_map=Json.toJson(List(Json.toJson(Map(
    "AGE"->Json.toJson(23),
    "ACTIVITY"->Json.toJson(3),
    "EDUCATION"->Json.toJson("Masters degree"),
    "GENDER"->Json.toJson("M"),
    "STATE"->Json.toJson("NY"),
    "NEGTWEETS"->Json.toJson(7),
    "INCOME"->Json.toJson(878657)
))))

val projectName=sys.env("DSX_PROJECT_NAME")
val authToken=sys.env("DSX_TOKEN")

val scoringURL=s"http://dsx-scripted-ml-python2-svc.dsxl-ml:7300/api/v1/score/unpublished/${projectName}/${modelName}"
println(scoringURL)

val payload_scoring=Json.stringify(json_map)
println(payload_scoring)

val response_scoring=Http(scoringURL).postData(payload_scoring).header("Content-Type","application/json").header("Authorization",authToken).option(HttpOptions.connTimeout(10000)).option(HttpOptions.readTimeout(50000)).asString
response_scoring

http://dsx-scripted-ml-python2-svc.dsxl-ml:7300/api/v1/score/unpublished/AllHandsCustomerChurnLab/BankingChurnMLNotebookModelLR


Developed/Updated by Alexander Petrov, Matt Walli, Anup Nair Data Science Elite Team, IBM Analytics