# Support Vector Machines: Churn Analysis

Let's look at a classification example in Spark MLLib.  We are going to look at some telecom data to see whether or not a customer "churned" or not.


In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


print('Spark UI running on http://YOURIPADDRESS:' + sc.uiWebUrl.split(':')[2])

## Step 1: Load the data

In [None]:
%%time
dataset = spark.read.csv("/data/churn/telco.csv.gz", header=True, inferSchema=True)

In [None]:
print("read {:,} records".format(dataset.count()))

dataset.printSchema()

In [None]:
## Dataframe show output is not easy to read
# dataset.show()

## pretty print with pandas
## horizontally
dataset.limit(10).toPandas()

## vertically
# dataset.limit(10).toPandas().T

## Step 2 : Basic Analytics of Data

In [None]:
## describe

## following output is hard to read
# dataset.describe().show() 

## use pandas for pretty print
## TODO : convert to pandas ('toPandas')
dataset.describe().???().T

In [None]:
## TODO : Distribution by 'Churn'
dataset.groupBy('???').count().show()

In [None]:
## TODO : Distribution by 'ContraCT'
dataset.groupBy('???').count().show()

In [None]:
## TODO : Distribution by 'Gender'
dataset.groupBy('???').count().show()

## Step 3 : Categorical Data

In [None]:
## Define columns
prediction_column = ['Churn']
categorical_columns = ['gender',  'InternetService','Contract','PaymentMethod']
categorical_index = ['gender_index',  'InternetService_index','Contract_index','PaymentMethod_index']


columns = ['SeniorCitizen','PhoneService','Partner','Dependents','tenure','MultipleLines',
           'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
           'StreamingTV','StreamingMovies','PaperlessBilling',
           'MonthlyCharges','TotalCharges']

In [None]:
dataset.select(categorical_columns).show(5)
dataset.select(prediction_column).show(5)


## Step 4: Deal with Categorical Columns

Let's deal with the categorical columns, including the output

Workflow:
- **Feature Indexers** :  ( category columns --> '*_index' columns)
- **Label indexer** : 'Churn' --> 'indexedLabel'
- **Vector Assembler** : '*_index' columns --> 'features' 
- **Scaler** :  'features' --> 'scaledFeatures'

In [None]:
## handy function to pretty print indexers, scalers, assemblers

from pyspark.ml.feature import StringIndexer, StandardScaler, VectorAssembler, MinMaxScaler

def pretty_print_transformer(transformer):
    if (type(transformer) is StringIndexer) \
        or (type(transformer) is StandardScaler) \
        or (type(transformer) is MinMaxScaler) : \
        return (transformer.__class__.__name__ + " : " + transformer.getInputCol() + ' -> ' +  transformer.getOutputCol())
    
    if type(transformer) is VectorAssembler:
        return (transformer.__class__.__name__ + " : " + str(transformer.getInputCols()) + ' -> ' +  transformer.getOutputCol())
    


In [None]:
## 4.1 - Feature Indexers

from pyspark.ml.feature import StringIndexer

print("indexing categorical columns : ", categorical_columnscategorical)

## TODO : create indexers in a loop
## loop through 'categorical_columns'
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index", handleInvalid="keep")\
            for column in ??? ]

for indexer in indexers:
    print(pretty_print_transformer(indexer))


In [None]:
## 4.2 - label indexer

from pyspark.ml.feature import StringIndexer

## TODO : we need to index 'Churn' column too
## Create a String Indexer with inputColumn='Churn' and outputCol='indexedLabel'
labelIndexer = ???(inputCol="???", outputCol="???")

print(pretty_print_transformer(labelIndexer))


In [None]:
## 4.3 - Vector assembler 
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=columns + categorical_index, outputCol="features")

print (pretty_print_transformer(assembler))


In [None]:
## 4.5 - Scaler
from pyspark.ml.feature import StandardScaler

## TODO : scale 'features' column into 'scaledFeatures'
scaler = StandardScaler(inputCol="???", outputCol="???")

print (pretty_print_transformer(scaler))

## Step 5: Build the Pipeline
We are going to transform the data using Spark pipeline.

In [None]:
from pyspark.ml import Pipeline

##  with scaler
stages = indexers + [labelIndexer, assembler,  scaler] 

## without scaler
#stages = indexers + [assembler, labelIndexer] 

i = 0
for stage in stages:
    i = i+1
    print ("stage ", i , " : ", pretty_print_transformer(stage))
print()

## TODO : Create a 'Pipeline' passing 'stages' as input
pipeline = ???(stages=???)

print ("pipeline : ", pipeline.explainParams())

In [None]:
%%time
## TODO : Run data through the pipeline
## Hint : first call 'fit' and then 'transform'
processed_data = pipeline.???(dataset).???(dataset)

print ("processed data count ", processed_data.count())

In [None]:
## pretty print transformed data using pandas
x = processed_data.limit(2).toPandas()
# print horizontally
# x
# print veriticall
x.T

## Step 6: Split into training and test.

In [None]:
## TODO : training=80%,  test=20%
(training, test) = processed_data.randomSplit([???, ???])

print("training set count : ", training.count())
print("testing set count : ", test.count())

## Step 7 - Create SVM Model

In [None]:
from pyspark.ml.classification import LinearSVC

## TODO : create 'LinearSVC' model
##    with labelCol='indexedLabel'
##    with featuresCol='scaledFeatures'
##    with maxIter=100
lsvc = ???(labelCol="???", featuresCol="???", maxIter=???, regParam=0.1)

## Step 8: Train  Linear SVM model

In [None]:
print ("training starting on ", training.count() , " records")

In [None]:
%%time 

## TODO : train the model
## Hint :    call 'fit' on 'training' data
lsvcModel = lsvc.???(???)
print ("training done")

In [None]:
# Print the coefficients and intercept for linearsSVC
coef = lsvcModel.coefficients

df = pd.DataFrame({'input' : columns + categorical_index, 'coefficient': lsvcModel.coefficients})
print("Intercept: " + str(lsvcModel.intercept))

df
#df.sort_values(by=['input'])

## Step 9 : Predict on Test Data

In [None]:
print ("predicting on " , test.count() , " records")

In [None]:
%%time

## TODO : predict on test data
## Hint : 'transform' on 'test'
predictions = lsvcModel.???(???)


## Step 10: See the evaluation metrics

### 10.1 - AUC

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='indexedLabel', rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)  #AUC


**=> What does AUC mean?** 

### 10.2 Model Accuracy

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator\

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print ("accuracy ", accuracy)
print("Test Error = %g" % (1.0 - accuracy))


### 10.3 : Confusion matrix

**Interpret the confusion matrix output**

In [None]:
# Confusion matrix
predictions.groupBy('Churn').pivot('prediction', [0,1]).count().na.fill(0).orderBy('Churn').show()

## Step 11: Try running without scaling features

In Step-5  we are adding a scaler at the end to normalize the vector.  
Try without scaler.  

Uncomment the following line   
```
#stages = indexers + [assembler, featureIndexer, labelIndexer] 
```

And run the whole notebook (Cell --> Run All)  
Do you see any improvement/degradation in accuracy / AUC ?