# Support Vector Machines: Churn Analysis

Let's look at a classification example in Spark MLLib.  We looked at the college admission before. We can look again at this dataset.  


In [None]:
%matplotlib inline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LinearSVC
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import pandas as pd



## Step 1: Load the data

In [None]:
dataset = spark.read.csv("/data/churn/telco.csv.gz", header=True, inferSchema=True)

prediction = ['Churn']
categorical = ['gender',  'InternetService','Contract','PaymentMethod']
categorical_index = ['gender_index',  'InternetService_index','Contract_index','PaymentMethod_index']


columns = ['SeniorCitizen','PhoneService','Partner','Dependents','tenure','MultipleLines',
           'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport',
           'StreamingTV','StreamingMovies','PaperlessBilling',
           'MonthlyCharges','TotalCharges']

In [None]:
dataset.select(categorical).show(5)
dataset.select(prediction).show(5)
dataset.select(columns).show(1)

In [None]:
dataset.describe().show()

## Step 2: Deal with Categorical Columns

Let's deal with the categorical columns, including the output

In [None]:
print(categorical)
dataset.select(categorical).show(5)

indexers = [StringIndexer(inputCol=column, outputCol=column + "_index", handleInvalid="keep").\
            fit(dataset) for column in categorical ]

labelIndexer = StringIndexer(inputCol="Churn", outputCol="indexedLabel")


## Step 3: Build the Vector



In [None]:
assembler = VectorAssembler(inputCols=columns + categorical_index, outputCol="features")


In [None]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4)



In [None]:
# Scaler

scaler = StandardScaler(inputCol="indexedFeatures", outputCol="scaledFeatures")

## Step 4: Split into training and test.

**=> Split into training/test with an 80/20 split ** 

In [None]:
## Split into training and test
## TODO: create training and test with an 80/20 split
(training, test) = dataset.randomSplit([.8, .2])

## Step 5: Build the Linear SVM model

In [None]:
lsvc = LinearSVC(labelCol="indexedLabel", featuresCol="scaledFeatures", maxIter=10, regParam=0.1)


stages = indexers + [assembler, featureIndexer, labelIndexer, scaler, lsvc] 

# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=stages)

# Fit the model
lsvcModel = pipeline.fit(training)


## Step 6: Run the test set and get the predictions

**=> TODO: Rename the label from "admit" to "label" **

**=> TODO: Transform the test dataset to get predictions **



In [None]:
predictions = lsvcModel.transform(test)

## Step 7: See the evaluation metrics

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol='indexedLabel', rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)  #AUC


**=> What does AUC mean?** 

In [None]:

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))


## Step 8: Show the confusion matrix

In [None]:
# Confusion matrix
predictions.groupBy('Churn').pivot('prediction', [0,1]).count().na.fill(0).orderBy('Churn').show()

**=> TODO: What is the meaning of the confusion matrix? **



## Step 9: Try running a prediction on your own data

**=> Create a few rows in your own dataframe (start with pandas dataframe) ** 

**=> Run .transform from your model to see the results.**