# Cross Validation & Hyper Parameter Tuning

### Overview
Choose the best model by tuning parameters
 
### Builds on
None

### Run time
approx. 10-20 minutes

### Notes



In [None]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark


## Step 1: Perform Logistic Regression

In [None]:
admissions = spark.read.csv("/data/college-admissions/admission-data.csv",\
                            header=True, inferSchema=True)

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["gpa", "gre","rank"], outputCol="features")
featureVector = assembler.transform(admissions)
featureVector = featureVector.withColumn("label",featureVector["admit"])

(train, test) = featureVector.randomSplit([0.7,  0.3])


from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(train)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

### Evaluate

In [None]:
trainingSummary = lrModel.summary

predictions_test = lrModel.transform(test)
predictions_train = lrModel.transform(train)

# we can also get this from trainingSummary
# predictions_train = lrModel.summary.predictions

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

print("Training set accuracy = " , evaluator.evaluate(predictions_train))
print("Test set accuracy = " , evaluator.evaluate(predictions_test))

In [None]:
## Confusion Matrix
cm = predictions_test.groupBy('admit').pivot('prediction').count().na.fill(0).orderBy('admit')
cm.show()

## 2 - Hyper Tuning

### 2.1 - Explain Parameters to the model

In [None]:
print(lr.explainParams())

### 2.2 - Create grid search & Cross validator

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")

## TODO : Experiment with one of the following evaluators

## MulticlassClassificationEvaluator is a good all purpose evaluator
# evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
#                                               metricName="accuracy")

## AUC
evaluator = BinaryClassificationEvaluator (metricName='areaUnderROC')

## AreaUnderPR - useful for skewed datasets
# evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')

# Create ParamGrid for Cross Validation
## TODO 1 - fill in 'regParam' values we want to test  (min 0.0)
## TODO 2 - fill in 'elasticNetParam' we want to test (values  between 0.0 to 1.0)
## TODO 3 - fill in 'maxIter' values we want to test (10, 50, 100)
paramGrid = (ParamGridBuilder()
             .addGrid(lrModel.regParam, [0.0, 0.01, 0.1, ???, ???, 2.0])
             .addGrid(lrModel.elasticNetParam, [0.0, ???, 1.0])
             .addGrid(lrModel.maxIter, [10, ???, 100])
             .build())

# TODO : Create 5-fold CrossValidator
## Hint : numFolds=5
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=???)

In [None]:
%%time

## TODO : Run cross validation on training data
## Hint : 'train'
print("cross validation starting ...")
cvModel = cv.fit(???)
print("cross validation done.")

In [None]:
## TODO : Identify the values for the following hyper parameters that cross validation figured out
##     - regParam
##     - elasticNetParam
##     - maxIter
## Hint : look at the output of 'explainParams'

bestModel = cvModel.bestModel
print("Best model : ", bestModel)
print("Coefficients: " + str(bestModel.coefficients))
print("Intercept: " + str(bestModel.intercept))
print ()
print("Best model params : \n", bestModel.explainParams())

### 2.3 - Evaluate

In [None]:
predictions_cv_test = cvModel.transform(test)
predictions_cv_train = cvModel.transform(train)

In [None]:
## TODO : calculate training & testing accuracy

print ("Training accuracy for cross validated model : ", evaluator.evaluate(predictions_cv_???)) # Hint : train

print ("Test accuracy for cross validated model : ", evaluator.evaluate(predictions_cv_???))  # Hint : test

In [None]:
## TODO : Inspect the confusion matrix output
## does this CM look better than previous CM?

cm = predictions_cv_test.groupBy('admit').pivot('prediction').count().na.fill(0).orderBy('admit')
cm.show()