# Cross Validation & Hyper Parameter Tuning

### Overview
Choose the best model by tuning parameters
 
### Builds on
None

### Run time
approx. 10-20 minutes

### Notes



In [1]:
# initialize Spark Session
import os
import sys
top_dir = os.path.abspath(os.path.join(os.getcwd(), "../"))
if top_dir not in sys.path:
    sys.path.append(top_dir)

from init_spark import init_spark
spark = init_spark()
spark

Initializing Spark...
Spark found in :  /Users/sujee/spark
Spark config:
	 spark.app.name=TestApp
	spark.master=local[*]
	executor.memory=2g
	spark.sql.warehouse.dir=/var/folders/lp/qm_skljd2hl4xtps5vw0tdgm0000gn/T/tmp3_wjo5r1
	some_property=some_value
Spark UI running on port 4040



## Step 1: Perform Logistic Regression

In [2]:
admissions = spark.read.csv("/data/college-admissions/admission-data.csv",\
                            header=True, inferSchema=True)

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["gpa", "gre","rank"], outputCol="features")
featureVector = assembler.transform(admissions)
featureVector = featureVector.withColumn("label",featureVector["admit"])

(train, test) = featureVector.randomSplit([0.7,  0.3])


from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0.8)

lrModel = lr.fit(train)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: [0.0,0.001731356923420387,0.0]
Intercept: -1.319320200338312


### Evaluate

In [3]:
trainingSummary = lrModel.summary

predictions_test = lrModel.transform(test)
predictions_train = lrModel.transform(train)

# we can also get this from trainingSummary
# predictions_train = lrModel.summary.predictions

In [4]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator

## Choose one of the following evaluators
## MulticlassClassificationEvaluator is a good all purpose evaluator
# evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
#                                               metricName="accuracy")

## AUC
evaluator = BinaryClassificationEvaluator (metricName='areaUnderROC')

## AreaUnderPR - useful for skewed datasets
# evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')

print("Training set accuracy = " , evaluator.evaluate(predictions_train))
print("Test set accuracy = " , evaluator.evaluate(predictions_test))

Training set accuracy =  0.8548094373865699
Test set accuracy =  0.8458646616541353


In [5]:
## Confusion Matrix
cm = predictions_test.groupBy('admit').pivot('prediction').count().na.fill(0).orderBy('admit')
cm.show()

+-----+---+---+
|admit|0.0|1.0|
+-----+---+---+
|    0| 19|  0|
|    1| 12|  2|
+-----+---+---+



## 2 - Hyper Tuning

### 2.1 - Explain Parameters to the model

In [6]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0, current: 0.8)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The bounds vector size m

### 2.2 - Create grid search & Cross validator

In [7]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


## Choose one of the following evaluators
## MulticlassClassificationEvaluator is a good all purpose evaluator
# evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
#                                               metricName="accuracy")

## AUC
evaluator = BinaryClassificationEvaluator (metricName='areaUnderROC')

## AreaUnderPR - useful for skewed datasets
# evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR')

# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lrModel.regParam, [0.01,0.1, 0.5, 0.8, 1.0, 2.0])
             .addGrid(lrModel.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lrModel.maxIter, [10, 50, 100])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [8]:
%%time

# Run cross validations
print("cross validation starting ...")
cvModel = cv.fit(train)
print("cross validation done.")

cross validation starting ...
cross validation done.
CPU times: user 5.54 s, sys: 1.29 s, total: 6.83 s
Wall time: 2min 8s


In [9]:
bestModel = cvModel.bestModel
print("Best model : ", bestModel)
print("Coefficients: " + str(bestModel.coefficients))
print("Intercept: " + str(bestModel.intercept))
print("Best model params : \n", bestModel.explainParams())

Best model :  LogisticRegression_450a993c2da89b12d7f1
Coefficients: [0.004168532687271339,0.007645239340917154,0.0]
Intercept: -4.983846952223279
Best model params : 
 aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0, current: 1.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial. (default: auto)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term (default: True)
labelCol: label column name (default: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. (undefined)
maxIter: maximum nu

### 2.3 - Evaluate

In [10]:
predictions_cv_test = cvModel.transform(test)
predictions_cv_train = cvModel.transform(train)

In [11]:
print ("Training accuracy for cross validated model : ", evaluator.evaluate(predictions_cv_train))
print ("Test accuracy for cross validated model : ", evaluator.evaluate(predictions_cv_test))

Training accuracy for cross validated model :  0.8593466424682396
Test accuracy for cross validated model :  0.8609022556390978


In [12]:
cm = predictions_cv_test.groupBy('admit').pivot('prediction').count().na.fill(0).orderBy('admit')
cm.show()

+-----+---+---+
|admit|0.0|1.0|
+-----+---+---+
|    0| 16|  3|
|    1|  5|  9|
+-----+---+---+

