In [1]:
import argparse
import re

from pyspark.sql import SparkSession

from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, RandomForestRegressionModel
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.mllib.evaluation import RegressionMetrics

In [2]:
#/FileStore/tables/cb1iklgz1490855926352/test.csv
#/FileStore/tables/cb1iklgz1490855926352/train.csv

In [3]:
trainInput = spark.read.csv('/FileStore/tables/cb1iklgz1490855926352/train.csv',header=True, inferSchema=True)
testInput = spark.read.csv('/FileStore/tables/cb1iklgz1490855926352/test.csv', header=True, inferSchema=True)

In [4]:
display(trainInput)

In [5]:
def setParams():
  parser = argparse.ArgumentParser()
  parser.add_argument("--trainInput",  help="Path to file/directory for training data", required=True)
  parser.add_argument("--testInput",   help="Path to file/directory for test data", required=True)
  parser.add_argument("--outputFile",  help="Path to output file")
  parser.add_argument("--algoNumTrees", nargs='+', type=int, help="One or more options for number of trees for RandomForest model. Default: 3", default=[3])
  parser.add_argument("--algoMaxDepth", nargs='+', type=int, help="One or more values for depth limit. Default: 4", default=[4])
  parser.add_argument("--algoMaxBins",  nargs='+', type=int, help="One or more values for max bins for RandomForest model. Default: 32", default=[32])
  parser.add_argument("--numFolds",    type=int,   help="Number of folds for K-fold Cross Validation. Default: 10", default=10)
  parser.add_argument("--trainSample", type=float, help="Sample fraction from 0.0 to 1.0 for train data", default=1.0)
  parser.add_argument("--testSample",  type=float, help="Sample fraction from 0.0 to 1.0 for test data", default=1.0)

  params = parser.parse_args()
  return params

In [6]:
#params = setParams() 
data = (trainInput.withColumnRenamed("loss", "label"))
[trainingData, validationData] = data.randomSplit([0.7, 0.3])

trainingData.cache()
validationData.cache()


In [7]:
#******************************************
print("Building Machine Learning pipeline")
#******************************************

    #StringIndexer for categorical columns (OneHotEncoder should be evaluated as well)
isCateg     = lambda c: c.startswith("cat")
categNewCol = lambda c: "idx_{0}".format(c) if (isCateg(c)) else c

stringIndexerStages = map(lambda c: StringIndexer(inputCol=c, outputCol=categNewCol(c))
.fit(trainInput.select(c).union(testInput.select(c))), filter(isCateg, trainingData.columns))

    #Function to remove categorical columns with too many categories
removeTooManyCategs = lambda c: not re.match(r"cat(109$|110$|112$|113$|116$)", c)

    #Function to select only feature columns (omit id and label)
onlyFeatureCols = lambda c: not re.match(r"id|label", c)

    #Definitive set of feature columns
featureCols = map(categNewCol, 
                      filter(onlyFeatureCols, 
                             filter(removeTooManyCategs, 
                                    trainingData.columns)))

    #VectorAssembler for training features
assembler = VectorAssembler(inputCols=featureCols, outputCol="features")

    #Estimator algorithm
algo = RandomForestRegressor(featuresCol="features", labelCol="label")
    
stages = stringIndexerStages
stages.append(assembler)
stages.append(algo)

    #Building the Pipeline for transformations and predictor
pipeline = Pipeline(stages=stages)


    #*********************************************************
print("Preparing K-fold Cross Validation and Grid Search")
    #*********************************************************

paramGrid = (ParamGridBuilder()
     .addGrid(algo.numTrees, [3])
     .addGrid(algo.maxDepth, [4])
     .addGrid(algo.maxBins, [32])
     .build())
      
cv = CrossValidator(estimator=pipeline,
                        evaluator=RegressionEvaluator(),
                        estimatorParamMaps=paramGrid,
                        numFolds=10)


    #**********************************************************
print("Training model with RandomForest algorithm")
    #**********************************************************

cvModel = cv.fit(trainingData)


    #********************************************************************
print("Evaluating model on train and test data and calculating RMSE")
    #********************************************************************
    
trainPredictionsAndLabels = cvModel.transform(trainingData).select("label", "prediction").rdd

validPredictionsAndLabels = cvModel.transform(validationData).select("label", "prediction").rdd

trainRegressionMetrics = RegressionMetrics(trainPredictionsAndLabels)
validRegressionMetrics = RegressionMetrics(validPredictionsAndLabels)

bestModel = cvModel.bestModel
featureImportances = bestModel.stages[-1].featureImportances.toArray()

In [8]:
output = ("\n=====================================================================\n" +
#       "Param trainSample: {0}\n".format(params.trainSample) +
#       "Param testSample: {0}\n".format(params.testSample) +
#       "TrainingData count: {0}\n".format(trainingData.count()) +
#       "ValidationData count: {0}\n".format(validationData.count()) +
#       "TestData count: {0}\n".format(testData.count()) +
#       "=====================================================================\n" +
#       "Param algoNumTrees = {0}\n".format(",".join(params.algoNumTrees)) +
#       "Param algoMaxDepth = {0}\n".format(",".join(params.algoMaxDepth)) +
#       "Param algoMaxBins = {0}\n".format(",".join(params.algoMaxBins)) +
#       "Param numFolds = {0}\n".format(params.numFolds) +
      "=====================================================================\n" +
      "Training data MSE = {0}\n".format(trainRegressionMetrics.meanSquaredError) +
      "Training data RMSE = {0}\n".format(trainRegressionMetrics.rootMeanSquaredError) +
      "Training data R-squared = {0}\n".format(trainRegressionMetrics.r2) +
      "Training data MAE = {0}\n".format(trainRegressionMetrics.meanAbsoluteError) +
      "Training data Explained variance = {0}\n".format(trainRegressionMetrics.explainedVariance) +
      "=====================================================================\n" +
      "Validation data MSE = {0}\n".format(validRegressionMetrics.meanSquaredError) +
      "Validation data RMSE = {0}\n".format(validRegressionMetrics.rootMeanSquaredError) +
      "Validation data R-squared = {0}\n".format(validRegressionMetrics.r2) +
      "Validation data MAE = {0}\n".format(validRegressionMetrics.meanAbsoluteError) +
      "Validation data Explained variance = {0}\n".format(validRegressionMetrics.explainedVariance) +
      "=====================================================================\n" +
      # "CV params explained: ${cvModel.explainParams()}\n" +
      # "RandomForest params explained: ${bestModel.stages[-1].explainParams()}\n" +
      "RandomForest features importances:\n {0}\n".format("\n".join(map(lambda z: "{0} = {1}".format(str(z[0]),str(z[1])), zip(featureCols, featureImportances)))) +
      "=====================================================================\n")
print(output)

In [9]:
display(cvModel.transform(testInput).select("id", "prediction"))