In [9]:
import java.time.LocalTime
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.lag
import org.apache.spark.sql.expressions.Window

import org.apache.spark.mllib.tree.model.DecisionTreeModel
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.evaluation.MulticlassMetrics 
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics  

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer, VectorAssembler}


val spark = SparkSession
  .builder()
  .appName("Spark ML")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

import spark.sqlContext.implicits._

val bucket = "dataproc-temp-us-central1-1044206227610-i54vpwyj"
spark.conf.set("temporaryGcsBucket", bucket)

println("Scala language: "+util.Properties.versionString)

// spark.sparkContext.version
spark.version

val data_raw = spark.read.parquet("gs://dataset-flight/final-for-ml.parquet")
//     .drop("ARR_DATETIME_RND")
//     .drop("ARR_WBAN")

val data = data_raw
    .drop(data_raw.columns.filter(colName => colName.contains("2")) : _*)
    .drop(data_raw.columns.filter(colName => colName.contains("3")) : _*)
    .drop(data_raw.columns.filter(colName => colName.contains("WindSpeed")) : _*)
    .na.drop()

val nbRows = data.count()
println(data.columns.size, nbRows)

Scala language: version 2.12.18
(49,2499643)


In [10]:
data.select("IS_DELAYED").summary("count", "mean").show()

+-------+--------------------+
|summary|          IS_DELAYED|
+-------+--------------------+
|  count|             2499643|
|   mean|0.029532617257744406|
+-------+--------------------+



In [11]:
// columns that need to added to feature column
val featColumns = data.columns.filter(name => !name.contains("IS_DELAYED"))
println(data.columns.size, featColumns.length)

// VectorAssembler to add feature column
// input columns - cols
// feature column - features
val vectorAssembler = new VectorAssembler()
    .setInputCols(featColumns)
    .setOutputCol("features")
    .setHandleInvalid("skip") // options are "keep", "error" or "skip"


val featureDf = vectorAssembler.transform(data)
// featureDf.printSchema()


// StringIndexer define new 'label' column with 'result' column
// Index labels, adding metadata to the label column.
// Fit on whole dataset to include all labels in index.
val labelIndexer = new StringIndexer()
  .setInputCol("IS_DELAYED")
  .setOutputCol("label")


val labelDf = labelIndexer.fit(featureDf).transform(featureDf)
// labelDf.printSchema()

(49,48)


featColumns = Array(DEP_DryBulbFarenheit, DEP_Visibility, DEP_WindDirection, DEP_StationPressure, DEP_WetBulbFarenheit, DEP_DewPointCelsius, DEP_clearSky, DEP_fewClouds, DEP_scatterClouds, DEP_brokenClouds, DEP_overCast, DEP_obscuredSky, DEP_partiallyObscuredSky, DEP_WindDirection1, DEP_StationPressure1, DEP_WetBulbFarenheit1, DEP_DewPointCelsius1, DEP_clearSky1, DEP_fewClouds1, DEP_scatterClouds1, DEP_brokenClouds1, DEP_overCast1, DEP_obscuredSky1, DEP_partiallyObscuredSky1, ARR_DryBulbFarenheit, ARR_Visibility, ARR_WindDirection, ARR_StationPressure, ARR_WetBulbFarenheit, ARR_DewPointCelsius, ARR_clearSky, ARR_fewClouds, ARR_scatterClouds, ARR_brokenClouds, ARR_overCast, ARR_obscuredSky, ARR_partiallyObscuredSky, ARR_WindDirection1, ARR_StationPressure1, ARR_WetBulbFare...


Array(DEP_DryBulbFarenheit, DEP_Visibility, DEP_WindDirection, DEP_StationPressure, DEP_WetBulbFarenheit, DEP_DewPointCelsius, DEP_clearSky, DEP_fewClouds, DEP_scatterClouds, DEP_brokenClouds, DEP_overCast, DEP_obscuredSky, DEP_partiallyObscuredSky, DEP_WindDirection1, DEP_StationPressure1, DEP_WetBulbFarenheit1, DEP_DewPointCelsius1, DEP_clearSky1, DEP_fewClouds1, DEP_scatterClouds1, DEP_brokenClouds1, DEP_overCast1, DEP_obscuredSky1, DEP_partiallyObscuredSky1, ARR_DryBulbFarenheit, ARR_Visibility, ARR_WindDirection, ARR_StationPressure, ARR_WetBulbFarenheit, ARR_DewPointCelsius, ARR_clearSky, ARR_fewClouds, ARR_scatterClouds, ARR_brokenClouds, ARR_overCast, ARR_obscuredSky, ARR_partiallyObscuredSky, ARR_WindDirection1, ARR_StationPressure1, ARR_WetBulbFare...

In [12]:
// Split the data into training and test sets (30% held out for testing).
// split data set training and test
// training data set - 70%
// test data set - 30%
val seed = 42

val zeros = labelDf.filter(col("IS_DELAYED") === 0).sample(0.1, seed)
val ones = labelDf.filter(col("IS_DELAYED") === 1)

// split datasets into training and testing
val Array(train0, test0) = zeros.randomSplit(Array(0.7, 0.3), seed)
val Array(train1, test1) = ones.randomSplit(Array(0.7, 0.3), seed)

// stack datasets back together
val trainingData = train0.union(train1)
val testData = test0.union(test1)

println(f"""There are ${trainingData.count} rows in the training set, and ${testData.count} in the test set""")

There are 221701 rows in the training set, and 94543 in the test set


seed = 42
zeros = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 49 more fields]
ones = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 49 more fields]
train0 = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 49 more fields]
test0 = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 49 more fields]
train1 = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 49 more fields]
test1 = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigi...


[IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigi...

In [13]:
testData.select("IS_DELAYED").summary("count", "mean").show()

+-------+-------------------+
|summary|         IS_DELAYED|
+-------+-------------------+
|  count|              94543|
|   mean|0.23189448187597178|
+-------+-------------------+



In [14]:
trainingData.select("IS_DELAYED").summary("count", "mean").show()

+-------+------------------+
|summary|        IS_DELAYED|
+-------+------------------+
|  count|            221701|
|   mean|0.2340855476520178|
+-------+------------------+



In [15]:
// train Random Forest model with training data set
val randomForestClassifier = new RandomForestClassifier()
  .setImpurity("gini")
  .setMaxDepth(15)
  .setNumTrees(30)
  .setFeatureSubsetStrategy("auto")
  .setSeed(seed)
  .setLabelCol("label") // indexedLabel
  .setFeaturesCol("features") // indexedFeatures

val randomForestModel = randomForestClassifier.fit(trainingData)

val predictionDf = randomForestModel.transform(testData)

// Select example rows to display.
predictionDf.select("IS_DELAYED", "features", "label", "rawPrediction", "probability", "prediction").show(5)

+----------+--------------------+-----+--------------------+--------------------+----------+
|IS_DELAYED|            features|label|       rawPrediction|         probability|prediction|
+----------+--------------------+-----+--------------------+--------------------+----------+
|         0|(48,[0,1,2,3,4,5,...|  0.0|[24.4463421293850...|[0.81487807097950...|       0.0|
|         0|(48,[0,1,3,4,5,7,...|  0.0|[24.1233321464375...|[0.80411107154791...|       0.0|
|         0|(48,[0,1,3,4,5,9,...|  0.0|[25.3166075611762...|[0.84388691870587...|       0.0|
|         0|(48,[0,1,2,3,4,5,...|  0.0|[27.9958769651046...|[0.93319589883682...|       0.0|
|         0|(48,[0,1,3,4,5,7,...|  0.0|[26.4653602134310...|[0.88217867378103...|       0.0|
+----------+--------------------+-----+--------------------+--------------------+----------+
only showing top 5 rows



randomForestClassifier = rfc_e217715724d7
randomForestModel = RandomForestClassificationModel: uid=rfc_e217715724d7, numTrees=30, numClasses=2, numFeatures=48
predictionDf = [IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 52 more fields]


[IS_DELAYED: bigint, DEP_DryBulbFarenheit: bigint ... 52 more fields]

Confusion matrix

In [16]:
val predictionAndLabels = predictionDf
    .select("prediction", "label")
    .as[(Double, Double)]
    .rdd

// Instantiate a new metrics objects
val bMetrics = new BinaryClassificationMetrics(predictionAndLabels)
val mMetrics = new MulticlassMetrics(predictionAndLabels)
val labels = mMetrics.labels

// Print out the Confusion matrix
println("Confusion matrix:")
println(mMetrics.confusionMatrix)

val trueNegative = predictionDf.filter(col("prediction") === 0 && col("label") === col("prediction")).count().toDouble
val truePositive = predictionDf.filter(col("prediction") === 1 && col("label") === col("prediction")).count().toDouble
val falseNegative = predictionDf.filter(col("prediction") === 0 && col("label") =!= col("prediction")).count().toDouble
val falsePositive = predictionDf.filter(col("prediction") === 1 && col("label") =!= col("prediction")).count().toDouble

val totalDelays = predictionDf.filter(col("label") === 1).count()

val precision = truePositive / (truePositive + falsePositive) 
val recall = truePositive / (truePositive + falseNegative) 
val acc = (truePositive + trueNegative) / (truePositive + trueNegative + falsePositive + falseNegative) 
val f1score = 2 * precision * recall / (precision + recall) 

Confusion matrix:
70945.0  1674.0  
16735.0  5189.0  


predictionAndLabels = MapPartitionsRDD[269] at rdd at <console>:113
bMetrics = org.apache.spark.mllib.evaluation.BinaryClassificationMetrics@5f9519f7
mMetrics = org.apache.spark.mllib.evaluation.MulticlassMetrics@371c5d51
labels = Array(0.0, 1.0)
trueNegative = 70945.0
truePositive = 5189.0
falseNegative = 16735.0
falsePositive = 1674.0
totalDelays = 21924
precision = 0.7560833454757395
recall = 0.23668126254333152
acc = 0.8052843679595528
f1score = 0.3605099524090735


0.3605099524090735

## Conclusion:
- nulls intelligement + FEAT selection on enlève _2 & _3
- stratified split + undersampling
- Random Forest MaxDepth(__15__)  .setNumTrees(__30__)
- precision = 0.75
- recall = 0.24
- acc = 0.81
- f1score = 0.36

feature importance

https://stackoverflow.com/questions/47043836/scala-random-forest-feature-importance-extraction-with-names-labels

In [26]:
val featureImportance = decisionTreeModel.featureImportances

// featureImportance.getClass

val res = featColumns.zip(featureImportance.toArray).sortBy(-_._2)


for (e <- res.take(20)) println(e)

(DEP_DryBulbFarenheit,0.09382043746894786)
(DEP_WindSpeed,0.04370395003943582)
(DEP_WindDirection,0.03871905883838827)
(ARR_DryBulbFarenheit,0.038422662389145906)
(DEP_StationPressure,0.036825118940613154)
(DEP_WindSpeed1,0.028278354339229934)
(ARR_WindSpeed,0.024823332949721562)
(DEP_Visibility,0.024500251206569505)
(ARR_WindDirection3,0.0237414653741213)
(DEP_WindDirection1,0.02215922532080796)
(ARR_WindDirection,0.021235844920245953)
(DEP_WindSpeed2,0.020899678261048275)
(ARR_DryBulbFarenheit2,0.020110689751338978)
(ARR_WindSpeed1,0.019921161509000935)
(ARR_Visibility,0.019465331837996484)
(ARR_WindSpeed3,0.0184810343380359)
(ARR_StationPressure,0.018194620859103575)
(ARR_WindSpeed2,0.01629911240552861)
(ARR_WindDirection1,0.015676205758860497)
(ARR_WindDirection2,0.01559791315197842)


featureImportance = (108,[0,1,2,3,4,5,6,7,8,9,10,11,14,15,16,17,18,19,20,21,22,23,24,26,27,28,29,30,31,32,33,34,35,36,37,38,40,41,42,43,44,45,46,47,48,49,50,51,54,55,56,57,58,59,60,61,62,63,64,65,68,69,70,71,72,73,74,75,76,77,78,80,81,82,83,84,85,86,88,89,90,91,92,94,95,96,97,98,99,100,101,102,103,104,105,106],[0.09382043746894786,0.024500251206569505,0.03871905883838827,0.04370395003943582,0.036825118940613154,0.01062246343702173,0.007872779224264983,9.380841791536837E-4,0.005963770656519379,0.0037234492316601717,0.008461155404203068,0.014366613387173618,0.02215922532080796,0.028278354339229934,0.014111237875049462,0.013685486108309995,0.010640734604133248,7.917591873401629E-4,0.003089402446096833,0.002670016894893009,0.006981975850576899,0.0037139099...


(108,[0,1,2,3,4,5,6,7,8,9,10,11,14,15,16,17,18,19,20,21,22,23,24,26,27,28,29,30,31,32,33,34,35,36,37,38,40,41,42,43,44,45,46,47,48,49,50,51,54,55,56,57,58,59,60,61,62,63,64,65,68,69,70,71,72,73,74,75,76,77,78,80,81,82,83,84,85,86,88,89,90,91,92,94,95,96,97,98,99,100,101,102,103,104,105,106],[0.09382043746894786,0.024500251206569505,0.03871905883838827,0.04370395003943582,0.036825118940613154,0.01062246343702173,0.007872779224264983,9.380841791536837E-4,0.005963770656519379,0.0037234492316601717,0.008461155404203068,0.014366613387173618,0.02215922532080796,0.028278354339229934,0.014111237875049462,0.013685486108309995,0.010640734604133248,7.917591873401629E-4,0.003089402446096833,0.002670016894893009,0.006981975850576899,0.0037139099...