In [1]:
%classpath add mvn org.apache.spark spark-sql_2.11 2.1.0
org.apache.log4j.Logger.getRootLogger().setLevel(org.apache.log4j.Level.ERROR);


null

In [4]:
import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._ 


import org.apache.spark._
import org.apache.spark.sql._
import org.apache.spark.sql.functions._


In [5]:
val spark = SparkSession.builder() 
  .master("local[*]") 
  .config("spark.executor.memory", "3g")
  .config("spark.sql.warehouse.dir", "/tmp/spark-warehouse")
  .appName("NohupReader")
  .getOrCreate()


org.apache.spark.sql.SparkSession@694b6248

In [6]:
import spark.implicits._

val research_home: String = scala.util.Properties.envOrElse("RESEARCH_HOME", "/home/acald013/Research/")
val folder = s"${research_home}Scripts/Python/"
val prefix = "nohup"

val nohup = spark.read.textFile(s"${folder}${prefix}.out")


org.apache.spark.sql.SparkSession$implicits$@7d0bb9f2

In [7]:
println(nohup.count)

4763


null

In [8]:
org.apache.spark.sql.catalyst.encoders.OuterScopes.addOuterScope(this)
case class Line(line: String, n: Long)
case class Run(runID: Long, date: String, method: String, cores: Int, epsilon: Double, mu: Int, delta: Int, methodTime: Double)
case class Stage(runID: Long, n: Long, timestamp: String, stage: String, stageTime: Double, load: Int, unit: String)
case class MDFrow(mdfID: Long, n: Long, method: String, epsilon: Double, mu: Int, delta: Int, stage: String, time: Double, load: Int, unit: String)

implicit class DatasetOps(ds: org.apache.spark.sql.Dataset[_]) {
    def display(rows: Int = 20) = {
        import com.twosigma.beakerx.scala.table.TableDisplay
        val columns = ds.columns
        val rowVals = ds.toDF.take(rows)
        val t = new TableDisplay(rowVals map (row => (columns zip row.toSeq).toMap))
        t.display()
    }
}

defined class Line
defined class Run
defined class Stage
defined class MDFrow
defined class DatasetOps


In [9]:
val lines = nohup.toDF("line").withColumn("n", monotonicallyIncreasingId).as[Line].cache()
val nLines = lines.count()
lines.show(10)

+--------------------+---+
|                line|  n|
+--------------------+---+
|FLOCKFINDER=Merge...|  0|
|acald013@dblab-ra...|  2|
|acald013@dblab-ra...|  3|
|acald013@dblab-ra...|  4|
|acald013@dblab-ra...|  5|
|no org.apache.spa...|  6|
|starting org.apac...|  7|
|acald013@dblab-ra...|  8|
|acald013@dblab-ra...|  9|
+--------------------+---+
only showing top 10 rows



null

In [10]:
val indicesRun = lines.filter{ l => 
        l.line.contains("=== MergeLast Start ===") || l.line.contains("method=MergeLast,") ||
        l.line.contains("=== SpatialJoin Start ===") || l.line.contains("method=SpatialJoin,")
    }
    .orderBy("n")
    .select("n")
    .collect()
    .toList
    .map(_.getLong(0))
    .grouped(2)
    .toList
    .map(pair => (pair.head, pair.last))
    .filter(r => r._1 != r._2)
    .zipWithIndex
val indexRun = spark.createDataset(indicesRun)
    .flatMap{ pair => 
        (pair._1._1 to pair._1._2)
        .toList.map(v => (pair._2, v))
    }
    .toDF("runID","n")
    .cache
indexRun.show(10, truncate = false)

+-----+---+
|runID|n  |
+-----+---+
|0    |18 |
|0    |19 |
|0    |20 |
|0    |21 |
|0    |22 |
|0    |23 |
|0    |24 |
|0    |25 |
|0    |26 |
|0    |27 |
+-----+---+
only showing top 10 rows



null

In [11]:
val runs = indexRun.join(lines, "n").
    groupBy("runID").
    agg(max($"n").alias("n")).
    join(lines, "n").
    select("runID", "line").
    orderBy("runID").
    map{ row =>
        val runID = row.getInt(0)
        val line  = row.getString(1)
        var arr1  = line.split(" -> ")
        val date  = arr1(0)
        val arr2  = arr1(1).split(",")
        val method  = arr2(0).split("=")(1)
        val cores   = arr2(1).split("=")(1).toInt
        val epsilon = arr2(2).split("=")(1).toDouble
        val mu      = arr2(3).split("=")(1).toInt
        val delta   = arr2(4).split("=")(1).toInt
        val time    = arr2(5).split("=")(1).toDouble
        Run(runID, date, method, cores, epsilon, mu, delta, time)
    }.
    cache
val nRuns = runs.count()
runs.show(nRuns.toInt, truncate = false)



+-----+-----------------------+---------+-----+-------+---+-----+----------+
|runID|date                   |method   |cores|epsilon|mu |delta|methodTime|
+-----+-----------------------+---------+-----+-------+---+-----+----------+
|0    |2018-06-07 22:22:17,016|MergeLast|28   |100.0  |4  |5    |927.875   |
|1    |2018-06-08 10:40:22,043|MergeLast|28   |60.0   |4  |5    |654.964   |
|2    |2018-06-08 10:52:07,203|MergeLast|28   |70.0   |4  |5    |686.385   |
|3    |2018-06-08 11:03:42,503|MergeLast|28   |80.0   |4  |5    |677.862   |
|4    |2018-06-08 11:17:24,874|MergeLast|28   |90.0   |4  |5    |798.868   |
+-----+-----------------------+---------+-----+-------+---+-----+----------+



null

In [12]:
val sampleRuns = runs.filter($"method" === "MergeLast").filter($"mu" === 4 || $"mu" === 3).filter($"delta" === 3 || $"delta" === 5)
val nSampleRuns = sampleRuns.count()
sampleRuns.show(nSampleRuns.toInt, truncate = false)

+-----+-----------------------+---------+-----+-------+---+-----+----------+
|runID|date                   |method   |cores|epsilon|mu |delta|methodTime|
+-----+-----------------------+---------+-----+-------+---+-----+----------+
|0    |2018-06-07 22:22:17,016|MergeLast|28   |100.0  |4  |5    |927.875   |
|1    |2018-06-08 10:40:22,043|MergeLast|28   |60.0   |4  |5    |654.964   |
|2    |2018-06-08 10:52:07,203|MergeLast|28   |70.0   |4  |5    |686.385   |
|3    |2018-06-08 11:03:42,503|MergeLast|28   |80.0   |4  |5    |677.862   |
|4    |2018-06-08 11:17:24,874|MergeLast|28   |90.0   |4  |5    |798.868   |
+-----+-----------------------+---------+-----+-------+---+-----+----------+



null

In [13]:
val stages = lines.filter(_.line.contains("|")).
    join(indexRun, "n").
    map{ m =>
        val n         = m.getLong(0)
        val line      = m.getString(1)
        val runID     = m.getInt(2)
        var arr1      = line.split(" -> ")
        val timestamp = arr1(0).trim
        val arr2      = arr1(1).split(" \\| ")
        val stage     = arr2(0).trim
        val time      = arr2(1).trim.dropRight(1).toDouble
        val arr3      = arr2(2).trim.split(" ")
        val load      = arr3(0).toInt
        val unit      = arr3(1)
        Stage(runID, n, timestamp, stage, time, load, unit)
    }.
    join(sampleRuns.select($"runID"), "runID").
    cache
    
println(stages.count())
stages.show(10, truncate = false)

org.apache.spark.SparkException:  Job aborted due to stage failure

In [19]:
val ml_stages = stages.join(runs, "runID").
    filter($"method" === "MergeLast").
    select($"runID", $"n", $"method", $"epsilon", $"mu", $"delta", $"stage".alias("stage0"), $"stageTime").
    withColumn("stage1", regexp_replace($"stage0", "Reporting locations at t=\\d+", "0.Reporting locations")).
    withColumn("stage2", regexp_replace($"stage1", "Checking internal timestamps", "4.Checking internals")).
    withColumn("stage3", regexp_replace($"stage2", "\\.\\.\\.", "")).
    select($"runID", $"n", $"method", $"epsilon", $"mu", $"delta", $"stage3".alias("stage"), $"stageTime").
    filter(!$"stage".rlike("4.Distance Join phase")).
    filter(!$"stage".rlike("5.Getting candidates"))
ml_stages.show(truncate = false)


+-----+---+---------+-------+---+-----+----------------------------+---------+
|runID|n  |method   |epsilon|mu |delta|stage                       |stageTime|
+-----+---+---------+-------+---+-----+----------------------------+---------+
|1    |291|MergeLast|30.0   |4  |3    |0.Reporting locations       |5.63     |
|1    |307|MergeLast|30.0   |4  |3    |1.Set of disks for t_i      |31.31    |
|1    |308|MergeLast|30.0   |4  |3    |0.Reporting locations       |4.98     |
|1    |324|MergeLast|30.0   |4  |3    |2.Set of disks for t_i+delta|20.51    |
|1    |325|MergeLast|30.0   |4  |3    |3.Joining timestams         |8.55     |
|1    |326|MergeLast|30.0   |4  |3    |4.Checking internals        |11.68    |
|1    |327|MergeLast|30.0   |4  |3    |0.Reporting locations       |4.91     |
|1    |343|MergeLast|30.0   |4  |3    |1.Set of disks for t_i      |19.58    |
|1    |344|MergeLast|30.0   |4  |3    |0.Reporting locations       |4.93     |
|1    |360|MergeLast|30.0   |4  |3    |2.Set of disk

null

In [20]:
val sj_stages = stages.join(runs, "runID").
    filter($"method" === "SpatialJoin").
    select($"runID", $"n", $"method", $"epsilon", $"mu", $"delta", $"stage".alias("stage0"), $"stageTime").
    withColumn("stage1", regexp_replace($"stage0", "\\.\\.\\.", "")).
    withColumn("stage2", regexp_replace($"stage1", "Reporting", "0.Reporting")).
    select($"runID", $"n", $"method", $"epsilon", $"mu", $"delta", $"stage2".alias("stage"), $"stageTime").
    filter(!$"stage".rlike("4.Distance Join phase")).
    filter(!$"stage".rlike("5.Getting candidates"))
sj_stages.show(truncate = false)



+-----+---+------+-------+---+-----+-----+---------+
|runID|n  |method|epsilon|mu |delta|stage|stageTime|
+-----+---+------+-------+---+-----+-----+---------+
+-----+---+------+-------+---+-----+-----+---------+



null

In [22]:
val data = runs.select($"runID", $"method", $"epsilon", $"mu", $"delta", $"methodTime".alias("time")).
    filter($"method" === "MergeLast").filter($"mu" === 4 || $"mu" === 3).filter($"delta" === 3 || $"delta" === 5).
    orderBy($"runID", $"epsilon", $"method").
    cache
println(data.count())
data.show(data.count().toInt, truncate = false)

10
+-----+---------+-------+---+-----+-------+
|runID|method   |epsilon|mu |delta|time   |
+-----+---------+-------+---+-----+-------+
|1    |MergeLast|30.0   |4  |3    |592.782|
|3    |MergeLast|30.0   |4  |3    |584.446|
|5    |MergeLast|30.0   |4  |3    |593.125|
|7    |MergeLast|30.0   |4  |3    |577.23 |
|9    |MergeLast|30.0   |4  |3    |585.75 |
|11   |MergeLast|30.0   |3  |5    |689.323|
|16   |MergeLast|30.0   |3  |5    |668.433|
|20   |MergeLast|30.0   |3  |5    |681.964|
|24   |MergeLast|30.0   |3  |5    |689.051|
|28   |MergeLast|30.0   |3  |5    |687.891|
+-----+---------+-------+---+-----+-------+



null

In [23]:
val d = data.collect.map(_.mkString(";")).mkString("\n")

import java.io._
val pw = new PrintWriter(new File(s"${folder}methods_${prefix}_draft.csv" ))
pw.write(s"$d\n")
pw.close

null

In [24]:
val d = ml_stages.union(sj_stages).collect.map(_.mkString(";")).mkString("\n")

import java.io._
val pw = new PrintWriter(new File(s"${folder}stages_${prefix}_draft.csv" ))
pw.write(s"$d\n")
pw.close

null

In [None]:
val indicesMdf = lines.filter{ l => 
        l.line.contains(" -> Setting mu=") || l.line.contains(" ->   berlin0-10,") 
    }
    .orderBy("n")
    .select("n")
    .collect()
    .toList
    .map(_.getLong(0))
    .grouped(2)
    .toList
    .map(pair => (pair.head, pair.last))
    .filter(r => r._1 != r._2)
    .zipWithIndex
val indexMdf = spark.createDataset(indicesMdf)
    .flatMap{ pair => 
        (pair._1._1 to pair._1._2)
        .toList.map(v => (pair._2, v))
    }
    .toDF("mdfID","n")
    .cache
indexMdf.show(15, truncate = false)

In [None]:
val mdfInfo = indexMdf.groupBy($"mdfID").agg(max($"n").alias("m")).orderBy($"m")
mdfInfo.show(10, truncate = false)

In [None]:
val mdfInfo = indexMdf.groupBy($"mdfID").agg(max($"n").alias("n")).orderBy($"n").
    join(indexRun, "n").
    join(runs, "runID").
    join(lines, "n").
    select($"method", $"epsilon", $"mu", $"delta", $"line").
    withColumn("timestamp", substring($"line", 122, 124)).
    select($"method", $"epsilon", $"mu", $"delta", $"timestamp")

mdfInfo.show(truncate = false)

In [None]:
val d = mdfInfo.collect.map(_.mkString(";")).mkString("\n")

import java.io._
val pw = new PrintWriter(new File(s"${folder}mdfInfo_${prefix}.csv" ))
pw.write(s"$d\n")
pw.close