In [1]:
import org.apache.spark.mllib

In [2]:
import org.apache.spark.sql.functions.explode

In [3]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

sqlContext = org.apache.spark.sql.SQLContext@5d0f338c




org.apache.spark.sql.SQLContext@5d0f338c

In [4]:
import sqlContext.implicits._ 

In [5]:
import org.apache.spark.sql.functions._

In [6]:
spark.version

2.0.0

DATAFRAME STATISTICS TUTORIAL

In [9]:
val dfTags = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("resources/question_tags_10K.csv")
    .toDF("id", "tag")

dfTags = [id: int, tag: string]


[id: int, tag: string]

In [10]:
// Create a dataframe from questions file questions_10K.csv
  val dfQuestionsCSV = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("dateFormat","yyyy-MM-dd HH:mm:ss")
    .csv("resources/questions_10K.csv")
    .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")


dfQuestionsCSV = [id: int, creation_date: string ... 5 more fields]


[id: int, creation_date: string ... 5 more fields]

In [11]:
dfQuestionsCSV.printSchema()

root
 |-- id: integer (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- closed_date: string (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: string (nullable = true)
 |-- answer_count: string (nullable = true)



In [12]:
// cast columns to data types
  val dfQuestions = dfQuestionsCSV.select(
    dfQuestionsCSV.col("id").cast("integer"),
    dfQuestionsCSV.col("creation_date").cast("timestamp"),
    dfQuestionsCSV.col("closed_date").cast("timestamp"),
    dfQuestionsCSV.col("deletion_date").cast("date"),
    dfQuestionsCSV.col("score").cast("integer"),
    dfQuestionsCSV.col("owner_userid").cast("integer"),
    dfQuestionsCSV.col("answer_count").cast("integer")
  )

dfQuestions = [id: int, creation_date: timestamp ... 5 more fields]


[id: int, creation_date: timestamp ... 5 more fields]

In [13]:
dfQuestions.printSchema()

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: timestamp (nullable = true)
 |-- deletion_date: date (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: integer (nullable = true)
 |-- answer_count: integer (nullable = true)



In [14]:
// Average
//  import org.apache.spark.sql.functions._
  dfQuestions
    .select(avg("score"))
    .show()


+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



In [15]:
  // Maximum
  dfQuestions
    .select(max("score"))
    .show()


  // Minimum
  dfQuestions
    .select(min("score"))
    .show()


  // Mean
  dfQuestions
    .select(mean("score"))
    .show()


  // Sum
  dfQuestions
    .select(sum("score"))
    .show()


+----------+
|max(score)|
+----------+
|      4443|
+----------+

+----------+
|min(score)|
+----------+
|       -27|
+----------+

+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+

+----------+
|sum(score)|
+----------+
|    361427|
+----------+



In [16]:
 // Group by with statistics
  dfQuestions
    .filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .join(dfTags, dfQuestions.col("id").equalTo(dfTags("id")))
    .groupBy(dfQuestions.col("owner_userid"))
    .agg(avg("score"), max("answer_count"))
    .show()


|owner_userid|avg(score)|max(answer_count)|
+------------+----------+-----------------+
|         268|      26.0|                1|
|         136|      57.6|                9|
|         123|      20.0|                3|
+------------+----------+-----------------+



In [17]:
// DataFrame Statistics using describe() method
  val dfQuestionsStatistics = dfQuestions.describe()
  dfQuestionsStatistics.show()


+-------+-----------------+------------------+-----------------+------------------+
|summary|               id|             score|     owner_userid|      answer_count|
+-------+-----------------+------------------+-----------------+------------------+
|  count|             9999|              9999|             7388|              9922|
|   mean|33929.17081708171| 36.14631463146315|47389.99472116947|6.6232614392259626|
| stddev|19110.09560532429|160.48316753972045|280943.1070344427| 9.069109116851138|
|    min|                1|               -27|                1|                -5|
|    max|            66037|              4443|          3431280|               316|
+-------+-----------------+------------------+-----------------+------------------+



dfQuestionsStatistics = [summary: string, id: string ... 3 more fields]


[summary: string, id: string ... 3 more fields]

In [18]:
 // Correlation
  val correlation = dfQuestions.stat.corr("score", "answer_count")
  println(s"correlation between column score and answer_count = $correlation")


  // Covariance
  val covariance = dfQuestions.stat.cov("score", "answer_count")
  println(s"covariance between column score and answer_count = $covariance")


 

correlation between column score and answer_count = 0.3699847903294707
covariance between column score and answer_count = 537.513381444165


correlation = 0.3699847903294707
covariance = 537.513381444165


537.513381444165

In [19]:
 // Frequent Items
  val dfFrequentScore = dfQuestions.stat.freqItems(Seq("answer_count"))
  dfFrequentScore.show()


+----------------------+
|answer_count_freqItems|
+----------------------+
|  [23, 131, 77, 86,...|
+----------------------+



dfFrequentScore = [answer_count_freqItems: array<int>]


[answer_count_freqItems: array<int>]

In [20]:
// Crosstab
  val dfScoreByUserid = dfQuestions
    .filter("owner_userid > 0 and owner_userid < 20")
    .stat
    .crosstab("score", "owner_userid")
  dfScoreByUserid.show(10)


  // Stratified sampling using sampleBy
  // find all rows where answer_count in (5, 10, 20)
  val dfQuestionsByAnswerCount = dfQuestions
    .filter("owner_userid > 0")
    .filter("answer_count in (5, 10, 20)")

  // count how many rows match answer_count in (5, 10, 20)
  dfQuestionsByAnswerCount
    .groupBy("answer_count")
    .count()
    .show()


|score_owner_userid|  1| 11| 13| 17|  2|  3|  4|  5|  8|  9|
+------------------+---+---+---+---+---+---+---+---+---+---+
|                56|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|
|               472|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|                14|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|
|                20|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|               179|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|
|                84|  0|  0|  0|  0|  1|  0|  0|  0|  0|  0|
|               160|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|
|                21|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|                 9|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0|
|                 2|  0|  0|  0|  0|  0|  0|  0|  1|  0|  1|
+------------------+---+---+---+---+---+---+---+---+---+---+
only showing top 10 rows

|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  811|
|          10|  272|
+------------+-----+



dfScoreByUserid = [score_owner_userid: string, 1: bigint ... 9 more fields]
dfQuestionsByAnswerCount = [id: int, creation_date: timestamp ... 5 more fields]


[id: int, creation_date: timestamp ... 5 more fields]

In [21]:
// Create a fraction map where we are only interested:
  // - 50% of the rows that have answer_count = 5
  // - 10% of the rows that have answer_count = 10
  // - 100% of the rows that have answer_count = 20
  // Note also that fractions should be in the range [0, 1]
  val fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)

  // Stratified sample using the fractionKeyMap.
  dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, 7L)
    .groupBy("answer_count")
    .count()
    .show()

  // Note that changing the random seed will modify your sampling outcome. As an example, let's change the random seed to 37.
  dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, 37L)
    .groupBy("answer_count")
    .count()
    .show()


  // Approximate Quantile
  val quantiles = dfQuestions
    .stat
    .approxQuantile("score", Array(0, 0.5, 1), 0.25)
  println(s"Qauntiles segments = ${quantiles.toSeq}")


+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  400|
|          10|   26|
+------------+-----+

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  388|
|          10|   25|
+------------+-----+

Qauntiles segments = WrappedArray(-27.0, 2.0, 4443.0)


fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)
quantiles = Array(-27.0, 2.0, 4443.0)


[-27.0, 2.0, 4443.0]

In [22]:
val tagsBloomFilter = dfTags.stat.bloomFilter("tag", 1000L, 0.1)
  println(s"bloom filter contains java tag = ${tagsBloomFilter.mightContain("java")}")
  println(s"bloom filter contains some unknown tag = ${tagsBloomFilter.mightContain("unknown tag")}")


bloom filter contains java tag = true
bloom filter contains some unknown tag = false


tagsBloomFilter = org.apache.spark.util.sketch.BloomFilterImpl@809c4023


org.apache.spark.util.sketch.BloomFilterImpl@809c4023

In [23]:
 // Count Min Sketch
  val cmsTag = dfTags.stat.countMinSketch("tag", 0.1, 0.9, 37)
  val estimatedFrequency = cmsTag.estimateCount("java")
  println(s"Estimated frequency for tag java = $estimatedFrequency")


Estimated frequency for tag java = 513


cmsTag = org.apache.spark.util.sketch.CountMinSketchImpl@431a88ed
estimatedFrequency = 513


513

In [24]:
  // Sampling With Replacement
  val dfTagsSample = dfTags.sample(true, 0.2, 37L)
  println(s"Number of rows in sample dfTagsSample = ${dfTagsSample.count()}")
  println(s"Number of rows in dfTags = ${dfTags.count()}")


Number of rows in sample dfTagsSample = 1948
Number of rows in dfTags = 9999


dfTagsSample = [id: int, tag: string]


[id: int, tag: string]