# More demos are at: http://allaboutscala.com/big-data/spark/

In [1]:
import org.apache.spark.mllib

In [2]:
import org.apache.spark.sql.functions.explode

In [3]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

sqlContext = org.apache.spark.sql.SQLContext@408eb9b




org.apache.spark.sql.SQLContext@408eb9b

In [4]:
import sqlContext.implicits._ 

In [5]:
import org.apache.spark.sql.functions._

In [6]:
spark.version

2.0.0

# DATAFRAME STATISTICS TUTORIAL

In [7]:
// read a csv file and create a dataframe
val dfTags = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("resources/question_tags_10K.csv")
    .toDF("id", "tag")

dfTags = [id: int, tag: string]


[id: int, tag: string]

In [8]:
//show dfTags schema
dfTags.printSchema()

root
 |-- id: integer (nullable = true)
 |-- tag: string (nullable = true)



In [9]:
// show a few examples
dfTags.take(10)

0,1
1,data
4,c#
4,winforms
4,type-conversion
4,decimal
4,opacity
6,html
6,css
6,css3
6,internet-explorer-7


In [10]:
// Create a dataframe from questions file questions_10K.csv
  val dfQuestionsCSV = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .option("dateFormat","yyyy-MM-dd HH:mm:ss")
    .csv("resources/questions_10K.csv")
    .toDF("id", "creation_date", "closed_date", "deletion_date", "score", "owner_userid", "answer_count")


dfQuestionsCSV = [id: int, creation_date: string ... 5 more fields]


[id: int, creation_date: string ... 5 more fields]

In [11]:
dfQuestionsCSV.printSchema()

root
 |-- id: integer (nullable = true)
 |-- creation_date: string (nullable = true)
 |-- closed_date: string (nullable = true)
 |-- deletion_date: string (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: string (nullable = true)
 |-- answer_count: string (nullable = true)



In [12]:
dfQuestionsCSV.take(5)

0,1,2,3,4,5,6
1,2008-07-31T21:26:37Z,,2011-03-28T00:53:47Z,1,,0
4,2008-07-31T21:42:52Z,,,472,8.0,13
6,2008-07-31T22:08:08Z,,,210,9.0,5
8,2008-07-31T23:33:19Z,2013-06-03T04:00:25Z,2015-02-11T08:26:40Z,42,,8
9,2008-07-31T23:40:59Z,,,1452,1.0,58


In [13]:
// cast columns to data types
  val dfQuestions = dfQuestionsCSV.select(
    dfQuestionsCSV.col("id").cast("integer"),
    dfQuestionsCSV.col("creation_date").cast("timestamp"),
    dfQuestionsCSV.col("closed_date").cast("timestamp"),
    dfQuestionsCSV.col("deletion_date").cast("date"),
    dfQuestionsCSV.col("score").cast("integer"),
    dfQuestionsCSV.col("owner_userid").cast("integer"),
    dfQuestionsCSV.col("answer_count").cast("integer")
  )

dfQuestions = [id: int, creation_date: timestamp ... 5 more fields]


[id: int, creation_date: timestamp ... 5 more fields]

In [14]:
// show a few examples
dfQuestions.show()

+---+--------------------+--------------------+-------------+-----+------------+------------+
| id|       creation_date|         closed_date|deletion_date|score|owner_userid|answer_count|
+---+--------------------+--------------------+-------------+-----+------------+------------+
|  1|2008-07-31 21:26:...|                null|   2011-03-28|    1|        null|           0|
|  4|2008-07-31 21:42:...|                null|         null|  472|           8|          13|
|  6|2008-07-31 22:08:...|                null|         null|  210|           9|           5|
|  8|2008-07-31 23:33:...|2013-06-03 04:00:...|   2015-02-11|   42|        null|           8|
|  9|2008-07-31 23:40:...|                null|         null| 1452|           1|          58|
| 11|2008-07-31 23:55:...|                null|         null| 1154|           1|          33|
| 13|2008-08-01 00:42:...|                null|         null|  464|           9|          25|
| 14|2008-08-01 00:59:...|                null|         null

In [15]:
dfQuestions.printSchema()

root
 |-- id: integer (nullable = true)
 |-- creation_date: timestamp (nullable = true)
 |-- closed_date: timestamp (nullable = true)
 |-- deletion_date: date (nullable = true)
 |-- score: integer (nullable = true)
 |-- owner_userid: integer (nullable = true)
 |-- answer_count: integer (nullable = true)



# Examples For Calculating Dataframe Statistics

In [16]:
// Average
//  import org.apache.spark.sql.functions._
  dfQuestions
    .select(avg("score"))
    .show()


+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



In [17]:
// Maximum
  dfQuestions
    .select(max("score"))
    .show()


+----------+
|max(score)|
+----------+
|      4443|
+----------+



In [18]:
// Minimum
  dfQuestions
    .select(min("score"))
    .show()



+----------+
|min(score)|
+----------+
|       -27|
+----------+



In [19]:
 
  // Mean
  dfQuestions
    .select(mean("score"))
    .show()


+-----------------+
|       avg(score)|
+-----------------+
|36.14631463146315|
+-----------------+



In [20]:

  // Sum
  dfQuestions
    .select(sum("score"))
    .show()


+----------+
|sum(score)|
+----------+
|    361427|
+----------+



In [21]:
// Filtering
dfQuestions
    .filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .show()


+---+--------------------+-----------+-------------+-----+------------+------------+
| id|       creation_date|closed_date|deletion_date|score|owner_userid|answer_count|
+---+--------------------+-----------+-------------+-----+------------+------------+
|402|2008-08-02 10:47:...|       null|         null|   81|         136|           9|
|419|2008-08-02 11:52:...|       null|         null|   26|         268|           1|
|427|2008-08-02 12:12:...|       null|         null|   20|         123|           3|
|438|2008-08-02 12:56:...|       null|         null|   42|         136|           8|
+---+--------------------+-----------+-------------+-----+------------+------------+



In [22]:
// Filter then join two dataframes
dfQuestions
    .filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .join(dfTags, dfQuestions.col("id").equalTo(dfTags("id")))
    .show()

+---+--------------------+-----------+-------------+-----+------------+------------+---+-----------------+
| id|       creation_date|closed_date|deletion_date|score|owner_userid|answer_count| id|              tag|
+---+--------------------+-----------+-------------+-----+------------+------------+---+-----------------+
|402|2008-08-02 10:47:...|       null|         null|   81|         136|           9|402|      objective-c|
|402|2008-08-02 10:47:...|       null|         null|   81|         136|           9|402|           iphone|
|419|2008-08-02 11:52:...|       null|         null|   26|         268|           1|419|              vb6|
|419|2008-08-02 11:52:...|       null|         null|   26|         268|           1|419|              com|
|427|2008-08-02 12:12:...|       null|         null|   20|         123|           3|427|         pocketpc|
|427|2008-08-02 12:12:...|       null|         null|   20|         123|           3|427|   windows-mobile|
|427|2008-08-02 12:12:...|       null

In [23]:
// Filter, join, then group and aggregate two dataframes by userIDs
  dfQuestions
    .filter("id > 400 and id < 450")
    .filter("owner_userid is not null")
    .join(dfTags, dfQuestions.col("id").equalTo(dfTags("id")))
    .groupBy(dfQuestions.col("owner_userid"))
    .agg(avg("score"), max("answer_count"))
    .show()


|owner_userid|avg(score)|max(answer_count)|
+------------+----------+-----------------+
|         268|      26.0|                1|
|         136|      57.6|                9|
|         123|      20.0|                3|
+------------+----------+-----------------+



In [24]:
dfQuestions.show()

+---+--------------------+--------------------+-------------+-----+------------+------------+
| id|       creation_date|         closed_date|deletion_date|score|owner_userid|answer_count|
+---+--------------------+--------------------+-------------+-----+------------+------------+
|  1|2008-07-31 21:26:...|                null|   2011-03-28|    1|        null|           0|
|  4|2008-07-31 21:42:...|                null|         null|  472|           8|          13|
|  6|2008-07-31 22:08:...|                null|         null|  210|           9|           5|
|  8|2008-07-31 23:33:...|2013-06-03 04:00:...|   2015-02-11|   42|        null|           8|
|  9|2008-07-31 23:40:...|                null|         null| 1452|           1|          58|
| 11|2008-07-31 23:55:...|                null|         null| 1154|           1|          33|
| 13|2008-08-01 00:42:...|                null|         null|  464|           9|          25|
| 14|2008-08-01 00:59:...|                null|         null

In [25]:
// DataFrame Statistics using describe() method
  val dfQuestionsStatistics = dfQuestions.describe()
  dfQuestionsStatistics.show()


+-------+-----------------+------------------+-----------------+------------------+
|summary|               id|             score|     owner_userid|      answer_count|
+-------+-----------------+------------------+-----------------+------------------+
|  count|             9999|              9999|             7388|              9922|
|   mean|33929.17081708171| 36.14631463146315|47389.99472116947|6.6232614392259626|
| stddev|19110.09560532429|160.48316753972045|280943.1070344427| 9.069109116851138|
|    min|                1|               -27|                1|                -5|
|    max|            66037|              4443|          3431280|               316|
+-------+-----------------+------------------+-----------------+------------------+



dfQuestionsStatistics = [summary: string, id: string ... 3 more fields]


[summary: string, id: string ... 3 more fields]

In [26]:
 // Correlation
  val correlation = dfQuestions.stat.corr("score", "answer_count")
  println(s"correlation between column score and answer_count = $correlation")

correlation between column score and answer_count = 0.3699847903294707


correlation = 0.3699847903294707


0.3699847903294707

In [27]:
 // Covariance
  val covariance = dfQuestions.stat.cov("score", "answer_count")
  println(s"covariance between column score and answer_count = $covariance")


covariance between column score and answer_count = 537.513381444165


covariance = 537.513381444165


537.513381444165

In [28]:
 // Frequent Items
  val dfFrequentScore = dfQuestions.stat.freqItems(Seq("answer_count"))
  dfFrequentScore.show()


+----------------------+
|answer_count_freqItems|
+----------------------+
|  [23, 131, 77, 86,...|
+----------------------+



dfFrequentScore = [answer_count_freqItems: array<int>]


[answer_count_freqItems: array<int>]

In [29]:
// show the whole array
dfFrequentScore.take(1)

0
"[23, 131, 77, 86, 41, 50, 32, 53, 35, 17, 8, 44, 26, 80, 89, -1, 71, 11, 56, 47, 38, 29, 20, 2, 65, 316, 5, -4, 14, 214, 46, 100, 55, 73, 67, 58, 40, 49, 13, 4, 31, 22, 103, -5, 97, 16, 7, -2, 43, 52, 25, 34, 61, 10, 37, 1, 19, 28, 129, 87, 114, 69, 78, 99, 63, 54, 45, 36, 27, 9, 18, 57, 21, 48, 12, 3, 30, 39, 15, 42, 33, 6, 24, -3, 296, 0, null]"


In [30]:
// Crosstab
  val dfScoreByUserid = dfQuestions
    .filter("owner_userid > 0 and owner_userid < 20")
    .stat
    .crosstab("score", "owner_userid")
  dfScoreByUserid.show(15)


+------------------+---+---+---+---+---+---+---+---+---+---+
|score_owner_userid|  1| 11| 13| 17|  2|  3|  4|  5|  8|  9|
+------------------+---+---+---+---+---+---+---+---+---+---+
|                56|  0|  0|  0|  1|  0|  0|  0|  0|  0|  0|
|               472|  0|  0|  0|  0|  0|  0|  0|  0|  1|  0|
|                14|  0|  0|  0|  1|  0|  0|  0|  1|  0|  0|
|                20|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|               179|  0|  0|  0|  0|  0|  1|  0|  0|  0|  0|
|                84|  0|  0|  0|  0|  1|  0|  0|  0|  0|  0|
|               160|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|
|                21|  0|  0|  0|  0|  0|  0|  0|  1|  0|  0|
|                 9|  0|  0|  0|  0|  0|  0|  1|  1|  0|  0|
|                 2|  0|  0|  0|  0|  0|  0|  0|  1|  0|  1|
|               296|  0|  1|  0|  0|  0|  0|  0|  0|  0|  0|
|              1154|  1|  0|  0|  0|  0|  0|  0|  0|  0|  0|
|                12|  0|  0|  0|  0|  0|  0|  0|  2|  0|  0|
|                 7|  0|

dfScoreByUserid = [score_owner_userid: string, 1: bigint ... 9 more fields]


[score_owner_userid: string, 1: bigint ... 9 more fields]

In [31]:
  // find all rows where answer_count in (5, 10, 20)
  val dfQuestionsByAnswerCount = dfQuestions
    .filter("owner_userid > 0")
    .filter("answer_count in (5, 10, 20)")
    .show()

+---+--------------------+--------------------+-------------+-----+------------+------------+
| id|       creation_date|         closed_date|deletion_date|score|owner_userid|answer_count|
+---+--------------------+--------------------+-------------+-----+------------+------------+
|  6|2008-07-31 22:08:...|                null|         null|  210|           9|           5|
| 16|2008-08-01 04:59:...|                null|         null|   84|           2|           5|
| 25|2008-08-01 12:13:...|                null|         null|  101|          23|          10|
| 48|2008-08-01 13:01:...|                null|         null|  200|          40|          20|
| 59|2008-08-01 13:14:...|                null|         null|   66|          45|           5|
| 88|2008-08-01 14:36:...|                null|         null|   71|          61|          10|
|129|2008-08-01 16:22:...|2012-07-03 14:30:...|         null|   72|          48|           5|
|176|2008-08-01 18:37:...|                null|         null

dfQuestionsByAnswerCount: Unit = ()


In [32]:


  // Stratified sampling using sampleBy
  // find all rows where answer_count in (5, 10, 20)
  val dfQuestionsByAnswerCount = dfQuestions
    .filter("owner_userid > 0")
    .filter("answer_count in (5, 10, 20)")

  // count how many rows match answer_count in (5, 10, 20)
  dfQuestionsByAnswerCount
    .groupBy("answer_count")
    .count()
    .show()


+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  811|
|          10|  272|
+------------+-----+



dfQuestionsByAnswerCount = [id: int, creation_date: timestamp ... 5 more fields]


[id: int, creation_date: timestamp ... 5 more fields]

In [33]:
// Create a fraction map where we are only interested:
  // - 50% of the rows that have answer_count = 5
  // - 10% of the rows that have answer_count = 10
  // - 100% of the rows that have answer_count = 20
  // Note also that fractions should be in the range [0, 1]
  val fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)

  // Stratified sample using the fractionKeyMap.
  dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, 7L)
    .show()


+----+--------------------+--------------------+-------------+-----+------------+------------+
|  id|       creation_date|         closed_date|deletion_date|score|owner_userid|answer_count|
+----+--------------------+--------------------+-------------+-----+------------+------------+
|  16|2008-08-01 04:59:...|                null|         null|   84|           2|           5|
|  48|2008-08-01 13:01:...|                null|         null|  200|          40|          20|
|  59|2008-08-01 13:14:...|                null|         null|   66|          45|           5|
| 129|2008-08-01 16:22:...|2012-07-03 14:30:...|         null|   72|          48|           5|
| 176|2008-08-01 18:37:...|                null|         null|   85|          91|          10|
| 336|2008-08-02 03:34:...|                null|         null|   48|          61|           5|
| 561|2008-08-02 21:34:...|                null|         null|   13|         157|           5|
| 564|2008-08-02 21:47:...|                null|  

fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)


Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)

In [34]:
// Create a fraction map where we are only interested:
  // - 50% of the rows that have answer_count = 5
  // - 10% of the rows that have answer_count = 10
  // - 100% of the rows that have answer_count = 20
  // Note also that fractions should be in the range [0, 1]
  val fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)

  // Stratified sample using the fractionKeyMap. 7L is the random seed
  dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, 7L)
    .groupBy("answer_count")
    .count()
    .show()


+------------+-----+                                                            
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  400|
|          10|   26|
+------------+-----+



fractionKeyMap = Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)


Map(5 -> 0.5, 10 -> 0.1, 20 -> 1.0)

In [35]:

  // Note that changing the random seed will modify your sampling outcome. As an example, let's change the random seed to 37.
  dfQuestionsByAnswerCount
    .stat
    .sampleBy("answer_count", fractionKeyMap, 37L)
    .groupBy("answer_count")
    .count()
    .show()

+------------+-----+
|answer_count|count|
+------------+-----+
|          20|   34|
|           5|  388|
|          10|   25|
+------------+-----+



In [36]:
dfQuestions.show()


+---+--------------------+--------------------+-------------+-----+------------+------------+
| id|       creation_date|         closed_date|deletion_date|score|owner_userid|answer_count|
+---+--------------------+--------------------+-------------+-----+------------+------------+
|  1|2008-07-31 21:26:...|                null|   2011-03-28|    1|        null|           0|
|  4|2008-07-31 21:42:...|                null|         null|  472|           8|          13|
|  6|2008-07-31 22:08:...|                null|         null|  210|           9|           5|
|  8|2008-07-31 23:33:...|2013-06-03 04:00:...|   2015-02-11|   42|        null|           8|
|  9|2008-07-31 23:40:...|                null|         null| 1452|           1|          58|
| 11|2008-07-31 23:55:...|                null|         null| 1154|           1|          33|
| 13|2008-08-01 00:42:...|                null|         null|  464|           9|          25|
| 14|2008-08-01 00:59:...|                null|         null

In [37]:
 // Approximate Quantile of the score column
// See documentation here: https://spark.apache.org/docs/2.0.2/api/java/org/apache/spark/sql/DataFrameStatFunctions.html

  val quantiles = dfQuestions
    .stat
    .approxQuantile("score", Array(0, 0.33, 0.5, 0.6, 0.7, 0.8, 1), 0.05)
  println(s"Qauntiles segments = ${quantiles.toSeq}")


Qauntiles segments = WrappedArray(-27.0, 3.0, 6.0, 9.0, 12.0, 22.0, 4443.0)


quantiles = Array(-27.0, 3.0, 6.0, 9.0, 12.0, 22.0, 4443.0)


[-27.0, 3.0, 6.0, 9.0, 12.0, 22.0, 4443.0]

In [38]:
dfTags.show()

+---+-------------------+
| id|                tag|
+---+-------------------+
|  1|               data|
|  4|                 c#|
|  4|           winforms|
|  4|    type-conversion|
|  4|            decimal|
|  4|            opacity|
|  6|               html|
|  6|                css|
|  6|               css3|
|  6|internet-explorer-7|
|  8|                 c#|
|  8|    code-generation|
|  8|                 j#|
|  8|           visualj#|
|  9|                 c#|
|  9|               .net|
|  9|           datetime|
| 11|                 c#|
| 11|           datetime|
| 11|           datediff|
+---+-------------------+
only showing top 20 rows



In [39]:
  // Sampling With Replacement
// dfTags.sample( with replacement, fraction of original, random seed)
  val dfTagsSample = dfTags.sample(true, 0.2, 37L)
  println(s"Number of rows in sample dfTagsSample = ${dfTagsSample.count()}")
  println(s"Number of rows in dfTags = ${dfTags.count()}")


Number of rows in sample dfTagsSample = 1948
Number of rows in dfTags = 9999


dfTagsSample = [id: int, tag: string]


[id: int, tag: string]

### bloomFilter

bloomFilter(Column col, long expectedNumItems, double fpp)

Builds a Bloom filter over a specified column.
A Bloom filter is a space-efficient probabilistic data structure that offers an approximate containment 
test with one-sided error: if it claims that an item is contained in it, this might be in error, 
but if it claims that an item is not contained in it, then this is definitely true. Currently supported data types include:
Byte,Short,Integer,Long,String

The false positive probability (FPP) of a Bloom filter is defined as the probability that mightContain(Object) 
will erroneously return true for an object that hasu not actually been put in the BloomFilter. 
The implementation is largely based on the BloomFilter class from Guava.


In [40]:
 
val tagsBloomFilter = dfTags.stat.bloomFilter("tag", 1000L, 0.1)
  println(s"bloom filter contains java tag = ${tagsBloomFilter.mightContain("java")}")
  println(s"bloom filter contains c# tag = ${tagsBloomFilter.mightContain("c#")}")

  println(s"bloom filter contains some unknown tag = ${tagsBloomFilter.mightContain("unknown tag")}")


bloom filter contains java tag = true
bloom filter contains c# tag = true
bloom filter contains some unknown tag = false


tagsBloomFilter = org.apache.spark.util.sketch.BloomFilterImpl@809c4023


org.apache.spark.util.sketch.BloomFilterImpl@809c4023

### Count-min sketch
A Count-min sketch is a probabilistic data structure used for summarizing streams of data in sub-linear space. 
Currently, supported data types include: Byte Short Integer Long String
A CountMinSketch is initialized with a random seed, and a pair of parameters:
relative error (or eps), and
confidence (or delta)
Suppose you want to estimate the number of times an element x has appeared in a data stream so far. 
With probability delta, the estimate of this frequency is within 
the range true frequency <= estimate <= true frequency + eps * N, where N is the total 
count of items have appeared the data stream so far. Under the cover, a CountMinSketch is essentially a 
two-dimensional long array with depth d and width w, where
d = ceil(2 / eps)
w = ceil(-log(1 - confidence) / log(2))

In [41]:
 // Count Min Sketch
//countMinSketch(Column col, double eps, double confidence, int seed)

  val cmsTag = dfTags.stat.countMinSketch("tag", 0.1, 0.99, 38)
  val estimatedFrequency = cmsTag.estimateCount("java")
  println(s"Estimated frequency for tag java = $estimatedFrequency")

  val estimatedFrequency_2 = cmsTag.estimateCount("C#")
  println(s"Estimated frequency for tag C# = $estimatedFrequency_2")

Estimated frequency for tag java = 484
Estimated frequency for tag C# = 329


cmsTag = org.apache.spark.util.sketch.CountMinSketchImpl@fbf07245
estimatedFrequency = 484
estimatedFrequency_2 = 329


329

In [42]:
//ceil (-log(1 - 0.99) / log(2))=7
//ceil (2 / 0.1) =20

println(cmsTag.depth) 
println(cmsTag.width)

7
20
