In [1]:
%showTypes on

Types will be printed.


Either[org.apache.toree.magic.CellMagicOutput,org.apache.toree.magic.LineMagicOutput] = Right(())

Toree creates a SparkContext automatically 

In [2]:
sc

org.apache.spark.SparkContext = org.apache.spark.SparkContext@b10b77b

In [3]:
println("Spark version:      " + sc.version)
println("Spark master:       " + sc.master)
println("Running 'locally'?: " + sc.isLocal)

Spark version:      2.3.0
Spark master:       local[*]
Running 'locally'?: true


example methods in scala

In [8]:
/*
 * "info" takes a single String argument, prints it on a line,
 * and returns it. 
 */
def info(message: String): String = {
    println(message)

    // The last expression in the block, message, is the return value. 
    // "return" keyword not required.
    // Do no additional formatting for the return string.
    message  
}

In [20]:
/*
 * "error" takes a single String argument, prints a formatted error message,
 * and returns the message. 
 */
def error(message: String): String = {   

    // Print the string passed to "println" and add a linefeed ("ln"):
    // See the next cell for an explanation of how the string is constructed.
    val fullMessage = s"""
        |********************************************************************
        |
        |  ERROR: $message
        |
        |********************************************************************
        |""".stripMargin
    println(fullMessage)

    fullMessage
}

In [17]:
val infoString = info("All is well.")

All is well.


In [18]:
infoString

String = All is well.

In [21]:
val errorString = error("Uh oh...")


********************************************************************

  ERROR: Uh oh...

********************************************************************



In [22]:
s"""
    |line 1
    |  line 2
    |  | line 3
    |""".stripMargin

String = "
line 1
  line 2
 line 3
"

In [27]:
"""
    |line 1
    |  line 2
    |""".stripMargin

String = "
line 1
  line 2
"

In [28]:
// Import File. Unlike Java, the semicolon ';' is not required.
import java.io.File

In [29]:
val shakespeare = new File("JustEnoughScalaForSpark/data/shakespeare")

In [30]:
val success = if (shakespeare.exists == false) {   // doesn't exist already?
    error(s"Data directory path doesn't exist! $shakespeare")  // ignore returned string
    false
} else {
    info(s"$shakespeare already exists")
    true
}
println("success = " + success)

JustEnoughScalaForSpark/data/shakespeare already exists
success = true


In [34]:
val pathSeparator = File.separator
val targetDirName = shakespeare.toString
val plays = Seq(
    "tamingoftheshrew", "comedyoferrors", "loveslabourslost", "midsummersnightsdream",
    "merrywivesofwindsor", "muchadoaboutnothing", "asyoulikeit", "twelfthnight")

if (success) {
    println(s"Checking that the plays are in $shakespeare:")
    val failures = for {
        play <- plays
        playFileName = targetDirName + pathSeparator + play
        playFile = new File(playFileName)
        if (playFile.exists == false) 
    } yield {
        s"$playFileName:\tNOT FOUND!"
    }

    println("Finished!")
    if (failures.size == 0) {
        info("All plays found!")
    } else {
        println("The following expected plays were not found:")
        failures.foreach(play => error(play))
    }
}

Checking that the plays are in JustEnoughScalaForSpark/data/shakespeare:
Finished!
All plays found!


Any = All plays found!

Passing Functions as Arguments

In [35]:
println("Pass println as the function to use for each element:")
plays.foreach(println)

Pass println as the function to use for each element:
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [36]:
println("\nUsing an anonymous function that calls println: `str => println(str)`")
println("(Note that the type of the argument `str` is inferred to be String.)")
plays.foreach(str => println(str))


Using an anonymous function that calls println: `str => println(str)`
(Note that the type of the argument `str` is inferred to be String.)
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [37]:
println("\nAdding the argument type explicitly. Note that the parentheses are required.")
plays.foreach((str: String) => println(str))


Adding the argument type explicitly. Note that the parentheses are required.
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [38]:
println("\nWhy do we need to name this argument? Scala lets us use _ as a placeholder.")
plays.foreach(println(_))


Why do we need to name this argument? Scala lets us use _ as a placeholder.
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [39]:
println("\nFor longer functions, you can use {...} instead of (...).")
println("Why? Because it gives you the familiar multiline block syntax with {...}")
plays.foreach {
  (str: String) => println(str)
}


For longer functions, you can use {...} instead of (...).
Why? Because it gives you the familiar multiline block syntax with {...}
tamingoftheshrew
comedyoferrors
loveslabourslost
midsummersnightsdream
merrywivesofwindsor
muchadoaboutnothing
asyoulikeit
twelfthnight


In [53]:
println("As an assume, use `reduceLeft` to sum some integers.")
val integers = 0 to 10   // Return a "range" from 0 to 10, inclusive

As an assume, use `reduceLeft` to sum some integers.


In [55]:
// Method 1: Use a function
val f = (x:Int, y:Int) => x + y
integers.reduceLeft(f)

Int = 55

In [56]:
// Method 2: 
println("\nThe _ placeholder can be used *once* for each argument in the list.")
integers.reduceLeft(_+_)


The _ placeholder can be used *once* for each argument in the list.


Int = 55

In [57]:
// Method 2: anonymous function
integers.reduceLeft((i,j) => i+j)

Int = 55

Our First Spark Program:

outputs for each word a list of the documents that contain it, along with the corresponding counts

In [113]:
val iiFirstPass1 = sc.wholeTextFiles(shakespeare.toString).
    flatMap { location_contents_tuple2 => 
        val words = location_contents_tuple2._2.split("""\W+""")
        val fileName = location_contents_tuple2._1.split(pathSeparator).last
        words.map(word => ((word, fileName), 1))
    }.
    reduceByKey((count1, count2) => count1 + count2).
    map { word_file_count_tup3 => 
        (word_file_count_tup3._1._1, (word_file_count_tup3._1._2, word_file_count_tup3._2)) 
    }.
    groupByKey.
    sortByKey(ascending = true).
    mapValues { iterable => 
        val vect = iterable.toVector.sortBy { file_count_tup2 => 
            (-file_count_tup2._2, file_count_tup2._1)
        }
        vect.mkString(",")
    }

In [114]:
iiFirstPass1.take(30).foreach(println)

(,(asyoulikeit,1),(comedyoferrors,1),(loveslabourslost,1),(merrywivesofwindsor,1),(midsummersnightsdream,1),(muchadoaboutnothing,1),(tamingoftheshrew,1),(twelfthnight,1))
(A,(loveslabourslost,78),(tamingoftheshrew,59),(twelfthnight,47),(comedyoferrors,42),(midsummersnightsdream,39),(merrywivesofwindsor,38),(asyoulikeit,34),(muchadoaboutnothing,31))
(ABOUT,(muchadoaboutnothing,18))
(ACT,(merrywivesofwindsor,23),(asyoulikeit,22),(twelfthnight,18),(muchadoaboutnothing,17),(tamingoftheshrew,12),(comedyoferrors,11),(loveslabourslost,9),(midsummersnightsdream,9))
(ADAM,(asyoulikeit,16))
(ADO,(muchadoaboutnothing,18))
(ADRIANA,(comedyoferrors,85))
(ADRIANO,(loveslabourslost,111))
(AEGEON,(comedyoferrors,20))
(AEMELIA,(comedyoferrors,16))
(AEMILIA,(comedyoferrors,3))
(AEacides,(tamingoftheshrew,1))
(AEgeon,(comedyoferrors,7))
(AEgle,(midsummersnightsdream,1))
(AEmilia,(comedyoferrors,4))
(AEsculapius,(merrywivesofwindsor,1))
(AGUECHEEK,(twelfthnight,2))
(ALL,(midsummersnightsdream,2),(tamingof

In [110]:
val fileContents = sc.wholeTextFiles(shakespeare.toString)
fileContents   // force the notebook to print the type.

org.apache.spark.rdd.RDD[(String, String)] = JustEnoughScalaForSpark/data/shakespeare MapPartitionsRDD[85] at wholeTextFiles at <console>:22

In [65]:
("foo", 101, 3.14159, ("bar", 202L))

(String, Int, Double, (String, Long)) = (foo,101,3.14159,(bar,202))

In [67]:
fileContents.count

Long = 8

In [104]:
val wordFileNameOnes = fileContents.flatMap { location_contents_tuple2 => 
    // example input record: (file_path, "all the words in the file")
    // mytuple._2 => give me the 2nd element
    val words = location_contents_tuple2._2.split("""\W+""")              
    // mytuple._1 => give me the 1st element
    val fileName = location_contents_tuple2._1.split(pathSeparator).last  
    // create a new tuple to return. Note how we structured it!
    words.map(word => ((word, fileName), 1))
}
wordFileNameOnes

org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[12] at flatMap at <console>:26

In [101]:
wordFileNameOnes

org.apache.spark.rdd.RDD[((String, String), Int)] = MapPartitionsRDD[12] at flatMap at <console>:26

In [72]:
wordFileNameOnes.take(10).foreach(println)

((,asyoulikeit),1)
((AS,asyoulikeit),1)
((YOU,asyoulikeit),1)
((LIKE,asyoulikeit),1)
((IT,asyoulikeit),1)
((DRAMATIS,asyoulikeit),1)
((PERSONAE,asyoulikeit),1)
((DUKE,asyoulikeit),1)
((SENIOR,asyoulikeit),1)
((living,asyoulikeit),1)


In [73]:
val uniques = wordFileNameOnes.reduceByKey((count1, count2) => count1 + count2)
uniques

org.apache.spark.rdd.RDD[((String, String), Int)] = ShuffledRDD[13] at reduceByKey at <console>:28

In [74]:
uniques.count

Long = 27276

In [75]:
uniques.take(30).foreach(println)

((dexterity,merrywivesofwindsor),1)
((crest,asyoulikeit),1)
((whole,comedyoferrors),2)
((lamb,muchadoaboutnothing),2)
((force,muchadoaboutnothing),2)
((letter,merrywivesofwindsor),19)
((blunt,tamingoftheshrew),3)
((bestow,asyoulikeit),1)
((rear,midsummersnightsdream),1)
((crossing,tamingoftheshrew),1)
((wronged,merrywivesofwindsor),4)
((S,tamingoftheshrew),10)
((HIPPOLYTA,midsummersnightsdream),19)
((revolve,twelfthnight),1)
((er,merrywivesofwindsor),11)
((renown,asyoulikeit),1)
((cubiculo,twelfthnight),1)
((All,twelfthnight),3)
((power,loveslabourslost),8)
((Albeit,asyoulikeit),1)
((lips,tamingoftheshrew),3)
((upshot,twelfthnight),1)
((approach,midsummersnightsdream),4)
((mean,muchadoaboutnothing),5)
((embossed,asyoulikeit),1)
((varnish,loveslabourslost),2)
((Apollo,midsummersnightsdream),1)
((spangled,midsummersnightsdream),1)
((gentlemen,comedyoferrors),1)
((Rebuke,loveslabourslost),1)


In [76]:
val words = uniques.map { word_file_count_tup3 => 
    (word_file_count_tup3._1._1, (word_file_count_tup3._1._2, word_file_count_tup3._2)) 
}
words

org.apache.spark.rdd.RDD[(String, (String, Int))] = MapPartitionsRDD[14] at map at <console>:30

In [77]:
val wordGroups = words.groupByKey.sortByKey(ascending = true)
wordGroups

org.apache.spark.rdd.RDD[(String, Iterable[(String, Int)])] = ShuffledRDD[18] at sortByKey at <console>:32

In [78]:
wordGroups.count

Long = 11951

In [79]:
wordGroups.take(30).foreach(println)

(,CompactBuffer((tamingoftheshrew,1), (asyoulikeit,1), (merrywivesofwindsor,1), (comedyoferrors,1), (midsummersnightsdream,1), (twelfthnight,1), (loveslabourslost,1), (muchadoaboutnothing,1)))
(A,CompactBuffer((loveslabourslost,78), (midsummersnightsdream,39), (muchadoaboutnothing,31), (merrywivesofwindsor,38), (comedyoferrors,42), (asyoulikeit,34), (twelfthnight,47), (tamingoftheshrew,59)))
(ABOUT,CompactBuffer((muchadoaboutnothing,18)))
(ACT,CompactBuffer((asyoulikeit,22), (comedyoferrors,11), (tamingoftheshrew,12), (loveslabourslost,9), (muchadoaboutnothing,17), (twelfthnight,18), (merrywivesofwindsor,23), (midsummersnightsdream,9)))
(ADAM,CompactBuffer((asyoulikeit,16)))
(ADO,CompactBuffer((muchadoaboutnothing,18)))
(ADRIANA,CompactBuffer((comedyoferrors,85)))
(ADRIANO,CompactBuffer((loveslabourslost,111)))
(AEGEON,CompactBuffer((comedyoferrors,20)))
(AEMELIA,CompactBuffer((comedyoferrors,16)))
(AEMILIA,CompactBuffer((comedyoferrors,3)))
(AEacides,CompactBuffer((tamingoftheshrew,1)

In [80]:
val iiFirstPass2 = wordGroups.mapValues { iterable => 
    val vect = iterable.toVector.sortBy { file_count_tup2 => 
        (-file_count_tup2._2, file_count_tup2._1)
    }
    vect.mkString(",")
}
iiFirstPass2

org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[19] at mapValues at <console>:34

In [92]:
iiFirstPass2.count

Long = 11951

In [112]:
iiFirstPass2.take(30).foreach(println)

(,(asyoulikeit,1),(comedyoferrors,1),(loveslabourslost,1),(merrywivesofwindsor,1),(midsummersnightsdream,1),(muchadoaboutnothing,1),(tamingoftheshrew,1),(twelfthnight,1))
(A,(loveslabourslost,78),(tamingoftheshrew,59),(twelfthnight,47),(comedyoferrors,42),(midsummersnightsdream,39),(merrywivesofwindsor,38),(asyoulikeit,34),(muchadoaboutnothing,31))
(ABOUT,(muchadoaboutnothing,18))
(ACT,(merrywivesofwindsor,23),(asyoulikeit,22),(twelfthnight,18),(muchadoaboutnothing,17),(tamingoftheshrew,12),(comedyoferrors,11),(loveslabourslost,9),(midsummersnightsdream,9))
(ADAM,(asyoulikeit,16))
(ADO,(muchadoaboutnothing,18))
(ADRIANA,(comedyoferrors,85))
(ADRIANO,(loveslabourslost,111))
(AEGEON,(comedyoferrors,20))
(AEMELIA,(comedyoferrors,16))
(AEMILIA,(comedyoferrors,3))
(AEacides,(tamingoftheshrew,1))
(AEgeon,(comedyoferrors,7))
(AEgle,(midsummersnightsdream,1))
(AEmilia,(comedyoferrors,4))
(AEsculapius,(merrywivesofwindsor,1))
(AGUECHEEK,(twelfthnight,2))
(ALL,(midsummersnightsdream,2),(tamingof

Exercises

In [136]:
// Add a filter statement to remove the first entry for the blank word ""
// Convert all words to lower case
val iiFirstPass1 = sc.wholeTextFiles(shakespeare.toString).
    flatMap { location_contents_tuple2 => 
        val words = location_contents_tuple2._2.split("""\W+""")
        val fileName = location_contents_tuple2._1.split(pathSeparator).last
        words.map(word => ((word.toLowerCase, fileName), 1)).
            filter(word_file_one_tup3 => word_file_one_tup3._1._1.size > 0)
    }.
    reduceByKey((count1, count2) => count1 + count2).
    map { word_file_count_tup3 => 
        (word_file_count_tup3._1._1, (word_file_count_tup3._1._2, word_file_count_tup3._2)) 
    }.
    groupByKey.
    sortByKey(ascending = true).
    mapValues { iterable => 
        val vect = iterable.toVector.sortBy { file_count_tup2 => 
            (-file_count_tup2._2, file_count_tup2._1)
        }
        vect.mkString(",")
    }

In [137]:
iiFirstPass1.take(30).foreach(println)

(a,(loveslabourslost,507),(merrywivesofwindsor,494),(muchadoaboutnothing,492),(asyoulikeit,461),(tamingoftheshrew,445),(twelfthnight,416),(midsummersnightsdream,281),(comedyoferrors,254))
(abandon,(asyoulikeit,4),(tamingoftheshrew,1),(twelfthnight,1))
(abate,(loveslabourslost,1),(midsummersnightsdream,1),(tamingoftheshrew,1))
(abatement,(twelfthnight,1))
(abbess,(comedyoferrors,8))
(abbey,(comedyoferrors,9))
(abbominable,(loveslabourslost,1))
(abbreviated,(loveslabourslost,1))
(abed,(asyoulikeit,1),(twelfthnight,1))
(abetting,(comedyoferrors,1))
(abhominable,(loveslabourslost,1))
(abhor,(asyoulikeit,1),(comedyoferrors,1),(loveslabourslost,1),(merrywivesofwindsor,1),(muchadoaboutnothing,1))
(abhors,(twelfthnight,2))
(abide,(merrywivesofwindsor,3),(midsummersnightsdream,2))
(abides,(muchadoaboutnothing,1))
(ability,(muchadoaboutnothing,1),(twelfthnight,1))
(abject,(comedyoferrors,1),(tamingoftheshrew,1))
(abjure,(midsummersnightsdream,1))
(abjured,(tamingoftheshrew,1),(twelfthnight,1))
(

In [135]:
iiFirstPass1.count

Long = 10409

Pattern Matching

In [149]:
val ii1 = sc.wholeTextFiles(shakespeare.toString).
    flatMap {
        case (location, contents) => 
            val words = contents.split("""\W+""").
                filter(word => word.size > 0)                      // #1
            val fileName = location.split(pathSeparator).last
            words.map(word => ((word.toLowerCase, fileName), 1))   // #2
    }.
    reduceByKey((count1, count2) => count1 + count2).
    map { 
        case ((word, fileName), count) => (word, (fileName, count)) 
    }.
    groupByKey.
    sortByKey(ascending = true).
    mapValues { iterable => 
        val vect = iterable.toVector.sortBy { 
            case (fileName, count) => (-count, fileName) 
        }
        vect.mkString(",")
    }

use Spark's DataFrame API

In [150]:
import org.apache.spark.sql.SQLContext
val sqlContext = new SQLContext(sc)
val ii1DF = sqlContext.createDataFrame(ii1).toDF("word", "locations_counts")

In [152]:
ii1DF

org.apache.spark.sql.DataFrame = [word: string, locations_counts: string]

In [164]:
ii1DF.show()

+-----------+--------------------+
|       word|    locations_counts|
+-----------+--------------------+
|          a|(loveslabourslost...|
|    abandon|(asyoulikeit,4),(...|
|      abate|(loveslabourslost...|
|  abatement|    (twelfthnight,1)|
|     abbess|  (comedyoferrors,8)|
|      abbey|  (comedyoferrors,9)|
|abbominable|(loveslabourslost,1)|
|abbreviated|(loveslabourslost,1)|
|       abed|(asyoulikeit,1),(...|
|   abetting|  (comedyoferrors,1)|
|abhominable|(loveslabourslost,1)|
|      abhor|(asyoulikeit,1),(...|
|     abhors|    (twelfthnight,2)|
|      abide|(merrywivesofwind...|
|     abides|(muchadoaboutnoth...|
|    ability|(muchadoaboutnoth...|
|     abject|(comedyoferrors,1...|
|     abjure|(midsummersnights...|
|    abjured|(tamingoftheshrew...|
|       able|(merrywivesofwind...|
+-----------+--------------------+
only showing top 20 rows



Supporting SQL Queries

In [165]:
// Instead of creating a string for the list of (location,count) pairs, 
// which is opaque to our SQL schema (i.e., just a string), let's "unzip" the collection into two arrays, 
// one for the locations and one for the counts.
val ii = sc.wholeTextFiles(shakespeare.toString).
    flatMap {
        case (location, contents) => 
            val words = contents.split("""\W+""").
                filter(word => word.size > 0)                      // #1
            val fileName = location.split(pathSeparator).last
            words.map(word => ((word.toLowerCase, fileName), 1))   // #2
    }.
    reduceByKey((count1, count2) => count1 + count2).
    map { 
        case ((word, fileName), count) => (word, (fileName, count)) 
    }.
    groupByKey.
    sortByKey(ascending = true).
    map {                         // Must use map now, because we'll format new records. 
      case (word, iterable) =>    // Hence, pattern match on the whole input record.

        val vect = iterable.toVector.sortBy { 
            case (fileName, count) => (-count, fileName) 
        }

        // Use `Vector.unzip`, which returns a single, two element tuple, where each
        // element is a collection, one for the locations and one for the counts. 
        // I use pattern matching to extract these two collections into variables.
        val (locations, counts) = vect.unzip  

        // Lastly, I'll compute the total count across all locations and return 
        // a new record with all four fields. The `reduceLeft` method takes a function
        // that knows how to "reduce" the collection down to a final value, working 
        // from the left.
        val totalCount = counts.reduceLeft((n1,n2) => n1+n2)

        (word, totalCount, locations, counts)
    }

In [166]:
ii.take(30).foreach(println)

(a,3350,Vector(loveslabourslost, merrywivesofwindsor, muchadoaboutnothing, asyoulikeit, tamingoftheshrew, twelfthnight, midsummersnightsdream, comedyoferrors),Vector(507, 494, 492, 461, 445, 416, 281, 254))
(abandon,6,Vector(asyoulikeit, tamingoftheshrew, twelfthnight),Vector(4, 1, 1))
(abate,3,Vector(loveslabourslost, midsummersnightsdream, tamingoftheshrew),Vector(1, 1, 1))
(abatement,1,Vector(twelfthnight),Vector(1))
(abbess,8,Vector(comedyoferrors),Vector(8))
(abbey,9,Vector(comedyoferrors),Vector(9))
(abbominable,1,Vector(loveslabourslost),Vector(1))
(abbreviated,1,Vector(loveslabourslost),Vector(1))
(abed,2,Vector(asyoulikeit, twelfthnight),Vector(1, 1))
(abetting,1,Vector(comedyoferrors),Vector(1))
(abhominable,1,Vector(loveslabourslost),Vector(1))
(abhor,5,Vector(asyoulikeit, comedyoferrors, loveslabourslost, merrywivesofwindsor, muchadoaboutnothing),Vector(1, 1, 1, 1, 1))
(abhors,2,Vector(twelfthnight),Vector(2))
(abide,5,Vector(merrywivesofwindsor, midsummersnightsdream),Vect

In [167]:
val iiDF = sqlContext.createDataFrame(ii).toDF("word", "total_count", "locations", "counts")
iiDF.cache
iiDF.registerTempTable("inverted_index")

In [168]:
iiDF.printSchema

root
 |-- word: string (nullable = true)
 |-- total_count: integer (nullable = false)
 |-- locations: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- counts: array (nullable = true)
 |    |-- element: integer (containsNull = false)



In [169]:
%%SQL
SELECT word, total_count, locations[0] AS top_location, counts[0] AS top_count 
FROM inverted_index

Either[org.apache.toree.magic.CellMagicOutput,org.apache.toree.magic.LineMagicOutput] = Left(Map(text/plain -> +-----------+-----------+----------------+---------+
|       word|total_count|    top_location|top_count|
+-----------+-----------+----------------+---------+
|          a|       3350|loveslabourslost|      507|
|    abandon|          6|     asyoulikeit|        4|
|      abate|          3|loveslabourslost|        1|
|  abatement|          1|    twelfthnight|        1|
|     abbess|          8|  comedyoferrors|        8|
|      abbey|          9|  comedyoferrors|        9|
|abbominable|          1|loveslabourslost|        1|
|abbreviated|          1|loveslabourslost|        1|
|       abed|          2|     asyoulikeit|        1|
|   abetting|          1|  comedyoferrors|...

In [199]:
val topLocations = sqlContext.sql("""
    SELECT word,  total_count, locations[0] AS top_1st_location, counts[0] AS top_1st_count, 
    locations[1] AS top_2nd_location, counts[1] AS top_2nd_count
    FROM inverted_index 
    WHERE word LIKE 'love%' OR word LIKE 'unlove%' OR word LIKE 'hate%' 
""")

In [200]:
topLocations.show(numRows = 40, truncate = false)

+-------+-----------+---------------------+-------------+---------------------+-------------+
|word   |total_count|top_1st_location     |top_1st_count|top_2nd_location     |top_2nd_count|
+-------+-----------+---------------------+-------------+---------------------+-------------+
|hate   |22         |midsummersnightsdream|9            |asyoulikeit          |6            |
|hated  |6          |midsummersnightsdream|4            |asyoulikeit          |2            |
|hateful|5          |midsummersnightsdream|3            |loveslabourslost     |1            |
|hates  |5          |asyoulikeit          |2            |merrywivesofwindsor  |1            |
|hateth |1          |midsummersnightsdream|1            |null                 |null         |
|love   |662        |loveslabourslost     |121          |asyoulikeit          |119          |
|loved  |38         |asyoulikeit          |13           |muchadoaboutnothing  |13           |
|lovely |15         |midsummersnightsdream|7            |tam

In [174]:
topLocations

org.apache.spark.sql.DataFrame = [word: string, total_count: int ... 2 more fields]