# Demos from https://github.com/nadimbahadoor/allaboutscala
License: Apache 2.0 License - see the LICENSE.TXT file at https://github.com/nadimbahadoor/allaboutscala

In [2]:
sc.parallelize(1 to 100).reduce(_+_)

[Stage 0:>                                                          (0 + 0) / 4]

5050

In [3]:
sc.parallelize(1 to 100).mean()

50.5

In [4]:
var name = "Hello"
println(name)

Hello


name = Hello


Hello

In [5]:
import org.apache.spark.mllib

In [6]:
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))

val df = spark
    .createDataFrame(donuts)
    .toDF("Donut Name", "Price")

df.show()


+-------------+-----+
|   Donut Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+



donuts = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df = [Donut Name: string, Price: double]


[Donut Name: string, Price: double]

In [7]:
val columnNames: Array[String] = df.columns
  columnNames.foreach(name => println(s"$name"))

Donut Name
Price


columnNames = Array(Donut Name, Price)


[Donut Name, Price]

In [8]:
val (columnNames, columnDataTypes) = df.dtypes.unzip
println(s"DataFrame column names = ${columnNames.mkString(", ")}")
println(s"DataFrame column data types = ${columnDataTypes.mkString(", ")}")

DataFrame column names = Donut Name, Price
DataFrame column data types = StringType, DoubleType


columnNames = Array(Donut Name, Price)
columnDataTypes = Array(StringType, DoubleType)


[StringType, DoubleType]

In [9]:
val sqlContext = new org.apache.spark.sql.SQLContext(sc)

sqlContext = org.apache.spark.sql.SQLContext@64552914




org.apache.spark.sql.SQLContext@64552914

In [10]:
import sqlContext.implicits._ 

In [13]:
val tagsDF = spark
    .read
    .option("inferSchema", true)
    .json("resources/tags_sample_1line.json")

 

tagsDF = [stackoverflow: array<struct<tag:struct<author:string,frameworks:array<struct<id:bigint,name:string>>,id:bigint,name:string>>>]


lastException: Throwable = null


[stackoverflow: array<struct<tag:struct<author:string,frameworks:array<struct<id:bigint,name:string>>,id:bigint,name:string>>>]

In [14]:
spark.version

2.0.0

In [15]:
import org.apache.spark.sql.functions.explode

In [16]:
val df = tagsDF.select(explode($"stackoverflow") as "stackoverflow_tags")

 

df = [stackoverflow_tags: struct<tag: struct<author: string, frameworks: array<struct<id:bigint,name:string>> ... 2 more fields>>]


[stackoverflow_tags: struct<tag: struct<author: string, frameworks: array<struct<id:bigint,name:string>> ... 2 more fields>>]

In [17]:
 df.printSchema()

  df.select(
    $"stackoverflow_tags.tag.id" as "id",
    $"stackoverflow_tags.tag.author" as "author",
    $"stackoverflow_tags.tag.name" as "tag_name",
    $"stackoverflow_tags.tag.frameworks.id" as "frameworks_id",
    $"stackoverflow_tags.tag.frameworks.name" as "frameworks_name"
  ).show()

root
 |-- stackoverflow_tags: struct (nullable = true)
 |    |-- tag: struct (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- frameworks: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)

+---+--------------+--------+-------------+--------------------+
| id|        author|tag_name|frameworks_id|     frameworks_name|
+---+--------------+--------+-------------+--------------------+
|  1|Martin Odersky|   scala|       [1, 2]|[Play Framework, ...|
|  2| James Gosling|    java|       [1, 2]|[Apache Tomcat, S...|
+---+--------------+--------+-------------+--------------------+



In [18]:
val donuts = Seq(("111","plain donut", 1.50), ("222", "vanilla donut", 2.0), ("333","glazed donut", 2.50))

val dfDonuts = spark
    .createDataFrame(donuts)
    .toDF("Id","Donut Name", "Price")

dfDonuts.show()

+---+-------------+-----+
| Id|   Donut Name|Price|
+---+-------------+-----+
|111|  plain donut|  1.5|
|222|vanilla donut|  2.0|
|333| glazed donut|  2.5|
+---+-------------+-----+



donuts = List((111,plain donut,1.5), (222,vanilla donut,2.0), (333,glazed donut,2.5))
dfDonuts = [Id: string, Donut Name: string ... 1 more field]


[Id: string, Donut Name: string ... 1 more field]

In [19]:
val inventory = Seq(("111", 10), ("222", 20), ("333", 30))
val dfInventory = spark
      .createDataFrame(inventory)
      .toDF("Id", "Inventory")

dfInventory.show()



+---+---------+
| Id|Inventory|
+---+---------+
|111|       10|
|222|       20|
|333|       30|
+---+---------+



inventory = List((111,10), (222,20), (333,30))
dfInventory = [Id: string, Inventory: int]


[Id: string, Inventory: int]

In [20]:
val dfDonutsInventory = dfDonuts.join(dfInventory, Seq("Id"), "inner")
dfDonutsInventory.show()

+---+-------------+-----+---------+
| Id|   Donut Name|Price|Inventory|
+---+-------------+-----+---------+
|111|  plain donut|  1.5|       10|
|222|vanilla donut|  2.0|       20|
|333| glazed donut|  2.5|       30|
+---+-------------+-----+---------+



dfDonutsInventory = [Id: string, Donut Name: string ... 2 more fields]


[Id: string, Donut Name: string ... 2 more fields]

In [21]:
import org.apache.spark.sql.functions._

In [22]:
val tagsDF = spark
    .read
    .option("multiLine", true)
    .option("inferSchema", true)
    .json("resources/tags_sample_1line.json")

val df = tagsDF
    .select(explode($"stackoverflow") as "stackoverflow_tags")
    .select(
      $"stackoverflow_tags.tag.id" as "id",
      $"stackoverflow_tags.tag.author" as "author",
      $"stackoverflow_tags.tag.name" as "tag_name",
      $"stackoverflow_tags.tag.frameworks.id" as "frameworks_id",
      $"stackoverflow_tags.tag.frameworks.name" as "frameworks_name"
    )

df.show()

+---+--------------+--------+-------------+--------------------+
| id|        author|tag_name|frameworks_id|     frameworks_name|
+---+--------------+--------+-------------+--------------------+
|  1|Martin Odersky|   scala|       [1, 2]|[Play Framework, ...|
|  2| James Gosling|    java|       [1, 2]|[Apache Tomcat, S...|
+---+--------------+--------+-------------+--------------------+



tagsDF = [stackoverflow: array<struct<tag:struct<author:string,frameworks:array<struct<id:bigint,name:string>>,id:bigint,name:string>>>]
df = [id: bigint, author: string ... 3 more fields]


[id: bigint, author: string ... 3 more fields]

In [23]:
df
  .select("*")
  .where(array_contains($"frameworks_name","Play Framework"))
  .show()

+---+--------------+--------+-------------+--------------------+
| id|        author|tag_name|frameworks_id|     frameworks_name|
+---+--------------+--------+-------------+--------------------+
|  1|Martin Odersky|   scala|       [1, 2]|[Play Framework, ...|
+---+--------------+--------+-------------+--------------------+



In [24]:
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price")

df.show()

val priceColumnExists = df.columns.contains("Price")
println(s"Does price column exist = $priceColumnExists")

+-------------+-----+
|   Donut Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+

Does price column exist = true


donuts = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df = [Donut Name: string, Price: double]
priceColumnExists = true


true

In [25]:
val targets = Seq(("Plain Donut", Array(1.50, 2.0)), ("Vanilla Donut", Array(2.0, 2.50)), ("Strawberry Donut", Array(2.50, 3.50)))

val df = spark
    .createDataFrame(targets)
    .toDF("Name", "Prices")

df.show()
df.printSchema()

+----------------+----------+
|            Name|    Prices|
+----------------+----------+
|     Plain Donut|[1.5, 2.0]|
|   Vanilla Donut|[2.0, 2.5]|
|Strawberry Donut|[2.5, 3.5]|
+----------------+----------+

root
 |-- Name: string (nullable = true)
 |-- Prices: array (nullable = true)
 |    |-- element: double (containsNull = false)



targets = List((Plain Donut,Array(1.5, 2.0)), (Vanilla Donut,Array(2.0, 2.5)), (Strawberry Donut,Array(2.5, 3.5)))
df = [Name: string, Prices: array<double>]


[Name: string, Prices: array<double>]

In [26]:
val df2 = df
    .select(
      $"Name",
      $"Prices"(0).as("Low Price"),
      $"Prices"(1).as("High Price")
    )

df2.show()

+----------------+---------+----------+
|            Name|Low Price|High Price|
+----------------+---------+----------+
|     Plain Donut|      1.5|       2.0|
|   Vanilla Donut|      2.0|       2.5|
|Strawberry Donut|      2.5|       3.5|
+----------------+---------+----------+



df2 = [Name: string, Low Price: double ... 1 more field]


[Name: string, Low Price: double ... 1 more field]

In [27]:
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))

val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price")

df.show()


+-------------+-----+
|   Donut Name|Price|
+-------------+-----+
|  plain donut|  1.5|
|vanilla donut|  2.0|
| glazed donut|  2.5|
+-------------+-----+



donuts = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df = [Donut Name: string, Price: double]


[Donut Name: string, Price: double]

In [28]:
val df2 = df.withColumnRenamed("Donut Name", "Name")
df2.show()

+----------------+----------+
|            Name|    Prices|
+----------------+----------+
|     Plain Donut|[1.5, 2.0]|
|   Vanilla Donut|[2.0, 2.5]|
|Strawberry Donut|[2.5, 3.5]|
+----------------+----------+



df2 = [Name: string, Prices: array<double>]


[Name: string, Prices: array<double>]

In [29]:
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price")

val stockMinMax: (String => Seq[Int]) = (donutName: String) => donutName match {
    case "plain donut"    => Seq(100, 500)
    case "vanilla donut"  => Seq(200, 400)
    case "glazed donut"   => Seq(300, 600)
    case _                => Seq(150, 150)
}

val udfStockMinMax = udf(stockMinMax)
val df2 = df.withColumn("Stock Min Max", udfStockMinMax($"Donut Name"))
df2.show()

+-------------+-----+-------------+
|   Donut Name|Price|Stock Min Max|
+-------------+-----+-------------+
|  plain donut|  1.5|   [100, 500]|
|vanilla donut|  2.0|   [200, 400]|
| glazed donut|  2.5|   [300, 600]|
+-------------+-----+-------------+



donuts = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df = [Donut Name: string, Price: double]
stockMinMax = > Seq[Int] = <function1>
udfStockMinMax = UserDefinedFunction(<function1>,ArrayType(IntegerType,false),Some(List(StringType)))
df2 = [Donut Name: string, Price: double ... 1 more field]


[Donut Name: string, Price: double ... 1 more field]

In [30]:
val donuts = Seq(("plain donut", 1.50), ("vanilla donut", 2.0), ("glazed donut", 2.50))
  
val df = spark
    .createDataFrame(donuts)
    .toDF("Donut Name", "Price")

val firstRow = df.first()
println(s"First row = $firstRow")

val firstRowColumn1 = df.first().get(0)
println(s"First row column 1 = $firstRowColumn1")


val firstRowColumnPrice = df.first().getAs[Double]("Price")
println(s"First row column Price = $firstRowColumnPrice")

First row = [plain donut,1.5]
First row column 1 = plain donut
First row column Price = 1.5


donuts = List((plain donut,1.5), (vanilla donut,2.0), (glazed donut,2.5))
df = [Donut Name: string, Price: double]
firstRow = [plain donut,1.5]
firstRowColumn1 = plain donut
firstRowColumnPrice = 1.5


1.5

In [31]:
val donuts = Seq(("plain donut", 1.50, "2018-04-17"), ("vanilla donut", 2.0, "2018-04-01"), ("glazed donut", 2.50, "2018-04-02"))
val df = spark
    .createDataFrame(donuts)
    .toDF("Donut Name", "Price", "Purchase Date")


df
  .withColumn("Price Formatted", format_number($"Price", 2))
  .withColumn("Name Formatted", format_string("awesome %s", $"Donut Name"))
  .withColumn("Name Uppercase", upper($"Donut Name"))
  .withColumn("Name Lowercase", lower($"Donut Name"))
  .withColumn("Date Formatted", date_format($"Purchase Date", "yyyyMMdd"))
  .withColumn("Day", dayofmonth($"Purchase Date"))
  .withColumn("Month", month($"Purchase Date"))
  .withColumn("Year", year($"Purchase Date"))
  .show()

+-------------+-----+-------------+---------------+--------------------+--------------+--------------+--------------+---+-----+----+
|   Donut Name|Price|Purchase Date|Price Formatted|      Name Formatted|Name Uppercase|Name Lowercase|Date Formatted|Day|Month|Year|
+-------------+-----+-------------+---------------+--------------------+--------------+--------------+--------------+---+-----+----+
|  plain donut|  1.5|   2018-04-17|           1.50| awesome plain donut|   PLAIN DONUT|   plain donut|      20180417| 17|    4|2018|
|vanilla donut|  2.0|   2018-04-01|           2.00|awesome vanilla d...| VANILLA DONUT| vanilla donut|      20180401|  1|    4|2018|
| glazed donut|  2.5|   2018-04-02|           2.50|awesome glazed donut|  GLAZED DONUT|  glazed donut|      20180402|  2|    4|2018|
+-------------+-----+-------------+---------------+--------------------+--------------+--------------+--------------+---+-----+----+



donuts = List((plain donut,1.5,2018-04-17), (vanilla donut,2.0,2018-04-01), (glazed donut,2.5,2018-04-02))
df = [Donut Name: string, Price: double ... 1 more field]


[Donut Name: string, Price: double ... 1 more field]

In [32]:
val donuts = Seq(("plain donut", 1.50, "2018-04-17"), ("vanilla donut", 2.0, "2018-04-01"), ("glazed donut", 2.50, "2018-04-02"))
val df = spark.createDataFrame(donuts).toDF("Donut Name", "Price", "Purchase Date")


df
  .withColumn("Hash", hash($"Donut Name")) // murmur3 hash as default.
  .withColumn("MD5", md5($"Donut Name"))
  .withColumn("SHA1", sha1($"Donut Name"))
  .withColumn("SHA2", sha2($"Donut Name", 256)) // 256 is the number of bits
  .show()

+-------------+-----+-------------+----------+--------------------+--------------------+--------------------+
|   Donut Name|Price|Purchase Date|      Hash|                 MD5|                SHA1|                SHA2|
+-------------+-----+-------------+----------+--------------------+--------------------+--------------------+
|  plain donut|  1.5|   2018-04-17|1594998220|53a70d9f08d8bb249...|7882fd7481cb43452...|4aace471ed4433f1b...|
|vanilla donut|  2.0|   2018-04-01| 673697474|254c8f04be947ec2c...|5dbbc954723a74fe0...|ccda17c5bc47d1671...|
| glazed donut|  2.5|   2018-04-02| 715175419|44199f422534a5736...|aaee30ecdc523fa1e...|6d1568ca8c20ffc0b...|
+-------------+-----+-------------+----------+--------------------+--------------------+--------------------+



donuts = List((plain donut,1.5,2018-04-17), (vanilla donut,2.0,2018-04-01), (glazed donut,2.5,2018-04-02))
df = [Donut Name: string, Price: double ... 1 more field]


[Donut Name: string, Price: double ... 1 more field]

In [33]:
val donuts = Seq(("plain donut", 1.50, "2018-04-17"), ("vanilla donut", 2.0, "2018-04-01"), ("glazed donut", 2.50, "2018-04-02"))
  
val df = spark
    .createDataFrame(donuts)
    .toDF("Donut Name", "Price", "Purchase Date")

 
  
df
  .withColumn("Contains plain", instr($"Donut Name", "donut"))
  .withColumn("Length", length($"Donut Name"))
  .withColumn("Trim", trim($"Donut Name"))
  .withColumn("LTrim", ltrim($"Donut Name"))
  .withColumn("RTrim", rtrim($"Donut Name"))
  .withColumn("Reverse", reverse($"Donut Name"))
  .withColumn("Substring", substring($"Donut Name", 0, 5))
  .withColumn("IsNull", isnull($"Donut Name"))
  .withColumn("Concat", concat_ws(" - ", $"Donut Name", $"Price"))
  .withColumn("InitCap", initcap($"Donut Name"))
  .show()

+-------------+-----+-------------+--------------+------+-------------+-------------+-------------+-------------+---------+------+-------------------+-------------+
|   Donut Name|Price|Purchase Date|Contains plain|Length|         Trim|        LTrim|        RTrim|      Reverse|Substring|IsNull|             Concat|      InitCap|
+-------------+-----+-------------+--------------+------+-------------+-------------+-------------+-------------+---------+------+-------------------+-------------+
|  plain donut|  1.5|   2018-04-17|             7|    11|  plain donut|  plain donut|  plain donut|  tunod nialp|    plain| false|  plain donut - 1.5|  Plain Donut|
|vanilla donut|  2.0|   2018-04-01|             9|    13|vanilla donut|vanilla donut|vanilla donut|tunod allinav|    vanil| false|vanilla donut - 2.0|Vanilla Donut|
| glazed donut|  2.5|   2018-04-02|             8|    12| glazed donut| glazed donut| glazed donut| tunod dezalg|    glaze| false| glazed donut - 2.5| Glazed Donut|
+---------

donuts = List((plain donut,1.5,2018-04-17), (vanilla donut,2.0,2018-04-01), (glazed donut,2.5,2018-04-02))
df = [Donut Name: string, Price: double ... 1 more field]


[Donut Name: string, Price: double ... 1 more field]

In [35]:
val donuts = Seq(("plain donut", 1.50), (null.asInstanceOf[String], 2.0), ("glazed donut", 2.50))

val dfWithNull = spark
    .createDataFrame(donuts)
    .toDF("Donut Name", "Price")

dfWithNull.show()

val dfWithoutNull = dfWithNull.na.drop()

dfWithoutNull.show()

+------------+-----+
|  Donut Name|Price|
+------------+-----+
| plain donut|  1.5|
|        null|  2.0|
|glazed donut|  2.5|
+------------+-----+

+------------+-----+
|  Donut Name|Price|
+------------+-----+
| plain donut|  1.5|
|glazed donut|  2.5|
+------------+-----+



donuts = List((plain donut,1.5), (null,2.0), (glazed donut,2.5))
dfWithNull = [Donut Name: string, Price: double]
dfWithoutNull = [Donut Name: string, Price: double]


[Donut Name: string, Price: double]