In [None]:
import $ivy.`org.apache.spark::spark-sql:2.4.5` 
import $ivy.`sh.almond::almond-spark:0.4.0`

import org.apache.spark.sql.{NotebookSparkSession, SparkSession}
import org.apache.spark.sql.{functions => func, _}
import org.apache.spark.sql.types._

val spark = NotebookSparkSession
      .builder()
      .config("spark.sql.join.preferSortMergeJoin", false)
      .config("spark.sql.shuffle.partitions", 64)
      .master("local[*]")
      .getOrCreate()

import spark.implicits._

import org.slf4j.LoggerFactory
import org.apache.log4j.{Level, Logger}

Logger.getRootLogger().setLevel(Level.ERROR)

In [None]:
spark.read.format("json").load("D:/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json").schema

In [None]:
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
import org.apache.spark.sql.types.Metadata

val myManualSchema = StructType(Array(
  StructField("DEST_COUNTRY_NAME", StringType, true),
  StructField("ORIGIN_COUNTRY_NAME", StringType, true),
  StructField("count", LongType, false,
    Metadata.fromJson("{\"hello\":\"world\"}"))
))

val df = spark.read.format("json").schema(myManualSchema)
  .load("D:/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")

In [None]:
df.orderBy($"count".asc).show

In [None]:
df.col("count")

In [None]:
import org.apache.spark.sql.functions.expr
expr("(((someCol + 5) * 200) - 6) < otherCol")

In [None]:
df.columns

In [None]:
df.first

In [None]:
//import org.apache.spark.sql.Row
val myRow = Row("Hello", null, 1, false)

In [None]:
val df = spark.read.format("json")
  .load("D:/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")
df.createOrReplaceTempView("dfTable")

In [None]:
val myManualSchema = new StructType(Array(
  new StructField("some", StringType, true),
  new StructField("col", StringType, true),
  new StructField("names", LongType, false)))
val myRows = Seq(Row("Hello", null, 1L))
val myRDD = spark.sparkContext.parallelize(myRows)
val myDf = spark.createDataFrame(myRDD, myManualSchema)
myDf.show()

In [None]:
df.select("DEST_COUNTRY_NAME").show(2)

In [None]:
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME"))
  .show(2)

In [None]:
df.selectExpr(
    "*", // include all original columns
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry")
  .show(2)

In [None]:
import org.apache.spark.sql.functions.lit
df.select(expr("*"), lit(1).as("One")).show(2)

In [None]:
df.withColumn("numberOne", lit(1)).show(2)

In [None]:
df.withColumn("withinCountry", expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME"))
  .show(2)

In [None]:
df.withColumn("count2", $"count".cast("long"))

In [None]:
import org.apache.spark.sql.functions.{col, column}
//df.filter(col("count") < 2).show(2)
df.filter($"count" < 2).show(2)
//df.where("count < 2").show(2)
df.where($"count" < 2).show(2)

In [None]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

In [None]:
val seed = 5
val withReplacement = false
val fraction = 0.5
df.sample(withReplacement, fraction, seed).count()

In [None]:
val schema = df.schema
val newRows = Seq(
  Row("New Country", "Other Country", 5L),
  Row("New Country 2", "Other Country 3", 1L)
)
val parallelizedRows = spark.sparkContext.parallelize(newRows)
val newDF = spark.createDataFrame(parallelizedRows, schema)
newDF.show()

In [None]:
df.union(newDF)
  .where("count = 1")
  .where($"ORIGIN_COUNTRY_NAME" =!= "United States")
  .show()

In [None]:
df.limit(5).show()

In [None]:
df.rdd.getNumPartitions

In [None]:
val collectDF = df.limit(10)
collectDF.take(5) // take works with an Integer count

In [None]:
collectDF.show() // this prints it out nicely

In [None]:
collectDF.show(5, false)

In [None]:
collectDF.collect()

## Capitulo 6

In [None]:
val df = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("D:/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("dfTable")    //crear치 una vista temporal de la tabla en la memoria

In [None]:
import org.apache.spark.sql.functions.lit
df.select(lit(5), lit("five"), lit(5.0))

In [None]:
df.where(col("InvoiceNo").equalTo(536365))
  .select("InvoiceNo", "Description")
  .show(5, false)

Lo mismo

In [None]:
df.where("InvoiceNo = 536365")
  .show(5, false)

In [None]:
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.where(col("StockCode").isin("DOT")).where(priceFilter.or(descripFilter))
  .show()

In [None]:
val DOTCodeFilter = col("StockCode") === "DOT"
val priceFilter = col("UnitPrice") > 600
val descripFilter = col("Description").contains("POSTAGE")
df.withColumn("isExpensive", DOTCodeFilter.and(priceFilter.or(descripFilter)))
  .where("isExpensive")
  //.filter("isExpensive")
  .select("unitPrice", "isExpensive").show(5)

In [None]:
import org.apache.spark.sql.functions.{expr, not, col}
df.withColumn("isExpensive", not(col("UnitPrice").leq(250)))
  .filter("isExpensive")
  .select("Description", "UnitPrice").show(5)
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))
  .filter("isExpensive")
  .select("Description", "UnitPrice").show(5)

In [None]:
df.where(col("Description").eqNullSafe("CustumerID")).show()

In [None]:
val fabricatedQuantity = func.pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

In [None]:
//Como SQL
df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

In [None]:
df.select(func.round(col("UnitPrice"), 1).alias("rounded"), col("UnitPrice")).show(5)

In [None]:
df.select(func.round(lit("2.5")), func.bround(lit("2.5"))).show(2)

In [None]:
df.stat.corr("Quantity", "UnitPrice")
df.select(func.corr("Quantity", "UnitPrice")).show()

In [None]:
df.describe().show()

In [None]:
df.select(func.monotonically_increasing_id()).show(7)

In [None]:
import org.apache.spark.sql.functions.{initcap}
df.select(initcap(col("Description"))).show(2, false)

In [None]:
import org.apache.spark.sql.functions.{lit, ltrim, rtrim, rpad, lpad, trim}
df.select(
    ltrim(lit("    HELLO    ")).as("ltrim"),          //Elimina los espacios en blanco de la izquierda
    rtrim(lit("    HELLO    ")).as("rtrim"),          //Elimina los espacios en blanco de la derecha
    trim(lit("    HELLO    ")).as("trim"),            //Elimina los espacios en blanco tanto de derecha como izquierda
    lpad(lit("HELLO"), 7, " ").as("lp"),              //Se queda con el numero de posiciones indicadas a침adiendo espacios por izq
    rpad(lit("HELLO"), 10, " ").as("rp")).show(2)     //Se queda con el numero de posiciones indicadas a침adiendo espacios por der

In [None]:
import org.apache.spark.sql.functions.regexp_replace
val simpleColors = Seq("black", "white", "red", "green", "blue")
val regexString = simpleColors.map(_.toUpperCase).mkString("|")
// the | signifies `OR` in regular expression syntax
df.select(
  regexp_replace(col("Description"), regexString, "COLOR").alias("color_clean"),
  col("Description")).show(2)

In [None]:
import org.apache.spark.sql.functions.translate
df.select(translate(col("Description"), "LEET", "1337"), col("Description"))
  .show(2)

In [None]:
import org.apache.spark.sql.functions.regexp_extract
val regexString = simpleColors.map(_.toUpperCase).mkString("(", "|", ")")
// the | signifies OR in regular expression syntax
df.select(
     regexp_extract(col("Description"), regexString, 1).alias("color_clean"),
     col("Description")).show(2)

In [None]:
val simpleColors = Seq("black", "white", "red", "green", "blue")
val selectedColumns = simpleColors.map(color => {
   col("Description").contains(color.toUpperCase).alias(s"is_$color")
}):+expr("*") // could also append this value
df.select(selectedColumns:_*).where(col("is_white").or(col("is_red")))
  .select("Description").show(3, false)

In [None]:
import org.apache.spark.sql.functions.{current_date, current_timestamp}
val dateDF = spark.range(10)
  .withColumn("today", current_date())
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")
dateDF.show()
dateDF.printSchema()

In [None]:
import org.apache.spark.sql.functions.{date_add, date_sub}
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

In [None]:
import org.apache.spark.sql.functions.{datediff, months_between, to_date}
dateDF.withColumn("week_ago", date_sub(col("today"), 7))
  .select(datediff(col("week_ago"), col("today"))).show(1)
dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))
  .select(months_between(col("start"), col("end"))).show(1)

In [None]:
spark.range(5).withColumn("date", lit("2017-01-01"))
  .select(to_date(col("date"))).show(1)

In [None]:
import org.apache.spark.sql.functions.to_date
val dateFormat = "yyyy-dd-MM"
val cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [None]:
cleanDateDF.filter(col("date2") > "'2017-12-12'").show()

In [None]:
import org.apache.spark.sql.functions.coalesce
df.select(coalesce(col("Description"), col("CustomerId"))).show()

In [None]:
df.na.fill("All Null values become this string")

In [None]:
df.show()

In [None]:
df.na.fill(5, Seq("StockCode", "InvoiceNo"))

In [None]:
val fillColValues = Map("StockCode" -> 5, "Description" -> "No Value")
df.na.fill(fillColValues)

In [None]:
df.selectExpr("(Description, InvoiceNo) as complex", "*")

In [None]:
import org.apache.spark.sql.functions.struct
val complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")
complexDF.show()

In [None]:
complexDF.select("complex.Description").show()
complexDF.select(col("complex").getField("InvoiceNo")).show()

In [None]:
import org.apache.spark.sql.functions.split
df.select(split(col("Description"), " ").alias("array_col"))
  .selectExpr("array_col[0]").show(2)

In [None]:
import org.apache.spark.sql.functions.array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

In [None]:
import org.apache.spark.sql.functions.map
df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

In [None]:
val jsonDF = spark.range(1).selectExpr("""
  '{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString""")

In [None]:
import org.apache.spark.sql.functions.{get_json_object, json_tuple}
jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]") as "column",
    json_tuple(col("jsonString"), "myJSONKey")).show(2)

In [None]:
import org.apache.spark.sql.functions.to_json
df.selectExpr("(InvoiceNo, Description) as myStruct")
  .select(to_json(col("myStruct")))

In [None]:
val udfExampleDF = spark.range(5).toDF("num")
def power3(number:Double):Double = number * number * number
power3(2.0)

In [None]:
import org.apache.spark.sql.functions.udf
val power3udf = udf(power3(_:Double):Double)

In [None]:
udfExampleDF.select(power3udf(col("num"))).show()

In [None]:
spark.udf.register("power3", power3(_:Double):Double)
udfExampleDF.selectExpr("power3(num)").show(2)

In [None]:
val df = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load("D:/Spark-The-Definitive-Guide-master/data/retail-data/all/*.csv")
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

In [None]:
df.show()

In [None]:
df.count()

In [None]:
import org.apache.spark.sql.functions.countDistinct
df.select(countDistinct("StockCode")).show()

In [None]:
import org.apache.spark.sql.functions.{sum, count, avg, expr}

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

In [None]:
import org.apache.spark.sql.functions.{var_pop, stddev_pop}
import org.apache.spark.sql.functions.{var_samp, stddev_samp}
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()

In [None]:
import org.apache.spark.sql.functions.{skewness, kurtosis}
df.select(skewness("Quantity"), kurtosis("Quantity")).show()

In [None]:
import org.apache.spark.sql.functions.{collect_set, collect_list}
df.agg(collect_set("Country"), collect_list("Country")).show()

In [None]:
df.groupBy("InvoiceNo", "CustomerId").count().show()

In [None]:
import org.apache.spark.sql.functions.count

df.groupBy("InvoiceNo").agg(
  count("Quantity").alias("quan"),
  expr("count(Quantity)")).show()

In [None]:
// in Scala
df.groupBy("InvoiceNo").agg("Quantity"->"avg", "Quantity"->"stddev_pop").show()

In [None]:
import org.apache.spark.sql.functions.{col, to_date}
val dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"),
  "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")

In [None]:
dfWithDate.show()

In [None]:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.col
val windowSpec = Window
  .partitionBy("CustomerId", "date")
  .orderBy(col("Quantity").desc)
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

In [None]:
import org.apache.spark.sql.functions.max
val maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)

In [None]:
import org.apache.spark.sql.functions.{dense_rank, rank}
val purchaseDenseRank = dense_rank().over(windowSpec)
val purchaseRank = rank().over(windowSpec)

In [None]:
import org.apache.spark.sql.functions.col

dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank"),
    maxPurchaseQuantity.alias("maxPurchaseQuantity")).show()

In [None]:
val dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

In [None]:
val rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))
  .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")
  .orderBy("Date")
rolledUpDF.show()

//donde vea los valores null es donde encontrar치 los totales generales

In [None]:
rolledUpDF.where("Country IS NULL").show()

In [None]:
dfNoNull.cube("Date", "Country").agg(sum($"Quantity"))
  .select("Date", "Country", "sum(Quantity)").orderBy("Date").show()

In [None]:
import org.apache.spark.sql.functions.{grouping_id, sum, expr}

dfNoNull.cube("customerId", "stockCode").agg(grouping_id(), sum("Quantity"))
.orderBy(expr("grouping_id()").desc)
.show()

In [None]:
val pivoted = dfWithDate.groupBy("date").pivot("Country").sum()

In [None]:
import org.apache.spark.sql.expressions.MutableAggregationBuffer
import org.apache.spark.sql.expressions.UserDefinedAggregateFunction
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
class BoolAnd extends UserDefinedAggregateFunction {
  def inputSchema: org.apache.spark.sql.types.StructType =
    StructType(StructField("value", BooleanType) :: Nil)
  def bufferSchema: StructType = StructType(
    StructField("result", BooleanType) :: Nil
  )
  def dataType: DataType = BooleanType
  def deterministic: Boolean = true
  def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer(0) = true
  }
  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Boolean](0) && input.getAs[Boolean](0)
  }
  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Boolean](0) && buffer2.getAs[Boolean](0)
  }
  def evaluate(buffer: Row): Any = {
    buffer(0)
  }
}

In [None]:
val ba = new BoolAnd
spark.udf.register("booland", ba)
import org.apache.spark.sql.functions._
spark.range(1)
  .selectExpr("explode(array(TRUE, TRUE, TRUE)) as t")
  .selectExpr("explode(array(TRUE, FALSE, TRUE)) as f", "t")
  .select(ba(col("t")), expr("booland(f)"))
  .show()

In [None]:
val person = Seq(
    (0, "Bill Chambers", 0, Seq(100)),
    (1, "Matei Zaharia", 1, Seq(500, 250, 100)),
    (2, "Michael Armbrust", 1, Seq(250, 100)))
  .toDF("id", "name", "graduate_program", "spark_status")
val graduateProgram = Seq(
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"),
    (1, "Ph.D.", "EECS", "UC Berkeley"))
  .toDF("id", "degree", "department", "school")
val sparkStatus = Seq(
    (500, "Vice President"),
    (250, "PMC Member"),
    (100, "Contributor"))
  .toDF("id", "status")

In [None]:
person.createOrReplaceTempView("person")
graduateProgram.createOrReplaceTempView("graduateProgram")
sparkStatus.createOrReplaceTempView("sparkStatus")

In [None]:
val joinExpression = person.col("graduate_program") === graduateProgram.col("id")

In [None]:
val wrongJoinExpression = person.col("name") === graduateProgram.col("school")

In [None]:
person.join(graduateProgram, joinExpression).show()

In [None]:
val joinType = "outer"
person.join(graduateProgram, joinExpression, joinType).show()

In [None]:
val joinType = "left_outer"
graduateProgram.join(person, joinExpression, joinType).show()

In [None]:
val joinType = "right_outer"
person.join(graduateProgram, joinExpression, joinType).show()

In [None]:
val joinType = "left_semi"
graduateProgram.join(person, joinExpression, joinType).show()

In [None]:
val joinType = "cross"
graduateProgram.join(person, joinExpression, joinType).show()

In [None]:
person.crossJoin(graduateProgram).show()

In [None]:
import org.apache.spark.sql.functions.expr

person.withColumnRenamed("id", "personId")
  .join(sparkStatus, expr("array_contains(spark_status, id)")).show()

In [None]:
val gradProgramDupe = graduateProgram.withColumnRenamed("id", "graduate_program")
val joinExpr = gradProgramDupe.col("graduate_program") === person.col(
  "graduate_program")
gradProgramDupe.printSchema()

In [None]:
person.join(gradProgramDupe, joinExpr).show()

In [None]:
person.join(gradProgramDupe,"graduate_program").show()

In [None]:
person.join(gradProgramDupe, joinExpr).drop(person.col("graduate_program"))
  .select("graduate_program").show()

In [None]:
import org.apache.spark.sql.functions.broadcast
val joinExpr = person.col("graduate_program") === graduateProgram.col("id")
person.join(broadcast(graduateProgram), joinExpr).explain()

In [None]:
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}
val myManualSchema = new StructType(Array(
  new StructField("DEST_COUNTRY_NAME", StringType, true),
  new StructField("ORIGIN_COUNTRY_NAME", StringType, true),
  new StructField("count", LongType, false)
))
spark.read.format("csv")
  .option("header", "true")
  .option("mode", "FAILFAST")
  .schema(myManualSchema)
  .load("D:/Spark-The-Definitive-Guide-master/data/flight-data/csv/2010-summary.csv")
  .show(5)

In [None]:
val csvFile = spark.read.format("csv")
  .option("header", "true").option("mode", "FAILFAST").schema(myManualSchema)
  .load("D:/Spark-The-Definitive-Guide-master/data/flight-data/csv/2010-summary.csv")

In [None]:
csvFile.write.format("csv").mode("overwrite").option("sep", "\t")
  .save("D:/Spark-The-Definitive-Guide-master/tmp/my-tsv-file.tsv")

In [None]:
csvFile.write.format("json").mode("overwrite").save("D:/Spark-The-Definitive-Guide-master/tmp/my-json-file.json")

In [None]:
spark.read.format("parquet")
  .load("D:/Spark-The-Definitive-Guide-master/data/flight-data/parquet/2010-summary.parquet").show(5)

In [None]:
csvFile.write.format("parquet").mode("overwrite")
  .save("D:/Spark-The-Definitive-Guide-master/tmp/my-parquet-file.parquet")

In [None]:
spark.read.textFile("D:/Spark-The-Definitive-Guide-master/data/flight-data/csv/2010-summary.csv")
  .selectExpr("split(value, ',') as rows").show()

In [None]:
//csvFile.select("DEST_COUNTRY_NAME").write.text("D:/Spark-The-Definitive-Guide-master/tmp/simple-text-file.txt")

In [None]:
//csvFile.repartition(5).write.format("csv").save("D:/Spark-The-Definitive-Guide-master/tmp/multiple.csv")

In [None]:
csvFile.limit(10).write.mode("overwrite").partitionBy("DEST_COUNTRY_NAME")
  .save("D:/Spark-The-Definitive-Guide-master/tmp/partitioned-files.parquet")

In [None]:
val numberBuckets = 10
val columnToBucketBy = "count"

//csvFile.write.format("parquet").mode("overwrite")
  //.bucketBy(numberBuckets, columnToBucketBy).saveAsTable("bucketedFiles")

In [None]:
spark.sql("SELECT 1 + 1").show()

In [None]:
spark.read.json("D:/Spark-The-Definitive-Guide-master/data/flight-data/json/2015-summary.json")
  .createOrReplaceTempView("some_sql_view") // DF => SQL

spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count)
FROM some_sql_view GROUP BY DEST_COUNTRY_NAME
""")
  .where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10")
  .count() // SQL => DF

In [None]:
spark.sparkContext

In [None]:
spark.range(500).rdd

In [None]:
spark.range(10).toDF.rdd.map(rowObject => rowObject.getLong(0))

In [None]:
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"
  .split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)

In [None]:
words.setName("myWords")
words.name // myWords

In [None]:
spark.sparkContext.textFile("D:/Spark-The-Definitive-Guide-master/some/path/withTextFiles")

In [None]:
spark.sparkContext.wholeTextFiles("D:/Spark-The-Definitive-Guide-master/some/path/withTextFiles")

In [None]:
def startsWithS(individual:String) = {
  individual.startsWith("S")
}
words.filter(word => startsWithS(word)).collect()

In [None]:
val words2 = words.map(word => (word, word(0), word.startsWith("S")))

In [None]:
words2.filter(record => record._3).take(5)

In [None]:
words.sortBy(word => word.length() * -1).take(2)

In [None]:
val fiftyFiftySplit = words.randomSplit(Array[Double](0.5, 0.5))

In [None]:
spark.sparkContext.parallelize(1 to 20).reduce(_ + _) 

In [None]:
def wordLengthReducer(leftWord:String, rightWord:String): String = {
  if (leftWord.length > rightWord.length)
    return leftWord
  else
    return rightWord
}

words.reduce(wordLengthReducer)

In [None]:
val confidence = 0.95
val timeoutMilliseconds = 400
words.countApprox(timeoutMilliseconds, confidence)

words.count()

In [None]:
words.countApproxDistinct(0.05)

In [None]:
words.countByValue()

In [None]:
words.first()

In [None]:
words.take(5)
words.takeOrdered(5)
words.top(5)
val withReplacement = true
val numberToTake = 6
val randomSeed = 100L
words.takeSample(withReplacement, numberToTake, randomSeed)

In [None]:
words.cache()
words.getStorageLevel

In [None]:
def indexedFunc(partitionIndex:Int, withinPartIterator: Iterator[String]) = {
  withinPartIterator.toList.map(
    value => s"Partition: $partitionIndex => $value").iterator
}
words.mapPartitionsWithIndex(indexedFunc).collect()

In [None]:
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"
  .split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)

In [None]:
words.map(word => (word.toLowerCase, 1))

In [None]:
val keyword = words.keyBy(word => word.toLowerCase.toSeq(0).toString)

In [None]:
keyword.mapValues(word => word.toUpperCase).collect()

In [None]:
keyword.flatMapValues(word => word.toUpperCase).collect()

In [None]:
keyword.keys.collect()
keyword.values.collect()

In [None]:
keyword.lookup("s")

In [None]:
val distinctChars = words.flatMap(word => word.toLowerCase.toSeq).distinct
  .collect()
import scala.util.Random
val sampleMap = distinctChars.map(c => (c, new Random().nextDouble())).toMap
words.map(word => (word.toLowerCase.toSeq(0), word))
  .sampleByKey(true, sampleMap, 6L)
  .collect()

In [None]:
words.map(word => (word.toLowerCase.toSeq(0), word))
  .sampleByKeyExact(true, sampleMap, 6L).collect()

In [None]:
val chars = words.flatMap(word => word.toLowerCase.toSeq)
val KVcharacters = chars.map(letter => (letter, 1))
def maxFunc(left:Int, right:Int) = math.max(left, right)
def addFunc(left:Int, right:Int) = left + right
//val nums = sc.parallelize(1 to 30, 5)

In [None]:
// in Scala
val timeout = 1000L //milliseconds
val confidence = 0.95
KVcharacters.countByKey()
KVcharacters.countByKeyApprox(timeout, confidence)

In [None]:
KVcharacters.groupByKey().map(row => (row._1, row._2.reduce(addFunc))).collect()

In [None]:
KVcharacters.reduceByKey(addFunc).collect()

In [None]:
KVcharacters.aggregateByKey(0)(addFunc, maxFunc).collect()

In [None]:
val valToCombiner = (value:Int) => List(value)
val mergeValuesFunc = (vals:List[Int], valToAppend:Int) => valToAppend :: vals
val mergeCombinerFunc = (vals1:List[Int], vals2:List[Int]) => vals1 ::: vals2
// now we define these as function variables
val outputPartitions = 6
KVcharacters
  .combineByKey(
    valToCombiner,
    mergeValuesFunc,
    mergeCombinerFunc,
    outputPartitions)
  .collect()

In [None]:
KVcharacters.foldByKey(0)(addFunc).collect()

In [None]:
import scala.util.Random
val distinctChars = words.flatMap(word => word.toLowerCase.toSeq).distinct
val charRDD = distinctChars.map(c => (c, new Random().nextDouble()))
val charRDD2 = distinctChars.map(c => (c, new Random().nextDouble()))
val charRDD3 = distinctChars.map(c => (c, new Random().nextDouble()))
charRDD.cogroup(charRDD2, charRDD3).take(5)

In [None]:
val keyedChars = distinctChars.map(c => (c, new Random().nextDouble()))
val outputPartitions = 10
KVcharacters.join(keyedChars).count()
KVcharacters.join(keyedChars, outputPartitions).count()

In [None]:
words.repartition(10)

In [None]:
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple"
  .split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)

In [None]:
val supplementalData = Map("Spark" -> 1000, "Definitive" -> 200,
                           "Big" -> -300, "Simple" -> 100)

In [None]:
val suppBroadcast = spark.sparkContext.broadcast(supplementalData)

In [None]:
suppBroadcast.value

In [None]:
words.map(word => (word, suppBroadcast.value.getOrElse(word, 0)))
  .sortBy(wordPair => wordPair._2)
  .collect()

In [None]:
case class Flight(DEST_COUNTRY_NAME: String,
                  ORIGIN_COUNTRY_NAME: String, count: BigInt)
org.apache.spark.sql.catalyst.encoders.OuterScopes.addOuterScope(this)
val flights = spark.read
  .parquet("D:/Spark-The-Definitive-Guide-master/data/flight-data/parquet/2010-summary.parquet")
  .as[Flight]

In [None]:
import org.apache.spark.util.LongAccumulator
val accUnnamed = new LongAccumulator
val acc = spark.sparkContext.register(accUnnamed)

In [None]:
val accChina = new LongAccumulator
val accChina2 = spark.sparkContext.longAccumulator("China")
spark.sparkContext.register(accChina, "China")

In [None]:
def accChinaFunc(flight_row: Flight) = {
  val destination = flight_row.DEST_COUNTRY_NAME
  val origin = flight_row.ORIGIN_COUNTRY_NAME
  if (destination == "China") {
    accChina.add(flight_row.count.toLong)
  }
  if (origin == "China") {
    accChina.add(flight_row.count.toLong)
  }
}

In [None]:
flights.foreach(flight_row => accChinaFunc(flight_row))

In [None]:
accChina.value // 953

In [None]:
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.util.AccumulatorV2

val arr = ArrayBuffer[BigInt]()

class EvenAccumulator extends AccumulatorV2[BigInt, BigInt] {
  private var num:BigInt = 0
  def reset(): Unit = {
    this.num = 0
  }
  def add(intValue: BigInt): Unit = {
    if (intValue % 2 == 0) {
        this.num += intValue
    }
  }
  def merge(other: AccumulatorV2[BigInt,BigInt]): Unit = {
    this.num += other.value
  }
  def value():BigInt = {
    this.num
  }
  def copy(): AccumulatorV2[BigInt,BigInt] = {
    new EvenAccumulator
  }
  def isZero():Boolean = {
    this.num == 0
  }
}
val acc = new EvenAccumulator
//val newAcc = sc.register(acc, "evenAcc")

In [None]:
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Databricks Spark Example")
  .config("spark.sql.warehouse.dir", "/user/hive/warehouse")
  .getOrCreate()

In [None]:
import org.apache.spark.SparkContext
val sc = SparkContext.getOrCreate()

In [None]:
spark.sparkContext.setLogLevel("INFO")

In [None]:
spark.read
   .option("header", "true")
   .csv("D:/Spark-The-Definitive-Guide-master/data/retail-data/all/online-retail-dataset.csv")
   .repartition(2)
   .selectExpr("instr(Description, 'GLASS') >= 1 as is_glass")
   .groupBy("is_glass")
   .count()
  .collect()