In [1]:
import org.apache.spark.sql._

val workingBucket = "s3a://S3_BUCKET"
val output_directory = "processed_jupyter_spark"
val sqlCtx = new SQLContext(sc)
val hc = sc.hadoopConfiguration
hc.set("hive.execution.engine", "mr")

def bucketPath(path: String) = {
    s"$workingBucket/$path"
}
def fullPath(path: String) = {
    s"$workingBucket/$output_directory/$path"
}

In [2]:
val carriers = sqlCtx.read.
                        format("com.databricks.spark.csv").
                        option("inferSchema", "true").
                        option("header", "true").
                        load(bucketPath("carriers.csv"))
carriers.write.mode(SaveMode.Overwrite).parquet(fullPath("carriers/"))
carriers.createOrReplaceTempView("carriers")
carriers.show(20)

+----+--------------------+
|Code|         Description|
+----+--------------------+
| 02Q|       Titan Airways|
| 04Q|  Tradewind Aviation|
| 05Q| Comlux Aviation, AG|
| 06Q|Master Top Linhas...|
| 07Q| Flair Airlines Ltd.|
| 09Q|      Swift Air, LLC|
| 0BQ|                 DCA|
| 0CQ|ACM AIR CHARTER GmbH|
| 0FQ|Maine Aviation Ai...|
| 0GQ|Inter Island Airw...|
| 0HQ|Polar Airlines de...|
|  0J|          JetClub AG|
| 0JQ|     Vision Airlines|
| 0KQ|Mokulele Flight S...|
| 0LQ|   Metropix UK, LLP.|
| 0MQ|Multi-Aero, Inc. ...|
|  0Q| Flying Service N.V.|
|  16|   PSA Airlines Inc.|
|  17|   Piedmont Airlines|
|  1I|Sky Trek Int'l Ai...|
+----+--------------------+
only showing top 20 rows



In [5]:
val airports = sqlCtx.read.
                        format("com.databricks.spark.csv").
                        option("inferSchema", "true").
                        option("header", "true").
                        load(bucketPath("airports.csv"))
airports.write.mode(SaveMode.Overwrite).parquet(fullPath("airports/"))
airports.createOrReplaceTempView("airports")
airports.show(20)

+----+--------------------+------------------+-----+-------+-----------+------------+
|iata|             airport|              city|state|country|        lat|        long|
+----+--------------------+------------------+-----+-------+-----------+------------+
| 00M|            Thigpen |       Bay Springs|   MS|    USA|31.95376472|-89.23450472|
| 00R|Livingston Municipal|        Livingston|   TX|    USA|30.68586111|-95.01792778|
| 00V|         Meadow Lake|  Colorado Springs|   CO|    USA|38.94574889|-104.5698933|
| 01G|        Perry-Warsaw|             Perry|   NY|    USA|42.74134667|-78.05208056|
| 01J|    Hilliard Airpark|          Hilliard|   FL|    USA| 30.6880125|-81.90594389|
| 01M|   Tishomingo County|           Belmont|   MS|    USA|34.49166667|-88.20111111|
| 02A|         Gragg-Wade |           Clanton|   AL|    USA|32.85048667|-86.61145333|
| 02C|             Capitol|        Brookfield|   WI|    USA|   43.08751|-88.17786917|
| 02G|   Columbiana County|    East Liverpool|   OH|  

In [4]:
import sqlCtx.implicits._

val flights_w_na = sqlCtx.read.
                        format("com.databricks.spark.csv").
                        option("inferSchema", "true").
                        option("header", "true").
                        option("nullValue", "NA").
                        load(bucketPath("2008.csv.bz2"))
val flights = flights_w_na.na.fill(0)
flights.write.mode(SaveMode.Overwrite).parquet(fullPath("flights/"))
flights.createOrReplaceTempView("flights")
flights.select($"ArrDelay",$"CarrierDelay",$"WeatherDelay",$"Distance").show(20)

+--------+------------+------------+--------+
|ArrDelay|CarrierDelay|WeatherDelay|Distance|
+--------+------------+------------+--------+
|     -14|           0|           0|     810|
|       2|           0|           0|     810|
|      14|           0|           0|     515|
|      -6|           0|           0|     515|
|      34|           2|           0|     515|
|      11|           0|           0|     688|
|      57|          10|           0|    1591|
|     -18|           0|           0|    1591|
|       2|           0|           0|     451|
|     -16|           0|           0|     451|
|       1|           0|           0|     828|
|      80|           8|           0|     828|
|       1|           0|           0|     162|
|      10|           0|           0|     162|
|      -4|           0|           0|     162|
|      11|           0|           0|     162|
|      15|           3|           0|    1489|
|     -15|           0|           0|    1489|
|      16|           0|           