# Reading JSON Files into DataFrames

This notebook shows several examples of how to import and transform JSON files into Spark DataFrames, using Scala.

In [69]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

val sqlC = new org.apache.spark.sql.SQLContext(sc)
import sqlC.implicits._

val test = spark.read.format("json").load("Downloads/yql.json")

test.printSchema()

root
 |-- query: struct (nullable = true)
 |    |-- count: long (nullable = true)
 |    |-- created: string (nullable = true)
 |    |-- lang: string (nullable = true)
 |    |-- results: struct (nullable = true)
 |    |    |-- channel: struct (nullable = true)
 |    |    |    |-- astronomy: struct (nullable = true)
 |    |    |    |    |-- sunrise: string (nullable = true)
 |    |    |    |    |-- sunset: string (nullable = true)
 |    |    |    |-- atmosphere: struct (nullable = true)
 |    |    |    |    |-- humidity: string (nullable = true)
 |    |    |    |    |-- pressure: string (nullable = true)
 |    |    |    |    |-- rising: string (nullable = true)
 |    |    |    |    |-- visibility: string (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- image: struct (nullable = true)
 |    |    |    |    |-- height: string (nullable = true)
 |    |    |    |    |-- link: string (nullable = true)
 |    |    |    |    |-- title: string (nullab

In [70]:
val testFinal = test.
    select(
        "query.created",
        "query.results.channel.item.lat",
        "query.results.channel.item.long",
        "query.results.channel.units.temperature",
        "query.results.channel.item.forecast").
    withColumn("forecast_explode", explode($"forecast")).
    withColumn("date", $"forecast_explode.date").
    withColumn("forecast_high", $"forecast_explode.high").
    withColumn("forecast_low", $"forecast_explode.low").
    drop($"forecast").
    drop($"forecast_explode")
    
testFinal.show()

+--------------------+--------+----------+-----------+-----------+-------------+------------+
|             created|     lat|      long|temperature|       date|forecast_high|forecast_low|
+--------------------+--------+----------+-----------+-----------+-------------+------------+
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|08 Nov 2017|           48|          39|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|09 Nov 2017|           52|          39|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|10 Nov 2017|           47|          27|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|11 Nov 2017|           40|          25|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|12 Nov 2017|           48|          33|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|13 Nov 2017|           51|          45|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|14 Nov 2017|           53|          43|
|2017-11-08T14:57:15Z|40.71455|-74.007118|          F|15 Nov

In [71]:
val airports = spark.read.format("json").load("Documents/jsons/airports/*.json")

airports.printSchema()

root
 |-- IATA: string (nullable = true)
 |-- ICAO: string (nullable = true)
 |-- city: string (nullable = true)
 |-- delay: string (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- status: struct (nullable = true)
 |    |-- avgDelay: string (nullable = true)
 |    |-- closureBegin: string (nullable = true)
 |    |-- closureEnd: string (nullable = true)
 |    |-- endTime: string (nullable = true)
 |    |-- maxDelay: string (nullable = true)
 |    |-- minDelay: string (nullable = true)
 |    |-- reason: string (nullable = true)
 |    |-- trend: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- weather: struct (nullable = true)
 |    |-- meta: struct (nullable = true)
 |    |    |-- credit: string (nullable = true)
 |    |    |-- updated: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |-- temp: string (nullable = true)
 |    |-- visibility: double (nullable = true)
 |    |-- weather: string (n

In [72]:
airports.select(
        "IATA", 
        "city", 
        "state",
        "weather.temp",
        "weather.visibility",
        "weather.weather",
        "weather.meta.updated").
    show()

+----+-----------+-------------+---------------+----------+-------------+--------------+
|IATA|       city|        state|           temp|visibility|      weather|       updated|
+----+-----------+-------------+---------------+----------+-------------+--------------+
| BOS|     Boston|Massachusetts| 44.0 F (6.7 C)|      10.0| A Few Clouds| 1:54 PM Local|
| LAX|Los Angeles|   California|73.0 F (22.8 C)|      10.0|Partly Cloudy|10:53 AM Local|
| JFK|   New York|     New York| 48.0 F (8.9 C)|      10.0|Mostly Cloudy| 1:51 PM Local|
| ORD|    Chicago|     Illinois| 45.0 F (7.2 C)|      10.0| A Few Clouds|12:51 PM Local|
+----+-----------+-------------+---------------+----------+-------------+--------------+



In [73]:
val pops = spark.read.format("json").load("Documents/jsons/populations/*.json")
pops.printSchema()

root
 |-- total_population: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- population: long (nullable = true)



In [74]:
val pops2 = pops.
    select(explode($"total_population").as("pop_array")).
    select("pop_array.date", "pop_array.population")


In [75]:
pops2.show()

+----------+----------+
|      date|population|
+----------+----------+
|2017-11-08|1348088878|
|2017-11-09|1348131714|
|2017-11-08| 130782294|
|2017-11-09| 130786594|
|2017-11-08| 327309303|
|2017-11-09| 327315752|
+----------+----------+

