On this website you can validate the structure of a JSON file: http://jsonlint.com/

In [1]:
from pyspark.sql import SparkSession

In [2]:
spSession = SparkSession.builder.master('local').appName('appSparkSql').getOrCreate()

In [3]:
dfEmployees = spSession.read.json('aux/datasets/employees.json')

In [4]:
dfEmployees.show()

+---+------+------+----------------+------+
|age|gender|id_dep|            name|salary|
+---+------+------+----------------+------+
| 42|     M|     1|  Gilmar Rezende|  5100|
| 50|     M|     2|  Matias Tavares|  8500|
| 36|     M|     1|   Paulo Miranda|  9700|
| 41|     F|     1|Ana Paula Soares|  9500|
| 34|     F|     2|   Carolina Maia|  6500|
+---+------+------+----------------+------+



In [5]:
dfEmployees.printSchema()

root
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id_dep: string (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)



In [6]:
dfEmployees.select('name').show()

+----------------+
|            name|
+----------------+
|  Gilmar Rezende|
|  Matias Tavares|
|   Paulo Miranda|
|Ana Paula Soares|
|   Carolina Maia|
+----------------+



In [7]:
dfEmployees.filter(dfEmployees['age'] > 40).orderBy(dfEmployees['age']).sort(dfEmployees['age'].desc()).show()

+---+------+------+----------------+------+
|age|gender|id_dep|            name|salary|
+---+------+------+----------------+------+
| 50|     M|     2|  Matias Tavares|  8500|
| 42|     M|     1|  Gilmar Rezende|  5100|
| 41|     F|     1|Ana Paula Soares|  9500|
+---+------+------+----------------+------+



In [8]:
dfEmployees.groupBy('gender').count().show()

+------+-----+
|gender|count|
+------+-----+
|     F|    2|
|     M|    3|
+------+-----+



In [9]:
dfEmployees.groupBy('id_dep').agg({'salary': 'avg', 'age': 'max'}).show()

+------+-----------+--------+
|id_dep|avg(salary)|max(age)|
+------+-----------+--------+
|     1|     8100.0|      42|
|     2|     7500.0|      50|
+------+-----------+--------+



In [10]:
dfEmployees.registerTempTable('tt_employees')

In [11]:
spSession.sql(" \
    SELECT \
        id_dep, \
        MAX(age) AS MAX_AGE, \
        AVG(salary) AS SALARY_AVG \
    FROM tt_employees \
    GROUP BY id_dep").show()

+------+-------+----------+
|id_dep|MAX_AGE|SALARY_AVG|
+------+-------+----------+
|     1|     42|    8100.0|
|     2|     50|    7500.0|
+------+-------+----------+

