In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (SparkSession.builder.appName("Dsay 6 DataFrame").getOrCreate())

## Data Preparation
* a.	Copy the following data to JSON format data and name it as employee.JSON. <br>
* b.	Create a Dataframe for employee.json, and use pyspark.

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
#define custom schema
schema = StructType([StructField("id", IntegerType(), True),
                     StructField("name", StringType(), True),
                     StructField("age", IntegerType(), True)
                    ])

In [4]:
json_file_path = "C:/Users/Lenovo/Documents/employee.JSON"

In [5]:
df = (spark.read.json(json_file_path, schema, multiLine=False))

In [6]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)



### `(1) Query all the data.`

In [7]:
df.show()

+---+-----+----+
| id| name| age|
+---+-----+----+
|  1| Ella|  36|
|  2|  Bob|  29|
|  3| Jack|  29|
|  4|  Jim|  28|
|  4|  Jim|  28|
|  5|Damon|null|
|  5|Damon|null|
+---+-----+----+



### `(2) Query all the data while remove the duplicates.`

In [8]:
from pyspark.sql.functions import col
df_unique = df.select(col("id"), col("name"), col("age")) \
.distinct() \
.sort(col("id").asc())
df_unique.show()

+---+-----+----+
| id| name| age|
+---+-----+----+
|  1| Ella|  36|
|  2|  Bob|  29|
|  3| Jack|  29|
|  4|  Jim|  28|
|  5|Damon|null|
+---+-----+----+



### `(3) Query all the data without printing ID field.`

In [9]:
from pyspark.sql.functions import col
df_unique.select(col("name"), col("age")).show()

+-----+----+
| name| age|
+-----+----+
| Ella|  36|
|  Bob|  29|
| Jack|  29|
|  Jim|  28|
|Damon|null|
+-----+----+



### `(4) The records with age > 30.`

In [10]:
from pyspark.sql.functions import col, expr
df_unique.select(col("id"), col("name"), col("age")) \
.where(expr("age > 30")) \
.show()

+---+----+---+
| id|name|age|
+---+----+---+
|  1|Ella| 36|
+---+----+---+



### `(5) Group data by age.`

In [11]:
from pyspark.sql.functions import count
df_unique.groupBy(col("age").alias("Age")) \
.pivot('name') \
.agg(count('id').alias("Counts")) \
.fillna(value=0) \
.sort(col("age").asc()) \
.filter(expr("age != 0")) \
.show()

+---+---+-----+----+----+---+
|Age|Bob|Damon|Ella|Jack|Jim|
+---+---+-----+----+----+---+
| 28|  0|    0|   0|   0|  1|
| 29|  1|    0|   0|   1|  0|
| 36|  0|    0|   1|   0|  0|
+---+---+-----+----+----+---+



### `(6) Arrange the data in ascending order of name.`

In [12]:
df_unique.select(col("id"), col("name"), col("age")) \
.orderBy(col("name").asc()) \
.show()

+---+-----+----+
| id| name| age|
+---+-----+----+
|  2|  Bob|  29|
|  5|Damon|null|
|  1| Ella|  36|
|  3| Jack|  29|
|  4|  Jim|  28|
+---+-----+----+



### `(7) Take out the first three lines of data.`

In [16]:
df_unique.select(col("id"), col("name"), col("age")).show(3)

+---+----+---+
| id|name|age|
+---+----+---+
|  1|Ella| 36|
|  2| Bob| 29|
|  3|Jack| 29|
+---+----+---+
only showing top 3 rows



### `(8) Find the average value of age.`

In [14]:
from pyspark.sql.functions import avg
df_unique.agg(avg(col("age"))).show()

+--------+
|avg(age)|
+--------+
|    30.5|
+--------+



### `(9) Query the minimum value of age.`

In [15]:
from pyspark.sql.functions import min
df_unique.agg(min("age").alias("Minimum age")) \
.show()

+-----------+
|Minimum age|
+-----------+
|         28|
+-----------+

