In [2]:
import pyspark
from pyspark.sql import SparkSession


In [3]:
spark = SparkSession.builder.appName("json-read-write").config("spark.driver.memory","40g").getOrCreate()

In [6]:
#df = spark.read.json("C:/Users/bibhusha.ojha_genese/Desktop/SparkSQL/data.json")

In [7]:
#using DDL type to define schema
df_schema = "id INT , name STRING, age INT , city STRING"

In [8]:
df = spark.read \
.schema(df_schema) \
.json("C:/Users/bibhusha.ojha_genese/Desktop/SparkSQL/data.json")

In [10]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



In [11]:
#drop unwanted column(s)
dropped_df = df.drop("age")

In [12]:
dropped_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)



In [13]:
#another way to drop
from pyspark.sql.functions import col
dropped_df = df.drop(col("age"))

In [18]:
from pyspark.sql.functions import current_timestamp
#rename columns and add a new column
final_df = dropped_df.withColumnRenamed("id", "serial_no") \
.withColumn("ingestion_date", current_timestamp()) #add new column

In [22]:
final_df.show(5)

+---------+--------------+-------------+--------------------+
|serial_no|          name|         city|      ingestion_date|
+---------+--------------+-------------+--------------------+
|     NULL|          NULL|         NULL|2024-01-24 10:40:...|
|        1|      John Doe|     New York|2024-01-24 10:40:...|
|        2|    Jane Smith|  Los Angeles|2024-01-24 10:40:...|
|        3|Robert Johnson|      Chicago|2024-01-24 10:40:...|
|        4|   Emily Davis|San Francisco|2024-01-24 10:40:...|
+---------+--------------+-------------+--------------------+
only showing top 5 rows



In [21]:
final_df.show(5, False)

+---------+--------------+-------------+-------------------------+
|serial_no|name          |city         |ingestion_date           |
+---------+--------------+-------------+-------------------------+
|NULL     |NULL          |NULL         |2024-01-24 10:40:31.36331|
|1        |John Doe      |New York     |2024-01-24 10:40:31.36331|
|2        |Jane Smith    |Los Angeles  |2024-01-24 10:40:31.36331|
|3        |Robert Johnson|Chicago      |2024-01-24 10:40:31.36331|
|4        |Emily Davis   |San Francisco|2024-01-24 10:40:31.36331|
+---------+--------------+-------------+-------------------------+
only showing top 5 rows



In [56]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, BooleanType, ArrayType

snippet_schema = StructType(fields = [StructField("channelId", StringType(), True),
                                      StructField("title", StringType(),True),
                                      StructField("assignable", BooleanType(), True)
                            
])

items_schema = StructType(fields = [StructField("kind", StringType(), True),
                                    StructField("etag", StringType(), True),
                                    StructField("id", IntegerType(), True),
                                    StructField("snippet", snippet_schema)
                                   ])

df_schema = StructType(fields = [StructField("kind", StringType(), True),
                                 StructField("etag", StringType(), True),
                                 StructField("items", ArrayType(items_schema)),
                                ])

In [63]:
#nested json
df = spark.read.schema(df_schema).json("C:/Users/bibhusha.ojha_genese/Desktop/SparkSQL/youtube-data.json")


In [64]:
df.printSchema() #nested schema json

root
 |-- kind: string (nullable = true)
 |-- etag: string (nullable = true)
 |-- items: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- kind: string (nullable = true)
 |    |    |-- etag: string (nullable = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- snippet: struct (nullable = true)
 |    |    |    |-- channelId: string (nullable = true)
 |    |    |    |-- title: string (nullable = true)
 |    |    |    |-- assignable: boolean (nullable = true)



In [65]:
df.show(5)
#solve?

+----+----+-----+
|kind|etag|items|
+----+----+-----+
|NULL|NULL| NULL|
|NULL|NULL| NULL|
|NULL|NULL| NULL|
|NULL|NULL| NULL|
|NULL|NULL| NULL|
+----+----+-----+
only showing top 5 rows



In [45]:
name_schema = StructType ([ StructField("firstname", StringType(), True),
                           StructField("lastname", StringType(), True),
])

nested_schema = StructType(fields= [ StructField("id", IntegerType(), True),
                                    StructField("name", name_schema),
                                    StructField("dob", StringType(), True),
                                    StructField("nationality", StringType(), True)
])

In [46]:
df = spark.read.schema(nested_schema).json("C:/Users/bibhusha.ojha_genese/Desktop/SparkSQL/nested-json.json")


In [47]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- nationality: string (nullable = true)



In [48]:
df.show(5)

+---+------------------+----------+-----------+
| id|              name|       dob|nationality|
+---+------------------+----------+-----------+
|  1|       {John, Doe}|1990-05-15|   American|
|  2|     {Jane, Smith}|1985-12-10|   Canadian|
|  3|{Michael, Johnson}|1995-08-22|    British|
|  4|    {Emily, Brown}|1988-03-28| Australian|
|  5|  {Robert, Miller}|1992-11-07|     German|
+---+------------------+----------+-----------+
only showing top 5 rows



In [50]:
from pyspark.sql.functions import col, concat, current_timestamp, lit
df_with_column = df.withColumn("name", concat(col("name.firstname"),lit(" "),col("name.lastname")))

In [None]:
#concatenate the names into one

In [51]:
df_with_column.show(5)

+---+---------------+----------+-----------+
| id|           name|       dob|nationality|
+---+---------------+----------+-----------+
|  1|       John Doe|1990-05-15|   American|
|  2|     Jane Smith|1985-12-10|   Canadian|
|  3|Michael Johnson|1995-08-22|    British|
|  4|    Emily Brown|1988-03-28| Australian|
|  5|  Robert Miller|1992-11-07|     German|
+---+---------------+----------+-----------+
only showing top 5 rows

