<a href="https://colab.research.google.com/github/bsrikanth24/Best-websites-a-programmer-should-visit/blob/master/Cracking_PySpark_JSON_Handling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1.from_json()

  This function parses a JSON string column into a PySpark StructType or other complex data types. It requires a schema to be specified.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StringType, IntegerType

# Initialize Spark session
spark = SparkSession.builder.appName("FromJSONExample").getOrCreate()

# Sample data (JSON string in DataFrame)
data = [("""{"name": "Srikanth", "age": 31}""",)]
df = spark.createDataFrame(data, ["json_data"])

# Define schema for JSON
schema = StructType() \
    .add("name", StringType(), True) \
    .add("age", IntegerType(), True)

# Parse JSON string into DataFrame columns
df_parsed = df.withColumn("parsed_json", from_json(col("json_data"), schema))

# Show result
df_parsed.select("parsed_json.*").show()

+--------+---+
|    name|age|
+--------+---+
|Srikanth| 31|
+--------+---+



2. **to_json()**

    This function converts columns in a DataFrame into a JSON string. It works well for nested structures and complex data types.

In [None]:
from pyspark.sql.functions import to_json, struct

# Sample data in a structured format
data = [("Srikanth", 31)]
df = spark.createDataFrame(data, ["name", "age"])

# Convert structured columns into a JSON string
df_json = df.withColumn("json_data", to_json(struct("name", "age")))

# Show result
df_json.show(truncate=False)

+--------+---+----------------------------+
|name    |age|json_data                   |
+--------+---+----------------------------+
|Srikanth|31 |{"name":"Srikanth","age":31}|
+--------+---+----------------------------+



3.json_tuple
      
  This function extracts multiple values from a JSON string based on the provided field names and creates new columns. It is useful for quickly extracting values from flat JSON strings.

In [None]:
from pyspark.sql.functions import json_tuple

# Sample data
data = [("""{"name": "Srikanth", "age": 31, "city": "Hyderabad"}""",)]
df = spark.createDataFrame(data, ["json_data"])

# Extract values using json_tuple
df_extracted = df.select(json_tuple(col("json_data"), "name", "age", "city") \
                  .alias("name", "age", "city"))

# Show result
df_extracted.show()

+--------+---+---------+
|    name|age|     city|
+--------+---+---------+
|Srikanth| 31|Hyderabad|
+--------+---+---------+



4. get_json_object()

  This function extracts a specific value from a JSON string based on the JSON path expression

In [None]:
from pyspark.sql.functions import get_json_object

# Sample data
data = [("""{"name": "Srikanth", "age": 31, "city": "Hyderabad"}""",)]
df = spark.createDataFrame(data, ["json_data"])

# Extract specific fields using JSON path
df_extracted = df.withColumn("name", get_json_object(col("json_data"), "$.name")) \
                 .withColumn("city", get_json_object(col("json_data"), "$.city"))

# Show result
df_extracted.show(truncate=False)

+----------------------------------------------------+--------+---------+
|json_data                                           |name    |city     |
+----------------------------------------------------+--------+---------+
|{"name": "Srikanth", "age": 31, "city": "Hyderabad"}|Srikanth|Hyderabad|
+----------------------------------------------------+--------+---------+



5. schema_of_json()

 This function infers the schema of a JSON string. It is useful when you don't know the schema beforehand and want to extract it dynamically.

In [None]:
from pyspark.sql.functions import schema_of_json

# Sample JSON string
json_string = '{"name": "Srikanth", "age": 31}'

# Infer schema from the JSON string
schema = spark.range(1) \
        .select(schema_of_json(json_string) \
        .alias("schema")).collect()[0][0]

# Print schema
print(schema)

STRUCT<age: BIGINT, name: STRING>


Question 1:

In PySpark, handling nested JSON data involves working with complex data types such as `ArrayType`, `MapType`, and `StructType`. Here's an example of how to process a nested JSON structure that includes these data types. We will:

1. Define a schema with `StructType`, `ArrayType`, and `MapType`. 2. Read the JSON data. 3. Access and manipulate the nested fields.

**Step 1: Create a Spark session**

In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

# This session is required for any PySpark code to run
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()

Step 2: Define sample data

In [37]:
# The sample data contains a single row with nested JSON structure.
# data = [(
#     '''{
#         "user_id": "U123",
#         "info": {
#             "name": "Srikanth",
#             "age": 31,
#             "preferences": {
#                 "colors": ["blue", "green"],
#                 "hobbies": ["reading", "traveling"]
#             }
#         },
#         "addresses": [
#             {
#                 "type": "home",
#                 "city": "Hyderabad",
#                 "postalCode": "500048"
#             },
#             {
#                 "type": "office",
#                 "city": "Bangalore",
#                 "postalCode": "560001"
#             }
#         ],
#         "metadata": {
#             "likes": 100,
#             "social": {
#                 "twitter": "@SmartInvGuide",
#                 "linkedin": "http://www.linkedin.com/in/srikanthb24/ "
#             }
#         }
#     }'''
# ,)]


data = [(
    '''{
    "user_id": "12345",
    "info": {
        "name": "Srikanth",
        "age": 30,
        "preferences": {
            "colors": ["red", "blue"],
            "hobbies": ["reading", "gaming"]
        }
    },
    "addresses": [
        {
            "type": "home",
            "city": "Hyderabad",
            "postalCode": "500048"
        }
    ],
    "metadata": {
        "likes": 100,
        "social": {
            "twitter": "@srikanth9",
            "linkedin": "srikanth24"
        }
    }
}'''
,)]

Step 3: Create a DataFrame with the JSON string data

In [38]:
# We're loading the raw JSON data as a single string in a DataFrame
df = spark.createDataFrame(data, ["json_data"])

Step 4: Parse the JSON string

In [39]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

# Define the schema for the JSON data
schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("info", StructType([
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("preferences", StructType([
            StructField("colors", ArrayType(StringType()), True),
            StructField("hobbies", ArrayType(StringType()), True)
        ]), True)
    ]), True),
    StructField("addresses", ArrayType(StructType([
        StructField("type", StringType(), True),
        StructField("city", StringType(), True),
        StructField("postalCode", StringType(), True)
    ])), True),
    StructField("metadata", StructType([
        StructField("likes", IntegerType(), True),
        StructField("social", StructType([
            StructField("twitter", StringType(), True),
            StructField("linkedin", StringType(), True)
        ]), True)
    ]), True)
])

# Convert JSON string to DataFrame with structured columns
json_df = df.selectExpr("CAST(json_data AS STRING) as json") \
    .select(from_json("json", schema).alias("data"))

Step 5: Flatten the DataFrame

In [40]:
# Access the nested fields using dot notation
# We also explode the 'addresses' array to create separate rows for each address entry.
flattened_df = json_df.select(
    col("data.user_id"),
    col("data.info.name"),
    col("data.info.age"),
    col("data.info.preferences.colors"),              # Preferred colors (array)
    col("data.info.preferences.hobbies"),             # Hobbies (array)
    explode(col("data.addresses")).alias("address"),  # Explode addresses array into rows
    col("data.metadata.likes"),
    col("data.metadata.social.twitter"),
    col("data.metadata.social.linkedin")
)

Step 6: Select and rename relevant fields

In [41]:
# After exploding 'addresses', select the individual fields of each address and rename them.
result_df = flattened_df.select(
    "user_id",
    "name",
    "age",
    "colors",
    "hobbies",
    col("address.type").alias("address_type"),  # Address type (home/office)
    col("address.city").alias("address_city"),
    col("address.postalCode").alias("address_postal_code"),
    "likes",
    "twitter",
    "linkedin"
)

Step 7: Display the DataFrame

In [42]:
# Show the resulting flattened DataFrame with all selected fields.
result_df.show(truncate=False)

+-------+--------+---+-----------+-----------------+------------+------------+-------------------+-----+----------+----------+
|user_id|name    |age|colors     |hobbies          |address_type|address_city|address_postal_code|likes|twitter   |linkedin  |
+-------+--------+---+-----------+-----------------+------------+------------+-------------------+-----+----------+----------+
|12345  |Srikanth|30 |[red, blue]|[reading, gaming]|home        |Hyderabad   |500048             |100  |@srikanth9|srikanth24|
+-------+--------+---+-----------+-----------------+------------+------------+-------------------+-----+----------+----------+



**Read JSON multiple lines**

To read a multi-line JSON file in PySpark, you can use the `multiline` option while reading the JSON.

In [43]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("ReadMultilineJSON").getOrCreate()

# Path to your JSON file
json_file_path = "/path/multiline_file.json"

# Read multi-line JSON file
df = spark.read.option("multiline", "true").json(json_file_path)

# Show DataFrame
df.show(truncate=False)

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/path/multiline_file.json.