In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("NESTED ADDRESS JSON") \
.getOrCreate()

In [8]:
raw_profiles = [
("U001","Hyderabad,Telangana,500081"),
("U002",{"city":"Delhi","state":"Delhi","pincode":"110001"}),
("U003",("Bangalore","Karnataka",560001)),
("U004","Mumbai,MH"),
("U005",None)
]

In [9]:
from pyspark.sql.types import StructType,StructField,StringType,MapType,ArrayType, IntegerType

In [10]:
profile_schema = StructType([
    StructField("user_id",StringType(),True),
    StructField("address_raw",MapType(StringType(),StringType()),True)
])

In [14]:
cleaned_raw_profiles = [(u, {"full_address":a}) if isinstance(a, str) else (u, {"city":a[0],"state":a[1],"pincode":str(a[2])}) if isinstance(a, tuple) and len(a)==3 else (u, {k:str(v) for k,v in a.items()}) if isinstance(a, dict) else (u,a) for u,a in raw_profiles]
df_profile_raw = spark.createDataFrame(cleaned_raw_profiles, schema=profile_schema)

1. Design a nested StructType for address

In [15]:
address_struct = StructType([
    StructField("city",StringType(),True),
    StructField("state",StringType(),True),
    StructField("pincode",IntegerType(),True)
])

2. Normalize all address formats into struct

In [21]:
df_profile_raw_normalized.show(truncate=False)
df_profile_raw_normalized.printSchema()

+-------+----------------------------------------------------------+------------------------------+
|user_id|address_raw                                               |address_structured            |
+-------+----------------------------------------------------------+------------------------------+
|U001   |{full_address -> Hyderabad,Telangana,500081}              |{Hyderabad, Telangana, 500081}|
|U002   |{pincode -> 110001, state -> Delhi, city -> Delhi}        |{Delhi, Delhi, 110001}        |
|U003   |{pincode -> 560001, state -> Karnataka, city -> Bangalore}|{Bangalore, Karnataka, 560001}|
|U004   |{full_address -> Mumbai,MH}                               |{Mumbai, MH, NULL}            |
|U005   |NULL                                                      |NULL                          |
+-------+----------------------------------------------------------+------------------------------+

root
 |-- user_id: string (nullable = true)
 |-- address_raw: map (nullable = true)
 |    |-- key: 

3. Extract city, state, pincode safely

In [22]:
df_extracted_address = df_profile_raw_normalized.select(
    col("user_id"),
    col("address_structured.city").alias("city"),
    col("address_structured.state").alias("state"),
    col("address_structured.pincode").alias("pincode")
)

df_extracted_address.show(truncate=False)
df_extracted_address.printSchema()

+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |NULL   |
|U005   |NULL     |NULL     |NULL   |
+-------+---------+---------+-------+

root
 |-- user_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- pincode: integer (nullable = true)



4. Set default pincode when missing

In [23]:
df_profile_with_default_pincode = df_profile_raw_normalized.withColumn(
    "address_structured",
    F.when(
        F.col("address_structured.pincode").isNull(),
        F.struct(
            F.col("address_structured.city").alias("city"),
            F.col("address_structured.state").alias("state"),
            F.lit(999999).cast(IntegerType()).alias("pincode") # Set default pincode
        )
    ).otherwise(F.col("address_structured"))
)

df_profile_with_default_pincode.show(truncate=False)
df_profile_with_default_pincode.printSchema()

+-------+----------------------------------------------------------+------------------------------+
|user_id|address_raw                                               |address_structured            |
+-------+----------------------------------------------------------+------------------------------+
|U001   |{full_address -> Hyderabad,Telangana,500081}              |{Hyderabad, Telangana, 500081}|
|U002   |{pincode -> 110001, state -> Delhi, city -> Delhi}        |{Delhi, Delhi, 110001}        |
|U003   |{pincode -> 560001, state -> Karnataka, city -> Bangalore}|{Bangalore, Karnataka, 560001}|
|U004   |{full_address -> Mumbai,MH}                               |{Mumbai, MH, 999999}          |
|U005   |NULL                                                      |{NULL, NULL, 999999}          |
+-------+----------------------------------------------------------+------------------------------+

root
 |-- user_id: string (nullable = true)
 |-- address_raw: map (nullable = true)
 |    |-- key: 

5. Drop irrecoverable records

In [24]:
df_cleaned_profiles = df_profile_with_default_pincode.filter(F.col("address_structured").isNotNull())

df_cleaned_profiles.show(truncate=False)
df_cleaned_profiles.printSchema()

+-------+----------------------------------------------------------+------------------------------+
|user_id|address_raw                                               |address_structured            |
+-------+----------------------------------------------------------+------------------------------+
|U001   |{full_address -> Hyderabad,Telangana,500081}              |{Hyderabad, Telangana, 500081}|
|U002   |{pincode -> 110001, state -> Delhi, city -> Delhi}        |{Delhi, Delhi, 110001}        |
|U003   |{pincode -> 560001, state -> Karnataka, city -> Bangalore}|{Bangalore, Karnataka, 560001}|
|U004   |{full_address -> Mumbai,MH}                               |{Mumbai, MH, 999999}          |
|U005   |NULL                                                      |{NULL, NULL, 999999}          |
+-------+----------------------------------------------------------+------------------------------+

root
 |-- user_id: string (nullable = true)
 |-- address_raw: map (nullable = true)
 |    |-- key: 

6. Flatten the struct into columns

In [25]:
df_flattened_profiles = df_cleaned_profiles.select(
    col("user_id"),
    col("address_structured.city").alias("city"),
    col("address_structured.state").alias("state"),
    col("address_structured.pincode").alias("pincode")
)
df_flattened_profiles.show(truncate=False)
df_flattened_profiles.printSchema()

+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |999999 |
|U005   |NULL     |NULL     |999999 |
+-------+---------+---------+-------+

root
 |-- user_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- pincode: integer (nullable = true)

