In [22]:
# Declaration of the packages to be used
from pyspark.sql import SparkSession, pandas
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import from_json, col

# Spark session creation
spark = SparkSession.builder.appName("FlattenNestedJSON").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# To read the JSON file for cc_sample transaction
df = spark.read.json("/home/agileox/Project/payn_project/data/cc_sample_transaction.json")

# Step 1: Define schema for inner "address"
# Due to the json files have 2 level of nested and nested nested format
# first level - getting the address
address_schema = StructType([
    StructField("street", StringType(), True),
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True)
])

# Step 2: Define schema for "personal_detail"
# working on getting the info for personnal de
personal_schema = StructType([
    StructField("person_name", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StringType(), True),  # nested JSON as string
    StructField("lat", StringType(), True),
    StructField("long", StringType(), True),
    StructField("city_pop", StringType(), True),
    StructField("job", StringType(), True),
    StructField("dob", StringType(), True)
])

# Step 3: Parse personal_detail JSON
df_level1 = df.withColumn("personal_detail", from_json(col("personal_detail"), personal_schema))

# Step 4: Parse address JSON inside personal_detail
df_level2 = df_level1.withColumn(
    "address",
    from_json(col("personal_detail.address"), address_schema)
)

# Step 5: Flatten everything
df_final = df_level2.select(
    "Unnamed: 0", "amt", "category", "cc_bic", "cc_num", "is_fraud",
    "merch_eff_time", "merch_last_update_time", "merch_lat", "merch_long",
    "merch_zipcode", "merchant", "trans_date_trans_time", "trans_num",
    col("personal_detail.person_name").alias("person_name"),
    col("personal_detail.gender").alias("gender"),
    col("personal_detail.lat").alias("lat"),
    col("personal_detail.long").alias("long"),
    col("personal_detail.city_pop").alias("city_pop"),
    col("personal_detail.job").alias("job"),
    col("personal_detail.dob").alias("dob"),
    col("address.street").alias("address_street"),
    col("address.city").alias("address_city"),
    col("address.state").alias("address_state"),
    col("address.zip").alias("address_zip")
)

# Option 2️⃣: If using Jupyter Notebook or VS Code, display as Pandas DataFrame table
try:
    display(df_final.limit(2).toPandas())  # works in Databricks, Jupyter, VSCode
except NameError:
    # fallback for standard Python interpreter
    print(df_final.limit(2).toPandas())
df_final.show(5, truncate=False)

                                                                                

Unnamed: 0.1,Unnamed: 0,amt,category,cc_bic,cc_num,is_fraud,merch_eff_time,merch_last_update_time,merch_lat,merch_long,...,gender,lat,long,city_pop,job,dob,address_street,address_city,address_state,address_zip
0,0,4.97,misc_net,CITIUS33CHI,2703186189652095,0,1325376018798532,1325376018666,36.011293,-82.048315,...,F,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,561 Perry Cove,Moravian Falls,NC,28654
1,1,107.23,grocery_pos,ADMDUS41,630423337322,0,1325376044867960,132537604479,49.159047,-118.186462,...,F,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,43039 Riley Greens Suite 393,Orient,WA,99160


+----------+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+----------------------------------+---------------------+--------------------------------+--------------------+------+-------+---------+--------+---------------------------------+----------+----------------------------+--------------+-------------+-----------+
|Unnamed: 0|amt   |category     |cc_bic     |cc_num          |is_fraud|merch_eff_time  |merch_last_update_time|merch_lat         |merch_long |merch_zipcode|merchant                          |trans_date_trans_time|trans_num                       |person_name         |gender|lat    |long     |city_pop|job                              |dob       |address_street              |address_city  |address_state|address_zip|
+----------+------+-------------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+----------