In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import from_json, col

# Create Spark session
spark = SparkSession.builder.appName("ReadJSON").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Read JSON file
df = spark.read.json("/home/agileox/Project/payn_project/data/cc_sample_transaction.json")

# Preview the data
df.show(1, truncate=False)
df.printSchema()


                                                                                

+----------+----+--------+-----------+----------------+--------+----------------+----------------------+---------+----------+-------------+--------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+--------------------------------+
|Unnamed: 0|amt |category|cc_bic     |cc_num          |is_fraud|merch_eff_time  |merch_last_update_time|merch_lat|merch_long|merch_zipcode|merchant                  |personal_detail                                                                                                                                                                                                                                                        |trans_date_trans_time|trans_num                       |
+----------+----+--------+--

In [23]:
# Show the raw personal_detail column
df.select("personal_detail").show(1, truncate=False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|personal_detail                                                                                                                                                                                                                                                        |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|{"person_name":"Jennifer,Banks,eeeee","gender":"F","address":"{\"street\":\"561 Perry Cove\",\"city\":\"Moravian Falls\",\"state\":\"NC\",\"zip\":\"28654\"}","lat":"36.0788","long":"-81.1781","city_pop

In [27]:
# Define the schema for the nested JSON in "personal_detail"
personal_schema = StructType([
    StructField("person_name", StringType(), True),
    StructField("person_age", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("gender", StringType(), True)
])

# Parse the JSON string inside "personal_detail"
df_parsed = df.withColumn("personal_detail", from_json(col("personal_detail"), personal_schema))

# Flatten the structure
df_final = df_parsed.select(
    "*",  # keep all original columns
    col("personal_detail.person_name").alias("person_name"),
    col("personal_detail.person_age").alias("person_age"),
    col("personal_detail.city").alias("person_city"),
    col("personal_detail.gender").alias("person_gender")
).drop("personal_detail")

# Show final results
df_final.show(1, truncate=False)
df_final.printSchema()

+----------+----+--------+-----------+----------------+--------+----------------+----------------------+---------+----------+-------------+--------------------------+---------------------+--------------------------------+--------------------+----------+-----------+-------------+
|Unnamed: 0|amt |category|cc_bic     |cc_num          |is_fraud|merch_eff_time  |merch_last_update_time|merch_lat|merch_long|merch_zipcode|merchant                  |trans_date_trans_time|trans_num                       |person_name         |person_age|person_city|person_gender|
+----------+----+--------+-----------+----------------+--------+----------------+----------------------+---------+----------+-------------+--------------------------+---------------------+--------------------------------+--------------------+----------+-----------+-------------+
|0         |4.97|misc_net|CITIUS33CHI|2703186189652095|0       |1325376018798532|1325376018666         |36.011293|-82.048315|28705        |fraud_Rippin, Kub and

+----------+------+-----------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+-------------------------------+---------------------+--------------------------------+--------------------+----------+-----------+------+
|Unnamed: 0|amt   |category   |cc_bic     |cc_num          |is_fraud|merch_eff_time  |merch_last_update_time|merch_lat         |merch_long |merch_zipcode|merchant                       |trans_date_trans_time|trans_num                       |person_name         |person_age|person_city|gender|
+----------+------+-----------+-----------+----------------+--------+----------------+----------------------+------------------+-----------+-------------+-------------------------------+---------------------+--------------------------------+--------------------+----------+-----------+------+
|0         |4.97  |misc_net   |CITIUS33CHI|2703186189652095|0       |1325376018798532|1325376018666         |36.011293   