# PayNet Assessment 
# Data Cleaning & Visualization using PySpark

#### Prepared by: Ainur Afifah

### Steps:
1. Initialize session 
2. Read JSON into dataframe
3. Convert JSON into a tabular format (Flatten the JSON)
4. Separate person_name into first name & last name
5. Convert timestamp
6. Other normalization 
7. Do charts and visualization (based on the dataset and additional datasets, but this is optional for me)

In [1]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, split, concat, lit, substring, regexp_extract
from pyspark.sql.functions import date_format, from_unixtime, to_timestamp, from_utc_timestamp
from pyspark.sql.functions import regexp_replace, when, array_contains, size, get_json_object
from pyspark.sql.functions import to_date, year, month, dayofmonth
from pyspark.sql.types import StringType, StructType, StructField
import re
from IPython.display import display



spark = SparkSession.builder.appName("cc-sample-data").getOrCreate()

json_file_path = "/Users/ainurafifah/Desktop/PROJECTS/portfolio/PayNet/cc_sample_transaction.json"


try:
    df = spark.read.option("multiline", "true").json(json_file_path)
    
    print("Full schema:")
    df.printSchema()
    
    # Convert timestamp to UTC+8
    # Set timezone configuration
    spark.conf.set("spark.sql.session.timeZone", "UTC+8")
    
    df = df.withColumn(
    "trans_date_trans_time_utc8",
    to_timestamp(col("trans_date_trans_time"))
            )
    
    df = df.withColumn(
    "merch_eff_time_utc8",from_unixtime(col("merch_eff_time") / 1000000))

    df = df.withColumn(
    "merch_last_update_time_utc8",
    from_unixtime(col("merch_last_update_time") / 1000))
    
    df = df.withColumn("bank_code", substring(col("cc_bic"), 1, 4))
    df = df.withColumn("country_code", substring(col("cc_bic"), 5, 2))
    df = df.withColumn("location_code", substring(col("cc_bic"), 7, 2))
    df = df.withColumn("branch_code", substring(col("cc_bic"), 9, 3))

    df = df.withColumn("fraud_flag",
       regexp_extract(col("merchant"), "^fraud_", 0))
    df = df.withColumn("merchant_name",
       regexp_extract(col("merchant"), "[^_]+$", 0))

        
    
    # Flattening the JSON
    # Define schema for nested structures
    address_schema = StructType([
                        StructField("street", StringType()),
                        StructField("city", StringType()),
                        StructField("state", StringType()),
                        StructField("zip", StringType())
                        ])

    personal_detail_schema = StructType([
                        StructField("person_name", StringType()),
                        StructField("gender", StringType()),
                        StructField("address", StringType()),
                        StructField("lat", StringType()),
                        StructField("long", StringType()),
                        StructField("city_pop", StringType()),
                        StructField("job", StringType()),   
                        StructField("dob", StringType())
                        ])

    # Parse nested JSON strings
    df_parsed = df.withColumn("personal_detail_parsed", 
                             from_json("personal_detail", personal_detail_schema))
    
    df_final = df_parsed.withColumn("address_parsed",
            from_json(
                regexp_replace(col("personal_detail_parsed.address"), r'\\"', '"'),
                address_schema))

    # Handling person_name column split
    DELIMITERS = [",", "@", "/", "!", "\\\\"]
    delimiter_pattern = "|".join([re.escape(d) for d in DELIMITERS])
    
    df_final = df_final.withColumn(
    "normalized_name",
        regexp_replace(col("personal_detail_parsed.person_name"), delimiter_pattern, "|")
    ).withColumn(
        "name_parts",
        split(col("normalized_name"), "\\|")
    )
    
    spark.conf.set("spark.sql.debug.maxToStringFields", "1000")

    # Flatten all nested structures
    flattened_df = df_final.select(
        "Unnamed: 0",
        "trans_date_trans_time_utc8",
        "cc_num",
        "merchant",
        "fraud_flag",
        "merchant_name",
        "category",
        "amt",
      
        when((size(col("name_parts")) > 0) & (col("name_parts")[0] != ""), 
            col("name_parts")[0]
        ).otherwise(lit(None)).alias("first"),
        
        when((size(col("name_parts")) > 1) & (col("name_parts")[1] != ""),
            col("name_parts")[1]
        ).otherwise(lit(None)).alias("last"),
        
             
        col("personal_detail_parsed.gender").alias("gender"),
        col("address_parsed.street").alias("street"),
        col("address_parsed.city").alias("city"),
        col("address_parsed.state").alias("state"),
        col("address_parsed.zip").alias("zip"),
        col("personal_detail_parsed.lat").alias("lat"),
        col("personal_detail_parsed.long").alias("long"),
        col("personal_detail_parsed.city_pop").alias("city_pop"),
        col("personal_detail_parsed.job").alias("job"),
        year(to_date(col("personal_detail_parsed.dob"))).alias("birth_year"),
        month(to_date(col("personal_detail_parsed.dob"))).alias("birth_month"),
        dayofmonth(to_date(col("personal_detail_parsed.dob"))).alias("birth_day"),
        "trans_num",
        "merch_lat",
        "merch_long",
        "is_fraud",
        "merch_zipcode",
        "merch_eff_time_utc8",
        "merch_last_update_time_utc8",
        "cc_bic",
        "bank_code",
        "country_code",
        "location_code",
        "branch_code"     
    ).drop("normalized_name", "name_parts")

    print("Flattened schema:")
    flattened_df.printSchema()
    
    print("\nSample flattened data:")
    flattened_df.show(10, truncate=False)
    
    
    
except Exception as e:
    print(f"Error reading JSON file: {e}")



Full schema:
root
 |-- Unnamed: 0: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- category: string (nullable = true)
 |-- cc_bic: string (nullable = true)
 |-- cc_num: string (nullable = true)
 |-- is_fraud: string (nullable = true)
 |-- merch_eff_time: string (nullable = true)
 |-- merch_last_update_time: string (nullable = true)
 |-- merch_lat: string (nullable = true)
 |-- merch_long: string (nullable = true)
 |-- merch_zipcode: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- personal_detail: string (nullable = true)
 |-- trans_date_trans_time: string (nullable = true)
 |-- trans_num: string (nullable = true)

Flattened schema:
root
 |-- Unnamed: 0: string (nullable = true)
 |-- trans_date_trans_time_utc8: timestamp (nullable = true)
 |-- cc_num: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- fraud_flag: string (nullable = true)
 |-- merchant_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt

In [None]:
spark.stop()