In [0]:
# Loading all raw JSON files from my Blob Storage (Bronze layer)
# Path where JSON survey feedback files land
input_path = "dbfs:/mnt/raw-feedback/"

df_raw = spark.read.json(input_path, multiLine=True)
# Display raw structure to verify data landed correctly
display(df_raw)


In [0]:
from pyspark.sql import SparkSession

#
spark = SparkSession.builder \
    .appName("Local_Bronze_JSON_Test") \
    .master("local[*]") \
    .getOrCreate()

# sample raw JSON records 
raw_data = [
    {
        "practitioner_id": "pract-101",
        "session_id": "sess-20250101-01",
        "timestamp": "2025-01-01T08:00:00Z",
        "likert_scores": {"safety": 5, "connectivity": 4, "clarity": 5},
        "emotion_before": "stressed",
        "emotion_after": "calm",
        "comment": "Loved the stretching part.",
        "client_anonymous_id": "anon-001"
    },
    {
        "practitioner_id": "pract-101",
        "session_id": "sess-20250102-01",
        "timestamp": "2025-01-02T08:05:00Z",
        "likert_scores": {"safety": 4, "connectivity": 5, "clarity": 4},
        "emotion_before": "anxious",
        "emotion_after": "grounded",
        "comment": "Felt supported and safe.",
        "client_anonymous_id": "anon-002"
    },
    {
        "practitioner_id": "pract-102",
        "session_id": "sess-20250103-01",
        "timestamp": "2025-01-03T18:30:00Z",
        "likert_scores": {"safety": 5, "connectivity": 5, "clarity": 5},
        "emotion_before": "tired",
        "emotion_after": "energized",
        "comment": "Great pacing and clear cues.",
        "client_anonymous_id": "anon-003"
    },
    {
        "practitioner_id": "pract-102",
        "session_id": "sess-20250104-02",
        "timestamp": "2025-01-04T19:00:00Z",
        "likert_scores": {"safety": 3, "connectivity": 4, "clarity": 3},
        "emotion_before": "overwhelmed",
        "emotion_after": "calm",
        "comment": "Music helped me relax.",
        "client_anonymous_id": "anon-004"
    },
    {
        "practitioner_id": "pract-103",
        "session_id": "sess-20250105-01",
        "timestamp": "2025-01-05T07:45:00Z",
        "likert_scores": {"safety": 4, "connectivity": 3, "clarity": 4},
        "emotion_before": "sad",
        "emotion_after": "hopeful",
        "comment": "Needed more time for cool down.",
        "client_anonymous_id": "anon-005"
    },
    {
        "practitioner_id": "pract-101",
        "session_id": "sess-20250106-01",
        "timestamp": "2025-01-06T08:10:00Z",
        "likert_scores": {"safety": 5, "connectivity": 4, "clarity": 4},
        "emotion_before": "nervous",
        "emotion_after": "confident",
        "comment": "Clear instructions and good options.",
        "client_anonymous_id": "anon-006"
    },
    {
        "practitioner_id": "pract-104",
        "session_id": "sess-20250107-01",
        "timestamp": "2025-01-07T20:15:00Z",
        "likert_scores": {"safety": 2, "connectivity": 3, "clarity": 2},
        "emotion_before": "frustrated",
        "emotion_after": "okay",
        "comment": "Room was crowded; hard to follow.",
        "client_anonymous_id": "anon-007"
    },
    {
        "practitioner_id": "pract-104",
        "session_id": "sess-20250108-01",
        "timestamp": "2025-01-08T20:10:00Z",
        "likert_scores": {"safety": 4, "connectivity": 4, "clarity": 5},
        "emotion_before": "low",
        "emotion_after": "uplifted",
        "comment": "Much better today—loved the flow.",
        "client_anonymous_id": "anon-008"
    },
    {
        "practitioner_id": "pract-103",
        "session_id": "sess-20250109-02",
        "timestamp": "2025-01-09T07:55:00Z",
        "likert_scores": {"safety": 5, "connectivity": 5, "clarity": 4},
        "emotion_before": "stressed",
        "emotion_after": "peaceful",
        "comment": "Felt seen and supported.",
        "client_anonymous_id": "anon-009"
    },
    {
        "practitioner_id": "pract-102",
        "session_id": "sess-20250110-01",
        "timestamp": "2025-01-10T18:40:00Z",
        "likert_scores": {"safety": 4, "connectivity": 5, "clarity": 5},
        "emotion_before": "anxious",
        "emotion_after": "calm",
        "comment": "Breathwork was the best part.",
        "client_anonymous_id": "anon-010"
    }
]

# 3) Create Bronze DataFrame (local)
df_raw = spark.createDataFrame(raw_data)

# 4) Verify
df_raw.show(truncate=False)
df_raw.printSchema()
print("Total records:", df_raw.count())


+-------------------+------------------------------------+-------------+--------------+----------------------------------------------+---------------+----------------+--------------------+
|client_anonymous_id|comment                             |emotion_after|emotion_before|likert_scores                                 |practitioner_id|session_id      |timestamp           |
+-------------------+------------------------------------+-------------+--------------+----------------------------------------------+---------------+----------------+--------------------+
|anon-001           |Loved the stretching part.          |calm         |stressed      |{clarity -> 5, connectivity -> 4, safety -> 5}|pract-101      |sess-20250101-01|2025-01-01T08:00:00Z|
|anon-002           |Felt supported and safe.            |grounded     |anxious       |{clarity -> 4, connectivity -> 5, safety -> 4}|pract-101      |sess-20250102-01|2025-01-02T08:05:00Z|
|anon-003           |Great pacing and clear cues.      

In [0]:
from pyspark.sql.functions import explode, col
# Each survey record has multiple scores (Safety, Connectivity, Clarity)
# explode() converts each score into a separate row 
df_flat = df_raw.select(
    col("practitioner_id"),
    col("session_id"),
    col("timestamp"),
    explode("likert_scores").alias("Question", "LikertScore"),
    col("emotion_before"),
    col("emotion_after"),
    col("comment")
)

display(df_flat)


practitioner_id,session_id,timestamp,Question,LikertScore,emotion_before,emotion_after,comment
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,clarity,5,stressed,calm,Loved the stretching part.
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,connectivity,4,stressed,calm,Loved the stretching part.
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,safety,5,stressed,calm,Loved the stretching part.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,clarity,4,anxious,grounded,Felt supported and safe.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,connectivity,5,anxious,grounded,Felt supported and safe.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,safety,4,anxious,grounded,Felt supported and safe.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,clarity,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,connectivity,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,safety,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250104-02,2025-01-04T19:00:00Z,clarity,3,overwhelmed,calm,Music helped me relax.


In [0]:
from pyspark.sql.functions import regexp_replace
# Remove emojis
# Filter out rows where Likert score is missing (null)
df_clean = df_flat.filter(col("LikertScore").isNotNull()) \
    .withColumn("Comment_Cleaned", regexp_replace("comment", "[^\x00-\x7F]+", "")) \
    .drop("comment")

display(df_clean)


practitioner_id,session_id,timestamp,Question,LikertScore,emotion_before,emotion_after,Comment_Cleaned
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,clarity,5,stressed,calm,Loved the stretching part.
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,connectivity,4,stressed,calm,Loved the stretching part.
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,safety,5,stressed,calm,Loved the stretching part.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,clarity,4,anxious,grounded,Felt supported and safe.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,connectivity,5,anxious,grounded,Felt supported and safe.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,safety,4,anxious,grounded,Felt supported and safe.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,clarity,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,connectivity,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,safety,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250104-02,2025-01-04T19:00:00Z,clarity,3,overwhelmed,calm,Music helped me relax.


In [0]:
from pyspark.sql.types import IntegerType
# Some Likert values may come as strings
df_normalized = df_clean.withColumn("LikertScore", col("LikertScore").cast(IntegerType()))
display(df_normalized)


practitioner_id,session_id,timestamp,Question,LikertScore,emotion_before,emotion_after,Comment_Cleaned
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,clarity,5,stressed,calm,Loved the stretching part.
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,connectivity,4,stressed,calm,Loved the stretching part.
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,safety,5,stressed,calm,Loved the stretching part.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,clarity,4,anxious,grounded,Felt supported and safe.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,connectivity,5,anxious,grounded,Felt supported and safe.
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,safety,4,anxious,grounded,Felt supported and safe.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,clarity,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,connectivity,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,safety,5,tired,energized,Great pacing and clear cues.
pract-102,sess-20250104-02,2025-01-04T19:00:00Z,clarity,3,overwhelmed,calm,Music helped me relax.


In [0]:
from pyspark.sql.functions import when
# Convert text emotions into numeric score 
# Then calculate emotional improvement after the session
df_with_delta = df_normalized \
    .withColumn("EmotionBeforeScore",
                when(col("emotion_before") == "sad", 1)
                .when(col("emotion_before") == "stressed", 2)
                .when(col("emotion_before").isin("okay", "neutral"), 3)
                .when(col("emotion_before") == "calm", 4)
                .when(col("emotion_before") == "happy", 5)) \
    .withColumn("EmotionAfterScore",
                when(col("emotion_after") == "sad", 1)
                .when(col("emotion_after") == "stressed", 2)
                .when(col("emotion_after").isin("okay", "neutral"), 3)
                .when(col("emotion_after") == "calm", 4)
                .when(col("emotion_after") == "happy", 5)) \
    .withColumn("EmotionalDelta",
                col("EmotionAfterScore") - col("EmotionBeforeScore"))

display(df_with_delta)


practitioner_id,session_id,timestamp,Question,LikertScore,emotion_before,emotion_after,Comment_Cleaned,EmotionBeforeScore,EmotionAfterScore,EmotionalDelta
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,clarity,5,stressed,calm,Loved the stretching part.,2.0,4.0,2.0
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,connectivity,4,stressed,calm,Loved the stretching part.,2.0,4.0,2.0
pract-101,sess-20250101-01,2025-01-01T08:00:00Z,safety,5,stressed,calm,Loved the stretching part.,2.0,4.0,2.0
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,clarity,4,anxious,grounded,Felt supported and safe.,,,
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,connectivity,5,anxious,grounded,Felt supported and safe.,,,
pract-101,sess-20250102-01,2025-01-02T08:05:00Z,safety,4,anxious,grounded,Felt supported and safe.,,,
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,clarity,5,tired,energized,Great pacing and clear cues.,,,
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,connectivity,5,tired,energized,Great pacing and clear cues.,,,
pract-102,sess-20250103-01,2025-01-03T18:30:00Z,safety,5,tired,energized,Great pacing and clear cues.,,,
pract-102,sess-20250104-02,2025-01-04T19:00:00Z,clarity,3,overwhelmed,calm,Music helped me relax.,,4.0,


In [0]:
# Save cleaned and enhanced dataset to Silver zone in parquet format
output_path = "dbfs:/mnt/silver-feedback/"

df_with_delta.write.mode("overwrite").parquet(output_path)
print("Silver layer saved successfully!")
