In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("DEVICE USAGE") \
.getOrCreate()

In [2]:
raw_devices = [
("U001",{"mobile":120,"laptop":300}),
("U002","mobile:200,tablet:100"),
("U003",{"desktop":"400","mobile":"150"}),
("U004",None),
("U005","laptop-250")
]

1. Design a MapType(StringType, IntegerType) schema

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, MapType

In [4]:
device_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("usage_raw", StringType(), True)
])

In [8]:
df_device_raw = spark.createDataFrame(raw_devices, schema=device_schema)
df_device_raw.show(truncate=False)

+-------+-------------------------+
|user_id|usage_raw                |
+-------+-------------------------+
|U001   |{mobile=120, laptop=300} |
|U002   |mobile:200,tablet:100    |
|U003   |{mobile=150, desktop=400}|
|U004   |NULL                     |
|U005   |laptop-250               |
+-------+-------------------------+



2. Parse string maps into proper maps

In [9]:
from pyspark.sql.functions import (regexp_replace, split, explode, map_from_entries, struct)

In [11]:
df_normalized = df_device_raw.withColumn("usage", regexp_replace(col("usage_raw"), "-", ":"))
df_kv = df_normalized.withColumn("kv_pairs", split(col("usage"), ",\\s"))
df_kv.show(truncate=False)

+-------+-------------------------+-------------------------+---------------------------+
|user_id|usage_raw                |usage                    |kv_pairs                   |
+-------+-------------------------+-------------------------+---------------------------+
|U001   |{mobile=120, laptop=300} |{mobile=120, laptop=300} |[{mobile=120, laptop=300}] |
|U002   |mobile:200,tablet:100    |mobile:200,tablet:100    |[mobile:200,tablet:100]    |
|U003   |{mobile=150, desktop=400}|{mobile=150, desktop=400}|[{mobile=150, desktop=400}]|
|U004   |NULL                     |NULL                     |NULL                       |
|U005   |laptop-250               |laptop:250               |[laptop:250]               |
+-------+-------------------------+-------------------------+---------------------------+



3. Convert all usage values to integers

In [14]:
from pyspark.sql.functions import regexp_replace, split, struct, map_from_entries, when, col, transform
from pyspark.sql.types import MapType, StringType, IntegerType

# Prepare the string for map conversion: remove braces, standardize delimiters
usage_parsed_string = regexp_replace(regexp_replace(col("usage"), "^\\{(.*)\\}$", "$1"), "=", ":")
# Transform the split key-value strings into an array of structs with integer values
usage_kv_structs = transform(split(usage_parsed_string, ",\\s*"), lambda x: struct(split(x, ":")[0].alias("key"), split(x, ":")[1].cast(IntegerType()).alias("value")))

df_with_integer_map = df_normalized.withColumn(
    "usage_map",
    when(col("usage").isNotNull(), map_from_entries(usage_kv_structs)).otherwise(None)
)

# Show the result and its schema
df_with_integer_map.select("user_id", "usage_raw", "usage", "usage_map").show(truncate=False)
df_with_integer_map.printSchema()

+-------+-------------------------+-------------------------+-------------------------------+
|user_id|usage_raw                |usage                    |usage_map                      |
+-------+-------------------------+-------------------------+-------------------------------+
|U001   |{mobile=120, laptop=300} |{mobile=120, laptop=300} |{mobile -> 120, laptop -> 300} |
|U002   |mobile:200,tablet:100    |mobile:200,tablet:100    |{mobile -> 200, tablet -> 100} |
|U003   |{mobile=150, desktop=400}|{mobile=150, desktop=400}|{mobile -> 150, desktop -> 400}|
|U004   |NULL                     |NULL                     |NULL                           |
|U005   |laptop-250               |laptop:250               |{laptop -> 250}                |
+-------+-------------------------+-------------------------+-------------------------------+

root
 |-- user_id: string (nullable = true)
 |-- usage_raw: string (nullable = true)
 |-- usage: string (nullable = true)
 |-- usage_map: map (nullable =

4. Handle malformed key-value pairs

In [16]:
from pyspark.sql.functions import regexp_replace, split, struct, map_from_entries, when, col, transform
from pyspark.sql.types import MapType, StringType, IntegerType
usage_parsed_string = regexp_replace(regexp_replace(col("usage"), "^\\{(.*)\\}$", "$1"), "=", ":")
usage_kv_structs = transform(
    split(usage_parsed_string, ",\\s*"),
    lambda x: struct(
        split(x, ":").getItem(0).alias("key"),
        split(x, ":").getItem(1).cast(IntegerType()).alias("value")
    )
)

df_with_integer_map_handled_malformed = df_normalized.withColumn(
    "usage_map",
    when(col("usage").isNotNull(), map_from_entries(usage_kv_structs)).otherwise(None)
)

# Show the result and its schema
df_with_integer_map_handled_malformed.select("user_id", "usage_raw", "usage", "usage_map").show(truncate=False)
df_with_integer_map_handled_malformed.printSchema()

+-------+-------------------------+-------------------------+-------------------------------+
|user_id|usage_raw                |usage                    |usage_map                      |
+-------+-------------------------+-------------------------+-------------------------------+
|U001   |{mobile=120, laptop=300} |{mobile=120, laptop=300} |{mobile -> 120, laptop -> 300} |
|U002   |mobile:200,tablet:100    |mobile:200,tablet:100    |{mobile -> 200, tablet -> 100} |
|U003   |{mobile=150, desktop=400}|{mobile=150, desktop=400}|{mobile -> 150, desktop -> 400}|
|U004   |NULL                     |NULL                     |NULL                           |
|U005   |laptop-250               |laptop:250               |{laptop -> 250}                |
+-------+-------------------------+-------------------------+-------------------------------+

root
 |-- user_id: string (nullable = true)
 |-- usage_raw: string (nullable = true)
 |-- usage: string (nullable = true)
 |-- usage_map: map (nullable =

5. Replace missing maps with empty maps

In [18]:
from pyspark.sql.functions import map_from_arrays, array, col, when

df_map_fixed = df_with_integer_map_handled_malformed.withColumn(
    "usage_map_fixed",
    when(col("usage_map").isNull(), map_from_arrays(array(), array()))
    .otherwise(col("usage_map"))
)
df_map_fixed.select("user_id", "usage_raw", "usage_map", "usage_map_fixed").show(truncate=False)
df_map_fixed.printSchema()

+-------+-------------------------+-------------------------------+-------------------------------+
|user_id|usage_raw                |usage_map                      |usage_map_fixed                |
+-------+-------------------------+-------------------------------+-------------------------------+
|U001   |{mobile=120, laptop=300} |{mobile -> 120, laptop -> 300} |{mobile -> 120, laptop -> 300} |
|U002   |mobile:200,tablet:100    |{mobile -> 200, tablet -> 100} |{mobile -> 200, tablet -> 100} |
|U003   |{mobile=150, desktop=400}|{mobile -> 150, desktop -> 400}|{mobile -> 150, desktop -> 400}|
|U004   |NULL                     |NULL                           |{}                             |
|U005   |laptop-250               |{laptop -> 250}                |{laptop -> 250}                |
+-------+-------------------------+-------------------------------+-------------------------------+

root
 |-- user_id: string (nullable = true)
 |-- usage_raw: string (nullable = true)
 |-- usage: st

6. Extract mobile usage safely

In [20]:
from pyspark.sql.functions import element_at
df_mobile = df_map_fixed.withColumn("mobile_usage", element_at(col("usage_map"), "mobile").cast("int"))
df_mobile.show()

+-------+--------------------+--------------------+--------------------+--------------------+------------+
|user_id|           usage_raw|               usage|           usage_map|     usage_map_fixed|mobile_usage|
+-------+--------------------+--------------------+--------------------+--------------------+------------+
|   U001|{mobile=120, lapt...|{mobile=120, lapt...|{mobile -> 120, l...|{mobile -> 120, l...|         120|
|   U002|mobile:200,tablet...|mobile:200,tablet...|{mobile -> 200, t...|{mobile -> 200, t...|         200|
|   U003|{mobile=150, desk...|{mobile=150, desk...|{mobile -> 150, d...|{mobile -> 150, d...|         150|
|   U004|                NULL|                NULL|                NULL|                  {}|        NULL|
|   U005|          laptop-250|          laptop:250|     {laptop -> 250}|     {laptop -> 250}|        NULL|
+-------+--------------------+--------------------+--------------------+--------------------+------------+



7. Identify users with usage above a threshold

In [22]:
high_usage_users = df_mobile.filter(col("mobile_usage") > 150)
high_usage_users.show()

+-------+--------------------+--------------------+--------------------+--------------------+------------+
|user_id|           usage_raw|               usage|           usage_map|     usage_map_fixed|mobile_usage|
+-------+--------------------+--------------------+--------------------+--------------------+------------+
|   U002|mobile:200,tablet...|mobile:200,tablet...|{mobile -> 200, t...|{mobile -> 200, t...|         200|
+-------+--------------------+--------------------+--------------------+--------------------+------------+

