In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
spark = SparkSession.builder \
.appName("USER PROFILE API") \
.getOrCreate()

In [2]:
raw_users = [
("U001","Amit","29","Hyderabad","50000"),
("U002","Neha","Thirty Two","Delhi","62000"),
("U003","Ravi",None,"Bangalore","45k"),
("U004","Pooja","28","Mumbai",58000),
("U005",None,"31","Chennai","")
]

In [3]:
from pyspark.sql.types import StructType,StructField,StringType

1. Design a StructType schema for this data

In [4]:
user_schema = StructType([
    StructField("user_id",StringType(),True),
    StructField("name",StringType(),True),
    StructField("age",StringType(),True),
    StructField("city",StringType(),True),
    StructField("salary",StringType(),True)
])

2. Load the data using the schema

In [5]:
df_raw = spark.createDataFrame(raw_users,user_schema)
df_raw.printSchema()
df_raw.show()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U003| Ravi|      NULL|Bangalore|   45k|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+



3. Identify records that fail type conversion

In [28]:
from pyspark.sql.functions import col, when
from pyspark.sql.types import IntegerType

df_cast = df_raw.withColumn(
    "age_int",
    when(col("age").rlike(r"^[0-9]+$"), col("age").cast("int")).otherwise(None)
).withColumn(
    "salary_str", col("salary").cast("string")
).withColumn(
    "salary_int",
    when(col("salary_str").rlike(r"^[0-9]+$"), col("salary_str").cast("int")).otherwise(None)
)

df_fail = df_cast.filter(
    (col("age").isNotNull() & col("age_int").isNull()) |
    (col("salary").isNotNull() & col("salary_int").isNull())
).select("user_id", "name", "age", "city", "salary")

df_fail.show(truncate=False)
df_raw.filter(~col("salary").rlike(r"^[0-9]+$")).show()

+-------+----+----------+---------+------+
|user_id|name|age       |city     |salary|
+-------+----+----------+---------+------+
|U002   |Neha|Thirty Two|Delhi    |62000 |
|U003   |Ravi|NULL      |Bangalore|45k   |
|U005   |NULL|31        |Chennai  |      |
+-------+----+----------+---------+------+

+-------+----+----+---------+------+
|user_id|name| age|     city|salary|
+-------+----+----+---------+------+
|   U003|Ravi|NULL|Bangalore|   45k|
|   U005|NULL|  31|  Chennai|      |
+-------+----+----+---------+------+



4. Convert age to integer safely

In [30]:
df_age_fixed = df_raw.withColumn(
    "age",
    when(col("age").rlike(r"^[0-9]+$"), col("age").cast("int")).otherwise(None)
)
df_age_fixed.show()

+-------+-----+----+---------+------+
|user_id| name| age|     city|salary|
+-------+-----+----+---------+------+
|   U001| Amit|  29|Hyderabad| 50000|
|   U002| Neha|NULL|    Delhi| 62000|
|   U003| Ravi|NULL|Bangalore|   45k|
|   U004|Pooja|  28|   Mumbai| 58000|
|   U005| NULL|  31|  Chennai|      |
+-------+-----+----+---------+------+



5. Normalize salary into integer (handle k )

In [34]:
from pyspark.sql.functions import col, when, regexp_replace
df_salary_fixed = df_age_fixed.withColumn(
    "salary",
    when(col("salary").rlike(r"^[0-9]+k$"), regexp_replace(col("salary"), "k", "").cast("int") * 1000)
    .when(col("salary").rlike(r"^[0-9]+$"), col("salary").cast("int"))
    .otherwise(None)
)
df_salary_fixed.show()

+-------+-----+----+---------+------+
|user_id| name| age|     city|salary|
+-------+-----+----+---------+------+
|   U001| Amit|  29|Hyderabad| 50000|
|   U002| Neha|NULL|    Delhi| 62000|
|   U003| Ravi|NULL|Bangalore| 45000|
|   U004|Pooja|  28|   Mumbai| 58000|
|   U005| NULL|  31|  Chennai|  NULL|
+-------+-----+----+---------+------+



6. Replace missing names with "UNKNOWN"

In [35]:
df_name_fixed = df_salary_fixed.fillna({"name": "UNKNOWN"})
df_name_fixed.show()

+-------+-------+----+---------+------+
|user_id|   name| age|     city|salary|
+-------+-------+----+---------+------+
|   U001|   Amit|  29|Hyderabad| 50000|
|   U002|   Neha|NULL|    Delhi| 62000|
|   U003|   Ravi|NULL|Bangalore| 45000|
|   U004|  Pooja|  28|   Mumbai| 58000|
|   U005|UNKNOWN|  31|  Chennai|  NULL|
+-------+-------+----+---------+------+



7. Drop records where age cannot be recovered

In [38]:
df_clean = df_name_fixed.filter(col("salary").isNotNull())
df_clean.show()

+-------+-----+----+---------+------+
|user_id| name| age|     city|salary|
+-------+-----+----+---------+------+
|   U001| Amit|  29|Hyderabad| 50000|
|   U002| Neha|NULL|    Delhi| 62000|
|   U003| Ravi|NULL|Bangalore| 45000|
|   U004|Pooja|  28|   Mumbai| 58000|
+-------+-----+----+---------+------+



8. Produce a final clean DataFrame

In [39]:
df_final = df_clean.select("user_id", "name", "age", "city", "salary")
df_final.show()

+-------+-----+----+---------+------+
|user_id| name| age|     city|salary|
+-------+-----+----+---------+------+
|   U001| Amit|  29|Hyderabad| 50000|
|   U002| Neha|NULL|    Delhi| 62000|
|   U003| Ravi|NULL|Bangalore| 45000|
|   U004|Pooja|  28|   Mumbai| 58000|
+-------+-----+----+---------+------+

