In [1]:
!pip install pyspark



# Define Explicit Schema



In [2]:
from pyspark.sql.types import StructType, StructField, StringType



In [3]:
user_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("age", StringType(), True),
    StructField("city", StringType(), True),
    StructField("salary", StringType(), True)
])

#Load the data using the schema

In [5]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

In [6]:
raw_users = [
    ("U001","Amit","29","Hyderabad","50000"),
    ("U002","Neha","Thirty Two","Delhi","62000"),
    ("U003","Ravi",None,"Bangalore","45k"),
    ("U004","Pooja","28","Mumbai",58000),
    ("U005",None,"31","Chennai","")
]

df = spark.createDataFrame(raw_users, schema=user_schema)
df.show()

+-------+-----+----------+---------+------+
|user_id| name|       age|     city|salary|
+-------+-----+----------+---------+------+
|   U001| Amit|        29|Hyderabad| 50000|
|   U002| Neha|Thirty Two|    Delhi| 62000|
|   U003| Ravi|      NULL|Bangalore|   45k|
|   U004|Pooja|        28|   Mumbai| 58000|
|   U005| NULL|        31|  Chennai|      |
+-------+-----+----------+---------+------+



# Identify records that fail type conversion

In [None]:
from pyspark.sql.functions import col

df_cast = df.withColumn("age_int", col("age").cast("int")) \
            .withColumn("salary_str", col("salary").cast("string"))

df_cast.show()

#Shows Error

 # Convert Age to Integer Safely

In [9]:
from pyspark.sql.functions import when, regexp_extract

df_clean_age = df.withColumn(
    "age_int",
    when(col("age").rlike("^[0-9]+$"), col("age").cast("int"))
)

# Normalize salary into integer (handle k )

In [10]:
from pyspark.sql.functions import regexp_replace

df_clean_salary = df_clean_age.withColumn(
    "salary_int",
    when(col("salary").rlike("^[0-9]+$"), col("salary").cast("int"))
    .when(col("salary").rlike("^[0-9]+k$"),
          (regexp_extract(col("salary"), "([0-9]+)", 1).cast("int") * 1000))
    .otherwise(None)
)

# Replace Missing Names With "UNKNOWN"





In [11]:
df_fix_names = df_clean_salary.withColumn(
    "name_fixed",
    when(col("name").isNull(), "UNKNOWN").otherwise(col("name"))
)

# Drop Records Where Age Cannot Be Recovered

In [12]:
df_final = df_fix_names.filter(col("age_int").isNotNull())

# Final Clean DataFrame

In [13]:
df_final.select("user_id","name_fixed","age_int","city","salary_int").show()

+-------+----------+-------+---------+----------+
|user_id|name_fixed|age_int|     city|salary_int|
+-------+----------+-------+---------+----------+
|   U001|      Amit|     29|Hyderabad|     50000|
|   U004|     Pooja|     28|   Mumbai|     58000|
|   U005|   UNKNOWN|     31|  Chennai|      NULL|
+-------+----------+-------+---------+----------+



#  E-COMMERCE ORDERS (ARRAY CORRUPTION)

# Define Schema

In [14]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

order_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("items", ArrayType(StringType()), True),   # enforce array
    StructField("amount", IntegerType(), True)
])

# Load Raw Data with Schema

In [24]:
raw_orders = [
    ("O001","U001",["Laptop","Mobile","Tablet"],75000),
    ("O002","U002",["Mobile","Tablet"],32000),
    ("O003","U003",["Laptop"],72000),
    ("O004","U004",[],25000),   # null â†’ empty array
    ("O005","U005",["Laptop","Mobile"],68000)
]

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

order_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("items", ArrayType(StringType()), True),
    StructField("amount", IntegerType(), True)
])

df = spark.createDataFrame(raw_orders, schema=order_schema)

# Normalize All Item Values into Arrays

In [27]:
raw_orders = [
    ("O001","U001","Laptop,Mobile,Tablet",75000),
    ("O002","U002","Mobile,Tablet",32000),
    ("O003","U003","Laptop",72000),
    ("O004","U004",None,25000),
    ("O005","U005","Laptop|Mobile",68000)
]

df = spark.createDataFrame(raw_orders, ["order_id","user_id","items","amount"])

from pyspark.sql.functions import when, split, col, regexp_replace, expr

df_norm = df.withColumn(
    "items_array",
    when(col("items").isNull(), expr("array()"))
    .otherwise(
        split(regexp_replace(col("items"), "\\|", ","), ",")
    )
)
df_norm.show(truncate=False)

+--------+-------+--------------------+------+------------------------+
|order_id|user_id|items               |amount|items_array             |
+--------+-------+--------------------+------+------------------------+
|O001    |U001   |Laptop,Mobile,Tablet|75000 |[Laptop, Mobile, Tablet]|
|O002    |U002   |Mobile,Tablet       |32000 |[Mobile, Tablet]        |
|O003    |U003   |Laptop              |72000 |[Laptop]                |
|O004    |U004   |NULL                |25000 |[]                      |
|O005    |U005   |Laptop|Mobile       |68000 |[Laptop, Mobile]        |
+--------+-------+--------------------+------+------------------------+



# Explode Items into One Row per Item

In [28]:
from pyspark.sql.functions import explode

df_exploded = df_norm.withColumn("item", explode(col("items_array")))
df_exploded.show()

+--------+-------+--------------------+------+--------------------+------+
|order_id|user_id|               items|amount|         items_array|  item|
+--------+-------+--------------------+------+--------------------+------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|Laptop|
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|Mobile|
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|Tablet|
|    O002|   U002|       Mobile,Tablet| 32000|    [Mobile, Tablet]|Mobile|
|    O002|   U002|       Mobile,Tablet| 32000|    [Mobile, Tablet]|Tablet|
|    O003|   U003|              Laptop| 72000|            [Laptop]|Laptop|
|    O005|   U005|       Laptop|Mobile| 68000|    [Laptop, Mobile]|Laptop|
|    O005|   U005|       Laptop|Mobile| 68000|    [Laptop, Mobile]|Mobile|
+--------+-------+--------------------+------+--------------------+------+



# Count Frequency of Each Item


In [29]:
df_item_freq = df_exploded.groupBy("item").count()
df_item_freq.show()

+------+-----+
|  item|count|
+------+-----+
|Laptop|    3|
|Mobile|    3|
|Tablet|    2|
+------+-----+



# Identify Orders with More Than 2 Items

In [30]:
from pyspark.sql.functions import size

df_multi_items = df_norm.filter(size(col("items_array")) > 2)
df_multi_items.show()

+--------+-------+--------------------+------+--------------------+
|order_id|user_id|               items|amount|         items_array|
+--------+-------+--------------------+------+--------------------+
|    O001|   U001|Laptop,Mobile,Tablet| 75000|[Laptop, Mobile, ...|
+--------+-------+--------------------+------+--------------------+



# Exercise 3
##  Define Schema

In [31]:
from pyspark.sql.types import StructType, StructField, StringType, MapType, IntegerType

device_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("usage", MapType(StringType(), IntegerType()), True)
])

# Load Raw Data



In [37]:
device_data = [
    ("U001", {"mobile": 120, "laptop": 300}),
    ("U002", {"mobile": 200, "tablet": 100}),
    ("U003", {"desktop": 400, "mobile": 150}),
    ("U004", {}),
    ("U005", {"laptop": 250})
]

df_devices = spark.createDataFrame(device_data, device_schema)
df_devices.printSchema()
df_devices.show(truncate=False)

root
 |-- user_id: string (nullable = true)
 |-- usage: map (nullable = true)
 |    |-- key: string
 |    |-- value: integer (valueContainsNull = true)

+-------+-------------------------------+
|user_id|usage                          |
+-------+-------------------------------+
|U001   |{mobile -> 120, laptop -> 300} |
|U002   |{mobile -> 200, tablet -> 100} |
|U003   |{mobile -> 150, desktop -> 400}|
|U004   |{}                             |
|U005   |{laptop -> 250}                |
+-------+-------------------------------+



# Extract mobile usage safely

In [40]:
from pyspark.sql.functions import col

df_mobile = df_devices.withColumn("mobile_usage", col("usage")["mobile"])
df_mobile.show(truncate=False)

+-------+-------------------------------+------------+
|user_id|usage                          |mobile_usage|
+-------+-------------------------------+------------+
|U001   |{mobile -> 120, laptop -> 300} |120         |
|U002   |{mobile -> 200, tablet -> 100} |200         |
|U003   |{mobile -> 150, desktop -> 400}|150         |
|U004   |{}                             |NULL        |
|U005   |{laptop -> 250}                |NULL        |
+-------+-------------------------------+------------+



# Identify users with usage above a threshold

In [41]:
df_threshold = df_mobile.filter(col("mobile_usage") > 150)
df_threshold.show(truncate=False)

+-------+------------------------------+------------+
|user_id|usage                         |mobile_usage|
+-------+------------------------------+------------+
|U002   |{mobile -> 200, tablet -> 100}|200         |
+-------+------------------------------+------------+



# Dataset 4

In [42]:
from pyspark.sql.types import StructType, StructField, StringType

address_schema = StructType([
    StructField("city", StringType(), True),
    StructField("state", StringType(), True),
    StructField("pincode", StringType(), True)
])

profile_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("address", address_schema, True)
])

# Load Raw Data


In [52]:
raw_profiles = [
    ("U001","Hyderabad,Telangana,500081"),
    ("U002",{"city":"Delhi","state":"Delhi","pincode":"110001"}),
    ("U003",("Bangalore","Karnataka",560001)),
    ("U004","Mumbai,MH"),
    ("U005",None)
]


normalized_profiles = []
for user_id, addr in raw_profiles:
    if addr is None:
        normalized_profiles.append((user_id, {"city": None, "state": None, "pincode": None}))
    elif isinstance(addr, str):
        parts = addr.split(",")
        city = parts[0] if len(parts) > 0 else None
        state = parts[1] if len(parts) > 1 else None
        pincode = parts[2] if len(parts) > 2 else None
        normalized_profiles.append((user_id, {"city": city, "state": state, "pincode": str(pincode) if pincode else None}))
    elif isinstance(addr, dict):
        normalized_profiles.append((user_id, {"city": addr.get("city"), "state": addr.get("state"), "pincode": str(addr.get("pincode"))}))
    elif isinstance(addr, tuple):
        city = addr[0] if len(addr) > 0 else None
        state = addr[1] if len(addr) > 1 else None
        pincode = addr[2] if len(addr) > 2 else None
        normalized_profiles.append((user_id, {"city": city, "state": state, "pincode": str(pincode) if pincode else None}))
    else:
        normalized_profiles.append((user_id, {"city": None, "state": None, "pincode": None}))


In [53]:
df_profiles = spark.createDataFrame(normalized_profiles, profile_schema)
df_profiles.printSchema()
df_profiles.show(truncate=False)

root
 |-- user_id: string (nullable = false)
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- pincode: string (nullable = true)

+-------+------------------------------+
|user_id|address                       |
+-------+------------------------------+
|U001   |{Hyderabad, Telangana, 500081}|
|U002   |{Delhi, Delhi, 110001}        |
|U003   |{Bangalore, Karnataka, 560001}|
|U004   |{Mumbai, MH, NULL}            |
|U005   |{NULL, NULL, NULL}            |
+-------+------------------------------+



# Extract City, State, Pincode Safely

In [54]:
from pyspark.sql.functions import col

df_extracted = df_profiles.select(
    "user_id",
    col("address.city").alias("city"),
    col("address.state").alias("state"),
    col("address.pincode").alias("pincode")
)
df_extracted.show(truncate=False)

+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |NULL   |
|U005   |NULL     |NULL     |NULL   |
+-------+---------+---------+-------+



# Set Default Pincode When Missing

In [55]:
from pyspark.sql.functions import when

df_fixed = df_extracted.withColumn(
    "pincode",
    when(col("pincode").isNull(), "000000").otherwise(col("pincode"))
)
df_fixed.show(truncate=False)

+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |000000 |
|U005   |NULL     |NULL     |000000 |
+-------+---------+---------+-------+



# Drop Irrecoverable Records


In [56]:
df_clean = df_fixed.filter(col("city").isNotNull() & col("state").isNotNull())
df_clean.show(truncate=False)

+-------+---------+---------+-------+
|user_id|city     |state    |pincode|
+-------+---------+---------+-------+
|U001   |Hyderabad|Telangana|500081 |
|U002   |Delhi    |Delhi    |110001 |
|U003   |Bangalore|Karnataka|560001 |
|U004   |Mumbai   |MH       |000000 |
+-------+---------+---------+-------+



# Raw Data



In [57]:
raw_transactions = [
    ("T001","2024-01-05","45000"),
    ("T002","05/01/2024",52000),
    ("T003","Jan 06 2024","Thirty Thousand"),
    ("T004",None,38000),
    ("T005","2024/01/07","42000")
]

# Normalize All Rows to One Format

In [58]:
from datetime import datetime

normalized_transactions = []
for tid, date, amt in raw_transactions:
    norm_date = None
    if date:
        for fmt in ["%Y-%m-%d", "%d/%m/%Y", "%b %d %Y", "%Y/%m/%d"]:
            try:
                norm_date = datetime.strptime(date, fmt).strftime("%Y-%m-%d")
                break
            except Exception:
                continue

    norm_amt = None
    try:
        norm_amt = int(str(amt).replace(",", ""))
    except Exception:
        norm_amt = None

    normalized_transactions.append((tid, norm_date, norm_amt))

for row in normalized_transactions:
    print(row)

('T001', '2024-01-05', 45000)
('T002', '2024-01-05', 52000)
('T003', '2024-01-06', None)
('T004', None, 38000)
('T005', '2024-01-07', 42000)


# Define Schema

In [61]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType

transaction_schema = StructType([
    StructField("trans_id", StringType(), False),
    StructField("trans_date", StringType(), True),
    StructField("amount", StringType(), True)
])

# Create DataFrame



In [62]:
df_transactions = spark.createDataFrame(normalized_transactions, transaction_schema)
df_transactions.printSchema()
df_transactions.show(truncate=False)

root
 |-- trans_id: string (nullable = false)
 |-- trans_date: string (nullable = true)
 |-- amount: string (nullable = true)

+--------+----------+------+
|trans_id|trans_date|amount|
+--------+----------+------+
|T001    |2024-01-05|45000 |
|T002    |2024-01-05|52000 |
|T003    |2024-01-06|NULL  |
|T004    |NULL      |38000 |
|T005    |2024-01-07|42000 |
+--------+----------+------+



# Identify Unrecoverable Records

In [63]:
df_invalid = df_transactions.filter(col("trans_date").isNull() | col("amount").isNull())
df_invalid.show(truncate=False)

+--------+----------+------+
|trans_id|trans_date|amount|
+--------+----------+------+
|T003    |2024-01-06|NULL  |
|T004    |NULL      |38000 |
+--------+----------+------+



# Separate Valid vs Invalid Transactions

In [64]:
df_valid = df_transactions.filter(col("trans_date").isNotNull() & col("amount").isNotNull())
df_valid.show(truncate=False)

+--------+----------+------+
|trans_id|trans_date|amount|
+--------+----------+------+
|T001    |2024-01-05|45000 |
|T002    |2024-01-05|52000 |
|T005    |2024-01-07|42000 |
+--------+----------+------+

