In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MyApp") \
    .getOrCreate()

# Design an explicit schema using StructType

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType

user_schema = StructType([
    StructField("user_id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True),
    StructField("skills", ArrayType(StringType()), True)
])

# Normalize age into IntegerType

In [None]:
def normalize_age(age):
    if age is None:
        return None
    if isinstance(age, int):
        return age
    if isinstance(age, str):
        age = age.strip()
        if age.isdigit():
            return int(age)
        word_to_num = {
            "Thirty": 30, "Twenty": 20, "Twenty Eight": 28,
            "Twenty Nine": 29, "Thirty One": 31
        }
        return word_to_num.get(age, None)
    return None

In [None]:
import ast

def normalize_skills(skills):
    if skills is None:
        return []
    if isinstance(skills, list):
        return [s.strip() for s in skills]
    if isinstance(skills, str):
        try:
            parsed = ast.literal_eval(skills)
            if isinstance(parsed, list):
                return [s.strip() for s in parsed]
        except:
            return [s.strip() for s in skills.split(",")]
    return []

In [None]:
def normalize_name(name):
    if name is None or name.strip() == "":
        return "Unknown"
    return name.strip()

In [None]:
raw_users = [
    ("U001","Amit","28","Hyderabad","['AI','ML','Cloud']"),
    ("U002","Neha","Thirty","Delhi","AI,Testing"),
    ("U003","Ravi",None,"Bangalore",["Data","Spark"]),
    ("U004","Pooja","29","Mumbai",None),
    ("U005","", "31","Chennai","['DevOps']")
]

clean_data = []
for uid, name, age, city, skills in raw_users:
    clean_data.append((
        uid,
        normalize_name(name),
        normalize_age(age),
        city,
        normalize_skills(skills)
    ))

users_df = spark.createDataFrame(clean_data, schema=user_schema)
users_df.show(truncate=False)

+-------+-------+----+---------+---------------+
|user_id|name   |age |city     |skills         |
+-------+-------+----+---------+---------------+
|U001   |Amit   |28  |Hyderabad|[AI, ML, Cloud]|
|U002   |Neha   |30  |Delhi    |[AI, Testing]  |
|U003   |Ravi   |NULL|Bangalore|[Data, Spark]  |
|U004   |Pooja  |29  |Mumbai   |[]             |
|U005   |Unknown|31  |Chennai  |[DevOps]       |
+-------+-------+----+---------+---------------+



# Exercise 2

In [1]:
raw_courses = [
 ("C001","PySpark Mastery",("Data Engineering","Advanced"),"₹9999"),
 ("C002","AI for Testers",{"domain":"QA","level":"Beginner"},"8999"),
 ("C003","ML Foundations",("AI","Intermediate"),None),
 ("C004","Data Engineering Bootcamp","Data|Advanced","₹14999")
]

# Define Nested StructType



In [2]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

course_schema = StructType([
    StructField("course_id", StringType(), True),
    StructField("title", StringType(), True),
    StructField("metadata", StructType([
        StructField("domain", StringType(), True),
        StructField("level", StringType(), True)
    ])),
    StructField("price", IntegerType(), True)
])

# Normalize Data







In [3]:
raw_courses = [
 ("C001","PySpark Mastery",("Data Engineering","Advanced"),"₹9999"),
 ("C002","AI for Testers",{"domain":"QA","level":"Beginner"},"8999"),
 ("C003","ML Foundations",("AI","Intermediate"),None),
 ("C004","Data Engineering Bootcamp","Data|Advanced","₹14999")
]

def normalize_metadata(meta):
    if isinstance(meta, tuple):
        return {"domain": meta[0], "level": meta[1]}
    elif isinstance(meta, dict):
        return {"domain": meta.get("domain"), "level": meta.get("level")}
    elif isinstance(meta, str):
        parts = meta.split("|")
        return {"domain": parts[0], "level": parts[1]}
    else:
        return {"domain": None, "level": None}

def normalize_price(price):
    if price is None:
        return 0
    return int(str(price).replace("₹",""))

fixed_courses = [
    (cid, title, normalize_metadata(meta), normalize_price(price))
    for cid, title, meta, price in raw_courses
]

# Create DataFrame


In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CoursesDF").getOrCreate()

courses_df = spark.createDataFrame(fixed_courses, schema=course_schema)
courses_df.show(truncate=False)

+---------+-------------------------+----------------------------+-----+
|course_id|title                    |metadata                    |price|
+---------+-------------------------+----------------------------+-----+
|C001     |PySpark Mastery          |{Data Engineering, Advanced}|9999 |
|C002     |AI for Testers           |{QA, Beginner}              |8999 |
|C003     |ML Foundations           |{AI, Intermediate}          |0    |
|C004     |Data Engineering Bootcamp|{Data, Advanced}            |14999|
+---------+-------------------------+----------------------------+-----+



# Dataset 3

In [5]:
raw_enrollments = [
 ("U001","C001","2024-01-05"),
 ("U002","C002","05/01/2024"),
 ("U003","C001","2024/01/06"),
 ("U004","C003","invalid_date"),
 ("U001","C004","2024-01-10")
]

# Normalize Enrollment Dates


In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, when

spark = SparkSession.builder.appName("Enrollments").getOrCreate()

enrollments_df = spark.createDataFrame(raw_enrollments, ["user_id","course_id","enroll_date_raw"])

enrollments_df = enrollments_df.withColumn(
    "enroll_date",
    when(to_date(col("enroll_date_raw"), "yyyy-MM-dd").isNotNull(),
         to_date(col("enroll_date_raw"), "yyyy-MM-dd"))
    .when(to_date(col("enroll_date_raw"), "dd/MM/yyyy").isNotNull(),
         to_date(col("enroll_date_raw"), "dd/MM/yyyy"))
    .when(to_date(col("enroll_date_raw"), "yyyy/MM/dd").isNotNull(),
         to_date(col("enroll_date_raw"), "yyyy/MM/dd"))
)

# Identify Invalid Enrollments




In [10]:
from pyspark.sql import functions as F

raw_enrollments = [
 ("U001","C001","2024-01-05"),
 ("U002","C002","05/01/2024"),
 ("U003","C001","2024/01/06"),
 ("U004","C003","invalid_date"),
 ("U001","C004","2024-01-10")
]

enrollments_df = spark.createDataFrame(raw_enrollments, ["user_id","course_id","enroll_date_raw"])

enrollments_df = enrollments_df.withColumn(
    "enroll_date",
    F.coalesce(
        F.expr("try_to_timestamp(enroll_date_raw, 'yyyy-MM-dd')"),
        F.expr("try_to_timestamp(enroll_date_raw, 'dd/MM/yyyy')"),
        F.expr("try_to_timestamp(enroll_date_raw, 'yyyy/MM/dd')")
    )
).withColumn(
    "is_valid",
    F.col("enroll_date").isNotNull()
)

enrollments_df.show(truncate=False)

+-------+---------+---------------+-------------------+--------+
|user_id|course_id|enroll_date_raw|enroll_date        |is_valid|
+-------+---------+---------------+-------------------+--------+
|U001   |C001     |2024-01-05     |2024-01-05 00:00:00|true    |
|U002   |C002     |05/01/2024     |2024-01-05 00:00:00|true    |
|U003   |C001     |2024/01/06     |2024-01-06 00:00:00|true    |
|U004   |C003     |invalid_date   |NULL               |false   |
|U001   |C004     |2024-01-10     |2024-01-10 00:00:00|true    |
+-------+---------+---------------+-------------------+--------+



# Join

In [15]:
users_df = spark.createDataFrame([
    ("U001","Alice"),
    ("U002","Bob"),
    ("U003","Charlie"),
    ("U004","Diana")
], ["user_id","user_name"])


In [16]:
from pyspark.sql import functions as F

raw_enrollments = [
 ("U001","C001","2024-01-05"),
 ("U002","C002","05/01/2024"),
 ("U003","C001","2024/01/06"),
 ("U004","C003","invalid_date"),
 ("U001","C004","2024-01-10")
]

enrollments_df = spark.createDataFrame(raw_enrollments, ["user_id","course_id","enroll_date_raw"])

enrollments_df = enrollments_df.withColumn(
    "enroll_date",
    F.coalesce(
        F.expr("try_to_timestamp(enroll_date_raw, 'yyyy-MM-dd')"),
        F.expr("try_to_timestamp(enroll_date_raw, 'dd/MM/yyyy')"),
        F.expr("try_to_timestamp(enroll_date_raw, 'yyyy/MM/dd')")
    )
).withColumn("is_valid", F.col("enroll_date").isNotNull())

In [17]:
from pyspark.sql.functions import broadcast

enrollments_users = enrollments_df.join(users_df, "user_id", "left")

full_enrollments = enrollments_users.join(courses_df, "course_id", "left")

full_enrollments = enrollments_df \
    .join(broadcast(users_df), "user_id", "left") \
    .join(broadcast(courses_df), "course_id", "left")

In [18]:
full_enrollments.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(LeftOuter, [course_id])
:- Project [user_id#70, course_id#71, enroll_date_raw#72, enroll_date#73, is_valid#74, user_name#69]
:  +- Join LeftOuter, (user_id#70 = user_id#68)
:     :- Project [user_id#70, course_id#71, enroll_date_raw#72, enroll_date#73, isnotnull(enroll_date#73) AS is_valid#74]
:     :  +- Project [user_id#70, course_id#71, enroll_date_raw#72, coalesce(try_to_timestamp(enroll_date_raw#72, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), try_to_timestamp(enroll_date_raw#72, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), try_to_timestamp(enroll_date_raw#72, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false)) AS enroll_date#73]
:     :     +- LogicalRDD [user_id#70, course_id#71, enroll_date_raw#72], false
:     +- ResolvedHint (strategy=broadcast)
:        +- LogicalRDD [user_id#68, user_name#69], false
+- ResolvedHint (strategy=broadcast)
   +- LogicalRDD [course_id#0, title#1, metadata#2, price#3], false

== 

# Dataset 5

In [19]:
raw_activity = [
 ("U001","login,watch,logout","{'device':'mobile','ip':'1.1.1.1'}",120),
 ("U002",["login","watch"],"device=laptop;ip=2.2.2.2",90),
 ("U003","login|logout",None,30),
 ("U004",None,"{'device':'tablet'}",60)
]

# Define Schema


In [20]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType, IntegerType

activity_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("actions_raw", StringType(), True),
    StructField("metadata_raw", StringType(), True),
    StructField("duration", IntegerType(), True)
])

# Normalize Actions


In [21]:
def normalize_actions(actions):
    if actions is None:
        return []
    if isinstance(actions, list):
        return actions
    if isinstance(actions, str):
        if "," in actions:
            return actions.split(",")
        elif "|" in actions:
            return actions.split("|")
        else:
            return [actions]
    return []

# Normalize Metadata


In [22]:
import re, ast

def normalize_metadata(meta):
    if meta is None:
        return {}
    try:
        if meta.strip().startswith("{"):
            return ast.literal_eval(meta.replace("'", '"'))
        parts = meta.split(";")
        kv = {}
        for p in parts:
            if "=" in p:
                k,v = p.split("=")
                kv[k.strip()] = v.strip()
        return kv
    except:
        return {}

# Fix Raw Data




In [23]:
fixed_activity = [
    (uid, normalize_actions(actions), normalize_metadata(meta), duration)
    for uid, actions, meta, duration in raw_activity
]

# Create DataFrame



In [24]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ActivityDF").getOrCreate()

activity_df = spark.createDataFrame(fixed_activity,
    ["user_id","actions","metadata","duration"])

activity_df.show(truncate=False)

+-------+----------------------+---------------------------------+--------+
|user_id|actions               |metadata                         |duration|
+-------+----------------------+---------------------------------+--------+
|U001   |[login, watch, logout]|{device -> mobile, ip -> 1.1.1.1}|120     |
|U002   |[login, watch]        |{device -> laptop, ip -> 2.2.2.2}|90      |
|U003   |[login, logout]       |{}                               |30      |
|U004   |[]                    |{device -> tablet}               |60      |
+-------+----------------------+---------------------------------+--------+



# Explode Actions & Count Frequency

In [25]:
from pyspark.sql.functions import explode, col

exploded_df = activity_df.withColumn("action", explode(col("actions")))
action_counts = exploded_df.groupBy("action").count()
action_counts.show()

+------+-----+
|action|count|
+------+-----+
| watch|    2|
|logout|    2|
| login|    3|
+------+-----+

