#Module 1: Setup & SparkSession Initialization
##Tasks:
##Install and configure PySpark in your local system or Colab.
##Initialize Spark with:

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("BotCampus PySpark Practice") \
    .master("local[*]") \
    .getOrCreate()

##Create a DataFrame from:

In [5]:
data = [
    ("Anjali", "Bangalore", 24),
    ("Ravi", "Hyderabad", 28),
    ("Kavya", "Delhi", 22),
    ("Meena", "Chennai", 25),
    ("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)

## Show schema, explain data types, and convert to RDD.

In [6]:
# Show schema
df.printSchema()
df.show()

# Convert to RDD
rdd = df.rdd

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



## Print .collect() and df.rdd.map() output.

In [7]:
print("RDD .collect():", rdd.collect())
print("RDD .map():", rdd.map(lambda x: (x.name, x.age)).collect())

RDD .collect(): [Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
RDD .map(): [('Anjali', 24), ('Ravi', 28), ('Kavya', 22), ('Meena', 25), ('Arjun', 30)]


# Module 2: RDDs & Transformations
##Scenario: You received app feedback from users in free-text

In [8]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the delivery",
    "Meena from Hyderabad had a late order",
    "Ajay from Pune liked the service",
    "Anjali from Delhi faced UI issues",
    "Rohit from Mumbai gave positive feedback"
])

# Tasks:
##Split each line into words (flatMap ).

In [12]:
words_rdd = feedback.flatMap(lambda line: line.split())

## Remove stop words (from , the , etc.).

In [14]:
# Stop words list
stop_words = {"from", "the", "a", "had", "with", "and", "an", "of", "to"}
# Clean & transform
words = feedback.flatMap(lambda line: line.lower().split()) \
.filter(lambda word: word not in stop_words)

## Count each word frequency using reduceByKey

In [15]:
word_count = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

##Find top 3 most frequent non-stop words.

In [16]:
top_3 = word_count.takeOrdered(3, key=lambda x: -x[1])
print("Top 3 frequent words:", top_3)

Top 3 frequent words: [('loved', 1), ('liked', 1), ('service', 1)]


#Module 3: DataFrames & Transformation (With Joins)
##DataFrames

In [27]:
from pyspark.sql.functions import col, when
students = [
    ("Amit", "10-A", 89),
    ("Kavya", "10-B", 92),
    ("Anjali", "10-A", 78),
    ("Rohit", "10-B", 85),
    ("Sneha", "10-C", 80)
]
attendance = [
    ("Amit", 24),
    ("Kavya", 22),
    ("Anjali", 20),
    ("Rohit", 25),
    ("Sneha", 19)
]
df_students = spark.createDataFrame(students, ["name", "section", "marks"])
df_attendance = spark.createDataFrame(attendance, ["name", "days_present"])

#Tasks:
##Join both DataFrames on name .

In [21]:
df_joined = df_students.join(df_attendance, on="name").show()

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



## Create a new column: attendance_rate = days_present / 25 .

In [19]:
df_joined = df_joined.withColumn("attendance_rate", col("days_present") / 25)

## Grade students using
when :
 A: >90, B: 80–90, C: <80.

In [37]:
from pyspark.sql.functions import col, when
df_joined = df_students.join(df_attendance, on="name", how="inner")
df_joined = df_joined.withColumn("grade",when(col("marks") > 90, "A").when(col("marks") >= 80, "B").otherwise("C"))
df_joined.show()

+------+-------+-----+------------+-----+
|  name|section|marks|days_present|grade|
+------+-------+-----+------------+-----+
|  Amit|   10-A|   89|          24|    B|
|Anjali|   10-A|   78|          20|    C|
| Kavya|   10-B|   92|          22|    A|
| Rohit|   10-B|   85|          25|    B|
| Sneha|   10-C|   80|          19|    B|
+------+-------+-----+------------+-----+



## Filter students with good grades but poor attendance (<80%).

In [38]:
# Add attendance_rate column
df_joined = df_joined.withColumn("attendance_rate", col("days_present") / 30)

# Filter based on grade and attendance_rate
df_filtered = df_joined.filter((col("grade").isin("A", "B")) & (col("attendance_rate") < 0.8))
df_filtered.show()


+-----+-------+-----+------------+-----+------------------+
| name|section|marks|days_present|grade|   attendance_rate|
+-----+-------+-----+------------+-----+------------------+
|Kavya|   10-B|   92|          22|    A|0.7333333333333333|
|Sneha|   10-C|   80|          19|    B|0.6333333333333333|
+-----+-------+-----+------------+-----+------------------+



# Module 4: Ingest CSV & JSON, Save to Parquet
## Tasks:


##Read both formats into DataFrames.


In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode
spark = SparkSession.builder.appName("SingleNestedJson").getOrCreate()
data = [{
    "id": 201,
    "name": "Nandini",
    "contact": {
        "email": "nandi@example.com",
        "city": "Hyderabad"
    },
    "skills": ["Python", "Spark", "SQL"]
}]

rdd = spark.sparkContext.parallelize(data)
df_json = spark.read.json(rdd)
df_json.printSchema()


root
 |-- contact: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



##Flatten nested JSON using select , col , alias , explode .

In [56]:
df_flat = df_json.select(
    col("id"),
    col("name"),
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode(col("skills")).alias("skill")
)

##Save both as Parquet files partitioned by city.

In [57]:
df_csv.write.mode("overwrite").partitionBy("city").parquet("/tmp/csv_output")
df_flat.write.mode("overwrite").partitionBy("city").parquet("/tmp/json_output")

In [58]:
!zip -r /content/csv_output.zip /tmp/csv_output
!zip -r /content/json_output.zip /tmp/json_output

  adding: tmp/csv_output/ (stored 0%)
  adding: tmp/csv_output/._SUCCESS.crc (stored 0%)
  adding: tmp/csv_output/city=Mumbai/ (stored 0%)
  adding: tmp/csv_output/city=Mumbai/.part-00000-78f9ca8a-b660-415f-9b43-e0144575585a.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/csv_output/city=Mumbai/part-00000-78f9ca8a-b660-415f-9b43-e0144575585a.c000.snappy.parquet (deflated 47%)
  adding: tmp/csv_output/city=Bangalore/ (stored 0%)
  adding: tmp/csv_output/city=Bangalore/.part-00000-78f9ca8a-b660-415f-9b43-e0144575585a.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/csv_output/city=Bangalore/part-00000-78f9ca8a-b660-415f-9b43-e0144575585a.c000.snappy.parquet (deflated 48%)
  adding: tmp/csv_output/_SUCCESS (stored 0%)
  adding: tmp/csv_output/city=Chennai/ (stored 0%)
  adding: tmp/csv_output/city=Chennai/.part-00000-78f9ca8a-b660-415f-9b43-e0144575585a.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/csv_output/city=Chennai/part-00000-78f9ca8a-b660-415f-9b43-e0144575585a.c000.snappy.

In [59]:
from google.colab import files
files.download("/content/csv_output.zip")
files.download("/content/json_output.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Module 5: Spark SQL with Temp Views
 Tasks:
 Register the students DataFrame as students_view .
 Write and run the following queries:

In [60]:
# Register view
df_students.createOrReplaceTempView("students_view")

# a) Average marks per section
spark.sql("""select section, AVG(marks) as avg_marks
from students_view
group by section""").show()

# b) Top scorer per section
spark.sql("""select section, name, marks from(
select *, rank() over(partition by section order by marks desc) as rnk
from students_view
)where rnk = 1""").show()

# c) Count by grade
df_students = df_students.withColumn("grade", when(col("marks") > 90, "A").when((col("marks") >= 80), "B").otherwise("C"))
df_students.createOrReplaceTempView("graded_students")

spark.sql("select grade, count(*) as count from graded_students group by grade").show()

# d) Above class average
spark.sql("""
select * from students_view
where marks > (select avg(marks) from students_view)
""").show()


+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-A|     83.5|
|   10-B|     88.5|
|   10-C|     80.0|
+-------+---------+

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+

+-----+-----+
|grade|count|
+-----+-----+
|    B|    3|
|    A|    1|
|    C|    1|
+-----+-----+

+-----+-------+-----+
| name|section|marks|
+-----+-------+-----+
| Amit|   10-A|   89|
|Kavya|   10-B|   92|
|Rohit|   10-B|   85|
+-----+-------+-----+



#Module 6: Partitioned Data & Incremental Loading
##Step 1: Full Load

In [61]:
df_students.write.mode("overwrite").partitionBy("section").parquet("output/students/")

## Step 2: Incremental Load

In [63]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

#Tasks:
##List files in output/students/ using Python.

In [64]:
import os
print("Files in 10-A partition:")
print(os.listdir("output/students/section=10-A"))

Files in 10-A partition:
['.part-00001-6ff44c48-396d-458d-a041-8c0e9f36ac8b.c000.snappy.parquet.crc', '.part-00001-15ca460b-716b-41a1-9dbe-26fcfcecca6c.c000.snappy.parquet.crc', '.part-00000-e400fb43-5442-4a5a-af18-e4808e060233.c000.snappy.parquet.crc', 'part-00001-e400fb43-5442-4a5a-af18-e4808e060233.c000.snappy.parquet', 'part-00001-15ca460b-716b-41a1-9dbe-26fcfcecca6c.c000.snappy.parquet', '.part-00001-e400fb43-5442-4a5a-af18-e4808e060233.c000.snappy.parquet.crc', 'part-00000-e400fb43-5442-4a5a-af18-e4808e060233.c000.snappy.parquet', 'part-00001-6ff44c48-396d-458d-a041-8c0e9f36ac8b.c000.snappy.parquet']


## Read only partition 10-A and list students.
## Compare before/after counts for section 10-A .

In [65]:
df_10a = spark.read.parquet("output/students/section=10-A")
df_10a.show()
print("Total in 10-A:", df_10a.count())

+------+-----+-----+
|  name|marks|grade|
+------+-----+-----+
|Anjali|   78|    C|
|  Amit|   89|    B|
| Tejas|   91| NULL|
| Tejas|   91| NULL|
+------+-----+-----+

Total in 10-A: 4


#Module 7: ETL Pipeline – End to End

In [67]:
from google.colab import files
uploaded = files.upload()

Saving raw_emp.csv to raw_emp.csv


In [81]:
from pyspark.sql.functions import col, when

# Fix column name by trimming whitespace
df_trimmed = df_raw.withColumnRenamed("bonus ", "bonus")

# Clean: Fill missing bonus
df_filled = df_trimmed.withColumn("bonus", when(col("bonus").isNull(), 2000).otherwise(col("bonus")))

# CTC
df_ctc = df_filled.withColumn("total_ctc", col("salary") + col("bonus"))

# Filter
df_filtered = df_ctc.filter(col("total_ctc") > 65000)

# Save as JSON and Parquet
df_filtered.write.mode("overwrite").json("/tmp/final_json_output")
df_filtered.write.mode("overwrite").partitionBy("dept").parquet("/tmp/final_parquet_output")

In [82]:
!zip -r /tmp/final_json_output.zip /tmp/final_json_output
!zip -r /tmp/final_parquet_output.zip /tmp/final_parquet_output

  adding: tmp/final_json_output/ (stored 0%)
  adding: tmp/final_json_output/._SUCCESS.crc (stored 0%)
  adding: tmp/final_json_output/part-00000-d1a04de6-bd75-4029-8ea7-44ec8009d9a5-c000.json (deflated 36%)
  adding: tmp/final_json_output/_SUCCESS (stored 0%)
  adding: tmp/final_json_output/.part-00000-d1a04de6-bd75-4029-8ea7-44ec8009d9a5-c000.json.crc (stored 0%)
  adding: tmp/final_parquet_output/ (stored 0%)
  adding: tmp/final_parquet_output/._SUCCESS.crc (stored 0%)
  adding: tmp/final_parquet_output/dept=IT/ (stored 0%)
  adding: tmp/final_parquet_output/dept=IT/.part-00000-07b29636-c8f3-42d3-b409-c1939b4414ea.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/final_parquet_output/dept=IT/part-00000-07b29636-c8f3-42d3-b409-c1939b4414ea.c000.snappy.parquet (deflated 52%)
  adding: tmp/final_parquet_output/_SUCCESS (stored 0%)
  adding: tmp/final_parquet_output/dept=Finance/ (stored 0%)
  adding: tmp/final_parquet_output/dept=Finance/.part-00000-07b29636-c8f3-42d3-b409-c1939b4414ea

In [83]:
from google.colab import files
files.download("/tmp/final_json_output.zip")
files.download("/tmp/final_parquet_output.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>