#Module 1: Setup & SparkSession Initialization

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder .appName("BotCampus PySpark Practice") .master("local[*]") .getOrCreate()
sc=spark.sparkContext

In [10]:
# Create a DataFrame from:
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df=spark.createDataFrame(data,columns)
df.show()
df.printSchema()


+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



In [16]:
rdd=df.rdd
collected=rdd.collect()
mapped=rdd.map(lambda x:(x.name,x.age+1)).collect()
print(collected)
print(mapped)


[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
[('Anjali', 25), ('Ravi', 29), ('Kavya', 23), ('Meena', 26), ('Arjun', 31)]


# Module 2: RDDs & Transformations

In [17]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])


In [22]:
#splitting words from feedback
splitted_words=feedback.flatMap(lambda x:x.split())
splitted_words.collect()


['Ravi',
 'from',
 'Bangalore',
 'loved',
 'the',
 'delivery',
 'Meena',
 'from',
 'Hyderabad',
 'had',
 'a',
 'late',
 'order',
 'Ajay',
 'from',
 'Pune',
 'liked',
 'the',
 'service',
 'Anjali',
 'from',
 'Delhi',
 'faced',
 'UI',
 'issues',
 'Rohit',
 'from',
 'Mumbai',
 'gave',
 'positive',
 'feedback']

In [20]:
#removing stop words
stop_words = {"from", "the", "had", "a", "and", "of", "to", "in"}

remove_stop=splitted_words.filter(lambda x:x not in stop_words)
remove_stop.collect()

['Ravi',
 'Bangalore',
 'loved',
 'delivery',
 'Meena',
 'Hyderabad',
 'late',
 'order',
 'Ajay',
 'Pune',
 'liked',
 'service',
 'Anjali',
 'Delhi',
 'faced',
 'UI',
 'issues',
 'Rohit',
 'Mumbai',
 'gave',
 'positive',
 'feedback']

In [25]:
# count each word frequency using reduceKey
word_count=splitted_words.map(lambda x:(x.lower(),1)).reduceByKey(lambda x,y:x+y)
word_count.collect()

[('from', 5),
 ('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('the', 2),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('had', 1),
 ('a', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

In [30]:
#top 3 non stop words
non_stop=word_count.filter(lambda x:x[0] not in stop_words)
non_stop=non_stop.sortBy(lambda x:x[1],ascending=False).take(3)
print(non_stop)

[('loved', 1), ('liked', 1), ('service', 1)]


# Module 3: DataFrames & Transformation (With Joins)

In [31]:
students = [
("Amit", "10-A", 89),
("Kavya", "10-B", 92),
("Anjali", "10-A", 78),
("Rohit", "10-B", 85),
("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]
attendance = [
("Amit", 24),
("Kavya", 22),
("Anjali", 20),
("Rohit", 25),
("Sneha", 19)
]
columns2 = ["name", "days_present"]


In [37]:
sf=spark.createDataFrame(students,columns)
sf.show()
af=spark.createDataFrame(attendance,columns2)
af.show()
# Join both DataFrames on name
print("----------------")
joined_df=sf.join(af,on="name",how="left")
joined_df.show()
print("--------------")
#Create a new column: attendance_rate = days_present / 25 .
from pyspark.sql.functions import col,when
joined_df=joined_df.withColumn("attendence_rate",col('days_present')/25)
joined_df.show()
#grade students using when
print("------------------")
joined_df=joined_df.withColumn("grades",when(col('marks')>90,"A").when(col('marks')>=80,"B").otherwise("C"))
joined_df.show()
print("------------------------")
# filter students with good grades but poor attendence
filtered=joined_df.filter((col('grades').isin('A','B'))&(col('attendence_rate')<0.80))
filtered.show()

+------+-------+-----+
|  name|section|marks|
+------+-------+-----+
|  Amit|   10-A|   89|
| Kavya|   10-B|   92|
|Anjali|   10-A|   78|
| Rohit|   10-B|   85|
| Sneha|   10-C|   80|
+------+-------+-----+

+------+------------+
|  name|days_present|
+------+------------+
|  Amit|          24|
| Kavya|          22|
|Anjali|          20|
| Rohit|          25|
| Sneha|          19|
+------+------------+

----------------
+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
| Kavya|   10-B|   92|          22|
| Sneha|   10-C|   80|          19|
| Rohit|   10-B|   85|          25|
|Anjali|   10-A|   78|          20|
+------+-------+-----+------------+

--------------
+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendence_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
| Kavya|   10-B|   92|      

# Module 4: Ingest CSV & JSON, Save to Parquet

In [39]:
csv_data="""
emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000"""
with open("employee.csv",'w')as f:
  f.write(csv_data)
import json
json_data={
    "id": 201,
    "name": "Nandini",
    "contact": {
     "email": "nandi@example.com",
     "city": "Hyderabad"
     },
    "skills": ["Python", "Spark", "SQL"]
}
with open('employee.json','w')as ff:
  json.dump(json_data,ff)

In [41]:
#reading both into dataframes
csv_df=spark.read.csv("employee.csv",header=True,inferSchema=True)
json_df=spark.read.json("employee.json")
csv_df.show()
json_df.show()

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+

+--------------------+---+-------+--------------------+
|             contact| id|   name|              skills|
+--------------------+---+-------+--------------------+
|{Hyderabad, nandi...|201|Nandini|[Python, Spark, SQL]|
+--------------------+---+-------+--------------------+



In [45]:
#flattening json nested
from pyspark.sql.functions import explode
flat_json=json_df.select(col("id"),col("name"),col('contact.city').alias("city"),col('contact.email').alias("email"),explode(col("skills")).alias("skill"))
flat_json.show()

+---+-------+---------+-----------------+------+
| id|   name|     city|            email| skill|
+---+-------+---------+-----------------+------+
|201|Nandini|Hyderabad|nandi@example.com|Python|
|201|Nandini|Hyderabad|nandi@example.com| Spark|
|201|Nandini|Hyderabad|nandi@example.com|   SQL|
+---+-------+---------+-----------------+------+



In [48]:
#saving the files partition in city
csv_df.write.mode('overwrite').partitionBy('city').parquet("output/csv_parquet/")
flat_json.write.mode("overwrite").partitionBy("city").parquet("output/csv_parquet/")

# Module 5: Spark SQL with Temp Views

In [53]:

joined_df.createOrReplaceTempView("students_view")

spark.sql("""
    select section, avg(marks) as avg_marks
    from students_view
    group by section
""").show()

# 2. top scorer in each section
spark.sql("""
    select section, name, marks
    from (
      select *,
             row_number() over (
               partition by section
               order by marks desc
             ) as rn
      from students_view
    ) tmp
    where rn = 1
""").show()

# 3. count of students in each grade category
spark.sql("""
    select grades, count(*) as count
    from students_view
    group by grades
""").show()

# 4. students with marks above class average
spark.sql("""
    with avg_tbl as (
      select avg(marks) as class_avg
      from students_view
    )
    select s.name, s.marks
    from students_view s
    cross join avg_tbl
    where s.marks > avg_tbl.class_avg
""").show()

# 5. attendance-adjusted performance
spark.sql("""
    select
      name,
      marks * attendence_rate as adjusted_score
    from students_view
""").show()


+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-C|     80.0|
|   10-A|     83.5|
|   10-B|     88.5|
+-------+---------+

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+

+------+-----+
|grades|count|
+------+-----+
|     B|    3|
|     C|    1|
|     A|    1|
+------+-----+

+-----+-----+
| name|marks|
+-----+-----+
| Amit|   89|
|Kavya|   92|
|Rohit|   85|
+-----+-----+

+------+------------------+
|  name|    adjusted_score|
+------+------------------+
|  Amit|             85.44|
| Kavya|             80.96|
| Sneha|              60.8|
| Rohit|              85.0|
|Anjali|62.400000000000006|
+------+------------------+



# Module 6: Partitioned Data & Incremental Loading

In [None]:
#Step 1: Full Load
sf.write.partitionBy("section").parquet("output/students/")

In [60]:
# step 2 incremental load
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")


In [58]:
 #List files in output/students/ using Python.
import os
for file in os.listdir("output/students/"):
  print(file)
# Read only partition 10-A and list students.
sec_students=spark.read.parquet("output/students/section=10-A")
sec_students.show()
 #Compare before/after counts for section 10-A
df_10A = spark.read.parquet("output/students/section=10-A/")
print("Before/After Counts for 10-A:", df_10A.count())
df_10A.show()


._SUCCESS.crc
section=10-A
section=10-B
_SUCCESS
section=10-C
+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+

Before/After Counts for 10-A: 3
+------+-----+
|  name|marks|
+------+-----+
|Anjali|   78|
| Tejas|   91|
|  Amit|   89|
+------+-----+



# Module 7: ETL Pipeline – End to End


In [63]:
raw_data="""
emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,"""
with open("employee_raw.csv",'w')as f:
  f.write(raw_data)


In [75]:
raw_employee=spark.read.csv("employee_raw.csv",header=True,inferSchema=True)
raw_employee.show()
#filling bonus null with 2000
raw_employee=raw_employee.fillna(value=2000,subset=['bonus'])
raw_employee.show()
#creating total_ctc
from pyspark.sql.functions import col
raw_employee=raw_employee.withColumn("total_ctc",col('salary')+col('bonus'))
raw_employee.show()
#filter employees 65000 above ctc
filtered_employee_ctc=raw_employee.filter(col('total_ctc')>65000)
filtered_employee_ctc.show()
#saving result in json format
filtered_employee_ctc.write \
    .mode("overwrite") \
    .json("/content/filtered_ctc.json")

# write partitioned parquet (overwrite)
filtered_employee_ctc.write \
    .mode("overwrite") \
    .partitionBy("dept") \
    .parquet("/content/filtered_ctc_parquet")


+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+

+------+-----+-------+------+