In [None]:
# This file covers basic steps to get started with pyspark

In [None]:
# Basic Tasks

In [12]:
# Task 1: Read and display data

from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('first').getOrCreate()
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35),("Tony",50)]
df=spark.createDataFrame(data,["Name","Age"])
# df.show()

In [14]:
# Task 2 : Filtering Data
df.filter(df.Age>30).show()

+-------+---+
|   Name|Age|
+-------+---+
|Charlie| 35|
|   Tony| 50|
+-------+---+



In [19]:
# Task : Grouping and Aggregation
df.groupBy(df.Age).count().show()

+---+-----+
|Age|count|
+---+-----+
| 25|    1|
| 30|    1|
| 35|    1|
| 50|    1|
+---+-----+



In [None]:
# Intermediate Tasks:

In [20]:
# Task Joining dataframes
data2 = [("Alice", "Engineer"), ("Bob", "Doctor"), ("Charlie", "Teacher")]
df2 = spark.createDataFrame(data2, ["Name", "Occupation"])
df.join(df2,"Name").show()
# Alice,(25,Engineer)



+-------+---+----------+
|   Name|Age|Occupation|
+-------+---+----------+
|  Alice| 25|  Engineer|
|    Bob| 30|    Doctor|
|Charlie| 35|   Teacher|
+-------+---+----------+



                                                                                

In [22]:
# Task : Handling Missing Data or remove the record that contain null value
data_with_null = [("Alice", 25), ("Bob", None), ("Charlie", 35)]
df_with_null = spark.createDataFrame(data_with_null, ["Name", "Age"])
df_with_null.na.drop().show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|Charlie| 35|
+-------+---+



In [None]:
# Advanced Tasks:

In [25]:
# Task: User-Defined Functions (UDFs): 
# Convert & Create upperCase column for Name column

from pyspark.sql.functions import udf
upper_udf=udf(lambda name:name.upper())
df.withColumn("UpperName",upper_udf(df["Name"])).show()
# df.show()

                                                                                

+-------+---+---------+
|   Name|Age|UpperName|
+-------+---+---------+
|  Alice| 25|    ALICE|
|    Bob| 30|      BOB|
|Charlie| 35|  CHARLIE|
|   Tony| 50|     TONY|
+-------+---+---------+



In [30]:
# Window Functions : generate rownumber based on Age class
from pyspark.sql.functions import row_number
from pyspark.sql import Window
orderBySpec=Window.orderBy("Age")
df.withColumn("Row_num",row_number().over(orderBySpec)).show()

24/02/01 22:39:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/02/01 22:39:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/02/01 22:39:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+---+-------+
|   Name|Age|Row_num|
+-------+---+-------+
|  Alice| 25|      1|
|    Bob| 30|      2|
|Charlie| 35|      3|
|   Tony| 50|      4|
+-------+---+-------+

