In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a Spark session
spark = SparkSession.builder.appName("Day6").getOrCreate()

In [0]:
# Sample data 
data = [
    ("Alice", "HR", 50000, "New York","2022-01-15"),
    ("Bob", "Engineering", 60000, "    San Francisco","2021-11-20"),
    ("Charlie", "HR", 55000, "Los Angeles   ", "2022-02-10"),
    ("David", "Engineering", 62000, "Seattle", "2022-03-05"),
    ("Eva", "Finance", 70000, "Chicago  ", "2021-09-30"),
    ("Frank", "Finance", 75000, "Houston", "2022-04-01"),
    ("Grace", "Engineering", 65000, "Boston", "2022-01-05"),
    ("Hannah", "HR", 48000, "Miami", "2021-12-10"),
    ("Ian", "Finance", 68000, "Dallas", "2022-02-20"),
    ("Jessica", "Engineering", 63000, "Atlanta",None)
]

# Define the schema for the DataFrame
schema = ["name", "dept", "salary", "city","joining_date"]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

df.show()



# Sample data 
j_data = [
    ("Alice", "Active"),
    ("Bob", "Inactive"),
    ("Charlie", "Inactive"),
    ("David", "Active"),
    ("Eva", "Active"),
    ("Frank", "Active"),
    ("Jess", "Inactive")
]

# Define the schema for the DataFrame
j_schema = ["name", "emp_status"]

# Create the DataFrame
j_df = spark.createDataFrame(j_data, j_schema)

j_df.show()

+-------+-----------+------+-----------------+------------+
|   name|       dept|salary|             city|joining_date|
+-------+-----------+------+-----------------+------------+
|  Alice|         HR| 50000|         New York|  2022-01-15|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|
|  David|Engineering| 62000|          Seattle|  2022-03-05|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|
|  Frank|    Finance| 75000|          Houston|  2022-04-01|
|  Grace|Engineering| 65000|           Boston|  2022-01-05|
| Hannah|         HR| 48000|            Miami|  2021-12-10|
|    Ian|    Finance| 68000|           Dallas|  2022-02-20|
|Jessica|Engineering| 63000|          Atlanta|        null|
+-------+-----------+------+-----------------+------------+

+-------+----------+
|   name|emp_status|
+-------+----------+
|  Alice|    Active|
|    Bob|  Inactive|
|Charlie|  Inactive|
|  David|    Active|
|    Eva|   

In [0]:
# left join
df.join(j_df,"name","left").show()

+-------+-----------+------+-----------------+------------+----------+
|   name|       dept|salary|             city|joining_date|emp_status|
+-------+-----------+------+-----------------+------------+----------+
|  Alice|         HR| 50000|         New York|  2022-01-15|    Active|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|  Inactive|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|  Inactive|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|    Active|
|  David|Engineering| 62000|          Seattle|  2022-03-05|    Active|
|  Frank|    Finance| 75000|          Houston|  2022-04-01|    Active|
|  Grace|Engineering| 65000|           Boston|  2022-01-05|      null|
| Hannah|         HR| 48000|            Miami|  2021-12-10|      null|
|Jessica|Engineering| 63000|          Atlanta|        null|      null|
|    Ian|    Finance| 68000|           Dallas|  2022-02-20|      null|
+-------+-----------+------+-----------------+------------+----------+



In [0]:
# right join
df.join(j_df,"name","right").show()

+-------+-----------+------+-----------------+------------+----------+
|   name|       dept|salary|             city|joining_date|emp_status|
+-------+-----------+------+-----------------+------------+----------+
|  Alice|         HR| 50000|         New York|  2022-01-15|    Active|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|  Inactive|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|  Inactive|
|  David|Engineering| 62000|          Seattle|  2022-03-05|    Active|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|    Active|
|  Frank|    Finance| 75000|          Houston|  2022-04-01|    Active|
|   Jess|       null|  null|             null|        null|  Inactive|
+-------+-----------+------+-----------------+------------+----------+



In [0]:
# outer join
df.join(j_df,"name","outer").show()

+-------+-----------+------+-----------------+------------+----------+
|   name|       dept|salary|             city|joining_date|emp_status|
+-------+-----------+------+-----------------+------------+----------+
|  Alice|         HR| 50000|         New York|  2022-01-15|    Active|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|  Inactive|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|  Inactive|
|  David|Engineering| 62000|          Seattle|  2022-03-05|    Active|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|    Active|
|  Frank|    Finance| 75000|          Houston|  2022-04-01|    Active|
|  Grace|Engineering| 65000|           Boston|  2022-01-05|      null|
| Hannah|         HR| 48000|            Miami|  2021-12-10|      null|
|    Ian|    Finance| 68000|           Dallas|  2022-02-20|      null|
|   Jess|       null|  null|             null|        null|  Inactive|
|Jessica|Engineering| 63000|          Atlanta|        null|      null|
+-----

In [0]:
# group by having
df.groupBy("dept").agg(F.count("*").alias("cnt")).filter(F.col("cnt")<4).show()

+-------+---+
|   dept|cnt|
+-------+---+
|     HR|  3|
|Finance|  3|
+-------+---+



In [0]:
# round decimal value 
df.withColumn("new_sal",F.col("salary")*.1234).select(F.round("new_sal",2)).show()

+-----------------+
|round(new_sal, 2)|
+-----------------+
|           6170.0|
|           7404.0|
|           6787.0|
|           7650.8|
|           8638.0|
|           9255.0|
|           8021.0|
|           5923.2|
|           8391.2|
|           7774.2|
+-----------------+



In [0]:
# today date
df.select(F.current_date().alias("current_date")).show()

+------------+
|current_date|
+------------+
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
|  2024-04-08|
+------------+



In [0]:
# date addition 
df.select(F.date_add("joining_date",10).alias("after_10_day")).show()

+------------+
|after_10_day|
+------------+
|  2022-01-25|
|  2021-11-30|
|  2022-02-20|
|  2022-03-15|
|  2021-10-10|
|  2022-04-11|
|  2022-01-15|
|  2021-12-20|
|  2022-03-02|
|        null|
+------------+



In [0]:
# date substract 
df.select(F.date_sub("joining_date",10).alias("prev_10_day")).show()

+-----------+
|prev_10_day|
+-----------+
| 2022-01-05|
| 2021-11-10|
| 2022-01-31|
| 2022-02-23|
| 2021-09-20|
| 2022-03-22|
| 2021-12-26|
| 2021-11-30|
| 2022-02-10|
|       null|
+-----------+



In [0]:
# year from date
df.select(F.year("joining_date").alias("year")).show()

+----+
|year|
+----+
|2022|
|2021|
|2022|
|2022|
|2021|
|2022|
|2022|
|2021|
|2022|
|null|
+----+



In [0]:
# Month from date
df.select(F.month("joining_date").alias("month")).show()

+-----+
|month|
+-----+
|    1|
|   11|
|    2|
|    3|
|    9|
|    4|
|    1|
|   12|
|    2|
| null|
+-----+



In [0]:
# Day from date
df.select(F.dayofmonth("joining_date").alias("day")).show()

+----+
| day|
+----+
|  15|
|  20|
|  10|
|   5|
|  30|
|   1|
|   5|
|  10|
|  20|
|null|
+----+



In [0]:
# sorting 
df.orderBy(F.col("salary").desc()).show()

+-------+-----------+------+-----------------+------------+
|   name|       dept|salary|             city|joining_date|
+-------+-----------+------+-----------------+------------+
|  Frank|    Finance| 75000|          Houston|  2022-04-01|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|
|    Ian|    Finance| 68000|           Dallas|  2022-02-20|
|  Grace|Engineering| 65000|           Boston|  2022-01-05|
|Jessica|Engineering| 63000|          Atlanta|        null|
|  David|Engineering| 62000|          Seattle|  2022-03-05|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|
|  Alice|         HR| 50000|         New York|  2022-01-15|
| Hannah|         HR| 48000|            Miami|  2021-12-10|
+-------+-----------+------+-----------------+------------+

