In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
# Create a Spark session
spark = SparkSession.builder.appName("Day4").getOrCreate()

In [0]:
# Sample data 
data = [
    ("Alice", "HR", 50000, "New York","2022-01-15"),
    ("Bob", "Engineering", 60000, "    San Francisco","2021-11-20"),
    ("Charlie", "HR", 55000, "Los Angeles   ", "2022-02-10"),
    ("David", "Engineering", 62000, "Seattle", "2022-03-05"),
    ("Eva", "Finance", 70000, "Chicago  ", "2021-09-30"),
    ("Frank", "Finance", 75000, "Houston", "2022-04-01"),
    ("Grace", "Engineering", 65000, "Boston", "2022-01-05"),
    ("Hannah", "HR", 48000, "Miami", "2021-12-10"),
    ("Ian", "Finance", 68000, "Dallas", "2022-02-20"),
    ("Jessica", "Engineering", 63000, "Atlanta",None)
]

# Define the schema for the DataFrame
schema = ["name", "dept", "salary", "city","joining_date"]

# Create the DataFrame
df = spark.createDataFrame(data, schema)

df.show()

+-------+-----------+------+-----------------+------------+
|   name|       dept|salary|             city|joining_date|
+-------+-----------+------+-----------------+------------+
|  Alice|         HR| 50000|         New York|  2022-01-15|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|
|  David|Engineering| 62000|          Seattle|  2022-03-05|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|
|  Frank|    Finance| 75000|          Houston|  2022-04-01|
|  Grace|Engineering| 65000|           Boston|  2022-01-05|
| Hannah|         HR| 48000|            Miami|  2021-12-10|
|    Ian|    Finance| 68000|           Dallas|  2022-02-20|
|Jessica|Engineering| 63000|          Atlanta|        null|
+-------+-----------+------+-----------------+------------+



In [0]:
# extract date part 
df.select(F.year(F.col("joining_date")).alias("joining_year")).show()

+------------+
|joining_year|
+------------+
|        2022|
|        2021|
|        2022|
|        2022|
|        2021|
|        2022|
|        2022|
|        2021|
|        2022|
|        null|
+------------+



In [0]:
# inequality filtering
df.filter(F.year(F.col("joining_date"))!="2021").show()

+-------+-----------+------+--------------+------------+
|   name|       dept|salary|          city|joining_date|
+-------+-----------+------+--------------+------------+
|  Alice|         HR| 50000|      New York|  2022-01-15|
|Charlie|         HR| 55000|Los Angeles   |  2022-02-10|
|  David|Engineering| 62000|       Seattle|  2022-03-05|
|  Frank|    Finance| 75000|       Houston|  2022-04-01|
|  Grace|Engineering| 65000|        Boston|  2022-01-05|
|    Ian|    Finance| 68000|        Dallas|  2022-02-20|
+-------+-----------+------+--------------+------------+



In [0]:
# in list
df.filter(F.month(F.col("joining_date")).isin("1","2")).show()

+-------+-----------+------+--------------+------------+
|   name|       dept|salary|          city|joining_date|
+-------+-----------+------+--------------+------------+
|  Alice|         HR| 50000|      New York|  2022-01-15|
|Charlie|         HR| 55000|Los Angeles   |  2022-02-10|
|  Grace|Engineering| 65000|        Boston|  2022-01-05|
|    Ian|    Finance| 68000|        Dallas|  2022-02-20|
+-------+-----------+------+--------------+------------+



In [0]:
# not in list
df.filter(~F.month(F.col("joining_date")).isin("1","2")).show()

+------+-----------+------+-----------------+------------+
|  name|       dept|salary|             city|joining_date|
+------+-----------+------+-----------------+------------+
|   Bob|Engineering| 60000|    San Francisco|  2021-11-20|
| David|Engineering| 62000|          Seattle|  2022-03-05|
|   Eva|    Finance| 70000|        Chicago  |  2021-09-30|
| Frank|    Finance| 75000|          Houston|  2022-04-01|
|Hannah|         HR| 48000|            Miami|  2021-12-10|
+------+-----------+------+-----------------+------------+



In [0]:
# null values
df.filter(df.joining_date.isNull()).show()

+-------+-----------+------+-------+------------+
|   name|       dept|salary|   city|joining_date|
+-------+-----------+------+-------+------------+
|Jessica|Engineering| 63000|Atlanta|        null|
+-------+-----------+------+-------+------------+



In [0]:
# not null values
df.filter(df.joining_date.isNotNull()).show()

+-------+-----------+------+-----------------+------------+
|   name|       dept|salary|             city|joining_date|
+-------+-----------+------+-----------------+------------+
|  Alice|         HR| 50000|         New York|  2022-01-15|
|    Bob|Engineering| 60000|    San Francisco|  2021-11-20|
|Charlie|         HR| 55000|   Los Angeles   |  2022-02-10|
|  David|Engineering| 62000|          Seattle|  2022-03-05|
|    Eva|    Finance| 70000|        Chicago  |  2021-09-30|
|  Frank|    Finance| 75000|          Houston|  2022-04-01|
|  Grace|Engineering| 65000|           Boston|  2022-01-05|
| Hannah|         HR| 48000|            Miami|  2021-12-10|
|    Ian|    Finance| 68000|           Dallas|  2022-02-20|
+-------+-----------+------+-----------------+------------+



In [0]:
#upper case
df.select(F.upper("city")).show()

+-----------------+
|      upper(city)|
+-----------------+
|         NEW YORK|
|    SAN FRANCISCO|
|   LOS ANGELES   |
|          SEATTLE|
|        CHICAGO  |
|          HOUSTON|
|           BOSTON|
|            MIAMI|
|           DALLAS|
|          ATLANTA|
+-----------------+



In [0]:
#lower case
df.select(F.lower("city")).show()

+-----------------+
|      lower(city)|
+-----------------+
|         new york|
|    san francisco|
|   los angeles   |
|          seattle|
|        chicago  |
|          houston|
|           boston|
|            miami|
|           dallas|
|          atlanta|
+-----------------+



In [0]:
# length
df.select(F.length("city")).show()

+------------+
|length(city)|
+------------+
|           8|
|          17|
|          14|
|           7|
|           9|
|           7|
|           6|
|           5|
|           6|
|           7|
+------------+



In [0]:
#trim case
df.select(F.trim("city")).show()

+-------------+
|   trim(city)|
+-------------+
|     New York|
|San Francisco|
|  Los Angeles|
|      Seattle|
|      Chicago|
|      Houston|
|       Boston|
|        Miami|
|       Dallas|
|      Atlanta|
+-------------+



In [0]:
# ltrim case
df.select(F.ltrim("city")).show()

+--------------+
|   ltrim(city)|
+--------------+
|      New York|
| San Francisco|
|Los Angeles   |
|       Seattle|
|     Chicago  |
|       Houston|
|        Boston|
|         Miami|
|        Dallas|
|       Atlanta|
+--------------+



In [0]:
# rtrim case
df.select(F.rtrim("city")).show()

+-----------------+
|      rtrim(city)|
+-----------------+
|         New York|
|    San Francisco|
|      Los Angeles|
|          Seattle|
|          Chicago|
|          Houston|
|           Boston|
|            Miami|
|           Dallas|
|          Atlanta|
+-----------------+



In [0]:
#string replace 
df.select(F.regexp_replace("city"," ","_").alias("replace")).show()

+-----------------+
|          replace|
+-----------------+
|         New_York|
|____San_Francisco|
|   Los_Angeles___|
|          Seattle|
|        Chicago__|
|          Houston|
|           Boston|
|            Miami|
|           Dallas|
|          Atlanta|
+-----------------+



In [0]:
# coalesce
df.select(F.coalesce("joining_date",F.current_date()).alias("default_date")).show()

+------------+
|default_date|
+------------+
|  2022-01-15|
|  2021-11-20|
|  2022-02-10|
|  2022-03-05|
|  2021-09-30|
|  2022-04-01|
|  2022-01-05|
|  2021-12-10|
|  2022-02-20|
|  2024-04-08|
+------------+



In [0]:
# date diff
df.select(F.datediff(F.current_date(),F.coalesce("joining_date",F.current_date())).alias("date_diff")).show()

+---------+
|date_diff|
+---------+
|      814|
|      870|
|      788|
|      765|
|      921|
|      738|
|      824|
|      850|
|      778|
|        0|
+---------+



In [0]:
# add months to date
df.select(F.add_months("joining_date",2).alias("add_months"),"joining_date").show()

+----------+------------+
|add_months|joining_date|
+----------+------------+
|2022-03-15|  2022-01-15|
|2022-01-20|  2021-11-20|
|2022-04-10|  2022-02-10|
|2022-05-05|  2022-03-05|
|2021-11-30|  2021-09-30|
|2022-06-01|  2022-04-01|
|2022-03-05|  2022-01-05|
|2022-02-10|  2021-12-10|
|2022-04-20|  2022-02-20|
|      null|        null|
+----------+------------+

