<a href="https://colab.research.google.com/github/alinapradhan/All-Spark-SQL-functions/blob/main/SPARKSQLAllFunctionsPyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark



In [2]:
## create sample data
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
from datetime import datetime


In [3]:
# intialise spark session
spark = SparkSession.builder.appName("Sample Data").getOrCreate()

In [4]:
spark

In [5]:
# sample data
data = [
    (1, "John Doe", "2024-08-01", 23, 1000.50),
    (2, "Jane Smith", "2024-08-02", 25, 1500.75),
    (3, "Bob Johnson", "2024-08-03", 18,3000.10),
    (4, "Alice Brown", "2024-08-04", 22, 2500.25),
    (5, "David Lee", "2024-08-05", 27, 3500.50),
    (6,"Ayush Pradhan","2024-08-06",31,2500.45),
]

In [6]:
# Create Dataframe
columns = ["id", "name", "dob", "age", "salary"]
df = spark.createDataFrame(data, columns)

In [7]:
# Spark the intial Dataframe
df.show()

+---+-------------+----------+---+-------+
| id|         name|       dob|age| salary|
+---+-------------+----------+---+-------+
|  1|     John Doe|2024-08-01| 23| 1000.5|
|  2|   Jane Smith|2024-08-02| 25|1500.75|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|
|  4|  Alice Brown|2024-08-04| 22|2500.25|
|  5|    David Lee|2024-08-05| 27| 3500.5|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|
+---+-------------+----------+---+-------+



In [8]:
# 1 .col  #Selects the "name" column.
from pyspark.sql.functions import col
df.select(col("name")).show()


+-------------+
|         name|
+-------------+
|     John Doe|
|   Jane Smith|
|  Bob Johnson|
|  Alice Brown|
|    David Lee|
|Ayush Pradhan|
+-------------+



In [9]:
# 2. lit #Adds a new column with a literal valudf_country = df.withColumn("country", lit("USA"))
df_country = df.withColumn("country", lit("USA"))
df_country.show()

+---+-------------+----------+---+-------+-------+
| id|         name|       dob|age| salary|country|
+---+-------------+----------+---+-------+-------+
|  1|     John Doe|2024-08-01| 23| 1000.5|    USA|
|  2|   Jane Smith|2024-08-02| 25|1500.75|    USA|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|    USA|
|  4|  Alice Brown|2024-08-04| 22|2500.25|    USA|
|  5|    David Lee|2024-08-05| 27| 3500.5|    USA|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|    USA|
+---+-------------+----------+---+-------+-------+



In [10]:
df.show()

+---+-------------+----------+---+-------+
| id|         name|       dob|age| salary|
+---+-------------+----------+---+-------+
|  1|     John Doe|2024-08-01| 23| 1000.5|
|  2|   Jane Smith|2024-08-02| 25|1500.75|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|
|  4|  Alice Brown|2024-08-04| 22|2500.25|
|  5|    David Lee|2024-08-05| 27| 3500.5|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|
+---+-------------+----------+---+-------+



In [11]:
# 3. adds 5 to the "age" column
from pyspark.sql.functions import expr
df_age = df.withColumn("age_plus_5", expr("age + 5"))
df_age.show()

+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|age_plus_5|
+---+-------------+----------+---+-------+----------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        28|
|  2|   Jane Smith|2024-08-02| 25|1500.75|        30|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|        23|
|  4|  Alice Brown|2024-08-04| 22|2500.25|        27|
|  5|    David Lee|2024-08-05| 27| 3500.5|        32|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|        36|
+---+-------------+----------+---+-------+----------+



In [12]:
# 4. when
#Classifies people as "Adult" or "Minor".
#from pyspark.sql.functions import when
#df.withColumn("status", when(col("age") > 18, "Adult").otherwise("Minor")).show()
from pyspark.sql.functions import when
df_classification = df.withColumn("classification", when(col("age") >= 18, "Adult").otherwise("Minor"))
df_classification.show()


+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|classification|
+---+-------------+----------+---+-------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|         Adult|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         Adult|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         Adult|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         Adult|
|  5|    David Lee|2024-08-05| 27| 3500.5|         Adult|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         Adult|
+---+-------------+----------+---+-------+--------------+



In [13]:
#5 .concat
#Concatenates first and last names with a space.
from pyspark.sql.functions import concat
df_concat = df.withColumn("full_name", concat(col("name"), lit(" ")))
df_concat.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|     full_name|
+---+-------------+----------+---+-------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     John Doe |
|  2|   Jane Smith|2024-08-02| 25|1500.75|   Jane Smith |
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|  Bob Johnson |
|  4|  Alice Brown|2024-08-04| 22|2500.25|  Alice Brown |
|  5|    David Lee|2024-08-05| 27| 3500.5|    David Lee |
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|Ayush Pradhan |
+---+-------------+----------+---+-------+--------------+



In [14]:
# 6 . substring
#Extracts the first three characters from the "name" column.
from pyspark.sql.functions import substring
df_substring = df.withColumn("first_three_chars", substring(col("name"), 1, 3))
df_substring.show()

+---+-------------+----------+---+-------+-----------------+
| id|         name|       dob|age| salary|first_three_chars|
+---+-------------+----------+---+-------+-----------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|              Joh|
|  2|   Jane Smith|2024-08-02| 25|1500.75|              Jan|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|              Bob|
|  4|  Alice Brown|2024-08-04| 22|2500.25|              Ali|
|  5|    David Lee|2024-08-05| 27| 3500.5|              Dav|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|              Ayu|
+---+-------------+----------+---+-------+-----------------+



In [15]:
# 7 . split
## splits the name column into AN ARRAY of words
from pyspark.sql.functions import split
df_split = df.withColumn("name_array", split(col("name"), " "))
df_split.show()

+---+-------------+----------+---+-------+----------------+
| id|         name|       dob|age| salary|      name_array|
+---+-------------+----------+---+-------+----------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     [John, Doe]|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   [Jane, Smith]|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|  [Bob, Johnson]|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  [Alice, Brown]|
|  5|    David Lee|2024-08-05| 27| 3500.5|    [David, Lee]|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|[Ayush, Pradhan]|
+---+-------------+----------+---+-------+----------------+



In [16]:
# 8 . regexp_replace
# replaces "john" with "Jon" in the "name" column
from pyspark.sql.functions import regexp_replace
df_replace = df.withColumn("replaced_name", regexp_replace(col("name"), "John", "Jon"))
df_replace.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|replaced_name|
+---+-------------+----------+---+-------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|      Jon Doe|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   Jane Smith|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|   Bob Jonson|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  Alice Brown|
|  5|    David Lee|2024-08-05| 27| 3500.5|    David Lee|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|Ayush Pradhan|
+---+-------------+----------+---+-------+-------------+



In [17]:
# 9. count
# Counts the number of records in the DataFrame
from pyspark.sql.functions import count
df_count = df.select(count("*").alias("record_count"))
df_count.show()

+------------+
|record_count|
+------------+
|           6|
+------------+



In [18]:
# 10. sum
#Calculates the total salary.
from pyspark.sql.functions import sum
df_sum = df.agg(sum("salary").alias("total_salary"))
df_sum.show()

+------------------+
|      total_salary|
+------------------+
|14002.550000000001|
+------------------+



In [19]:
# 11 .avg
#Calculates the average age.
from pyspark.sql.functions import avg
df_avg = df.agg(avg("age").alias("average_age"))
df_avg.show()

+------------------+
|       average_age|
+------------------+
|24.333333333333332|
+------------------+



In [20]:
# 12 max
# finds the maximum salary
from pyspark.sql.functions import max
df_max = df.agg(max("salary").alias("max_salary"))
df_max.show()

+----------+
|max_salary|
+----------+
|    3500.5|
+----------+



In [21]:
# 13 min
# finds the minimum age
from pyspark.sql.functions import min
df_min = df.agg(min("age").alias("min_age"))
df_min.show()

+-------+
|min_age|
+-------+
|     18|
+-------+



In [22]:
# 14 round
# rounds the salary to the nearest integer
from pyspark.sql.functions import round
df_round = df.withColumn("rounded_salary", round(col("salary"), 0))
df_round.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|rounded_salary|
+---+-------------+----------+---+-------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        1001.0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|        1501.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|        3000.0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|        2500.0|
|  5|    David Lee|2024-08-05| 27| 3500.5|        3501.0|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|        2500.0|
+---+-------------+----------+---+-------+--------------+



In [23]:
# 15. data_format
# Formats the "dob ( date of birth )" columns as MM/dd/YY
from pyspark.sql.functions import date_format
df_format = df.withColumn("formatted_dob", date_format(col("dob"), "mm/dd/yyyy"))
df_format.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|formatted_dob|
+---+-------------+----------+---+-------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|   00/01/2024|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   00/02/2024|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|   00/03/2024|
|  4|  Alice Brown|2024-08-04| 22|2500.25|   00/04/2024|
|  5|    David Lee|2024-08-05| 27| 3500.5|   00/05/2024|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|   00/06/2024|
+---+-------------+----------+---+-------+-------------+



In [24]:
# 16 current date
# adds the current date to the DatAfrAME
from pyspark.sql.functions import current_date
df_current_date = df.withColumn("current_date", current_date())
df_current_date.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|current_date|
+---+-------------+----------+---+-------+------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|  2025-08-09|
|  2|   Jane Smith|2024-08-02| 25|1500.75|  2025-08-09|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|  2025-08-09|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  2025-08-09|
|  5|    David Lee|2024-08-05| 27| 3500.5|  2025-08-09|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|  2025-08-09|
+---+-------------+----------+---+-------+------------+



In [25]:
# 17 CURRENT TIMESTAMP
#adds the current timestamp to the DataFrame
from pyspark.sql.functions import current_timestamp
df_current_timestamp = df.withColumn("current_timestamp", current_timestamp())
df_current_timestamp.show()

+---+-------------+----------+---+-------+--------------------+
| id|         name|       dob|age| salary|   current_timestamp|
+---+-------------+----------+---+-------+--------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|2025-08-09 07:53:...|
|  2|   Jane Smith|2024-08-02| 25|1500.75|2025-08-09 07:53:...|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|2025-08-09 07:53:...|
|  4|  Alice Brown|2024-08-04| 22|2500.25|2025-08-09 07:53:...|
|  5|    David Lee|2024-08-05| 27| 3500.5|2025-08-09 07:53:...|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|2025-08-09 07:53:...|
+---+-------------+----------+---+-------+--------------------+



In [26]:
# 18 year,month,dayofmonth
# Extracts the year,month, and the day from the "dob" column
from pyspark.sql.functions import year, month, dayofmonth

In [27]:
df_year = df.withColumn("year", year(col("dob")))
df_year.show()

+---+-------------+----------+---+-------+----+
| id|         name|       dob|age| salary|year|
+---+-------------+----------+---+-------+----+
|  1|     John Doe|2024-08-01| 23| 1000.5|2024|
|  2|   Jane Smith|2024-08-02| 25|1500.75|2024|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|2024|
|  4|  Alice Brown|2024-08-04| 22|2500.25|2024|
|  5|    David Lee|2024-08-05| 27| 3500.5|2024|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|2024|
+---+-------------+----------+---+-------+----+



In [28]:
#19 date_add
#adds 10 days top the "dob" columns
from pyspark.sql.functions import date_add
df_date_add = df.withColumn("date_after_10_days", date_add(col("dob"), 10))
df_date_add.show()

+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|date_after_10_days|
+---+-------------+----------+---+-------+------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        2024-08-11|
|  2|   Jane Smith|2024-08-02| 25|1500.75|        2024-08-12|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|        2024-08-13|
|  4|  Alice Brown|2024-08-04| 22|2500.25|        2024-08-14|
|  5|    David Lee|2024-08-05| 27| 3500.5|        2024-08-15|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|        2024-08-16|
+---+-------------+----------+---+-------+------------------+



In [29]:
# 20 date_sub
# subtracts 10 days from the "dob" columns
from pyspark.sql.functions import date_sub
df_date_sub = df.withColumn("date_before_10_days", date_sub(col("dob"), 10))
df_date_sub.show()

+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|date_before_10_days|
+---+-------------+----------+---+-------+-------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|         2024-07-22|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         2024-07-23|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         2024-07-24|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         2024-07-25|
|  5|    David Lee|2024-08-05| 27| 3500.5|         2024-07-26|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         2024-07-27|
+---+-------------+----------+---+-------+-------------------+



In [31]:
 ## 21 date diff
# calculates the difference in days between the current date and the "dob"
from pyspark.sql.functions import datediff
df_date_diff = df.withColumn("days_since_dob", datediff(current_date(), col("dob")))
df_date_diff.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|days_since_dob|
+---+-------------+----------+---+-------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|           373|
|  2|   Jane Smith|2024-08-02| 25|1500.75|           372|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|           371|
|  4|  Alice Brown|2024-08-04| 22|2500.25|           370|
|  5|    David Lee|2024-08-05| 27| 3500.5|           369|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|           368|
+---+-------------+----------+---+-------+--------------+



In [32]:
## 22 to_date
#CONVERTS THE"dob" column from string to date format
from pyspark.sql.functions import to_date
df_to_date = df.withColumn("dob_as_date", to_date(col("dob"), "yyyy-MM-dd"))
df_to_date.show()

+---+-------------+----------+---+-------+-----------+
| id|         name|       dob|age| salary|dob_as_date|
+---+-------------+----------+---+-------+-----------+
|  1|     John Doe|2024-08-01| 23| 1000.5| 2024-08-01|
|  2|   Jane Smith|2024-08-02| 25|1500.75| 2024-08-02|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 2024-08-03|
|  4|  Alice Brown|2024-08-04| 22|2500.25| 2024-08-04|
|  5|    David Lee|2024-08-05| 27| 3500.5| 2024-08-05|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45| 2024-08-06|
+---+-------------+----------+---+-------+-----------+



In [35]:
## 23 to_timestamp
# converts the "dob" column from string to timestamp format
from pyspark.sql.functions import to_timestamp
df_to_timestamp = df.withColumn("dob_as_timestamp", to_timestamp(col("dob"), "yyyy-MM-dd"))
df_to_timestamp.show()

+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|   dob_as_timestamp|
+---+-------------+----------+---+-------+-------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|2024-08-01 00:00:00|
|  2|   Jane Smith|2024-08-02| 25|1500.75|2024-08-02 00:00:00|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|2024-08-03 00:00:00|
|  4|  Alice Brown|2024-08-04| 22|2500.25|2024-08-04 00:00:00|
|  5|    David Lee|2024-08-05| 27| 3500.5|2024-08-05 00:00:00|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|2024-08-06 00:00:00|
+---+-------------+----------+---+-------+-------------------+



In [45]:
## 24 window
## Aggregates the salary over a sliding window of one day
from pyspark.sql.functions import window
df_window = df.withColumn("window",window(col("dob"),"1 day"))
df_window.show(truncate=False)

+---+-------------+----------+---+-------+------------------------------------------+
|id |name         |dob       |age|salary |window                                    |
+---+-------------+----------+---+-------+------------------------------------------+
|1  |John Doe     |2024-08-01|23 |1000.5 |{2024-08-01 00:00:00, 2024-08-02 00:00:00}|
|2  |Jane Smith   |2024-08-02|25 |1500.75|{2024-08-02 00:00:00, 2024-08-03 00:00:00}|
|3  |Bob Johnson  |2024-08-03|18 |3000.1 |{2024-08-03 00:00:00, 2024-08-04 00:00:00}|
|4  |Alice Brown  |2024-08-04|22 |2500.25|{2024-08-04 00:00:00, 2024-08-05 00:00:00}|
|5  |David Lee    |2024-08-05|27 |3500.5 |{2024-08-05 00:00:00, 2024-08-06 00:00:00}|
|6  |Ayush Pradhan|2024-08-06|31 |2500.45|{2024-08-06 00:00:00, 2024-08-07 00:00:00}|
+---+-------------+----------+---+-------+------------------------------------------+



In [53]:
# 25 rank, dense_rank , row_number
#Applies ranking functions to the salary column.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, dense_rank, row_number

# Re-create the DataFrame as it seems to have been overwritten
data = [
    (1, "John Doe", "2024-08-01", 23, 1000.50),
    (2, "Jane Smith", "2024-08-02", 25, 1500.75),
    (3, "Bob Johnson", "2024-08-03", 18,3000.10),
    (4, "Alice Brown", "2024-08-04", 22, 2500.25),
    (5, "David Lee", "2024-08-05", 27, 3500.50),
    (6,"Ayush Pradhan","2024-08-06",31,2500.45),
]
columns = ["id", "name", "dob", "age", "salary"]
df = spark.createDataFrame(data, columns)

windowSpec = Window.orderBy(col("salary").desc())
df.withColumn("rank", rank().over(windowSpec)).withColumn("dense_rank", dense_rank().over(windowSpec)).withColumn("row_number", row_number().over(windowSpec)).show()

+---+-------------+----------+---+-------+----+----------+----------+
| id|         name|       dob|age| salary|rank|dense_rank|row_number|
+---+-------------+----------+---+-------+----+----------+----------+
|  5|    David Lee|2024-08-05| 27| 3500.5|   1|         1|         1|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|   2|         2|         2|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|   3|         3|         3|
|  4|  Alice Brown|2024-08-04| 22|2500.25|   4|         4|         4|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   5|         5|         5|
|  1|     John Doe|2024-08-01| 23| 1000.5|   6|         6|         6|
+---+-------------+----------+---+-------+----+----------+----------+



In [63]:
## 26 array
## creates a new array column
from pyspark.sql.functions import array
df_array  = df.withColumn("array_column", array(col("id"),col("age")))
df_array.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|array_column|
+---+-------------+----------+---+-------+------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     [1, 23]|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     [2, 25]|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     [3, 18]|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     [4, 22]|
|  5|    David Lee|2024-08-05| 27| 3500.5|     [5, 27]|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|     [6, 31]|
+---+-------------+----------+---+-------+------------+



In [64]:
## 27 array_contains
## checks if a specified element exits in an array column
from pyspark.sql.functions import array_contains
df_array_contains = df_array.withColumn("contains_id_1", array_contains(array(col("id")), 1))
df_array_contains.show()

+---+-------------+----------+---+-------+------------+-------------+
| id|         name|       dob|age| salary|array_column|contains_id_1|
+---+-------------+----------+---+-------+------------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     [1, 23]|         true|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     [2, 25]|        false|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     [3, 18]|        false|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     [4, 22]|        false|
|  5|    David Lee|2024-08-05| 27| 3500.5|     [5, 27]|        false|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|     [6, 31]|        false|
+---+-------------+----------+---+-------+------------+-------------+



In [66]:
##  28 explode
## Creates a new row for each element in the given array or map column
from pyspark.sql.functions import explode
df_explode = df_array.withColumn("exploded_array", explode(array(col("id"),col("age"))))
df_explode.show()

+---+-------------+----------+---+-------+------------+--------------+
| id|         name|       dob|age| salary|array_column|exploded_array|
+---+-------------+----------+---+-------+------------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     [1, 23]|             1|
|  1|     John Doe|2024-08-01| 23| 1000.5|     [1, 23]|            23|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     [2, 25]|             2|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     [2, 25]|            25|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     [3, 18]|             3|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     [3, 18]|            18|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     [4, 22]|             4|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     [4, 22]|            22|
|  5|    David Lee|2024-08-05| 27| 3500.5|     [5, 27]|             5|
|  5|    David Lee|2024-08-05| 27| 3500.5|     [5, 27]|            27|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|     [6, 31]|             6|
|  6|A

In [69]:
## 29 map  creates a new map column
from pyspark.sql.functions import create_map
df_map = df.withColumn("map_column", create_map(lit("name"), col("name"), lit("age"), col("age")))
df_map.show(truncate=False)

+---+-------------+----------+---+-------+----------------------------------+
|id |name         |dob       |age|salary |map_column                        |
+---+-------------+----------+---+-------+----------------------------------+
|1  |John Doe     |2024-08-01|23 |1000.5 |{name -> John Doe, age -> 23}     |
|2  |Jane Smith   |2024-08-02|25 |1500.75|{name -> Jane Smith, age -> 25}   |
|3  |Bob Johnson  |2024-08-03|18 |3000.1 |{name -> Bob Johnson, age -> 18}  |
|4  |Alice Brown  |2024-08-04|22 |2500.25|{name -> Alice Brown, age -> 22}  |
|5  |David Lee    |2024-08-05|27 |3500.5 |{name -> David Lee, age -> 27}    |
|6  |Ayush Pradhan|2024-08-06|31 |2500.45|{name -> Ayush Pradhan, age -> 31}|
+---+-------------+----------+---+-------+----------------------------------+



In [70]:
# 30 coalesce
#Returns the first non-null value among the given columns.
from pyspark.sql.functions import coalesce
df_coalesce = df.withColumn("coalesced_value", coalesce(col("name"), col("age")))
df_coalesce.show()

+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|coalesced_value|
+---+-------------+----------+---+-------+---------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|       John Doe|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     Jane Smith|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|    Bob Johnson|
|  4|  Alice Brown|2024-08-04| 22|2500.25|    Alice Brown|
|  5|    David Lee|2024-08-05| 27| 3500.5|      David Lee|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|  Ayush Pradhan|
+---+-------------+----------+---+-------+---------------+



In [71]:
#### 31 isnull
#Checks if the column is null.
from pyspark.sql.functions import isnull
df_isnull = df.withColumn("is_name_null", isnull(col("name")))
df_isnull.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|is_name_null|
+---+-------------+----------+---+-------+------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|       false|
|  2|   Jane Smith|2024-08-02| 25|1500.75|       false|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|       false|
|  4|  Alice Brown|2024-08-04| 22|2500.25|       false|
|  5|    David Lee|2024-08-05| 27| 3500.5|       false|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|       false|
+---+-------------+----------+---+-------+------------+



In [72]:
## 32 isnan
## checks if the column is NaN
from pyspark.sql.functions import isnan
df_isnan = df.withColumn("is_salary_nan", isnan(col("salary")))
df_isnan.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|is_salary_nan|
+---+-------------+----------+---+-------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        false|
|  2|   Jane Smith|2024-08-02| 25|1500.75|        false|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|        false|
|  4|  Alice Brown|2024-08-04| 22|2500.25|        false|
|  5|    David Lee|2024-08-05| 27| 3500.5|        false|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|        false|
+---+-------------+----------+---+-------+-------------+



In [73]:
## 33 sha2
## applies the SHA-2 hash functions to the column
from pyspark.sql.functions import sha2
df_sha2 = df.withColumn("sha2_hash", sha2(col("name"), 256))
df_sha2.show()

+---+-------------+----------+---+-------+--------------------+
| id|         name|       dob|age| salary|           sha2_hash|
+---+-------------+----------+---+-------+--------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|6cea57c2fb6cbc2a4...|
|  2|   Jane Smith|2024-08-02| 25|1500.75|a2dd3acadb1c9dcd9...|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|63ee5af378b5f3ff4...|
|  4|  Alice Brown|2024-08-04| 22|2500.25|f86206bf359a841e1...|
|  5|    David Lee|2024-08-05| 27| 3500.5|ce4c3b259cf559b42...|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|1322aea8c7bcc50f0...|
+---+-------------+----------+---+-------+--------------------+



In [74]:
# 34 md5
#Calculates the MD5 hash of a column.
from pyspark.sql.functions import md5
df_md5 = df.withColumn("md5_hash", md5(col("name")))
df_md5.show(truncate=False)

+---+-------------+----------+---+-------+--------------------------------+
|id |name         |dob       |age|salary |md5_hash                        |
+---+-------------+----------+---+-------+--------------------------------+
|1  |John Doe     |2024-08-01|23 |1000.5 |4c2a904bafba06591225113ad17b5cec|
|2  |Jane Smith   |2024-08-02|25 |1500.75|71768b5e2a0b3697eb3c0c6d4ebbbaf8|
|3  |Bob Johnson  |2024-08-03|18 |3000.1 |1018deacd9ac4ed7b69c8d393c553459|
|4  |Alice Brown  |2024-08-04|22 |2500.25|c6d12fbafcefab0e3974d56e590d54ef|
|5  |David Lee    |2024-08-05|27 |3500.5 |972735291e9dae3b229865e06664c59f|
|6  |Ayush Pradhan|2024-08-06|31 |2500.45|e5bf61efff2fe866d96d45a0246f66a7|
+---+-------------+----------+---+-------+--------------------------------+



In [75]:
## 35 monotonically_increasing_id
## generates a unique, monotonically increasing 64-bit integer for each row
from pyspark.sql.functions import monotonically_increasing_id
df_monotonically_increasing_id = df.withColumn("monotonically_increasing_id", monotonically_increasing_id())
df_monotonically_increasing_id.show()

+---+-------------+----------+---+-------+---------------------------+
| id|         name|       dob|age| salary|monotonically_increasing_id|
+---+-------------+----------+---+-------+---------------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|                          0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|                          1|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|                          2|
|  4|  Alice Brown|2024-08-04| 22|2500.25|                 8589934592|
|  5|    David Lee|2024-08-05| 27| 3500.5|                 8589934593|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|                 8589934594|
+---+-------------+----------+---+-------+---------------------------+



In [77]:
## 36 length
## Returns the length of a string column
from pyspark.sql.functions import length
df_length = df.withColumn("name_length", length(col("name")))
df_length.show()

+---+-------------+----------+---+-------+-----------+
| id|         name|       dob|age| salary|name_length|
+---+-------------+----------+---+-------+-----------+
|  1|     John Doe|2024-08-01| 23| 1000.5|          8|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         10|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         11|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         11|
|  5|    David Lee|2024-08-05| 27| 3500.5|          9|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         13|
+---+-------------+----------+---+-------+-----------+



In [78]:
## 37 upper and lower
## CONVERTS ALL CHARACTERS OF A STRING  COLUMN TO UPPER OR LOWER CASE
from pyspark.sql.functions import upper, lower
df_upper = df.withColumn("upper_name", upper(col("name")))
df_lower = df.withColumn("lower_name", lower(col("name")))
df_upper.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|   upper_name|
+---+-------------+----------+---+-------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     JOHN DOE|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   JANE SMITH|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|  BOB JOHNSON|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  ALICE BROWN|
|  5|    David Lee|2024-08-05| 27| 3500.5|    DAVID LEE|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|AYUSH PRADHAN|
+---+-------------+----------+---+-------+-------------+



In [79]:
### 38 trim, ltrim, rtrim
#Trims spaces from both sides, left side, or right side of a string column.

from pyspark.sql.functions import trim, ltrim, rtrim
"""
df_trim = df.withColumn("trimmed_name", trim(col("name")))
df_trim.show()

df_ltrim = df.withColumn("ltrimmed_name", ltrim(col("name")))
df_ltrim.show()

df_rtrim = df.withColumn("rtrimmed_name", rtrim(col("name")))
df_rtrim.show()
"""
from pyspark.sql.functions import trim, ltrim, rtrim
df_all = df.withColumn("name_trimmed", trim(col("name"))).withColumn("name_ltrimmed", ltrim(col("name"))).withColumn("name_rtrimmed", rtrim(col("name")))
df_all.show()

+---+-------------+----------+---+-------+-------------+-------------+-------------+
| id|         name|       dob|age| salary| name_trimmed|name_ltrimmed|name_rtrimmed|
+---+-------------+----------+---+-------+-------------+-------------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     John Doe|     John Doe|     John Doe|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   Jane Smith|   Jane Smith|   Jane Smith|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|  Bob Johnson|  Bob Johnson|  Bob Johnson|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  Alice Brown|  Alice Brown|  Alice Brown|
|  5|    David Lee|2024-08-05| 27| 3500.5|    David Lee|    David Lee|    David Lee|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|Ayush Pradhan|Ayush Pradhan|Ayush Pradhan|
+---+-------------+----------+---+-------+-------------+-------------+-------------+



In [81]:

# 39 abs
#Returns the absolute value of a numeric column.
from pyspark.sql.functions import abs
#df_abs = df.withColumn("absolute_age", abs(col("age")))
#df_abs.show()
from pyspark.sql.functions import abs
df_abs = df.withColumn("abs_salary", abs(col("salary") - 3000))
df_abs.show()


+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|         abs_salary|
+---+-------------+----------+---+-------+-------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|             1999.5|
|  2|   Jane Smith|2024-08-02| 25|1500.75|            1499.25|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|0.09999999999990905|
|  4|  Alice Brown|2024-08-04| 22|2500.25|             499.75|
|  5|    David Lee|2024-08-05| 27| 3500.5|              500.5|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|  499.5500000000002|
+---+-------------+----------+---+-------+-------------------+



In [82]:
# 40 sqrt
#Returns the square root of a numeric column.
from pyspark.sql.functions import sqrt
df_sqrt = df.withColumn("sqrt_age", sqrt(col("age")))
df_sqrt.show()


+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|          sqrt_age|
+---+-------------+----------+---+-------+------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5| 4.795831523312719|
|  2|   Jane Smith|2024-08-02| 25|1500.75|               5.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 4.242640687119285|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  4.69041575982343|
|  5|    David Lee|2024-08-05| 27| 3500.5| 5.196152422706632|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|5.5677643628300215|
+---+-------------+----------+---+-------+------------------+



In [83]:
# 41 exp
#Computes the exponential of the given column.

from pyspark.sql.functions import exp

#df_exp = df.withColumn("exp_salary", exp(col("salary")))
#df_exp.show()

from pyspark.sql.functions import exp

df_exp= df.withColumn("exp_age", exp(col("age")))
df_exp.show()


+---+-------------+----------+---+-------+--------------------+
| id|         name|       dob|age| salary|             exp_age|
+---+-------------+----------+---+-------+--------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5| 9.744803446248903E9|
|  2|   Jane Smith|2024-08-02| 25|1500.75|7.200489933738588E10|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 6.565996913733051E7|
|  4|  Alice Brown|2024-08-04| 22|2500.25| 3.584912846131592E9|
|  5|    David Lee|2024-08-05| 27| 3500.5|5.320482406017986...|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|2.904884966524742...|
+---+-------------+----------+---+-------+--------------------+



In [84]:
# 42 log, log10, log2
#Computes the logarithm of the column using a base of e, 10, or 2.

from pyspark.sql.functions import log, log10, log2

#df_log = df.withColumn("log_salary", log(col("salary")))
#df_log.show()

#df_log = df.withColumn("log_salary", log10(col("salary")))
#df_log.show()

#df_log = df.withColumn("log_salary", log2(col("salary")))
#df_log.show()

df_all = df.withColumn("log_age", log(col("age"))).withColumn("log10_age", log10(col("age"))).withColumn("log2_age", log2(col("age")))
df_all.show()

+---+-------------+----------+---+-------+------------------+------------------+-----------------+
| id|         name|       dob|age| salary|           log_age|         log10_age|         log2_age|
+---+-------------+----------+---+-------+------------------+------------------+-----------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|3.1354942159291497|1.3617278360175928|4.523561956057013|
|  2|   Jane Smith|2024-08-02| 25|1500.75|3.2188758248682006|1.3979400086720377|4.643856189774724|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|2.8903717578961645| 1.255272505103306|4.169925001442312|
|  4|  Alice Brown|2024-08-04| 22|2500.25| 3.091042453358316|1.3424226808222062|4.459431618637297|
|  5|    David Lee|2024-08-05| 27| 3500.5| 3.295836866004329|1.4313637641589874|4.754887502163469|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|3.4339872044851463|1.4913616938342726|4.954196310386876|
+---+-------------+----------+---+-------+------------------+------------------+-----------------+



In [85]:
# 43 greatest and least
#Returns the greatest or least value of the list of columns.

from pyspark.sql.functions import greatest, least

df_greatest = df.withColumn("greatest_value", greatest(col("id"), col("age")))
print(df_greatest.show())

df_least = df.withColumn("least_value", least(col("id"), col("age")))
print(df_least.show())

#df_all = df.withColumn("max_value", greatest(col("age"), col("salary"))).withColumn("min_value", least(col("age"), col("salary")))
#df_all.show()


+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|greatest_value|
+---+-------------+----------+---+-------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|            23|
|  2|   Jane Smith|2024-08-02| 25|1500.75|            25|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|            18|
|  4|  Alice Brown|2024-08-04| 22|2500.25|            22|
|  5|    David Lee|2024-08-05| 27| 3500.5|            27|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|            31|
+---+-------------+----------+---+-------+--------------+

None
+---+-------------+----------+---+-------+-----------+
| id|         name|       dob|age| salary|least_value|
+---+-------------+----------+---+-------+-----------+
|  1|     John Doe|2024-08-01| 23| 1000.5|          1|
|  2|   Jane Smith|2024-08-02| 25|1500.75|          2|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|          3|
|  4|  Alice Brown|2024-08-04| 22|2500.25|          4|
|  5|    David Lee|2024-08-05

In [86]:

# 44 pow
#Raises the value of a column to the power of another column.

from pyspark.sql.functions import pow

df_pow = df.withColumn("pow_result", pow(col("id"), col("age")))
df_pow.show(truncate=False)


+---+-------------+----------+---+-------+---------------------+
|id |name         |dob       |age|salary |pow_result           |
+---+-------------+----------+---+-------+---------------------+
|1  |John Doe     |2024-08-01|23 |1000.5 |1.0                  |
|2  |Jane Smith   |2024-08-02|25 |1500.75|3.3554432E7          |
|3  |Bob Johnson  |2024-08-03|18 |3000.1 |3.87420489E8         |
|4  |Alice Brown  |2024-08-04|22 |2500.25|1.7592186044416E13   |
|5  |David Lee    |2024-08-05|27 |3500.5 |7.4505805969238282E18|
|6  |Ayush Pradhan|2024-08-06|31 |2500.45|1.3264435183244001E24|
+---+-------------+----------+---+-------+---------------------+



In [87]:
# 45 round, bround
#Rounds the value of the column to the nearest integer or to the nearest integer with ties broken by rounding away from zero.

from pyspark.sql.functions import round, bround

df_round = df.withColumn("rounded_salary", round(col("salary"), 0))
print(df_round.show())

df_bround = df.withColumn("brounded_salary", bround(col("salary"), 0))
print(df_bround.show())

#df_all = df.withColumn("rounded_salary", round(col("salary"), 0)).withColumn("brounded_salary", bround(col("salary"), 0))
#df_all.show()

+---+-------------+----------+---+-------+--------------+
| id|         name|       dob|age| salary|rounded_salary|
+---+-------------+----------+---+-------+--------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        1001.0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|        1501.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|        3000.0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|        2500.0|
|  5|    David Lee|2024-08-05| 27| 3500.5|        3501.0|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|        2500.0|
+---+-------------+----------+---+-------+--------------+

None
+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|brounded_salary|
+---+-------------+----------+---+-------+---------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|         1000.0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         1501.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         3000.0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         2500.0|
|

In [88]:
# 46 degrees, radians
#Converts an angle from radians to degrees or from degrees to radians.

from pyspark.sql.functions import degrees, radians

df_degrees = df.withColumn("degrees_value", degrees(col("age")))
print(df_degrees.show())

df_radians = df.withColumn("radians_value", radians(col("age")))
print(df_radians.show())

#df_all = df.withColumn("degrees_value", degrees(col("age"))).withColumn("radians_value", radians(col("age")))
#df_all.show()


+---+-------------+----------+---+-------+------------------+
| id|         name|       dob|age| salary|     degrees_value|
+---+-------------+----------+---+-------+------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|1317.8029288008934|
|  2|   Jane Smith|2024-08-02| 25|1500.75|1432.3944878270581|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 1031.324031235482|
|  4|  Alice Brown|2024-08-04| 22|2500.25| 1260.507149287811|
|  5|    David Lee|2024-08-05| 27| 3500.5|1546.9860468532227|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45| 1776.169164905552|
+---+-------------+----------+---+-------+------------------+

None
+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|      radians_value|
+---+-------------+----------+---+-------+-------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5| 0.4014257279586958|
|  2|   Jane Smith|2024-08-02| 25|1500.75| 0.4363323129985824|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 0.31415926535897

In [92]:
# 47 signum
#Computes the signum of a number: -1 if it's negative, 0 if it's zero, and 1 if it's positive.
from pyspark.sql.functions import signum
df_signum = df.withColumn("signum_value", signum(col("age")))
df_signum.show()


+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|signum_value|
+---+-------------+----------+---+-------+------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|         1.0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         1.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         1.0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         1.0|
|  5|    David Lee|2024-08-05| 27| 3500.5|         1.0|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         1.0|
+---+-------------+----------+---+-------+------------+



In [94]:
# 48 hex, unhex
#Converts a column to hexadecimal and back.
from pyspark.sql.functions import hex, unhex
# Create the 'hex_value' column within the 'df' DataFrame
#df = df.withColumn("hex_value", hex(col("age")))
#print(df.show())
#df_unhex = df.withColumn("unhex_value", unhex(col("hex_value")))
#print(df_unhex.show())
from pyspark.sql.functions import hex, unhex
df_all= df.withColumn("hex_name", hex(col("id"))).withColumn("unhex_name", unhex(hex(col("id"))))
df_all.show()

+---+-------------+----------+---+-------+--------+----------+
| id|         name|       dob|age| salary|hex_name|unhex_name|
+---+-------------+----------+---+-------+--------+----------+
|  1|     John Doe|2024-08-01| 23| 1000.5|       1|      [01]|
|  2|   Jane Smith|2024-08-02| 25|1500.75|       2|      [02]|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|       3|      [03]|
|  4|  Alice Brown|2024-08-04| 22|2500.25|       4|      [04]|
|  5|    David Lee|2024-08-05| 27| 3500.5|       5|      [05]|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|       6|      [06]|
+---+-------------+----------+---+-------+--------+----------+



In [95]:
# 49  nvl, nvl2
#Replaces null values with the specified value.
from pyspark.sql.functions import nvl, nvl2, col
df_nvl = df.withColumn("nvl_age", nvl(col("age"), col("id"))) # Replace 0 with a column object
print(df_nvl.show())
df_nvl2 = df.withColumn("nvl2_age", nvl2(col("age"), col("id"), col("salary"))) # Replace 0 and 10 with column objects
print(df_nvl2.show())
# df_all = df.withColumn("nvl_column", expr("nvl(null, 'default')"))
# df_all.show()

+---+-------------+----------+---+-------+-------+
| id|         name|       dob|age| salary|nvl_age|
+---+-------------+----------+---+-------+-------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     23|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     25|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     18|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     22|
|  5|    David Lee|2024-08-05| 27| 3500.5|     27|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|     31|
+---+-------------+----------+---+-------+-------+

None
+---+-------------+----------+---+-------+--------+
| id|         name|       dob|age| salary|nvl2_age|
+---+-------------+----------+---+-------+--------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     1.0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     2.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     3.0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     4.0|
|  5|    David Lee|2024-08-05| 27| 3500.5|     5.0|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|     6.0|
+---+-----------

In [97]:
# 50 reverse
#Reverses the string in a column.
from pyspark.sql.functions import reverse
df_reverse = df.withColumn("reversed_name", reverse(col("name")))
df_reverse.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|reversed_name|
+---+-------------+----------+---+-------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|     eoD nhoJ|
|  2|   Jane Smith|2024-08-02| 25|1500.75|   htimS enaJ|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|  nosnhoJ boB|
|  4|  Alice Brown|2024-08-04| 22|2500.25|  nworB ecilA|
|  5|    David Lee|2024-08-05| 27| 3500.5|    eeL divaD|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|nahdarP hsuyA|
+---+-------------+----------+---+-------+-------------+



In [98]:
# 51 initcap
#Converts the first letter of each word to uppercase.
from pyspark.sql.functions import initcap
df_initcap = df.withColumn("capitalized_name", initcap(col("name")))
df_initcap.show()


+---+-------------+----------+---+-------+----------------+
| id|         name|       dob|age| salary|capitalized_name|
+---+-------------+----------+---+-------+----------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        John Doe|
|  2|   Jane Smith|2024-08-02| 25|1500.75|      Jane Smith|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|     Bob Johnson|
|  4|  Alice Brown|2024-08-04| 22|2500.25|     Alice Brown|
|  5|    David Lee|2024-08-05| 27| 3500.5|       David Lee|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|   Ayush Pradhan|
+---+-------------+----------+---+-------+----------------+



In [99]:
# 52 instr
#Returns the position of the first occurrence of a substring.
from pyspark.sql.functions import instr
df_instr = df.withColumn("position_of_a", instr(col("name"), "a"))
print(df_instr.show())
print(df_instr.printSchema())
#df_instr = df.withColumn("position", instr(col("name"), "Doe"))
#df_instr.show()

+---+-------------+----------+---+-------+-------------+
| id|         name|       dob|age| salary|position_of_a|
+---+-------------+----------+---+-------+-------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|            0|
|  2|   Jane Smith|2024-08-02| 25|1500.75|            2|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|            0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|            0|
|  5|    David Lee|2024-08-05| 27| 3500.5|            2|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|            9|
+---+-------------+----------+---+-------+-------------+

None
root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: double (nullable = true)
 |-- position_of_a: integer (nullable = true)

None


In [100]:
# 53 locate
#Similar to instr, but can start the search from a specified position.
from pyspark.sql.functions import locate
df_locate = df.withColumn("locate_doe", locate("Doe", col("name")))
df_locate.show()

+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|locate_doe|
+---+-------------+----------+---+-------+----------+
|  1|     John Doe|2024-08-01| 23| 1000.5|         6|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         0|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         0|
|  5|    David Lee|2024-08-05| 27| 3500.5|         0|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         0|
+---+-------------+----------+---+-------+----------+



In [101]:
# 54 soundex
#Converts a string to its Soundex code, useful for phonetic matching
from pyspark.sql.functions import soundex
df_soundex = df.withColumn("soundex_name", soundex(col("name")))
df_soundex.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|soundex_name|
+---+-------------+----------+---+-------+------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|        J530|
|  2|   Jane Smith|2024-08-02| 25|1500.75|        J525|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|        B125|
|  4|  Alice Brown|2024-08-04| 22|2500.25|        A421|
|  5|    David Lee|2024-08-05| 27| 3500.5|        D134|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|        A216|
+---+-------------+----------+---+-------+------------+



In [102]:
# 55 levenshtein
#Computes the Levenshtein distance between two strings.
from pyspark.sql.functions import levenshtein
df_levenshtein = df.withColumn("levenshtein_distance", levenshtein(col("name"), lit("Jon Doe")))
df_levenshtein.show()

+---+-------------+----------+---+-------+--------------------+
| id|         name|       dob|age| salary|levenshtein_distance|
+---+-------------+----------+---+-------+--------------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|                   1|
|  2|   Jane Smith|2024-08-02| 25|1500.75|                   7|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|                   8|
|  4|  Alice Brown|2024-08-04| 22|2500.25|                   9|
|  5|    David Lee|2024-08-05| 27| 3500.5|                   7|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|                  12|
+---+-------------+----------+---+-------+--------------------+



In [103]:
# 56 conv
#Converts a number from one base to another.
from pyspark.sql.functions import conv
df_conv = df.withColumn("binary_value", conv(col("id"), 10, 2))
df_conv.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|binary_value|
+---+-------------+----------+---+-------+------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|           1|
|  2|   Jane Smith|2024-08-02| 25|1500.75|          10|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|          11|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         100|
|  5|    David Lee|2024-08-05| 27| 3500.5|         101|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         110|
+---+-------------+----------+---+-------+------------+



In [104]:
# 57 translate
#Replaces characters in a string with other characters.
from pyspark.sql.functions import translate
df_translate = df.withColumn("translated_name", translate(col("name"), "o", "a"))
df_translate.show()

+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|translated_name|
+---+-------------+----------+---+-------+---------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|       Jahn Dae|
|  2|   Jane Smith|2024-08-02| 25|1500.75|     Jane Smith|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|    Bab Jahnsan|
|  4|  Alice Brown|2024-08-04| 22|2500.25|    Alice Brawn|
|  5|    David Lee|2024-08-05| 27| 3500.5|      David Lee|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|  Ayush Pradhan|
+---+-------------+----------+---+-------+---------------+



In [105]:
# 58 crc32
#Computes a cyclic redundancy check value
from pyspark.sql.functions import crc32
df_crc32 = df.withColumn("crc32_name", crc32(col("name")))
df_crc32.show()

+---+-------------+----------+---+-------+----------+
| id|         name|       dob|age| salary|crc32_name|
+---+-------------+----------+---+-------+----------+
|  1|     John Doe|2024-08-01| 23| 1000.5|1782059462|
|  2|   Jane Smith|2024-08-02| 25|1500.75|3280634359|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 991314801|
|  4|  Alice Brown|2024-08-04| 22|2500.25|2917542307|
|  5|    David Lee|2024-08-05| 27| 3500.5|1227316513|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|3209413628|
+---+-------------+----------+---+-------+----------+



In [106]:
# 59 uuid
#Generates a column of UUIDs.
from pyspark.sql.functions import expr
df_uuid = df.withColumn("uuid", expr("uuid()"))
df_uuid.show(truncate = False)

+---+-------------+----------+---+-------+------------------------------------+
|id |name         |dob       |age|salary |uuid                                |
+---+-------------+----------+---+-------+------------------------------------+
|1  |John Doe     |2024-08-01|23 |1000.5 |ce8fe90a-504b-4fd1-b226-6d32fdc596b9|
|2  |Jane Smith   |2024-08-02|25 |1500.75|7b09f3b9-983b-4fa7-9c83-9a35b1b2a147|
|3  |Bob Johnson  |2024-08-03|18 |3000.1 |7e10b3e9-c8b4-402f-b6c5-337a07c2c904|
|4  |Alice Brown  |2024-08-04|22 |2500.25|88a5ffcd-cfb7-49dc-ae4d-5658369b449f|
|5  |David Lee    |2024-08-05|27 |3500.5 |4a7ac6fb-5eb4-4513-a428-610643186305|
|6  |Ayush Pradhan|2024-08-06|31 |2500.45|5b57c8ec-1b29-438d-8887-e783737ea354|
+---+-------------+----------+---+-------+------------------------------------+



In [107]:
# 60 percent_rank
#Computes the percent rank of a row within a window partition.
from pyspark.sql.functions import percent_rank
windowSpec = Window.orderBy(col("salary").desc())
df_percent_rank = df.withColumn("percent_rank", percent_rank().over(windowSpec))
df_percent_rank.show()

+---+-------------+----------+---+-------+------------+
| id|         name|       dob|age| salary|percent_rank|
+---+-------------+----------+---+-------+------------+
|  5|    David Lee|2024-08-05| 27| 3500.5|         0.0|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|         0.2|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|         0.4|
|  4|  Alice Brown|2024-08-04| 22|2500.25|         0.6|
|  2|   Jane Smith|2024-08-02| 25|1500.75|         0.8|
|  1|     John Doe|2024-08-01| 23| 1000.5|         1.0|
+---+-------------+----------+---+-------+------------+



In [108]:
# 61 cume_dist computes the cumulative distribution of a value in a group of values
from pyspark.sql.functions import cume_dist
windowSpec = Window.orderBy(col("salary").desc())
df_cume_dist = df.withColumn("cume_dist", cume_dist().over(windowSpec))
df_cume_dist.show()

+---+-------------+----------+---+-------+-------------------+
| id|         name|       dob|age| salary|          cume_dist|
+---+-------------+----------+---+-------+-------------------+
|  5|    David Lee|2024-08-05| 27| 3500.5|0.16666666666666666|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1| 0.3333333333333333|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|                0.5|
|  4|  Alice Brown|2024-08-04| 22|2500.25| 0.6666666666666666|
|  2|   Jane Smith|2024-08-02| 25|1500.75| 0.8333333333333334|
|  1|     John Doe|2024-08-01| 23| 1000.5|                1.0|
+---+-------------+----------+---+-------+-------------------+



In [109]:
# 62 percentile distributes rows of an ordered partition into a specified number of buckets
from pyspark.sql.functions import ntile
windowSpec = Window.orderBy(col("salary").desc())
df_ntile = df.withColumn("ntile", ntile(3).over(windowSpec))
df_ntile.show()

+---+-------------+----------+---+-------+-----+
| id|         name|       dob|age| salary|ntile|
+---+-------------+----------+---+-------+-----+
|  5|    David Lee|2024-08-05| 27| 3500.5|    1|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|    1|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|    2|
|  4|  Alice Brown|2024-08-04| 22|2500.25|    2|
|  2|   Jane Smith|2024-08-02| 25|1500.75|    3|
|  1|     John Doe|2024-08-01| 23| 1000.5|    3|
+---+-------------+----------+---+-------+-----+



In [111]:
# 63 flatten an array of arrays into a single array
from pyspark.sql.functions import flatten
df_flatten = df.withColumn("flattened_array", flatten(array(array(lit(1), lit(2)), array(lit(3)))))
df_flatten.show()


+---+-------------+----------+---+-------+---------------+
| id|         name|       dob|age| salary|flattened_array|
+---+-------------+----------+---+-------+---------------+
|  1|     John Doe|2024-08-01| 23| 1000.5|      [1, 2, 3]|
|  2|   Jane Smith|2024-08-02| 25|1500.75|      [1, 2, 3]|
|  3|  Bob Johnson|2024-08-03| 18| 3000.1|      [1, 2, 3]|
|  4|  Alice Brown|2024-08-04| 22|2500.25|      [1, 2, 3]|
|  5|    David Lee|2024-08-05| 27| 3500.5|      [1, 2, 3]|
|  6|Ayush Pradhan|2024-08-06| 31|2500.45|      [1, 2, 3]|
+---+-------------+----------+---+-------+---------------+



In [119]:
# 64 grouping_id returns the level of grouping applied
from pyspark.sql.functions import grouping_id
from pyspark.sql import functions  as F
df_grouping_sets = df.cube("age", "salary").agg(grouping_id().alias("grouping_id"),F.sum("salary"))
df_grouping_sets.show()

+----+-------+-----------+------------------+
| age| salary|grouping_id|       sum(salary)|
+----+-------+-----------+------------------+
|NULL|1500.75|          2|           1500.75|
|NULL| 1000.5|          2|            1000.5|
|  25|   NULL|          1|           1500.75|
|NULL| 3000.1|          2|            3000.1|
|NULL|   NULL|          3|14002.550000000001|
|  23|   NULL|          1|            1000.5|
|  25|1500.75|          0|           1500.75|
|  23| 1000.5|          0|            1000.5|
|  18| 3000.1|          0|            3000.1|
|  18|   NULL|          1|            3000.1|
|NULL| 3500.5|          2|            3500.5|
|  22|   NULL|          1|           2500.25|
|  22|2500.25|          0|           2500.25|
|NULL|2500.45|          2|           2500.45|
|  27|   NULL|          1|            3500.5|
|NULL|2500.25|          2|           2500.25|
|  31|   NULL|          1|           2500.45|
|  27| 3500.5|          0|            3500.5|
|  31|2500.45|          0|        

In [120]:
# 65 roll up
## used for multi dimensional aggregates, similar to cube but with a subset of it
# from pyspark .sql.functions import rollup
df_rollup = df.rollup("age", "salary").agg(F.sum("salary"))
df_rollup.show()


+----+-------+------------------+
| age| salary|       sum(salary)|
+----+-------+------------------+
|  25|   NULL|           1500.75|
|NULL|   NULL|14002.550000000001|
|  23|   NULL|            1000.5|
|  25|1500.75|           1500.75|
|  23| 1000.5|            1000.5|
|  18| 3000.1|            3000.1|
|  18|   NULL|            3000.1|
|  22|   NULL|           2500.25|
|  22|2500.25|           2500.25|
|  27|   NULL|            3500.5|
|  31|   NULL|           2500.45|
|  27| 3500.5|            3500.5|
|  31|2500.45|           2500.45|
+----+-------+------------------+



In [121]:
# 67 corr
## returns the Pearson correlation coefficient between two columns
from pyspark.sql.functions import corr
df_corr = df.select(corr(col("age"), col("salary")))
df_corr.show()

+-------------------+
|  corr(age, salary)|
+-------------------+
|0.04031322089496106|
+-------------------+



In [123]:
## 68 collect_list
# returns all values from an aggregated group as a list
from pyspark.sql.functions import collect_list
df_collect_list = df.groupBy("age").agg(collect_list("name").alias("names"))
df_collect_list.show()

+---+---------------+
|age|          names|
+---+---------------+
| 25|   [Jane Smith]|
| 18|  [Bob Johnson]|
| 23|     [John Doe]|
| 22|  [Alice Brown]|
| 31|[Ayush Pradhan]|
| 27|    [David Lee]|
+---+---------------+

