
# Add column

In [0]:
data_list = [
    (100, "Prashant",45,45000),
    (101, "Tarun", 36, 33000),
    (102, "David", 48, 28000)
]

test_df = spark.createDataFrame(data_list).toDF("id", "name", "age", "salary")
test_df.show()

+---+--------+---+------+
| id|    name|age|salary|
+---+--------+---+------+
|100|Prashant| 45| 45000|
|101|   Tarun| 36| 33000|
|102|   David| 48| 28000|
+---+--------+---+------+



In [0]:
from pyspark.sql.functions import expr, col


test_df1 = test_df\
    .withColumn("increment", expr("salary * 10/100"))\
        .withColumn("increment2", col("salary")*0.1)\
            .withColumn("increment3", test_df.salary*0.1)

In [0]:
test_df2 = test_df1.withColumn("salary", expr("salary + increment"))

test_df2.show()

+---+--------+---+-------+---------+----------+----------+
| id|    name|age| salary|increment|increment2|increment3|
+---+--------+---+-------+---------+----------+----------+
|100|Prashant| 45|49500.0|   4500.0|    4500.0|    4500.0|
|101|   Tarun| 36|36300.0|   3300.0|    3300.0|    3300.0|
|102|   David| 48|30800.0|   2800.0|    2800.0|    2800.0|
+---+--------+---+-------+---------+----------+----------+




Each withColumn transformation introduces a new projection in the SQL execution plan.

Calling it too many times can lead to performance issues and even StackOverflowException.


Hence its better to handle this in the select() itself, using expr wherever possible and avoid long chains of withColumn.

In [0]:
test_df3 = test_df.select("id", "name", "age", expr("salary + salary*10/100 as salary"), expr("salary*10/100 as increment"))

test_df3.show()

+---+--------+---+-------+---------+
| id|    name|age| salary|increment|
+---+--------+---+-------+---------+
|100|Prashant| 45|49500.0|   4500.0|
|101|   Tarun| 36|36300.0|   3300.0|
|102|   David| 48|30800.0|   2800.0|
+---+--------+---+-------+---------+



In [0]:
test_df.select("id", "name", "age", expr("salary*10/100 as increment"), expr("salary + increment as salary")).show()

+---+--------+---+---------+-------+
| id|    name|age|increment| salary|
+---+--------+---+---------+-------+
|100|Prashant| 45|   4500.0|49500.0|
|101|   Tarun| 36|   3300.0|36300.0|
|102|   David| 48|   2800.0|30800.0|
+---+--------+---+---------+-------+




# Rename Column

In [0]:
test_df2.show()

+---+--------+---+-------+---------+----------+----------+
| id|    name|age| salary|increment|increment2|increment3|
+---+--------+---+-------+---------+----------+----------+
|100|Prashant| 45|49500.0|   4500.0|    4500.0|    4500.0|
|101|   Tarun| 36|36300.0|   3300.0|    3300.0|    3300.0|
|102|   David| 48|30800.0|   2800.0|    2800.0|    2800.0|
+---+--------+---+-------+---------+----------+----------+



In [0]:
test_df2.withColumnRenamed("increment", "salary_increment")\
    .withColumnRenamed("salary", "incremented_salary").show()

+---+--------+---+------------------+----------------+----------+----------+
| id|    name|age|incremented_salary|salary_increment|increment2|increment3|
+---+--------+---+------------------+----------------+----------+----------+
|100|Prashant| 45|           49500.0|          4500.0|    4500.0|    4500.0|
|101|   Tarun| 36|           36300.0|          3300.0|    3300.0|    3300.0|
|102|   David| 48|           30800.0|          2800.0|    2800.0|    2800.0|
+---+--------+---+------------------+----------------+----------+----------+




withColumnRenamed does not throw any error if the column doesnt exist.

Also adds a projection per statement.


# Drop

In [0]:
test_df2.drop("increment3").show()

+---+--------+---+-------+---------+----------+
| id|    name|age| salary|increment|increment2|
+---+--------+---+-------+---------+----------+
|100|Prashant| 45|49500.0|   4500.0|    4500.0|
|101|   Tarun| 36|36300.0|   3300.0|    3300.0|
|102|   David| 48|30800.0|   2800.0|    2800.0|
+---+--------+---+-------+---------+----------+

