<a href="https://colab.research.google.com/github/balakumar-dataengineer/testrepo/blob/master/Pyspark_WithColumn()_function.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('Pyspark test') \
                          .master('local') \
                          .getOrCreate()

data= [
    (101, "Alice Johnson", "Pizza Hut", "2024-02-10", 25.99),
    (102, "Bob Smith", "KFC", "2024-02-11", 15.49),
    (103, "Charlie Brown", "McDonald's", "2024-02-12", 12.99),
    (104, "David Lee", "Subway", "2024-02-13", 8.99),
    (105, "Emma Watson", "Domino's", "2024-02-14", 18.75)
]

schema=['orderId','name','restaurantName','orderDate','price']

df=spark.createDataFrame(data,schema)

df.show()

+-------+-------------+--------------+----------+-----+
|orderId|         name|restaurantName| orderDate|price|
+-------+-------------+--------------+----------+-----+
|    101|Alice Johnson|     Pizza Hut|2024-02-10|25.99|
|    102|    Bob Smith|           KFC|2024-02-11|15.49|
|    103|Charlie Brown|    McDonald's|2024-02-12|12.99|
|    104|    David Lee|        Subway|2024-02-13| 8.99|
|    105|  Emma Watson|      Domino's|2024-02-14|18.75|
+-------+-------------+--------------+----------+-----+



In [2]:
df1=df.withColumn('price',df['price']*1.25)
df1.show()
df1.printSchema()

+-------+-------------+--------------+----------+-------+
|orderId|         name|restaurantName| orderDate|  price|
+-------+-------------+--------------+----------+-------+
|    101|Alice Johnson|     Pizza Hut|2024-02-10|32.4875|
|    102|    Bob Smith|           KFC|2024-02-11|19.3625|
|    103|Charlie Brown|    McDonald's|2024-02-12|16.2375|
|    104|    David Lee|        Subway|2024-02-13|11.2375|
|    105|  Emma Watson|      Domino's|2024-02-14|23.4375|
+-------+-------------+--------------+----------+-------+

root
 |-- orderId: long (nullable = true)
 |-- name: string (nullable = true)
 |-- restaurantName: string (nullable = true)
 |-- orderDate: string (nullable = true)
 |-- price: double (nullable = true)



In [3]:
df2=df1.withColumn('orderId',df1['orderId'].cast('Integer'))
df2.show()
df2.printSchema()

+-------+-------------+--------------+----------+-------+
|orderId|         name|restaurantName| orderDate|  price|
+-------+-------------+--------------+----------+-------+
|    101|Alice Johnson|     Pizza Hut|2024-02-10|32.4875|
|    102|    Bob Smith|           KFC|2024-02-11|19.3625|
|    103|Charlie Brown|    McDonald's|2024-02-12|16.2375|
|    104|    David Lee|        Subway|2024-02-13|11.2375|
|    105|  Emma Watson|      Domino's|2024-02-14|23.4375|
+-------+-------------+--------------+----------+-------+

root
 |-- orderId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- restaurantName: string (nullable = true)
 |-- orderDate: string (nullable = true)
 |-- price: double (nullable = true)



In [4]:
df3=df2.withColumn('orderDate',df2['orderDate'].cast('Timestamp'))

df3.show()
df3.printSchema()

+-------+-------------+--------------+-------------------+-------+
|orderId|         name|restaurantName|          orderDate|  price|
+-------+-------------+--------------+-------------------+-------+
|    101|Alice Johnson|     Pizza Hut|2024-02-10 00:00:00|32.4875|
|    102|    Bob Smith|           KFC|2024-02-11 00:00:00|19.3625|
|    103|Charlie Brown|    McDonald's|2024-02-12 00:00:00|16.2375|
|    104|    David Lee|        Subway|2024-02-13 00:00:00|11.2375|
|    105|  Emma Watson|      Domino's|2024-02-14 00:00:00|23.4375|
+-------+-------------+--------------+-------------------+-------+

root
 |-- orderId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- restaurantName: string (nullable = true)
 |-- orderDate: timestamp (nullable = true)
 |-- price: double (nullable = true)



In [5]:
from pyspark.sql.functions import col

df4=df3.withColumn('amount',col('price')*10)
df4.show()
df4.printSchema()

+-------+-------------+--------------+-------------------+-------+-------+
|orderId|         name|restaurantName|          orderDate|  price| amount|
+-------+-------------+--------------+-------------------+-------+-------+
|    101|Alice Johnson|     Pizza Hut|2024-02-10 00:00:00|32.4875|324.875|
|    102|    Bob Smith|           KFC|2024-02-11 00:00:00|19.3625|193.625|
|    103|Charlie Brown|    McDonald's|2024-02-12 00:00:00|16.2375|162.375|
|    104|    David Lee|        Subway|2024-02-13 00:00:00|11.2375|112.375|
|    105|  Emma Watson|      Domino's|2024-02-14 00:00:00|23.4375|234.375|
+-------+-------------+--------------+-------------------+-------+-------+

root
 |-- orderId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- restaurantName: string (nullable = true)
 |-- orderDate: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- amount: double (nullable = true)



In [7]:
from pyspark.sql.functions import lit


df5=df4.withColumn('source_id',lit(1))
df5.show()
df5.printSchema()


+-------+-------------+--------------+-------------------+-------+-------+---------+
|orderId|         name|restaurantName|          orderDate|  price| amount|source_id|
+-------+-------------+--------------+-------------------+-------+-------+---------+
|    101|Alice Johnson|     Pizza Hut|2024-02-10 00:00:00|32.4875|324.875|        1|
|    102|    Bob Smith|           KFC|2024-02-11 00:00:00|19.3625|193.625|        1|
|    103|Charlie Brown|    McDonald's|2024-02-12 00:00:00|16.2375|162.375|        1|
|    104|    David Lee|        Subway|2024-02-13 00:00:00|11.2375|112.375|        1|
|    105|  Emma Watson|      Domino's|2024-02-14 00:00:00|23.4375|234.375|        1|
+-------+-------------+--------------+-------------------+-------+-------+---------+

root
 |-- orderId: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- restaurantName: string (nullable = true)
 |-- orderDate: timestamp (nullable = true)
 |-- price: double (nullable = true)
 |-- amount: double (nulla