In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Column Operations") \
    .getOrCreate()

In [None]:
data = [
    ("U001", "New York", 620),
    ("U002", "Los Angeles", 300),
    ("U003", "Chicago" , 700),
    ("U004", "Delhi", 700)
]

columns = ["user_id", "city", "amount"]
df = spark.createDataFrame(data, columns)
df.show()

+-------+-----------+------+
|user_id|       city|amount|
+-------+-----------+------+
|   U001|   New York|   620|
|   U002|Los Angeles|   300|
|   U003|    Chicago|   700|
|   U004|      Delhi|   700|
+-------+-----------+------+



In [None]:
from pyspark.sql.functions import col
df=df.withColumn("amount_with_tax", col("amount") * 1.15)
df.show()

+-------+-----------+------+-----------------+
|user_id|       city|amount|  amount_with_tax|
+-------+-----------+------+-----------------+
|   U001|   New York|   620|            713.0|
|   U002|Los Angeles|   300|            345.0|
|   U003|    Chicago|   700|804.9999999999999|
|   U004|      Delhi|   700|804.9999999999999|
+-------+-----------+------+-----------------+



In [None]:
df = df.withColumnRenamed("amount_with_tax", "total_amount")
df.show()

+-------+-----------+------+-----------------+
|user_id|       city|amount|     total_amount|
+-------+-----------+------+-----------------+
|   U001|   New York|   620|            713.0|
|   U002|Los Angeles|   300|            345.0|
|   U003|    Chicago|   700|804.9999999999999|
|   U004|      Delhi|   700|804.9999999999999|
+-------+-----------+------+-----------------+



In [None]:
df= df.replace ("Delhi", "New Delhi", subset=["city"])
df.show()

+-------+-----------+------+-----------------+
|user_id|       city|amount|     total_amount|
+-------+-----------+------+-----------------+
|   U001|   New York|   620|            713.0|
|   U002|Los Angeles|   300|            345.0|
|   U003|    Chicago|   700|804.9999999999999|
|   U004|  New Delhi|   700|804.9999999999999|
+-------+-----------+------+-----------------+



In [None]:
from pyspark.sql.functions import when

df=df.withColumn(
    "amount_category",
    when(col("amount")>= 500, "High").otherwise("Low")
)
df.show()

+-------+-----------+------+-----------------+---------------+
|user_id|       city|amount|     total_amount|amount_category|
+-------+-----------+------+-----------------+---------------+
|   U001|   New York|   620|            713.0|           High|
|   U002|Los Angeles|   300|            345.0|            Low|
|   U003|    Chicago|   700|804.9999999999999|           High|
|   U004|  New Delhi|   700|804.9999999999999|           High|
+-------+-----------+------+-----------------+---------------+



In [None]:
data = [
    ("ORD001","Delhi","Laptop",45000,"2024-01-05"),
    ("ORD002","Mumbai","Mobile",32000,"2024-01-06"),
    ("ORD003","Bangalore","Tablet",30000,"2024-01-07"),
    ("ORD004","Delhi","Laptop",55000,"2024-01-08"),
    ("ORD005","Mumbai","Tablet",34000,"2024-01-09")
]
columns = ["order_id","city","product","price","order_date"]
df = spark.createDataFrame(data, columns)
df.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
+--------+---------+-------+-----+----------+



In [None]:
df.write.mode("overwrite").orc("data/orc/orders")

In [None]:
df_orc = spark.read.orc("data/orc/orders")
df_orc.show()

+--------+---------+-------+-----+----------+
|order_id|     city|product|price|order_date|
+--------+---------+-------+-----+----------+
|  ORD003|Bangalore| Tablet|30000|2024-01-07|
|  ORD004|    Delhi| Laptop|55000|2024-01-08|
|  ORD005|   Mumbai| Tablet|34000|2024-01-09|
|  ORD001|    Delhi| Laptop|45000|2024-01-05|
|  ORD002|   Mumbai| Mobile|32000|2024-01-06|
+--------+---------+-------+-----+----------+

