In [None]:
from pyspark.sql import SparkSession

# Step 1: Start SparkSession
spark = SparkSession.builder.appName("CSV Example").getOrCreate()

# Step 2: Create a sample DataFrame
data = [
    ("Aashi", 22, "F"),
    ("Raj", 30, "M"),
    ("Neha", 25, "F"),
    ("Ravi", 28, "M"),
    ("Sneha", 24, "F"),
    ("Aman", 35, "M"),
    ("Pooja", 27, "F"),
    ("Karan", 40, "M"),
    ("Simran", 23, "F"),
    ("Vikram", 31, "M"),
    ("Nisha", 26, "F"),
    ("Arjun", 29, "M"),
    ("Tina", 32, "F"),
    ("Rahul", 21, "M"),
    ("Komal", 34, "F")
]
columns = ["name", "age", "gender"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Aashi| 22|     F|
|   Raj| 30|     M|
|  Neha| 25|     F|
|  Ravi| 28|     M|
| Sneha| 24|     F|
|  Aman| 35|     M|
| Pooja| 27|     F|
| Karan| 40|     M|
|Simran| 23|     F|
|Vikram| 31|     M|
| Nisha| 26|     F|
| Arjun| 29|     M|
|  Tina| 32|     F|
| Rahul| 21|     M|
| Komal| 34|     F|
+------+---+------+



In [None]:
df.write.option("header", True).mode("overwrite").csv("/tmp/data_formats/csv")


In [None]:
csv_df = spark.read.option("header", True).csv("/tmp/data_formats/csv")


In [None]:
csv_df.createOrReplaceTempView("people")

In [None]:
result = spark.sql("SELECT * FROM people")
result.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Karan| 40|     M|
|Simran| 23|     F|
|Vikram| 31|     M|
| Nisha| 26|     F|
| Arjun| 29|     M|
|  Tina| 32|     F|
| Rahul| 21|     M|
| Komal| 34|     F|
| Aashi| 22|     F|
|   Raj| 30|     M|
|  Neha| 25|     F|
|  Ravi| 28|     M|
| Sneha| 24|     F|
|  Aman| 35|     M|
| Pooja| 27|     F|
+------+---+------+



In [None]:
maximum= spark.sql("SELECT MAX(age) FROM people")
maximum.show()

+--------+
|max(age)|
+--------+
|      40|
+--------+



In [None]:
minimum = spark.sql("SELECT MIN (age) from people")
minimum.show()

+--------+
|min(age)|
+--------+
|      21|
+--------+



In [None]:
greater_age = spark.sql("SELECT * FROM people WHERE age > 30")
greater_age.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Karan| 40|     M|
|Vikram| 31|     M|
|  Tina| 32|     F|
| Komal| 34|     F|
|  Aman| 35|     M|
+------+---+------+



In [None]:
df.write.mode("overwrite").json("/tmp/data_formats/json")

In [None]:
df_json = spark.read.option("multiline", False).json("/tmp/data_formats/json")
df_json.show()

+---+------+------+
|age|gender|  name|
+---+------+------+
| 40|     M| Karan|
| 23|     F|Simran|
| 31|     M|Vikram|
| 26|     F| Nisha|
| 29|     M| Arjun|
| 32|     F|  Tina|
| 21|     M| Rahul|
| 34|     F| Komal|
| 22|     F| Aashi|
| 30|     M|   Raj|
| 25|     F|  Neha|
| 28|     M|  Ravi|
| 24|     F| Sneha|
| 35|     M|  Aman|
| 27|     F| Pooja|
+---+------+------+



In [None]:
df_json.createOrReplaceTempView("json_table")

In [None]:
df.write.mode("overwrite").parquet("/tmp/data_formats/parquet")

In [None]:
df_parquet=spark.read.parquet("/tmp/data_formats/parquet")
df_parquet.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Karan| 40|     M|
|Simran| 23|     F|
|Vikram| 31|     M|
| Nisha| 26|     F|
| Arjun| 29|     M|
|  Tina| 32|     F|
| Rahul| 21|     M|
| Komal| 34|     F|
| Aashi| 22|     F|
|   Raj| 30|     M|
|  Neha| 25|     F|
|  Ravi| 28|     M|
| Sneha| 24|     F|
|  Aman| 35|     M|
| Pooja| 27|     F|
+------+---+------+



In [None]:
df_parquet.createOrReplaceTempView("parquet_table")

In [None]:
result = spark.sql("select * from parquet_table")
result.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Karan| 40|     M|
|Simran| 23|     F|
|Vikram| 31|     M|
| Nisha| 26|     F|
| Arjun| 29|     M|
|  Tina| 32|     F|
| Rahul| 21|     M|
| Komal| 34|     F|
| Aashi| 22|     F|
|   Raj| 30|     M|
|  Neha| 25|     F|
|  Ravi| 28|     M|
| Sneha| 24|     F|
|  Aman| 35|     M|
| Pooja| 27|     F|
+------+---+------+



In [None]:
df.write.format("orc").mode("overwrite").save("/tmp/data_formats/orc")

In [None]:
df_orc=spark.read.format("orc").load("/tmp/data_formats/orc")
df_orc.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Karan| 40|     M|
|Simran| 23|     F|
|Vikram| 31|     M|
| Nisha| 26|     F|
| Arjun| 29|     M|
|  Tina| 32|     F|
| Rahul| 21|     M|
| Komal| 34|     F|
| Aashi| 22|     F|
|   Raj| 30|     M|
|  Neha| 25|     F|
|  Ravi| 28|     M|
| Sneha| 24|     F|
|  Aman| 35|     M|
| Pooja| 27|     F|
+------+---+------+



In [None]:
df_orc.createOrReplaceTempView("orc_table")

In [None]:
result = spark.sql("select * from orc_table")
result.show()

+------+---+------+
|  name|age|gender|
+------+---+------+
| Karan| 40|     M|
|Simran| 23|     F|
|Vikram| 31|     M|
| Nisha| 26|     F|
| Arjun| 29|     M|
|  Tina| 32|     F|
| Rahul| 21|     M|
| Komal| 34|     F|
| Aashi| 22|     F|
|   Raj| 30|     M|
|  Neha| 25|     F|
|  Ravi| 28|     M|
| Sneha| 24|     F|
|  Aman| 35|     M|
| Pooja| 27|     F|
+------+---+------+

