In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession

spark=SparkSession.builder\
      .appName("Read CSV Example")\
      .getOrCreate()

spark.version

'4.0.1'

#Read and write from CSV

In [3]:
data = """id,name,city,age,salary
1,Arjun,Hyderabad,25,45000
2,Meera,Chennai,32,52000
3,Rajesh,Bangalore,29,61000
4,Priya,Delhi,22,38000
5,Sanjay,Mumbai,35,72000
"""

In [5]:
with open("employees.csv", "w") as f:
  f.write(data)

In [6]:
df=spark.read\
  .option("header", True)\
  .option("inferSchema", True)\
  .csv("employees.csv")

df.show()
df.printSchema()

+---+------+---------+---+------+
| id|  name|     city|age|salary|
+---+------+---------+---+------+
|  1| Arjun|Hyderabad| 25| 45000|
|  2| Meera|  Chennai| 32| 52000|
|  3|Rajesh|Bangalore| 29| 61000|
|  4| Priya|    Delhi| 22| 38000|
|  5|Sanjay|   Mumbai| 35| 72000|
+---+------+---------+---+------+

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- salary: integer (nullable = true)



# Read and write from Json

In [7]:
json_data=[
    {"id": 1, "name": "Arjun", "city": "Hyderabad", "dept": "IT", "salary": 45000},
    {"id": 2, "name": "Krittika", "city": "Kolkata", "dept": "Development", "salary": 50000},
]

df_json=spark.createDataFrame(json_data)
df_json.write.mode("overwrite").json("employees.json")

In [8]:
df=spark.read.json("employees.json")
df.show()
df.printSchema()

+---------+-----------+---+--------+------+
|     city|       dept| id|    name|salary|
+---------+-----------+---+--------+------+
|  Kolkata|Development|  2|Krittika| 50000|
|Hyderabad|         IT|  1|   Arjun| 45000|
+---------+-----------+---+--------+------+

root
 |-- city: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: long (nullable = true)



#Parquet

In [9]:
df.write.mode("overwrite").parquet("employee.parquet")

In [10]:
df_parquet=spark.read.parquet("employee.parquet")
df_parquet.show()

+---------+-----------+---+--------+------+
|     city|       dept| id|    name|salary|
+---------+-----------+---+--------+------+
|  Kolkata|Development|  2|Krittika| 50000|
|Hyderabad|         IT|  1|   Arjun| 45000|
+---------+-----------+---+--------+------+

