In [0]:
# File location and type
file_location = "/FileStore/tables/Diabetic_Data.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

DiastolicBloodPressure,BMI,Age,Diabetic
80,43.50972593,21,0
93,21.24057571,23,0
47,41.51152348,23,0
78,29.58219193,43,1
59,42.60453585,22,0
92,19.72416021,26,0
47,21.94135672,21,0
87,18.2777226,26,0
95,26.62492885,53,1
31,36.88957571,26,0


In [0]:
dbutils.fs.rm('/FileStore/tables/Diabetic_Data',True)

Out[7]: True

In [0]:
df.write.format("parquet").mode("overwrite").save("/FileStore/tables/Diabetic_Data")

In [0]:
%sql
select count(*) from parquet.`/FileStore/tables/Diabetic_Data/part-00000-tid-4245850745351810011-2f9266ec-5bc2-40c0-a831-344f065ec96a-6-1-c000.snappy.parquet`

count(1)
10000


In [0]:
# storing a file in 5 partition equally
df.repartition(5).write.format("parquet").mode("overwrite").save("/FileStore/tables/Diabetic_Data1")

In [0]:
%sql
select count(*) from parquet.`/FileStore/tables/Diabetic_Data1/part-00000-tid-530304819920627090-dea5961c-e524-4484-8321-f0f24b34b9b5-11-1-c000.snappy.parquet`

count(1)
2000


In [0]:
# storing a file by partitionBy command using Diabetic column
df.write.format("parquet").partitionBy("Diabetic").mode("overwrite").save("/FileStore/tables/Diabetic_Data2")

In [0]:
%sql
select count(*) from parquet.`/FileStore/tables/Diabetic_Data2/Diabetic=0/part-00000-tid-2184142841195521317-93e456ba-2013-4ca6-8e35-58cbc1521950-19-1.c000.snappy.parquet`

count(1)
6656


In [0]:
# storing a file using repartition and partitionBy both command using Diabetic column
df.repartition(3).write.format("parquet").partitionBy("Diabetic").mode("overwrite").save("/FileStore/tables/Diabetic_Data3")

In [0]:
%sql
select count(*) from parquet.`/FileStore/tables/Diabetic_Data3/Diabetic=0/part-00000-tid-4724119093511545493-4984932c-a867-44ad-b966-7f255a64e083-24-1.c000.snappy.parquet`

count(1)
2240
