In [0]:
dbutils.help()

In [0]:
dbutils.fs.help()

In [0]:
# read files metadata from folder
file_lst=dbutils.fs.ls("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files")

for i in file_lst:
    print(i.path)
    print(i.name)
    print(i.size)
    print(i.modificationTime)

dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files/emp.json/
emp.json/
0
0
dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files/emp.parquet/
emp.parquet/
0
0


In [0]:
#today date 
import datetime
datetime.datetime.today().strftime('%d%m%y')

Out[20]: '080424'

In [0]:
# creating spark session

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
spark=SparkSession.builder.appName("Day7").getOrCreate()

In [0]:
# dataset for csv file

name,dept,salary,city
Alice,HR,50000,New York
Bob,Engineering,,San Francisco
Charlie,HR,,Los Angeles
David,Engineering,62000,Seattle
Eva,Finance,70000,Chicago
Frank,Finance,,Houston
Grace,Engineering,65000,Boston
Hannah,HR,48000,Miami
Ian,Finance,,Dallas
Jessica,Engineering,63000,Atlanta

In [0]:
# read csv file with header and schema 
# same way you can read from json ,parquet as well
df = spark.read.format("csv")\
    .option("header", "true")\
        .option("inferSchema","true")\
            .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
df.show()

+-------+-----------+------+-------------+
|   name|       dept|salary|         city|
+-------+-----------+------+-------------+
|  Alice|         HR| 50000|     New York|
|    Bob|Engineering|  null|San Francisco|
|Charlie|         HR|  null|  Los Angeles|
|  David|Engineering| 62000|      Seattle|
|    Eva|    Finance| 70000|      Chicago|
|  Frank|    Finance|  null|      Houston|
|  Grace|Engineering| 65000|       Boston|
| Hannah|         HR| 48000|        Miami|
|    Ian|    Finance|  null|       Dallas|
|Jessica|Engineering| 63000|      Atlanta|
+-------+-----------+------+-------------+



In [0]:
# read csv file with skip 5 rows 
df = spark.read.format("csv").option("skipRows","5")\
            .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
df.show()

+-------+-----------+-----+-------+
|    _c0|        _c1|  _c2|    _c3|
+-------+-----------+-----+-------+
|    Eva|    Finance|70000|Chicago|
|  Frank|    Finance| null|Houston|
|  Grace|Engineering|65000| Boston|
| Hannah|         HR|48000|  Miami|
|    Ian|    Finance| null| Dallas|
|Jessica|Engineering|63000|Atlanta|
+-------+-----------+-----+-------+



In [0]:
#dropping rows with missing value 
df.dropna().show()

+-------+-----------+------+--------+
|   name|       dept|salary|    city|
+-------+-----------+------+--------+
|  Alice|         HR| 50000|New York|
|  David|Engineering| 62000| Seattle|
|    Eva|    Finance| 70000| Chicago|
|  Grace|Engineering| 65000|  Boston|
| Hannah|         HR| 48000|   Miami|
|Jessica|Engineering| 63000| Atlanta|
+-------+-----------+------+--------+



In [0]:
# fill null  value 
df.fillna(0).show()
# or 
df.fillna({"salary":0}).show()

+-------+-----------+------+-------------+
|   name|       dept|salary|         city|
+-------+-----------+------+-------------+
|  Alice|         HR| 50000|     New York|
|    Bob|Engineering|     0|San Francisco|
|Charlie|         HR|     0|  Los Angeles|
|  David|Engineering| 62000|      Seattle|
|    Eva|    Finance| 70000|      Chicago|
|  Frank|    Finance|     0|      Houston|
|  Grace|Engineering| 65000|       Boston|
| Hannah|         HR| 48000|        Miami|
|    Ian|    Finance|     0|       Dallas|
|Jessica|Engineering| 63000|      Atlanta|
+-------+-----------+------+-------------+



In [0]:
# writing to parquet same way we can save csv &json
df.write.format("parquet").mode("overwrite").save("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files/emp.parquet")
a=dbutils.fs.ls("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files")
for i in a:
    print(i.path)

dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files/emp.json/
dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/outbound_files/emp.parquet/


In [0]:
#caching data 
df.cache()

Out[23]: DataFrame[name: string, dept: string, salary: int, city: string]

In [0]:
# Sample data 
j_data = [
    ("Alice", "Active"),
    ("Bob", "Inactive"),
    ("Charlie", "Inactive"),
    ("David", "Active"),
    ("Eva", "Active"),
    ("Frank", "Active"),
    ("Jess", "Inactive")
]

# Define the schema for the DataFrame
j_schema = ["name", "emp_status"]

# Create the DataFrame
j_df = spark.createDataFrame(j_data, j_schema)

j_df.show()

+-------+----------+
|   name|emp_status|
+-------+----------+
|  Alice|    Active|
|    Bob|  Inactive|
|Charlie|  Inactive|
|  David|    Active|
|    Eva|    Active|
|  Frank|    Active|
|   Jess|  Inactive|
+-------+----------+



In [0]:
#broadcaste join 
from pyspark.sql.functions import broadcast
df.join(broadcast(j_df),"name").show()

+-------+-----------+------+-------------+----------+
|   name|       dept|salary|         city|emp_status|
+-------+-----------+------+-------------+----------+
|  Alice|         HR| 50000|     New York|    Active|
|    Bob|Engineering|  null|San Francisco|  Inactive|
|Charlie|         HR|  null|  Los Angeles|  Inactive|
|  David|Engineering| 62000|      Seattle|    Active|
|    Eva|    Finance| 70000|      Chicago|    Active|
|  Frank|    Finance|  null|      Houston|    Active|
+-------+-----------+------+-------------+----------+



In [0]:
#get no of partitions 
print( df.rdd.getNumPartitions())

1


In [0]:
# increase the partition
df=df.repartition(4)
print( df.rdd.getNumPartitions())

4


In [0]:
# decrease the partition
df=df.coalesce(2)
print( df.rdd.getNumPartitions())

2


In [0]:
# repartitionByRange 
df = df.repartitionByRange(3, "salary")
print( df.rdd.getNumPartitions() )

3


In [0]:
# show the data
df.collect()

Out[29]: [Row(name='Charlie', dept='HR', salary=None, city='Los Angeles'),
 Row(name='Alice', dept='HR', salary=50000, city='New York'),
 Row(name='Ian', dept='Finance', salary=None, city='Dallas'),
 Row(name='David', dept='Engineering', salary=62000, city='Seattle'),
 Row(name='Frank', dept='Finance', salary=None, city='Houston'),
 Row(name='Bob', dept='Engineering', salary=None, city='San Francisco'),
 Row(name='Grace', dept='Engineering', salary=65000, city='Boston'),
 Row(name='Jessica', dept='Engineering', salary=63000, city='Atlanta'),
 Row(name='Hannah', dept='HR', salary=48000, city='Miami'),
 Row(name='Eva', dept='Finance', salary=70000, city='Chicago')]

In [0]:
# explain plan
df.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   Coalesce 2
   +- ShuffleQueryStage 0, Statistics(sizeInBytes=688.0 B, rowCount=10, isRuntime=true)
      +- Exchange RoundRobinPartitioning(4), REPARTITION_BY_NUM, [plan_id=400]
         +- InMemoryTableScan [name#520, dept#521, salary#522, city#523], false
               +- InMemoryRelation [name#520, dept#521, salary#522, city#523], StorageLevel(disk, memory, deserialized, 1 replicas)
                     +- FileScan csv [name#520,dept#521,salary#522,city#523] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_file..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<name:string,dept:string,salary:int,city:string>
+- == Initial Plan ==
   Coalesce 2
   +- Exchange RoundRobinPartitioning(4), REPARTITION_BY_NUM, [plan_id=400]
      +- InMemoryTableScan [name#520, dept#521, salary#522, city#523], false


In [0]:
f_schema =T.StructType(
    [
        T.StructField("name",T.StringType()),
        T.StructField("dept",T.StringType()),
        T.StructField("salary",T.IntegerType()),
        T.StructField("city",T.IntegerType()),
        T.StructField("corrupt_record",T.StringType())
    ]
)

In [0]:
# read csv file with permissivie making city as int so we can see how it is working 
# same way you can read from json ,parquet as well
df = spark.read.format("csv")\
    .option("header", "true")\
        .schema(f_schema)\
            .option("mode","PERMISSIVE")\
                .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
df.show()

+-------+-----------+------+----+
|   name|       dept|salary|city|
+-------+-----------+------+----+
|  Alice|         HR| 50000|null|
|    Bob|Engineering|  null|null|
|Charlie|         HR|  null|null|
|  David|Engineering| 62000|null|
|    Eva|    Finance| 70000|null|
|  Frank|    Finance|  null|null|
|  Grace|Engineering| 65000|null|
| Hannah|         HR| 48000|null|
|    Ian|    Finance|  null|null|
|Jessica|Engineering| 63000|null|
+-------+-----------+------+----+



In [0]:
# read csv file with DROPMALFORMED making city as int so we can see how it is working 
# same way you can read from json ,parquet as well
df = spark.read.format("csv")\
    .option("header", "true")\
        .schema(f_schema)\
            .option("mode","DROPMALFORMED")\
                .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
df.show()

+----+----+------+----+
|name|dept|salary|city|
+----+----+------+----+
+----+----+------+----+



In [0]:
# read csv file with FAILFAST making city as int so we can see how it is working 
# same way you can read from json ,parquet as well
df = spark.read.format("csv")\
    .option("header", "true")\
        .schema(f_schema)\
            .option("mode","FAILFAST")\
                .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
df.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-858561018502442>:8[0m
[1;32m      1[0m [38;5;66;03m# read csv file with FAILFAST making city as int so we can see how it is working [39;00m
[1;32m      2[0m [38;5;66;03m# same way you can read from json ,parquet as well[39;00m
[1;32m      3[0m df [38;5;241m=[39m spark[38;5;241m.[39mread[38;5;241m.[39mformat([38;5;124m"[39m[38;5;124mcsv[39m[38;5;124m"[39m)\
[1;32m      4[0m     [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mheader[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mtrue[39m[38;5;124m"[39m)\
[1;32m      5[0m         [38;5;241m.[39mschema(f_schema)\
[1;32m      6[0m             [38;5;241m.[39moption([38;5;124m"[39m[38;5;124mmode[39m[38;5;124m"[39m,[38;5;124m"[39m[38;5;124mFAILFAST[39m[38;5;124m"[39m)\
[1;32m      7[0m          

In [0]:
# read csv file with permissivie making city as int so we can see how it is working 
# same way you can read from json ,parquet as well
# .option("badRecordsPath", "D:/spark_practice/bad_dir")

df = spark.read.format("csv")\
    .option("header", "true")\
        .option("mode","PERMISSIVE")\
            .option("columnNameOfCorruptRecord", "corrupt_record")\
                .schema(f_schema)\
                    .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
df.show(truncate = False)

+-------+-----------+------+----+---------------------------------+
|name   |dept       |salary|city|corrupt_record                   |
+-------+-----------+------+----+---------------------------------+
|Alice  |HR         |50000 |null|Alice,HR,50000,New York          |
|Bob    |Engineering|null  |null|Bob,Engineering,,San Francisco   |
|Charlie|HR         |null  |null|Charlie,HR,,Los Angeles          |
|David  |Engineering|62000 |null|David,Engineering,62000,Seattle  |
|Eva    |Finance    |70000 |null|Eva,Finance,70000,Chicago        |
|Frank  |Finance    |null  |null|Frank,Finance,,Houston           |
|Grace  |Engineering|65000 |null|Grace,Engineering,65000,Boston   |
|Hannah |HR         |48000 |null|Hannah,HR,48000,Miami            |
|Ian    |Finance    |null  |null|Ian,Finance,,Dallas              |
|Jessica|Engineering|63000 |null|Jessica,Engineering,63000,Atlanta|
+-------+-----------+------+----+---------------------------------+



In [0]:
#explode function

# Sample data with array column
data = [
    ("Alice", ["apple", "orange", "banana"]),
    ("Bob", ["pear", "peach"]),
    ("Charlie", ["grape", "melon", "kiwi"])
]

# Create DataFrame from sample data
df = spark.createDataFrame(data, ["name", "fruits"])

# Show the DataFrame before exploding
print("DataFrame before exploding:")
df.show(truncate=False)

# Explode the 'fruits' array column into separate rows
df_exploded = df.withColumn("fruit", F.explode("fruits"))

# Show the DataFrame after exploding
print("DataFrame after exploding:")
df_exploded.show(truncate=False)

DataFrame before exploding:
+-------+-----------------------+
|name   |fruits                 |
+-------+-----------------------+
|Alice  |[apple, orange, banana]|
|Bob    |[pear, peach]          |
|Charlie|[grape, melon, kiwi]   |
+-------+-----------------------+

DataFrame after exploding:
+-------+-----------------------+------+
|name   |fruits                 |fruit |
+-------+-----------------------+------+
|Alice  |[apple, orange, banana]|apple |
|Alice  |[apple, orange, banana]|orange|
|Alice  |[apple, orange, banana]|banana|
|Bob    |[pear, peach]          |pear  |
|Bob    |[pear, peach]          |peach |
|Charlie|[grape, melon, kiwi]   |grape |
|Charlie|[grape, melon, kiwi]   |melon |
|Charlie|[grape, melon, kiwi]   |kiwi  |
+-------+-----------------------+------+



In [0]:
# Struct Field

# Sample data
data = [
    ("Alice", 34, "New York"),
    ("Bob", 28, "San Francisco"),
    ("Charlie", 30, "Los Angeles")
]

# Define the schema for the struct
schema = "name STRING, age INT, city STRING"

# Create DataFrame from sample data and schema
df = spark.createDataFrame(data, schema)

# Create a struct column using 'struct' function
struct_column = F.struct(col("name"), col("age"), col("city")).alias("person_info")

# Add the struct column to the DataFrame
df_with_struct = df.withColumn("person_details", struct_column)

# Show the DataFrame with the struct column
print("DataFrame with Struct Column:")
df_with_struct.show(truncate=False)

# Accessing struct fields
df_with_struct.select("person_details.name", "person_details.age", "person_details.city").show(truncate=False)


DataFrame with Struct Column:
+-------+---+-------------+--------------------------+
|name   |age|city         |person_details            |
+-------+---+-------------+--------------------------+
|Alice  |34 |New York     |{Alice, 34, New York}     |
|Bob    |28 |San Francisco|{Bob, 28, San Francisco}  |
|Charlie|30 |Los Angeles  |{Charlie, 30, Los Angeles}|
+-------+---+-------------+--------------------------+

+-------+---+-------------+
|name   |age|city         |
+-------+---+-------------+
|Alice  |34 |New York     |
|Bob    |28 |San Francisco|
|Charlie|30 |Los Angeles  |
+-------+---+-------------+



In [0]:
# HASH 


# Sample data representing a dimension (e.g., customer dimension)
data = [
    ("Alice", "New York"),
    ("Alice", "New Yor"),
    ("Bob", "San Francisco"),
    ("Charlie", "Los Angeles"),
    ("David", "Seattle"),
    ("Eva", "Chicago"),
    ("Frank", "Houston")
]

# Create DataFrame from sample data with column names
df = spark.createDataFrame(data, ["name", "city"])

# Concatenate columns (name and city) to form a single string for hashing
concatenated_column = concat(col("name"), col("city"))

# Calculate MD5 hash key for the concatenated column
df_with_hash = df.withColumn("hash_key", md5(concatenated_column))

# Show the DataFrame with MD5 hash key
df_with_hash.show(truncate=False)


+-------+-------------+--------------------------------+
|name   |city         |hash_key                        |
+-------+-------------+--------------------------------+
|Alice  |New York     |bbaa5cb0e7ddd36acadb2cb7e8172135|
|Alice  |New Yor      |66d233ddc5f864e9b575a2c6de94c9e9|
|Bob    |San Francisco|c653024a114755365c53712b2d1f85c0|
|Charlie|Los Angeles  |29dc2a10918d5352a28f8ed7874e7ab6|
|David  |Seattle      |f5b22633a2cf4a6b4ec908bd7da7b257|
|Eva    |Chicago      |0df5bdf5f5206b7cf31e9c32fc8c8313|
|Frank  |Houston      |3ec7c2586d901b629bdd1753acb035fa|
+-------+-------------+--------------------------------+



In [0]:
# pyspark udf
def uppercase_string(s):
    if s is not None:
        return s.upper()
    else:
        return None
# Register the Python function as a PySpark UDF
uppercase_udf = udf(uppercase_string, F.StringType())

In [0]:
# udf usecase
df.select(uppercase_udf(F.col("city"))).show()

+----------------------+
|uppercase_string(city)|
+----------------------+
|              NEW YORK|
|         SAN FRANCISCO|
|           LOS ANGELES|
+----------------------+



In [0]:
# dataset for csv file

name,dept,salary,city
Alice,HR,50000,New York
Bob,Engineering,,San Francisco
Charlie,HR,,Los Angeles
David,Engineering,62000,Seattle
Eva,Finance,70000,Chicago
Frank,Finance,,Houston
Grace,Engineering,65000,Boston
Hannah,HR,48000,Miami
Ian,Finance,,Dallas
Jessica,Engineering,63000,Atlanta

In [0]:
# load data to delta table
df = spark.read.format("csv")\
    .option("header", "true")\
        .option("inferSchema","true")\
            .load("dbfs:/FileStore/shared_uploads/ayushmaurya15398@gmail.com/inbound_files/test.csv")
display(spark.sql('DROP TABLE IF EXISTS delta_table'))
df.write.saveAsTable("delta_table")

In [0]:
%sql
select * from delta_table

name,dept,salary,city
Alice,HR,50000.0,New York
Bob,Engineering,,San Francisco
Charlie,HR,,Los Angeles
David,Engineering,62000.0,Seattle
Eva,Finance,70000.0,Chicago
Frank,Finance,,Houston
Grace,Engineering,65000.0,Boston
Hannah,HR,48000.0,Miami
Ian,Finance,,Dallas
Jessica,Engineering,63000.0,Atlanta


In [0]:
# describe the detail of the table
display(spark.sql('DESCRIBE DETAIL delta_table'))

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion,tableFeatures,statistics
delta,ae8e7361-bde1-47f7-b783-884679a69256,spark_catalog.default.delta_table,,dbfs:/user/hive/warehouse/delta_table,2024-04-15T04:42:24.826+0000,2024-04-15T04:42:33.000+0000,List(),1,1559,Map(),1,2,"List(appendOnly, invariants)",Map()


In [0]:
# get column details 
display(spark.sql('desc delta_table'))

col_name,data_type,comment
name,string,
dept,string,
salary,int,
city,string,


In [0]:
%sql
-- insert new row 
insert into delta_table values('Sam','IT',10000,'San Francisco')

num_affected_rows,num_inserted_rows
1,1


In [0]:
%sql 
select * from delta_table

name,dept,salary,city
Alice,HR,50000.0,New York
Bob,Engineering,,San Francisco
Charlie,HR,,Los Angeles
David,Engineering,62000.0,Seattle
Eva,Finance,70000.0,Chicago
Frank,Finance,,Houston
Grace,Engineering,65000.0,Boston
Hannah,HR,48000.0,Miami
Ian,Finance,,Dallas
Jessica,Engineering,63000.0,Atlanta


In [0]:
# get the history of table 
display(spark.sql('DESCRIBE history delta_table'))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
1,2024-04-15T04:45:01.000+0000,6267682188955183,ayushmaurya15398@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(2748036996083126),0415-043653-9toqkc8i,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputBytes -> 1328, numOutputRows -> 1)",,Databricks-Runtime/12.2.x-scala2.12
0,2024-04-15T04:42:33.000+0000,6267682188955183,ayushmaurya15398@gmail.com,CREATE TABLE AS SELECT,"Map(description -> null, isManaged -> true, partitionBy -> [], properties -> {})",,List(2748036996083126),0415-043653-9toqkc8i,,WriteSerializable,True,"Map(numFiles -> 1, numOutputBytes -> 1559, numOutputRows -> 10)",,Databricks-Runtime/12.2.x-scala2.12


In [0]:
%sql 
-- time travel feature 
select * from delta_table version as of 0

name,dept,salary,city
Alice,HR,50000.0,New York
Bob,Engineering,,San Francisco
Charlie,HR,,Los Angeles
David,Engineering,62000.0,Seattle
Eva,Finance,70000.0,Chicago
Frank,Finance,,Houston
Grace,Engineering,65000.0,Boston
Hannah,HR,48000.0,Miami
Ian,Finance,,Dallas
Jessica,Engineering,63000.0,Atlanta


In [0]:
df2 = spark.read.format('delta').option('versionAsOf', 0).table("delta_table")
display(df2)

name,dept,salary,city
Alice,HR,50000.0,New York
Bob,Engineering,,San Francisco
Charlie,HR,,Los Angeles
David,Engineering,62000.0,Seattle
Eva,Finance,70000.0,Chicago
Frank,Finance,,Houston
Grace,Engineering,65000.0,Boston
Hannah,HR,48000.0,Miami
Ian,Finance,,Dallas
Jessica,Engineering,63000.0,Atlanta


In [0]:
%sql
--- chache a table in memory to speed up query
cache select * from  delta_table;

In [0]:
%sql
--- Analyze table to collect statistics on enitre column 
ANALYZE TABLE delta_table COMPUTE STATISTICS FOR ALL COLUMNS

In [0]:
%sql
-- optimized delta tables, bin packs tables for better performance  
optimize delta_table

path,metrics
dbfs:/user/hive/warehouse/delta_table,"List(1, 2, List(1607, 1607, 1607.0, 1, 1607), List(1328, 1559, 1443.5, 2, 2887), 0, null, 1, 2, 0, true, 0, 0, 1713156994872, 1713157005283, 8, 1, null, List(0, 0), 4, 4, 806)"


In [0]:
display(spark.sql('DESCRIBE history delta_table'))

version,timestamp,userId,userName,operation,operationParameters,job,notebook,clusterId,readVersion,isolationLevel,isBlindAppend,operationMetrics,userMetadata,engineInfo
2,2024-04-15T04:56:41.000+0000,6267682188955183,ayushmaurya15398@gmail.com,OPTIMIZE,"Map(auto -> false, batchId -> 0, predicate -> [], zOrderBy -> [])",,List(2748036996083126),0415-043653-9toqkc8i,1.0,SnapshotIsolation,False,"Map(numRemovedFiles -> 2, numRemovedBytes -> 2887, p25FileSize -> 1607, numDeletionVectorsRemoved -> 0, minFileSize -> 1607, numAddedFiles -> 1, maxFileSize -> 1607, p75FileSize -> 1607, p50FileSize -> 1607, numAddedBytes -> 1607)",,Databricks-Runtime/12.2.x-scala2.12
1,2024-04-15T04:45:01.000+0000,6267682188955183,ayushmaurya15398@gmail.com,WRITE,"Map(mode -> Append, partitionBy -> [])",,List(2748036996083126),0415-043653-9toqkc8i,0.0,WriteSerializable,True,"Map(numFiles -> 1, numOutputBytes -> 1328, numOutputRows -> 1)",,Databricks-Runtime/12.2.x-scala2.12
0,2024-04-15T04:42:33.000+0000,6267682188955183,ayushmaurya15398@gmail.com,CREATE TABLE AS SELECT,"Map(description -> null, isManaged -> true, partitionBy -> [], properties -> {})",,List(2748036996083126),0415-043653-9toqkc8i,,WriteSerializable,True,"Map(numFiles -> 1, numOutputBytes -> 1559, numOutputRows -> 10)",,Databricks-Runtime/12.2.x-scala2.12


In [0]:
%sql
-- Optimize / Zorder 
optimize delta_table zorder by (city)

path,metrics
dbfs:/user/hive/warehouse/delta_table,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 1607), 0, List(0, 0), 0, null), 0, 1, 1, false, 0, 0, 1713157111666, 1713157115511, 8, 0, null, List(0, 0), 4, 4, 0)"


In [0]:
%sql 
--- Vacuum Delta Table to remove old files
vacuum delta_table

path
dbfs:/user/hive/warehouse/delta_table
