# Delta-rs: MERGE guide

```
merge(source: Union[pyarrow.Table, pyarrow.RecordBatch, pyarrow.RecordBatchReader, ds.Dataset, pd.DataFrame], predicate: str, source_alias: Optional[str] = None, target_alias: Optional[str] = None, error_on_type_mismatch: bool = True, writer_properties: Optional[WriterProperties] = None, large_dtypes: Optional[bool] = None, custom_metadata: Optional[Dict[str, str]] = None, post_commithook_properties: Optional[PostCommitHookProperties] = None, commit_properties: Optional[CommitProperties] = None) -> TableMerger
```

In [None]:
from deltalake import DeltaTable, write_deltalake
import pyarrow as pa

## Update

In [38]:
target_data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp_table", target_data)
dt = DeltaTable("tmp_table")
source_data = pa.table({"x": [2, 3], "y": [5,8]})

In [41]:
(
    dt.merge(
        source=source_data,
        predicate="target.x = source.x",
        source_alias="source",
        target_alias="target")
    .when_matched_update(
        updates={"x": "source.x", "y":"source.y"})
    .execute()
)

{'num_source_rows': 2,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 1,
 'num_output_rows': 3,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 20,
 'scan_time_ms': 0,
 'rewrite_time_ms': 3}

In [43]:
print(dt.to_pandas().sort_values("x", ignore_index=True))

   x  y
0  1  4
1  2  5
2  3  8


## Insert

In [48]:
target_data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp_table", target_data)
dt = DeltaTable("tmp_table")
source_data = pa.table({"x": [2, 3, 7], "y": [4, 5, 8]})

In [49]:
(
    dt.merge(
        source=source_data,
        predicate="target.x = source.x",
        source_alias="source",
        target_alias="target")
    .when_not_matched_insert(
        updates={"x": "source.x", "y":"source.y"})
    .execute()
)

{'num_source_rows': 3,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 0,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 0,
 'num_output_rows': 1,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 0,
 'execution_time_ms': 19,
 'scan_time_ms': 0,
 'rewrite_time_ms': 2}

In [50]:
print(dt.to_pandas().sort_values("x", ignore_index=True))

   x  y
0  1  4
1  2  5
2  3  6
3  7  8


## Delete

In [56]:
target_data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp_table", target_data)
dt = DeltaTable("tmp_table")
source_data = pa.table({"x": [2, 3], "deleted": [False, True]})

In [57]:
(
    dt.merge(
        source=source_data,
        predicate="target.x = source.x",
        source_alias="source",
        target_alias="target")
    .when_matched_delete(
        predicate="source.deleted = true")
    .execute()
)

{'num_source_rows': 2,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 0,
 'num_target_rows_deleted': 1,
 'num_target_rows_copied': 2,
 'num_output_rows': 2,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 22,
 'scan_time_ms': 0,
 'rewrite_time_ms': 2}

In [58]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4
1,2,5


## Upsert

In [12]:
target_data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp_table", target_data)
dt = DeltaTable("tmp_table")
source_data = pa.table({"x": [2, 3, 5], "y": [5, 8, 11]})

In [13]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [14]:
(
    dt.merge(
        source=source_data,
        predicate="target.x = source.x",
        source_alias="source",
        target_alias="target")
    .when_matched_update(
        updates={"x": "source.x", "y":"source.y"})
    .when_not_matched_insert(
        updates={"x": "source.x", "y":"source.y"})
    .execute()
)

{'num_source_rows': 3,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 1,
 'num_output_rows': 4,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 20,
 'scan_time_ms': 0,
 'rewrite_time_ms': 3}

In [15]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,8
3,5,11


## Upsert with Delete

In [62]:
target_data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp_table", target_data)
dt = DeltaTable("tmp_table")
source_data = pa.table({"x": [2, 3, 5], "y": [5, 8, 11]})

In [63]:
(
    dt.merge(
        source=source_data,
        predicate="target.x = source.x",
        source_alias="source",
        target_alias="target")
    .when_matched_update(
        updates={"x": "source.x", "y":"source.y"})
    .when_not_matched_insert(
        updates={"x": "source.x", "y":"source.y"})
    .when_not_matched_by_source_delete()
    .execute()
)

{'num_source_rows': 3,
 'num_target_rows_inserted': 1,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 1,
 'num_target_rows_copied': 0,
 'num_output_rows': 3,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 2,
 'num_target_files_removed': 1,
 'execution_time_ms': 21,
 'scan_time_ms': 0,
 'rewrite_time_ms': 3}

In [64]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,2,5
1,3,8
2,5,11


## Multiple Matches

In [51]:
target_data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp_table", target_data)
dt = DeltaTable("tmp_table")
source_data = pa.table({"x": [2, 3, 5], "y": [5, 8, 11]})

In [52]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,6


In [53]:
(
    dt.merge(
        source=source_data,
        predicate="target.x = source.x",
        source_alias="source",
        target_alias="target")
    .when_matched_update(
        updates={"x": "source.x", "y":"source.y"})
    .when_matched_delete(
        predicate="source.x = target.x")
    .execute()
)

{'num_source_rows': 3,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 2,
 'num_target_rows_deleted': 0,
 'num_target_rows_copied': 1,
 'num_output_rows': 3,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 21,
 'scan_time_ms': 0,
 'rewrite_time_ms': 2}

In [54]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4
1,2,5
2,3,8


In [31]:
dt = DeltaTable("tmp_table", version=0)
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4


### Delete on Predicate

In [1]:
from deltalake import DeltaTable, write_deltalake
import pyarrow as pa

data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
write_deltalake("tmp", data)
dt = DeltaTable("tmp")
new_data = pa.table({"x": [2, 3], "deleted": [False, True]})

In [2]:
(
    dt.merge(
        source=new_data,
        predicate='target.x = source.x',
        source_alias='source',
        target_alias='target')
    .when_matched_delete(
        predicate="source.deleted = true")
    .execute()
)

{'num_source_rows': 2,
 'num_target_rows_inserted': 0,
 'num_target_rows_updated': 0,
 'num_target_rows_deleted': 1,
 'num_target_rows_copied': 2,
 'num_output_rows': 2,
 'num_target_files_scanned': 1,
 'num_target_files_skipped_during_scan': 0,
 'num_target_files_added': 1,
 'num_target_files_removed': 1,
 'execution_time_ms': 31,
 'scan_time_ms': 0,
 'rewrite_time_ms': 4}

In [3]:
dt.to_pandas().sort_values("x", ignore_index=True)

Unnamed: 0,x,y
0,1,4
1,2,5


### Delete All Matched Records

In [4]:
dt = DeltaTable("tmp")
(
    dt.merge(
        source=new_data,
        predicate='target.x = source.x',
        source_alias='source',
        target_alias='target')
    .when_matched_delete()
    .execute()
)
dt.to_pandas()



Unnamed: 0,x,y
0,1,4
