Skip to content

Commit

Permalink
Fix inconsistent merge (#30)
Browse files Browse the repository at this point in the history
* I suspect this was wrong

* Take in account offset

* Woops

Co-authored-by: Thomas <ö95242+thomasw21@users.noreply.github.com>
  • Loading branch information
thomasw21 and Thomas committed Jul 28, 2021
1 parent fcd3836 commit 0125aaa
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions megatron/data/indexed_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,13 +298,16 @@ def merge_file_(self, another_file):
index = IndexedDataset(another_file)
assert index.dtype == self.dtype

offset = len(self.sizes)

begin = self.data_offsets[-1]
for offset in index.data_offsets[1:]:
self.data_offsets.append(begin + offset)
self.sizes.extend(index.sizes)
begin = self.dim_offsets[-1]
for dim_offset in index.dim_offsets[1:]:
self.dim_offsets.append(begin + dim_offset)
self.doc_idx.extend( (offset + index.doc_idx)[1:] )

with open(data_file_path(another_file), 'rb') as f:
while True:
Expand Down Expand Up @@ -566,8 +569,9 @@ def merge_file_(self, another_file):
total_len = len(index.sizes)+len(self._sizes)
print(f" concat {another_file} size={len(index.sizes)} for a total size of {total_len}")

for size in index.sizes:
self._sizes.append(size)
offset = len(self._sizes)
self._sizes.extend(index.sizes)
self._doc_idx.extend( (offset + index.doc_idx)[1:] )

# Concatenate data
with open(data_file_path(another_file), 'rb') as f:
Expand Down

0 comments on commit 0125aaa

Please sign in to comment.