In [1]:
import pandas as pd
import os
import heapq

# Specify paths
file_read = r'D:\ba\Results\v1t3\msb_filled.csv'
file_write = r'D:\ba\Results\v1t3\msb_filled_sorted.csv'
temp_dir = r'D:\ba\Results\v1t3\temp_chunks'
os.makedirs(temp_dir, exist_ok=True)

chunk_size = 5000000  
#---------------------------------------------------------------------------------------
temp_files = []


print("Splitting and sorting chunks...")
for i, chunk in enumerate(pd.read_csv(file_read, chunksize=chunk_size, parse_dates=[2])):  # Parse the third column as timestamp
    chunk_sorted = chunk.sort_values(by=chunk.columns[2])  # Sort by the third column
    temp_file = os.path.join(temp_dir, f'chunk_{i}.csv')
    chunk_sorted.to_csv(temp_file, index=False)
    temp_files.append(temp_file)
    print(f"Chunk {i} sorted and saved: {temp_file}")


print("Merging sorted chunks...")
with open(file_write, 'w') as output_file:
    # Open all chunk files
    temp_file_handles = [open(f, 'r') for f in temp_files]
    
    # Read headers from the first file and write to output
    header = temp_file_handles[0].readline()
    output_file.write(header)
    
    
    iterators = [map(str.strip, f) for f in temp_file_handles]
    for it in iterators:
        next(it)  # Skip header row
    
    
    def parse_row(row):
        parts = row.split(',')
        return pd.Timestamp(parts[2]), row  # Assuming 'timestamp' is the third column

    heap = []
    for idx, it in enumerate(iterators):
        try:
            row = next(it)
            heapq.heappush(heap, (parse_row(row), idx))
        except StopIteration:
            pass

    while heap:
        (_, row), idx = heapq.heappop(heap)
        output_file.write(row + '\n')
        try:
            new_row = next(iterators[idx])
            heapq.heappush(heap, (parse_row(new_row), idx))
        except StopIteration:
            pass


print("Cleaning up temporary files...")
for f in temp_file_handles:
    f.close()
for temp_file in temp_files:
    os.remove(temp_file)
os.rmdir(temp_dir)

print(f"Sorting completed! File saved at: {file_write}")


Splitting and sorting chunks...
Chunk 0 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_0.csv
Chunk 1 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_1.csv
Chunk 2 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_2.csv
Chunk 3 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_3.csv
Chunk 4 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_4.csv
Chunk 5 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_5.csv
Chunk 6 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_6.csv
Chunk 7 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_7.csv
Chunk 8 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_8.csv
Chunk 9 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_9.csv
Chunk 10 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_10.csv
Chunk 11 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_11.csv
Chunk 12 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_12.csv
Chunk 13 sorted and saved: D:\ba\Results\v1t3\temp_chunks\chunk_1