In [1]:
import pandas as pd
import time
import numpy as np
from scipy.sparse import csr_matrix # for creating sparse data

data_file = 'DELETE_ME_huge_file.csv' # Do NOT FORGET TO DELETE THIS AT THE END

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# chunking

In [2]:
# 1. Create a large CSV (5 million rows) # took 30 sec
rows = 50_000_000
df = pd.DataFrame({
    "id": np.arange(rows),
    "value": np.random.randint(0, 100, size=rows)
})

df.to_csv(data_file, index=False)
print(f"Successfully created {data_file}")

Successfully created DELETE_ME_huge_file.csv


In [17]:
# 2. Read at once (memory heavy)
start   = time.process_time()
df_full = pd.read_csv(data_file)
total_sum_full = df_full["value"].sum()
end     = time.process_time()
print(f"Full load:    sum={total_sum_full}, time={end - start:.2f} sec")

Full load:    sum=2474891770, time=4.83 sec


In [18]:
# 3. Read in chunks (memory light)
start = time.process_time()

total_sum_chunk = 0
chunks = pd.read_csv(data_file, chunksize=100_000)  # 100k rows at a time
for chunk in chunks:
    total_sum_chunk += chunk["value"].sum()
    
end = time.process_time()
print(f"Chunked load: sum={total_sum_chunk}, time={end - start:.2f} sec")

Chunked load: sum=2474891770, time=9.61 sec


In [19]:
# conclusion
# Both methods give the same result
# Chunking uses far less memory (you’re only holding 100k rows in memory at any given time instead of 5 million).
# The speed may be slightly better or slightly worse depending on system I/O, but the real win is memory efficiency.

In [4]:
# Now lets calculate mean value using chunking: took 12 sec

start = time.process_time()

total_sum   = 0
total_count = 0

chunks = pd.read_csv(data_file, chunksize=100_000)  # 100k rows at a time
for chunk in chunks:
    total_sum   += chunk["value"].sum()
    total_count += chunk["value"].count()   # count non-NaN values

mean_value = total_sum / total_count

end = time.process_time()

print(f"Chunked load: mean={mean_value}, time={end - start:.2f} sec")

Chunked load: mean=49.49528002, time=6.84 sec
