# Pandas CSV Reading Performance Tips

## Following code would generate data_file and then we would use this file

In [None]:
import pandas as pd
import time
import numpy as np

data_file = 'junk_data_read_performance.csv'

In [36]:
# NOTE: Took 1 minute to generate the CSB file

# Lets generate some fake data and save to a csv file for testing:

# Number of rows
n = 10000000

# Random seed for reproducibility
np.random.seed(42)

# Generate data
data = {
    "id": np.arange(1, n + 1, dtype="int32"),
    "name": np.random.choice(["Fernando", "Prakash", "Shamlodhiya", "Smith", "Patel", "Juno"], size=n),
    "amount": np.round(np.random.uniform(50, 500, size=n), 2).astype("float32"),
    "status": np.random.choice(["Paid", "Pending", "Failed"], size=n, p=[0.6, 0.3, 0.1])
}

# Create DataFrame
df = pd.DataFrame(data)

# Save as CSV
df.to_csv(data_file, index=False)

print(f"Sample dataset '{data_file}' with {n} rows created successfully!")

Sample dataset 'data_read_perfomance.csv' with 10000000 rows created successfully!


## 1. Use dtype to specify column types
- Avoid pandas guessing data
- The process is slower but is memory efficient
- Define dtypes for each column

In [37]:
# NOTE: Took 4 secs
# step1: Without specifying dtypes

start = time.process_time()
df = pd.read_csv(data_file)
end = time.process_time()

print(df.head())
print(end - start)
print(df.info(memory_usage="deep"))

   id         name  amount  status
0   1        Smith  282.05    Paid
1   2        Patel  355.10    Paid
2   3  Shamlodhiya  164.49  Failed
3   4        Patel  122.70  Failed
4   5        Patel   82.75    Paid
3.890625
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   name    object 
 2   amount  float64
 3   status  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 1.3 GB
None


In [38]:
# NOTE: Took 4.2 secs
# step2: with dtype explicitly mentioned

dtypes = {
    'id': 'int32',
    'name': 'category',
    'amount': 'float32',
    'status': 'category'
}
start = time.process_time()
df = pd.read_csv(data_file, dtype=dtypes)
end = time.process_time()

print(df.head())
print(end - start)
print(df.info(memory_usage="deep"))

   id         name      amount  status
0   1        Smith  282.049988    Paid
1   2        Patel  355.100006    Paid
2   3  Shamlodhiya  164.490005  Failed
3   4        Patel  122.699997  Failed
4   5        Patel   82.750000    Paid
4.234375
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 4 columns):
 #   Column  Dtype   
---  ------  -----   
 0   id      int32   
 1   name    category
 2   amount  float32 
 3   status  category
dtypes: category(2), float32(1), int32(1)
memory usage: 95.4 MB
None


In [39]:
# How much memry saved ?
1300/95.4

13.626834381551362

## conclusion: The process slows down little bit, but you save memory

## 2. Use usecols to load only required columns

In [40]:
# step1: Read all columns

start = time.process_time()
df = pd.read_csv(data_file)
end = time.process_time()

print(df.head())
print(end - start)
print(df.info(memory_usage="deep"))

   id         name  amount  status
0   1        Smith  282.05    Paid
1   2        Patel  355.10    Paid
2   3  Shamlodhiya  164.49  Failed
3   4        Patel  122.70  Failed
4   5        Patel   82.75    Paid
3.734375
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   name    object 
 2   amount  float64
 3   status  object 
dtypes: float64(1), int64(1), object(2)
memory usage: 1.3 GB
None


In [41]:
# step2: Read all columns

start = time.process_time()
df = pd.read_csv(data_file, usecols=['id', 'amount'])
end = time.process_time()

print(df.head())
print(end - start)
print(df.info(memory_usage="deep"))

   id  amount
0   1  282.05
1   2  355.10
2   3  164.49
3   4  122.70
4   5   82.75
3.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000000 entries, 0 to 9999999
Data columns (total 2 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   amount  float64
dtypes: float64(1), int64(1)
memory usage: 152.6 MB
None


In [42]:
# How much saving in memory
1300/152

8.552631578947368

In [None]:
## conlusion: Helps save memory

## 5. Process large files in chunks using chunksize

### When to use chunking ?

You only need to aggregate results (e.g., sum, mean, counts).

In [31]:

# 1. Create a large CSV (5 million rows)
rows = 5_000_000
df = pd.DataFrame({
    "id": np.arange(rows),
    "value": np.random.randint(0, 100, size=rows)
})
df.to_csv("big_data.csv", index=False)

# -------------------------------
# 2. Read at once (memory heavy)
start = time.process_time()
df_full = pd.read_csv("big_data.csv")
total_sum_full = df_full["value"].sum()
end = time.process_time()
print(f"Full load:    sum={total_sum_full}, time={end - start:.2f} sec")

# -------------------------------
# 3. Read in chunks (memory light)
start = time.process_time()
total_sum_chunk = 0
chunks = pd.read_csv("big_data.csv", chunksize=100_000)  # 100k rows at a time
for chunk in chunks:
    total_sum_chunk += chunk["value"].sum()
end = time.process_time()
print(f"Chunked load: sum={total_sum_chunk}, time={end - start:.2f} sec")


Full load:    sum=247479112, time=0.53 sec
Chunked load: sum=247479112, time=0.89 sec


## conlusion
- Both methods give the same result
- Chunking uses far less memory (you’re only holding 100k rows in memory at any given time instead of 5 million).
- The speed may be slightly better or slightly worse depending on system I/O, but the **real win is memory efficiency**.

## 7. (NOT CONCLUSIVE): Disable quoting if not needed (speeds up parsing)

In [46]:

import csv

start = time.perf_counter()
df = pd.read_csv(data_file)
end = time.perf_counter()

print(end - start)

4.984042200000204


In [47]:

start = time.perf_counter()
df = pd.read_csv(data_file, quoting=csv.QUOTE_NONE)
end = time.perf_counter()

print(end - start)

4.962390000000141


In [None]:
# ## 3. Parse dates efficiently using parse_dates
df = pd.read_csv(data_file, parse_dates=['created_at'])

In [None]:
# ## 6. Use compression if reading from zipped files
df = pd.read_csv('data.csv.gz', compression='gzip')

In [48]:
# ## 8. Use faster backend engines (pandas 2.0+)
df = pd.read_csv(data_file, engine='pyarrow')  # Alternative to engine='c'

ImportError: Missing optional dependency 'pyarrow'.  Use pip or conda to install pyarrow.