In [1]:
import os
import pandas as pd
import dask.dataframe as dd
import time
import psutil

file_path = "/kaggle/input/taxi-trip-data/2018_Yellow_Taxi_Trip_Data.csv"

if not os.path.exists(file_path):
    print("The data is not available")
else:
    print(f"The data is available {file_path}")
    print(f"data size {os.path.getsize(file_path) / (1024**3):.2f} GB")

The data is available /kaggle/input/taxi-trip-data/2018_Yellow_Taxi_Trip_Data.csv
data size 9.71 GB


In [2]:
start = time.time()
rows = 0

for chunk in pd.read_csv(file_path, chunksize=5000, dtype=str):
    rows += len(chunk)

end = time.time()
mem = psutil.virtual_memory().used / (1024**3)

print(f"Read {rows:,} rows")
print(f"Time taken {end - start:.2f} s")
print(f"Memory used {mem:.2f} GB")


Read 112,234,626 rows
Time taken 486.50 s
Memory used 1.30 GB


In [3]:
start = time.time()

df = dd.read_csv(file_path, assume_missing=True, dtype=str)

n_rows = df.shape[0].compute()

end = time.time()
mem = psutil.virtual_memory().used / (1024**3)

print(f"Dask {n_rows:,} rows")
print(f"Time taken {end - start:.2f} s")
print(f"Memory used {mem:.2f} GB")

Dask 112,234,626 rows
Time taken 307.57 s
Memory used 2.84 GB


In [4]:
import gzip
import shutil

gz_file = "2018_Yellow_Taxi_Trip_Data.csv.gz"

if not os.path.exists(gz_file):
    print("Compressing the file...")
    with open(file_path, 'rb') as f_in:
        with gzip.open(gz_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("Compression completed successfully.")

start = time.time()
rows = 0

for chunk in pd.read_csv(gz_file, chunksize=(5000), compression='gzip', dtype=str):
    rows += len(chunk)

end = time.time()

mem = psutil.virtual_memory().used / (1024**3)

print(f"Pandas + gzip {rows:,} rows")
print(f"Time taken {end - start:.2f} seconds")
print(f"Memory used {mem:.2f} GB")


Compressing the file...
Compression completed successfully.
Pandas + gzip 112,234,626 rows
Time taken 389.68 seconds
Memory used 2.84 GB
