This notebook with visualize the effectiveness of `pandas` and `polars` at reading CSV files of different sizes.

To import csv using `pandas` the function used is:
```
import pandas as pd
pd.read_csv()
```

And, for `polars` is:
```
import polars as pl
pl.read_csv()
```

In [None]:
all_csv_path = [
    '/kaggle/input/football-players-data/fifa_players.csv', # 3.6 MB
    '/kaggle/input/crop-production-in-india/Crop_production.csv', # 10 MB
    '/kaggle/input/e-commerece-sales-data-2023-24/product_details.csv', # 20 MB
    '/kaggle/input/global-fire-burned-area/GlobalFireBurnedArea_2022.csv', # 40 MB
    '/kaggle/input/daigt-proper-train-dataset/train_drcat_01.csv', # 74 MB
    '/kaggle/input/daigt-proper-train-dataset/train_drcat_04.csv', # 103 MB
    '/kaggle/input/bitcoin-historical-data/bitstampUSD_1-min_data_2012-01-01_to_2021-03-31.csv', # 317 MB
    '/kaggle/input/ecommerce-dataset/item_properties_part1.csv', # 484 MB
    '/kaggle/input/product-titles-text-classification/titles_to_categories.csv', # 788 MB
    '/kaggle/input/job-description-dataset/job_descriptions.csv', #1.74 GB
    '/kaggle/input/political-advertisements-from-facebook/fbpac-ads-en-US.csv' # 3.22 GB
]

In [None]:
import os
import time
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
number_of_times_to_read = 7 # Change this to whatever number of times you want to read each files
all_csv = {}

In [None]:
for csv_file in all_csv_path:
    file_size_bytes = os.path.getsize(csv_file)
    file_size_mb = file_size_bytes / (1024 * 1024)
    all_csv[csv_file] = [f'{file_size_mb:.2f}']

all_csv

**So, the first element (str dtype) of every value of each key is the size in MB**

In [None]:
for csv_file in all_csv_path:
    trials = {
        'pandas': [],
        'polars': []
    }
    
    for _ in range(number_of_times_to_read):
        try:
            start_time = time.time()
            pd_df = pd.read_csv(csv_file)
            end_time = time.time()
            elapsed_time = end_time - start_time
            trials['pandas'].append(round(elapsed_time, 2))
        except:
            print(f"Error reading {csv_file} with pandas")

        try:
            start_time = time.time()
            pl_df = pl.read_csv(csv_file)
            end_time = time.time()
            elapsed_time = end_time - start_time
            trials['polars'].append(round(elapsed_time, 2))
        except:
            print(f"Error reading {csv_file} with polars")
        
    all_csv[csv_file].append(trials)

In [None]:
all_csv

In [None]:
Stats = []

for csv_file_path, file_info in all_csv.items():
    
    file_size_in_mb = file_info[0]
    
    pandas_trials = file_info[1]['pandas']
    
    polars_trials = file_info[1]['polars']
    
    for x, y in zip(pandas_trials, polars_trials):
        Stats.append(
            {
                'size': float(file_size_in_mb),
                'pandas_import_time': x,
                'polars_import_time': y
            }
    )
    
Stats =  pd.DataFrame(Stats)
Stats

In [None]:
Stats.to_csv('PandasPolarsCsvReadingTimes.csv', index=False)

In [None]:
Stats['size'].unique()

In [None]:
plt.style.use('ggplot')
# sns.set(style="whitegrid")

fig, ax = plt.subplots(figsize=(12, 6))

ax = sns.lineplot(
    data=Stats, 
    x="size", 
    y="pandas_import_time", 
    errorbar="sd", 
    label='Pandas', 
    marker='o', 
    linestyle='-'
)

ax = sns.lineplot(
    data=Stats, 
    x="size", 
    y="polars_import_time", 
    errorbar="sd", 
    label='Polars', 
    marker='o', 
    linestyle='-'
)

ax.set_xlabel('CSV Size (MB)')
ax.xaxis.label.set_color('black')
ax.xaxis.label.set_bbox({'facecolor': 'white', 'edgecolor': 'white'})

ax.set_ylabel('Mean Reading Time (Seconds)')
ax.yaxis.label.set_color('black')
ax.yaxis.label.set_bbox({'facecolor': 'white', 'edgecolor': 'white'})

plt.title('Pandas and Polars CSV file reading time with Error Band\n')

legend = ax.legend()
legend.set_frame_on(True)
legend.get_frame().set_facecolor('white')

plt.grid(True)
plt.show()