<a href="https://colab.research.google.com/github/Yushamsi/Data-Ingestion-Combined-Flights/blob/main/Data-Ingestion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *Data Ingestion*

In [None]:
import pandas as pd
import dask.dataframe as dd
import modin.pandas as mpd
import os
import psutil
import time

In [None]:
# File path
file_path = 'your_large_file.csv'

## Reading the file

In [None]:
# Function to measure CPU and memory usage
def measure_usage(library_name, read_function, file_path):
    # Measure before reading
    start_cpu = psutil.cpu_percent(interval=1)
    start_memory = psutil.virtual_memory().used / (1024 ** 3)  # Convert to GB
    start_time = time.time()

    # Read file
    df = read_function(file_path)

    # Measure after reading
    end_cpu = psutil.cpu_percent(interval=1)
    end_memory = psutil.virtual_memory().used / (1024 ** 3)  # Convert to GB
    end_time = time.time()

    # Calculate differences
    cpu_usage = end_cpu - start_cpu
    memory_usage = end_memory - start_memory
    read_time_seconds = end_time - start_time
    read_time_minutes = read_time_seconds / 60

    # Print results
    print(f"{library_name} read time: {read_time_seconds:.2f} seconds ({read_time_minutes:.2f} minutes)")
    print(f"{library_name} CPU usage change: {cpu_usage:.2f}%")
    print(f"{library_name} Memory usage change: {memory_usage:.2f} GB")

Reading with Pandas

In [None]:
# Measure Pandas
measure_usage("Pandas", pd.read_csv, file_path)

Reading with Dask


In [None]:
# For Dask since the actual data loading operation (like converting to pandas DataFrame) happens when you perform an action (like compute for Dask), the direct approach works differently.
# For these, typically need to wrap the operation in a function if you're doing more than just reading.
measure_usage("Dask", lambda file: dd.read_csv(file).compute(), file_path)

Reading with Modin (Pandas on Ray)

In [None]:
# Modin with Ray
os.environ["MODIN_ENGINE"] = "ray"  # Use Ray as the backend
measure_usage("Modin Ray", mpd.read_csv, file_path)


Reading with Modin (Pandas on Ray)

In [None]:
# Modin with Dask
os.environ["MODIN_ENGINE"] = "dask"  # Use Dask as the backend
measure_usage("Modin Dask", mpd.read_csv, file_path)