In [16]:
!pip install polars



In [42]:
from functools import wraps
import time
import psutil
import pandas as pd
from google.cloud import storage
import logging
from io import StringIO
import polars as pl
from typing import List

In [33]:
def monitor_cpu_usage(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        current_process = psutil.Process()
        current_cpu_usage = current_process.cpu_percent(interval=0.5)
        print(f"Function '{func.__name__}' called with CPU usage {current_cpu_usage}%")
        return func(*args, **kwargs)

    return wrapper

In [12]:
def monitor_time_usage(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Function {func.__name__} execution time: {execution_time:.4f} seconds")
        return result

    return wrapper

In [26]:
@monitor_cpu_usage
@monitor_time_usage
def read_data_from_gcs(bucket_name: str, source: str, delimiter: str) -> pd.DataFrame:
  # Explicitly use service account credentials by specifying the private key file.
  storage_client = storage.Client.from_service_account_json('/content/perqara-data-532572ce4996.json')

  # Get the bucket and blob objects
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(source)

  # Download the contents of the blob as a string
  csv_data = blob.download_as_text()

  df = pd.read_csv(StringIO(csv_data), delimiter=delimiter, low_memory=False)
  logging.info(f"Reading {source} from bucket: {bucket_name}...")

  return df

In [34]:
bucket_name = "perqara-dendrobium"
source = "raw/mongodb/csv/m_chat_messages/m_chat_messages.csv" # 355,687 rows
delimiter = "|"

# Pandas 2.0
df = read_data_from_gcs(bucket_name, source, delimiter)

Function 'read_data_from_gcs' called with CPU usage 1.0%
Function read_data_from_gcs execution time: 3.8291 seconds


In [18]:
@monitor_cpu_usage
@monitor_time_usage
def read_data_from_gcs_with_polars(bucket_name: str, source: str, delimiter: str) -> pd.DataFrame:
  # Explicitly use service account credentials by specifying the private key file.
  storage_client = storage.Client.from_service_account_json('/content/perqara-data-532572ce4996.json')

  # Get the bucket and blob objects
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(source)

  # Download the contents of the blob as a string
  csv_data = blob.download_as_text()

  df = pl.read_csv(StringIO(csv_data), separator=delimiter)
  logging.info(f"Reading {source} from bucket: {bucket_name}...")

  return df

In [38]:
# Polars Eager Mode
df_polars = read_data_from_gcs_with_polars(bucket_name, source, delimiter)

Function 'read_data_from_gcs_with_polars' called with CPU usage 0.0%
Function read_data_from_gcs_with_polars execution time: 2.4737 seconds


In [29]:
@monitor_cpu_usage
@monitor_time_usage
def read_data_from_gcs_with_polars_v2(bucket_name: str, source: str, delimiter: str) -> pd.DataFrame:
  # Explicitly use service account credentials by specifying the private key file.
  storage_client = storage.Client.from_service_account_json('/content/perqara-data-532572ce4996.json')

  # Get the bucket and blob objects
  bucket = storage_client.get_bucket(bucket_name)
  blob = bucket.blob(source)

  # Download the contents of the blob as a string
  csv_data = blob.download_as_text()

  df = pl.scan_csv(StringIO(csv_data), separator=delimiter)
  logging.info(f"Reading {source} from bucket: {bucket_name}...")

  return df

In [40]:
# Polars Lazy Mode
df_polars_v2 = read_data_from_gcs_with_polars(bucket_name, source, delimiter)

Function 'read_data_from_gcs_with_polars' called with CPU usage 0.0%
Function read_data_from_gcs_with_polars execution time: 1.7459 seconds




---



In [31]:
@monitor_cpu_usage
@monitor_time_usage
def remove_duplicates(df: pd.DataFrame, column_list: list) -> pd.DataFrame:
  for column in column_list:
      df.drop_duplicates(subset=column, keep="last", inplace=True)

  logging.info(f"Duplicated rows based on column: {column} has been successfully removed.")
  return df

In [48]:
# Pandas 2.0
result_df1 = remove_duplicates(df, ["object_id"])

Function 'remove_duplicates' called with CPU usage 1.0%
Function remove_duplicates execution time: 0.1081 seconds


In [52]:
@monitor_cpu_usage
@monitor_time_usage
def remove_duplicates_with_polars(df: pl.DataFrame, column_list: List[str]) -> pl.DataFrame:
  for column in column_list:
      df = df.unique(subset=column, keep="last")

  logging.info(f"Duplicated rows based on columns: {', '.join(column_list)} have been successfully removed.")
  return df

In [53]:
# Polars Eager Mode
result_df2 = remove_duplicates_with_polars(df_polars, ["object_id"])

Function 'remove_duplicates_with_polars' called with CPU usage 2.0%
Function remove_duplicates_with_polars execution time: 0.3603 seconds


In [54]:
@monitor_cpu_usage
@monitor_time_usage
def remove_duplicates_with_polars_v2(df: pl.DataFrame, column_list: List[str]) -> pl.DataFrame:
  for column in column_list:
      df = df.unique(subset=column, keep="last")

  logging.info(f"Duplicated rows based on columns: {', '.join(column_list)} have been successfully removed.")
  return df

In [55]:
# Polars Lazy Mode
result_df3 = remove_duplicates_with_polars_v2(df_polars_v2, ["object_id"])

Function 'remove_duplicates_with_polars_v2' called with CPU usage 0.0%
Function remove_duplicates_with_polars_v2 execution time: 0.3216 seconds
