## Preparacao

In [1]:
import os
import sys
import boto3

In [2]:
boto3.setup_default_session(profile_name="default", region_name="us-east-1")

## Informacoes do sistema

In [3]:
print("Numero de cores:", os.cpu_count())

Numero de cores: 16


In [4]:
!lscpu | grep -iE "Modelo"

Modelo:                               141
Nome do modelo:                       11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz


In [5]:
!nvidia-smi

Sat Nov 16 17:09:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...    On  | 00000000:01:00.0  On |                  N/A |
| N/A   49C    P5              13W / 115W |    126MiB /  6144MiB |     31%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

## Definicoes

In [None]:
# size_memory = round(sys.getsizeof(df) / (1024 * 1024), 2)
# print("Tamanho em memoria com sys:    ", size_memory)
# print("Tamanho em memoria com pandas: ", round(df.memory_usage(index=True, deep=True).sum() / (1024 * 1024), 2))

# dict_resp = wr.s3.size_objects(s3_file)
# size_s3 = dict_resp[s3_file] / (1024 * 1024)

In [3]:
def calculate_size(obj):
    size_memory = round(sys.getsizeof(obj) / (1024 * 1024), 2)
    print("Tamanho em memoria com sys:", size_memory, "MB")
    return size_memory

In [4]:
s3_file = "s3://data-us-east-1-891377318910/datasets/loan/part-loan-2018.parquet"

# Pandas

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html#pandas.read_parquet

In [None]:
import pandas as pd

<ul>
<li>engines: <em>(default: auto)</em>
    <ul>
        <li>auto</li>
        <li>pyarrow</li>
        <li>fastparquet</li>
    </ul>
</li>
<br>

<li>dtype_backend: <em>(default: numpy_nullable)</em>
    <ul>
        <li>numpy_nullable</li>
        <li>pyarrow</li>
    </ul>
</li>
</ul>

In [30]:
df = pd.read_parquet(path=s3_file, engine="pyarrow", dtype_backend="pyarrow")

In [40]:
size_memory = calculate_size(df)

Tamanho em memoria com sys:  541.87  MB


|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento | engine | dtype_backend |
|--------|-----------------|-----------------|-----------------------|--------|---------------|
| pandas | 60.5 MB | 1381.94 MB | 01:39 s    | pyarrow | numpy_nullable |
| pandas | 60.5 MB | 541.87 MB | 01:03 s    | pyarrow | pyarrow |
| pandas | 60.5 MB |  1381.94 MB | 00:11 s    | fastparquet | none |

# CuDF

In [None]:
# https://rapids.ai/#quick-start
# https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/

In [None]:
# pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12

# pip install \
#   --extra-index-url=https://pypi.nvidia.com \
#   cudf-cu12==24.10.* \
#   dask-cudf-cu12==24.10.* \
#   cuml-cu12==24.10.* \
#   cugraph-cu12==24.10.*

In [137]:
## load cudf.pandas kernel
%load_ext cudf.pandas

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


In [None]:
import rmm
import cudf
import pandas as pd

In [144]:
# cudf.describe_option()

In [47]:
stats_mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())
rmm.mr.set_current_device_resource(stats_mr)

In [36]:
pd

<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>

In [None]:
df = pd.read_parquet(path=s3_file, engine="pyarrow", dtype_backend="pyarrow")

In [44]:
print("Tamanho em memoria com pandas: ", round(df.memory_usage(index=True, deep=True).sum() / (1024 * 1024), 2))

Tamanho em memoria com pandas:  1917.08


In [50]:
print(f"Total memory usage: {round(stats_mr.allocation_counts.current_bytes/(1024**2), 0)} MB")
print(f"Peak memory usage:  {round(stats_mr.allocation_counts.peak_bytes/(1024**2), 0)} MB")

Total memory usage 0.0 MB
Peak memory usage 0.0 MB


|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento | engine  | dtype_backend  |
|--------|-----------------|-----------------|-----------------------|---------|----------------|
| cudf   | 60.5 MB         | 541.87 MB       | 00:48 s               | pyarrow | pyarrow        |
| cudf   | 60.5 MB         | 1917.08 MB      | 01:50 s               | pyarrow | numpy_nullable |
| cudf   | 60.5 MB         | 1917.08 MB      | 01:40 s               | fastparquet | None       |

# awswrangler(sdk for pandas)

In [6]:
import pandas as pd
import awswrangler as wr

2024-11-16 14:50:52,545	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
2024-11-16 14:50:52,696	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [None]:
# Opcoes: python, ray
wr.engine.set("python")
# wr._distributed.Engine.set("python")

In [20]:
# Opcoes: pandas, modin
wr.memory_format.set("pandas")

In [21]:
print(f"Execution Engine: {wr.engine.get()}")
print(f"Memory Format:    {wr.memory_format.get()}")

Execution Engine: EngineEnum.PYTHON
Memory Format:    MemoryFormatEnum.PANDAS


### s3 - pandas

In [None]:
# https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.s3.read_parquet.html#awswrangler.s3.read_parquet
# https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.s3.select_query.html

In [22]:
wr.engine.set("python")
wr.memory_format.set("pandas")

print(f"Execution Engine: {wr.engine.get()}")
print(f"Memory Format:    {wr.memory_format.get()}")

Execution Engine: EngineEnum.PYTHON
Memory Format:    MemoryFormatEnum.PANDAS


In [None]:
df = wr.s3.read_parquet(path=s3_file, dtype_backend="pyarrow", use_threads=True)

In [54]:
size_memory = calculate_size(df)

Tamanho em memoria com sys: 541.87 MB


### s3 - modin

In [23]:
wr.engine.set("ray")
wr.memory_format.set("modin")

print(f"Execution Engine: {wr.engine.get()}")
print(f"Memory Format:    {wr.memory_format.get()}")

Execution Engine: EngineEnum.RAY
Memory Format:    MemoryFormatEnum.MODIN


In [10]:
df = wr.s3.read_parquet(path=s3_file)

2024-11-16 14:51:58,963	INFO worker.py:1819 -- Started a local Ray instance.


[dataset]: Run `pip install tqdm` to enable progress reporting.


2024-11-16 14:53:58,669	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-16_14-51-57_754054_144510/logs/ray-data
2024-11-16 14:53:58,669	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet]


In [12]:
type(df)

modin.pandas.dataframe.DataFrame

In [None]:
size_memory = calculate_size(df)

|     lib     | tamanho arquivo | tamanho memoria |  api  | tempo de carregamento  | dtype_backend | threads |
|-------------|-----------------|-----------------|-------|------------------------|---------------|---------|
| awswrangler | 60.5 MB         | 541.87 MB       |  s3   | 00:27 s                | pyarrow | False |
| awswrangler | 60.5 MB         | 541.87 MB       |  s3   | 00:13 s                | pyarrow | True  |
| awswrangler | 60.5 MB         | 1907.16 MB      |  s3   | 00:11 s                | numpy_nullable | False |
| awswrangler | 60.5 MB         | 1907.16 MB      |  s3   | 00:06 s                | numpy_nullable | True  |

### athena

In [None]:
# https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.athena.read_sql_table.html
# https://aws-sdk-pandas.readthedocs.io/en/stable/stubs/awswrangler.athena.read_sql_query.html

In [87]:
df = wr.athena.read_sql_table(table="part_loan_2018_parquet", database="base")

In [91]:
df

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,year


|     lib     | tamanho arquivo | tamanho memoria |  api      | tempo de carregamento  | dtype_backend | threads |
|-------------|-----------------|-----------------|-----------|------------------------|---------------|---------|
| awswrangler | 60.5 MB         | 000.00 MB       |  athena   | 00:00 s                | pyarrow | False |
| awswrangler | 60.5 MB         | 000.00 MB       |  athena   | 00:00 s                | pyarrow | True  |
| awswrangler | 60.5 MB         | 000.00 MB       |  athena   | 00:00 s                | numpy_nullable | False |
| awswrangler | 60.5 MB         | 000.00 MB       |  athena   | 00:00 s                | numpy_nullable | True  |

# Polars

In [None]:
# https://docs.pola.rs/api/python/stable/reference/api/polars.read_parquet.html
# https://docs.pola.rs/api/python/stable/reference/api/polars.scan_parquet.html

In [5]:
import polars as pl

In [6]:
creds = pl.CredentialProviderAWS(profile_name="default")

<ul>
<li>parallel: <em>(default: auto)</em>
    <ul>
        <li>auto</li>
        <li>columns</li>
        <li>row_groups</li>
        <li>none</li>
    </ul>
</li>
<br>

In [None]:
df = pl.read_parquet(
    source=s3_file,
    parallel="auto",
    use_statistics=True,
    use_pyarrow=True,
    memory_map=True
).lazy()

In [74]:
size_memory = round(df.estimated_size() / (1024 * 1024), 2)
print("Tamanho estimado em memoria:", size_memory, "MB")

Tamanho estimado em memoria: 441.77 MB


|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento  | parallel | use_pyarrow | memory_map |
|--------|-----------------|-----------------|------------------------|----------|-------------|------------|
| polars | 60.5 MB         | 450.70 MB       | 04:44 s                | auto     | False       | none       |
| polars | 60.5 MB         | 441.77 MB       | 05:45 s                | auto     | True        | False      |
| polars | 60.5 MB         | 441.77 MB       | 00:47 s                | auto     | True        | True       |

### scan

In [77]:
df = pl.scan_parquet(source=s3_file, parallel="auto", use_statistics=True, low_memory=True)

In [78]:
df = df.collect()

In [79]:
size_memory = round(df.estimated_size() / (1024 * 1024), 2)
print("Tamanho estimado em memoria:", size_memory, "MB")

Tamanho estimado em memoria: 450.7 MB


|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento  | parallel | low_memory |
|--------|-----------------|-----------------|------------------------|----------|------------|
| polars | 60.5 MB         | 450.70 MB       | 05:28 s                | auto     | True       |
| polars | 60.5 MB         | 000.00 MB       | 00:00 s                | auto     | True       |
| polars | 60.5 MB         | 000.00 MB       | 00:00 s                | auto     | True       |

# Polars on gpu

In [None]:
# pip install -U polars
# pip install polars[gpu] --extra-index-url=https://pypi.nvidia.com

In [7]:
import rmm
import polars as pl

In [10]:
stats_mr = rmm.mr.StatisticsResourceAdaptor(rmm.mr.get_current_device_resource())

# gpu_config = pl.lazyframe.engine_config.GPUEngine(device=0, memory_resource=stats_mr, raise_on_fail=True)
gpu_engine = pl.GPUEngine(device=0, memory_resource=stats_mr, raise_on_fail=True)

In [23]:
df = pl.scan_parquet(source=s3_file, parallel="auto", use_statistics=True, low_memory=True)

In [24]:
# df = df.collect(engine=gpu_engine)
df = df.collect(engine="gpu")

In [19]:
size_memory = round(df.estimated_size() / (1024 * 1024), 2)
print("Tamanho estimado em memoria:", size_memory, "MB")

Tamanho estimado em memoria: 450.7 MB


In [20]:
print(f"Total memory usage: {round(stats_mr.allocation_counts.current_bytes/(1024**2), 0)} MB")
print(f"Peak memory usage:  {round(stats_mr.allocation_counts.peak_bytes/(1024**2), 0)} MB")

Total memory usage: 0.0 MB
Peak memory usage:  0.0 MB


|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento  | parallel | low_memory |
|--------|-----------------|-----------------|------------------------|----------|------------|
| polars | 60.5 MB         | 441.8 MB        | 00:24 s                | auto     | True       |
| polars | 60.5 MB         | 000.00 MB       | 00:00 s                | auto     | True       |
| polars | 60.5 MB         | 000.00 MB       | 00:00 s                | auto     | True       |

# Pyarrow

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds

In [None]:
dataset = ds.dataset(source=s3_file, format="parquet")

# Duckdb

In [None]:
# https://duckdb.org/docs/data/parquet/overview.html
# https://duckdb.org/docs/guides/python/export_pandas.html
# https://duckdb.org/docs/guides/python/execute_sql.html
# https://duckdb.org/docs/guides/python/sql_on_pandas.html

In [None]:
# pip install duckdb
# pip install 'polars[pyarrow]'

In [56]:
import duckdb

In [57]:
duckdb.sql("INSTALL httpfs;")
duckdb.sql("INSTALL aws;")

In [58]:
duckdb.sql("LOAD httpfs;")
duckdb.sql("LOAD aws;")
# duckdb.sql("CALL load_aws_credentials('default');")
duckdb.sql("SET s3_region='us-east-1';")

In [59]:
# REGION 'us-west-2'
# PROFILE '<name_of_your_profile>'
duckdb.sql("""
CREATE SECRET (
    TYPE S3,
    PROVIDER CREDENTIAL_CHAIN
);
""")

┌─────────┐
│ Success │
│ boolean │
├─────────┤
│ true    │
└─────────┘

In [12]:
conn = duckdb.connect(database=":memory:", read_only=False)
conn.execute("INSTALL httpfs;")
conn.execute("LOAD httpfs;")
conn.execute("SET s3_region='us-east-1';")
conn.execute("""
CREATE SECRET (
    TYPE S3,
    PROVIDER CREDENTIAL_CHAIN
);
""")

<duckdb.duckdb.DuckDBPyConnection at 0x7f1b81a5c0b0>

In [31]:
# df = conn.execute(f"SELECT * FROM read_parquet('{s3_file}');").fetchdf() # 40 seg
# df = conn.execute(f"SELECT * FROM read_parquet('{s3_file}');").fetch_df() # 12 seg
# df = conn.execute(f"SELECT * FROM read_parquet('{s3_file}');").fetch_df_chunk(vectors_per_chunk=1000) # 12 seg
df = conn.execute(f"SELECT * FROM read_parquet('{s3_file}');").fetch_arrow_table() # 10 seg

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [32]:
type(df)

pyarrow.lib.Table

In [34]:
df.shape

(495242, 146)

In [None]:
# conn.close()

In [93]:
# df = duckdb.sql(f"SELECT * FROM read_parquet('{s3_file}');").df()
df = duckdb.query(f"SELECT * FROM read_parquet('{s3_file}');").df()
# df = duckdb.sql(f"SELECT * FROM read_parquet('{s3_file}');").to_df()
# df = duckdb.query(f"SELECT * FROM read_parquet('{s3_file}');").to_df()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [100]:
df = duckdb.sql(f"SELECT * FROM read_parquet('{s3_file}');").pl()
# df = duckdb.query(f"SELECT * FROM read_parquet('{s3_file}');").pl()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [101]:
type(df)

polars.dataframe.frame.DataFrame

In [102]:
# size_memory = round(df.memory_usage(index=True, deep=True).sum() / (1024 * 1024), 2)
size_memory = round(df.estimated_size() / (1024 * 1024), 2)
print("Tamanho estimado em memoria:", size_memory, "MB")

Tamanho estimado em memoria: 441.8 MB


In [None]:
duckdb.sql(f"""
CREATE OR REPLACE TABLE temp_df AS
SELECT * FROM read_parquet('{s3_file}');
""")

In [106]:
duckdb.sql("""
SELECT * FROM temp_df LIMIT 5;
""")

┌─────────┬───────────┬───────────┬─────────────┬─────────────────┬────────────┬──────────┬─────────────┬─────────┬───────────┬────────────────┬────────────┬────────────────┬────────────┬─────────────────────┬──────────┬─────────────┬────────────┬─────────┬─────────┬────────────────────┬────────────────────┬──────────┬────────────┬────────┬─────────────┬──────────────────┬────────────────┬────────────────────────┬────────────────────────┬──────────┬─────────┬───────────┬────────────┬───────────┬─────────────────────┬───────────┬───────────────┬─────────────┬─────────────────┬─────────────────┬───────────────┬────────────────────┬────────────┬─────────────────────────┬──────────────┬─────────────────┬──────────────┬────────────────────┬────────────────────────────┬─────────────────────────────┬─────────────┬──────────────────┬──────────────────┬───────────┬───────────────────────────┬────────────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬──

In [119]:
df = duckdb.read_parquet(s3_file)

In [120]:
type(df)

duckdb.duckdb.DuckDBPyRelation

In [121]:
# df.show(max_rows=5)
duckdb.sql("SELECT * FROM df LIMIT 5")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

┌─────────┬───────────┬───────────┬─────────────┬─────────────────┬────────────┬──────────┬─────────────┬─────────┬───────────┬────────────────┬────────────┬────────────────┬────────────┬─────────────────────┬──────────┬─────────────┬────────────┬─────────┬─────────┬────────────────────┬────────────────────┬──────────┬────────────┬────────┬─────────────┬──────────────────┬────────────────┬────────────────────────┬────────────────────────┬──────────┬─────────┬───────────┬────────────┬───────────┬─────────────────────┬───────────┬───────────────┬─────────────┬─────────────────┬─────────────────┬───────────────┬────────────────────┬────────────┬─────────────────────────┬──────────────┬─────────────────┬──────────────┬────────────────────┬────────────────────────────┬─────────────────────────────┬─────────────┬──────────────────┬──────────────────┬───────────┬───────────────────────────┬────────────────┬──────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬──

|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento  | metodo |
|--------|-----------------|-----------------|------------------------|--------|
| duckdb | 60.5 MB         |  536.2 MB       | 01:35 s                | sql, df     |
| duckdb | 60.5 MB         |  536.2 MB       | 00:27 s                | query, df   |
| duckdb | 60.5 MB         |  536.2 MB       | 00:33 s                | sql, to_df   |
| duckdb | 60.5 MB         |  536.2 MB       | 00:41 s                | query, to_df |
| duckdb | 60.5 MB         | 000.00 MB       | 00:29 s                | create table |
| duckdb | 60.5 MB         |  441.8 MB       | 00:14 s                | sql, pl     |
| duckdb | 60.5 MB         |  441.8 MB       | 00:26 s                | query, pl   |
| duckdb | 60.5 MB         | 000.00 MB       | 01:00 s                | read_parquet |

# Dask

In [None]:
# DOCS
# https://docs.dask.org/en/stable/
# https://docs.dask.org/en/latest/dataframe-create.html
# https://docs.dask.org/en/stable/dataframe-api.html
# https://examples.dask.org/machine-learning/scale-scikit-learn.html

# INSTALL
# pip install dask
# pip install "dask[complete]"
# pip install "dask[array]"       # Install requirements for dask array
# pip install "dask[dataframe]"   # Install requirements for dask dataframe
# pip install "dask[diagnostics]" # Install requirements for dask diagnostics
# pip install "dask[distributed]" # Install requirements for distributed dask

In [8]:
import pandas as pd
import dask
import dask.dataframe as dd
from dask.distributed import LocalCluster
from dask.distributed import Client

In [9]:
# cluster = LocalCluster()
# client = cluster.get_client()

client = Client(n_workers=2, threads_per_worker=4, processes=True, memory_limit='8GB')

In [None]:
# df = pd.read_parquet(path=s3_file)
# ddf = dd.from_pandas(df, npartitions=10)

# df = dd.read_sql_query(sql="", con=conn, npartitions=4)

In [43]:
df = dd.read_parquet(path=s3_file, engine='pyarrow', dtype_backend=None, arrow_to_pandas=None)

In [45]:
df.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,year
0,,,2500,2500,2500,36 months,13.56,84.92,C,C1,...,,Cash,N,,,,,,,2018
1,,,30000,30000,30000,60 months,18.94,777.23,D,D2,...,,Cash,N,,,,,,,2018
2,,,5000,5000,5000,36 months,17.97,180.69,D,D1,...,,Cash,N,,,,,,,2018
3,,,4000,4000,4000,36 months,18.94,146.51,D,D2,...,,Cash,N,,,,,,,2018
4,,,30000,30000,30000,60 months,16.14,731.78,C,C4,...,,Cash,N,,,,,,,2018


In [37]:
df.npartitions

1

In [42]:
df = df.repartition(npartitions=10)
df.npartitions

10

In [34]:
# size_memory = df.memory_usage(index=True, deep=True).sum()
# print("Tamanho estimado em memoria:", size_memory, "MB")

|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento | engine | dtype_backend |
|--------|-----------------|-----------------|-----------------------|--------|---------------|
| pandas | 60.5 MB         | 000.00 MB       | 00:00 s               | none   | none |
| pandas | 60.5 MB         | 000.00 MB       | 00:00 s               | pyarrow   | none |
| pandas | 60.5 MB         | 000.00 MB       | 00:00 s               | none   | none |

# Daft

In [None]:
# DOCS
# https://www.getdaft.io/projects/docs/en/latest/index.html
# https://www.getdaft.io/projects/docs/en/latest/api_docs/index.html
# https://www.getdaft.io/projects/docs/en/latest/api_docs/doc_gen/io_functions/daft.read_parquet.html
# https://www.getdaft.io/projects/docs/en/latest/api_docs/dataframe.html

# INSTALL
# pip install getdaft
# pip install getdaft[aws]
# pip install getdaft[ray]
# pip install getdaft[all]

In [46]:
import daft
from daft import DataType, udf

In [None]:
df = daft.read_parquet(path=s3_file, use_native_downloader=True)

In [48]:
df.show(5)

id Utf8,member_id Utf8,loan_amnt Int64,funded_amnt Int64,funded_amnt_inv Int64,term Utf8,int_rate Float64,installment Float64,grade Utf8,sub_grade Utf8,emp_title Utf8,emp_length Utf8,home_ownership Utf8,annual_inc Float64,verification_status Utf8,issue_d Utf8,loan_status Utf8,pymnt_plan Utf8,url Utf8,desc Utf8,purpose Utf8,title Utf8,zip_code Utf8,addr_state Utf8,dti Float64,delinq_2yrs Int64,earliest_cr_line Utf8,inq_last_6mths Int64,mths_since_last_delinq Int64,mths_since_last_record Int64,open_acc Int64,pub_rec Int64,revol_bal Int64,revol_util Float64,total_acc Int64,initial_list_status Utf8,out_prncp Float64,out_prncp_inv Float64,total_pymnt Float64,total_pymnt_inv Float64,total_rec_prncp Float64,total_rec_int Float64,total_rec_late_fee Float64,recoveries Float64,collection_recovery_fee Float64,last_pymnt_d Utf8,last_pymnt_amnt Float64,next_pymnt_d Utf8,last_credit_pull_d Utf8,collections_12_mths_ex_med Int64,mths_since_last_major_derog Int64,policy_code Int64,application_type Utf8,annual_inc_joint Float64,dti_joint Float64,verification_status_joint Utf8,acc_now_delinq Int64,tot_coll_amt Int64,tot_cur_bal Int64,open_acc_6m Int64,open_act_il Int64,open_il_12m Int64,open_il_24m Int64,mths_since_rcnt_il Int64,total_bal_il Int64,il_util Int64,open_rv_12m Int64,open_rv_24m Int64,max_bal_bc Int64,all_util Int64,total_rev_hi_lim Int64,inq_fi Int64,total_cu_tl Int64,inq_last_12m Int64,acc_open_past_24mths Int64,avg_cur_bal Int64,bc_open_to_buy Int64,bc_util Float64,chargeoff_within_12_mths Int64,delinq_amnt Int64,mo_sin_old_il_acct Int64,mo_sin_old_rev_tl_op Int64,mo_sin_rcnt_rev_tl_op Int64,mo_sin_rcnt_tl Int64,mort_acc Int64,mths_since_recent_bc Int64,mths_since_recent_bc_dlq Int64,mths_since_recent_inq Int64,mths_since_recent_revol_delinq Int64,num_accts_ever_120_pd Int64,num_actv_bc_tl Int64,num_actv_rev_tl Int64,num_bc_sats Int64,num_bc_tl Int64,num_il_tl Int64,num_op_rev_tl Int64,num_rev_accts Int64,num_rev_tl_bal_gt_0 Int64,num_sats Int64,num_tl_120dpd_2m Int64,num_tl_30dpd Int64,num_tl_90g_dpd_24m Int64,num_tl_op_past_12m Int64,pct_tl_nvr_dlq Float64,percent_bc_gt_75 Float64,pub_rec_bankruptcies Int64,tax_liens Int64,tot_hi_cred_lim Int64,total_bal_ex_mort Int64,total_bc_limit Int64,total_il_high_credit_limit Int64,revol_bal_joint Int64,sec_app_earliest_cr_line Utf8,sec_app_inq_last_6mths Int64,sec_app_mort_acc Int64,sec_app_open_acc Int64,sec_app_revol_util Float64,sec_app_open_act_il Int64,sec_app_num_rev_accts Int64,sec_app_chargeoff_within_12_mths Int64,sec_app_collections_12_mths_ex_med Int64,sec_app_mths_since_last_major_derog Int64,hardship_flag Utf8,hardship_type Utf8,hardship_reason Utf8,hardship_status Utf8,deferral_term Utf8,hardship_amount Utf8,hardship_start_date Utf8,hardship_end_date Utf8,payment_plan_start_date Utf8,hardship_length Utf8,hardship_dpd Utf8,hardship_loan_status Utf8,orig_projected_additional_accrued_interest Utf8,hardship_payoff_balance_amount Utf8,hardship_last_payment_amount Utf8,disbursement_method Utf8,debt_settlement_flag Utf8,debt_settlement_flag_date Utf8,settlement_status Utf8,settlement_date Utf8,settlement_amount Utf8,settlement_percentage Utf8,settlement_term Utf8,year Utf8
,,2500,2500,2500,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,109xx,NY,18.24,0,Apr-2001,1,,45.0,9,1,4341,10.3,34,w,2386.02,2386.02,167.02,167.02,113.98,53.04,0,0,0,Feb-2019,84.92,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,16901,2,2,1,2,2,12560,69,2,7,2137,28,42000,1,11,2,9,1878,34360,5.9,0,0,140,212,1,1,0,1,,2,,0,2,5,3,3,16,7,18,5,9,0,0,0,3,100.0,0,1,0,60124,16901,36500,18124,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
,,30000,30000,30000,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,713xx,LA,26.52,0,Jun-1987,0,71.0,75.0,13,1,12315,24.2,44,w,29387.75,29387.75,1507.11,1507.11,612.25,894.86,0,0,0,Feb-2019,777.23,Mar-2019,Feb-2019,0,,1,Individual,,,,0,1208,321915,4,4,2,3,3,87153,88,4,5,998,57,50800,2,15,2,10,24763,13761,8.3,0,0,163,378,4,3,3,4,,4,,0,2,4,4,9,27,8,14,4,13,0,0,0,6,95.0,0,1,0,372872,99468,15000,94072,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
,,5000,5000,5000,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,490xx,MI,10.51,0,Apr-2011,0,,,8,0,4599,19.1,13,w,4787.21,4787.21,353.89,353.89,212.79,141.1,0,0,0,Feb-2019,180.69,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,110299,0,1,0,2,14,7150,72,0,2,0,35,24100,1,5,0,4,18383,13800,0.0,0,0,87,92,15,14,2,77,,14,,0,0,3,3,3,4,6,7,3,8,0,0,0,0,100.0,0,0,0,136927,11749,13800,10000,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
,,4000,4000,4000,36 months,18.94,146.51,D,D2,IT Supervisor,10+ years,MORTGAGE,92000,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,985xx,WA,16.74,0,Feb-2006,0,,,10,0,5468,78.1,13,w,3831.93,3831.93,286.71,286.71,168.07,118.64,0,0,0,Feb-2019,146.51,Mar-2019,Feb-2019,0,,1,Individual,,,,0,686,305049,1,5,3,5,5,30683,68,0,0,3761,70,7000,2,4,3,5,30505,1239,75.2,0,0,62,154,64,5,3,64,,5,,0,1,2,1,2,7,2,3,2,10,0,0,0,3,100.0,100,0,0,385183,36151,5000,44984,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
,,30000,30000,30000,60 months,16.14,731.78,C,C4,Mechanic,10+ years,MORTGAGE,57250,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,212xx,MD,26.35,0,Dec-2000,0,,,12,0,829,3.6,26,w,29339.02,29339.02,1423.21,1423.21,660.98,762.23,0,0,0,Feb-2019,731.78,Mar-2019,Feb-2019,0,,1,Individual,,,,0,0,116007,3,5,3,5,4,28845,89,2,4,516,54,23100,1,0,0,9,9667,8471,8.9,0,0,53,216,2,2,2,2,,13,,0,2,2,3,8,9,6,15,2,12,0,0,0,5,92.3,0,0,0,157548,29674,9300,32332,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018


# Ibis

In [None]:
# DOCS
# https://ibis-project.org/
# https://ibis-project.org/install
# https://ibis-project.org/tutorials/ibis-for-pandas-users
# https://ibis-project.org/backends/duckdb
# https://ibis-project.org/backends/pyspark
# https://ibis-project.org/reference/expression-tables

# INSTALL
# pip install 'ibis-framework[duckdb]'
# pip install 'ibis-framework[polars]'
# pip install 'ibis-framework[pyspark]'

In [52]:
import ibis
import polars as pl
import duckdb

In [53]:
ibis.options.interactive = True

In [54]:
con = ibis.polars.connect()

In [56]:
df = con.read_parquet(path=s3_file, table_name="temp_df")

In [57]:
df.head(5)

|   lib  | tamanho arquivo | tamanho memoria | tempo de carregamento | engine |
|--------|-----------------|-----------------|-----------------------|--------|
| pandas | 60.5 MB         | 000.00 MB       | 00:9 s                | polars   |
| pandas | 60.5 MB         | 000.00 MB       | 00:00 s               | none   |
| pandas | 60.5 MB         | 000.00 MB       | 00:00 s               | none   |

# Modin

In [None]:
# DOCS
# https://modin.readthedocs.io/en/stable/
# https://modin.readthedocs.io/en/stable/usage_guide/index.html
# https://modin.readthedocs.io/en/stable/getting_started/why_modin/why_modin.html
# https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_dask/local/exercise_1.ipynb
# https://github.com/modin-project/modin/blob/main/examples/tutorial/jupyter/execution/pandas_on_ray/local/exercise_1.ipynb
# https://github.com/modin-project/modin/blob/main/examples/modin-scikit-learn-example.ipynb

# INSTALL
# pip install modin
# pip install "modin[all]"

In [69]:
import modin
import modin.config as cfg
import modin.pandas as pd
import pandas as pd

In [71]:
print(modin.config.Engine.get())
print(modin.config.NPartitions.get())

Dask
2


In [None]:
# Opcoes: dask, ray
cfg.Engine.put("dask")
# os.environ["MODIN_ENGINE"] = "dask"

In [None]:
# modin.config.NPartitions.put(16)

In [65]:
df = pd.read_parquet(path=s3_file, engine="auto")

In [66]:
df.head(5)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,year
0,,,2500,2500,2500,36 months,13.56,84.92,C,C1,...,,Cash,N,,,,,,,2018
1,,,30000,30000,30000,60 months,18.94,777.23,D,D2,...,,Cash,N,,,,,,,2018
2,,,5000,5000,5000,36 months,17.97,180.69,D,D1,...,,Cash,N,,,,,,,2018
3,,,4000,4000,4000,36 months,18.94,146.51,D,D2,...,,Cash,N,,,,,,,2018
4,,,30000,30000,30000,60 months,16.14,731.78,C,C4,...,,Cash,N,,,,,,,2018


# Ray

In [None]:
# DOCS
# https://docs.ray.io/en/latest/
# https://docs.ray.io/en/latest/data/data.html
# https://docs.ray.io/en/latest/data/loading-data.html
# https://docs.ray.io/en/latest/data/api/api.html

# INSTALL
# pip install 'ray[data]'

In [25]:
import ray
import ray.data as rd

In [26]:
ray.init()

2024-11-16 22:40:18,043	INFO worker.py:1810 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.11.10
Ray version:,2.39.0
Dashboard:,http://127.0.0.1:8265


In [None]:
# ds = ray.data.from_pandas(df)
# ds = ray.data.from_dask(ddf)
# ds = ray.data.from_spark(df)
# ds = ray.data.from_modin(mdf)
# ds = ray.data.from_mars(mdf)
# ds = ray.data.from_arrow(table)
# dataset = ray.data.read_sql("SELECT * FROM movie", conn)

In [27]:
ds = ray.data.read_parquet(s3_file)

Parquet Files Sample 0:   0%|          | 0.00/1.00 [00:00<?, ? file/s]

In [29]:
ds.materialize()

2024-11-16 22:44:39,487	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-16_22-40-16_269828_403022/logs/ray-data
2024-11-16 22:44:39,487	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet]


Running 0: 0.00 row [00:00, ? row/s]

- ReadParquet->SplitBlocks(50) 1: 0.00 row [00:00, ? row/s]

MaterializedDataset(
   num_blocks=250,
   num_rows=495242,
   schema={
      id: string,
      member_id: string,
      loan_amnt: int64,
      funded_amnt: int64,
      funded_amnt_inv: int64,
      term: string,
      int_rate: double,
      installment: double,
      grade: string,
      sub_grade: string,
      emp_title: string,
      emp_length: string,
      home_ownership: string,
      annual_inc: double,
      verification_status: string,
      issue_d: string,
      loan_status: string,
      pymnt_plan: string,
      url: string,
      desc: string,
      purpose: string,
      title: string,
      zip_code: string,
      addr_state: string,
      dti: double,
      delinq_2yrs: int64,
      earliest_cr_line: string,
      inq_last_6mths: int64,
      mths_since_last_delinq: int64,
      mths_since_last_record: int64,
      open_acc: int64,
      pub_rec: int64,
      revol_bal: int64,
      revol_util: double,
      total_acc: int64,
      initial_list_status: string,
   

In [30]:
ds.show(1)

2024-11-16 22:49:38,342	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-11-16_22-40-16_269828_403022/logs/ray-data
2024-11-16 22:49:38,342	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]


Running 0: 0.00 row [00:00, ? row/s]

- ReadParquet->SplitBlocks(50) 1: 0.00 row [00:00, ? row/s]

- limit=1 2: 0.00 row [00:00, ? row/s]

{'id': None, 'member_id': None, 'loan_amnt': 2500, 'funded_amnt': 2500, 'funded_amnt_inv': 2500, 'term': ' 36 months', 'int_rate': 13.56, 'installment': 84.92, 'grade': 'C', 'sub_grade': 'C1', 'emp_title': 'Chef', 'emp_length': '10+ years', 'home_ownership': 'RENT', 'annual_inc': 55000.0, 'verification_status': 'Not Verified', 'issue_d': 'Dec-2018', 'loan_status': 'Current', 'pymnt_plan': 'n', 'url': None, 'desc': None, 'purpose': 'debt_consolidation', 'title': 'Debt consolidation', 'zip_code': '109xx', 'addr_state': 'NY', 'dti': 18.24, 'delinq_2yrs': 0, 'earliest_cr_line': 'Apr-2001', 'inq_last_6mths': 1, 'mths_since_last_delinq': None, 'mths_since_last_record': 45, 'open_acc': 9, 'pub_rec': 1, 'revol_bal': 4341, 'revol_util': 10.3, 'total_acc': 34, 'initial_list_status': 'w', 'out_prncp': 2386.02, 'out_prncp_inv': 2386.02, 'total_pymnt': 167.02, 'total_pymnt_inv': 167.02, 'total_rec_prncp': 113.98, 'total_rec_int': 53.04, 'total_rec_late_fee': 0.0, 'recoveries': 0.0, 'collection_reco

# Fugue

In [None]:
# DOCS
# https://fugue-tutorials.readthedocs.io/index.html
# https://fugue-tutorials.readthedocs.io/tutorials/quick_look/ten_minutes.html
# https://fugue-tutorials.readthedocs.io/tutorials/integrations/backends/index.html
# https://fugue-tutorials.readthedocs.io/tutorials/advanced/execution_engine.html
# https://github.com/fugue-project/tutorials/tree/master/tutorials
# https://github.com/fugue-project/tutorials/blob/master/tutorials/beginner/execution_engine.ipynb
# https://fugue.readthedocs.io/en/latest/top_api.html

# INSTALL
# pip install fugue[sql]
# pip install fugue[duckdb,sql]

In [35]:
import fugue_duckdb
import fugue.api as fa
from fugue.api import fugue_sql_flow
from fugue.api import fugue_sql
from fugue import transform
from fugue_sql import fsql
from fugue_sql import FugueSQLWorkflow
from fugue import FugueWorkflow
# from dask_sql.integrations.fugue import DaskSQLExecutionEngine
# from fugue_ibis import run_ibis
# from fugue_jupyter import setup
from fugue_notebook import setup

In [36]:
setup()

<IPython.core.display.Javascript object>

In [69]:
# engine: None, pandas, dask, spark, duckdb
df = fa.load(s3_file, engine="pandas")

In [70]:
type(df)

pandas.core.frame.DataFrame

In [77]:
%%fsql duck
SELECT * FROM df LIMIT 3
PRINT

Unnamed: 0,id:int,member_id:int,loan_amnt:long,funded_amnt:long,funded_amnt_inv:long,term:str,int_rate:double,installment:double,grade:str,sub_grade:str,emp_title:str,emp_length:str,home_ownership:str,annual_inc:double,verification_status:str,issue_d:str,loan_status:str,pymnt_plan:str,url:int,desc:int,purpose:str,title:str,zip_code:str,addr_state:str,dti:double,delinq_2yrs:long,earliest_cr_line:str,inq_last_6mths:long,mths_since_last_delinq:double,mths_since_last_record:double,open_acc:long,pub_rec:long,revol_bal:long,revol_util:double,total_acc:long,initial_list_status:str,out_prncp:double,out_prncp_inv:double,total_pymnt:double,total_pymnt_inv:double,...,tax_liens:long,tot_hi_cred_lim:long,total_bal_ex_mort:long,total_bc_limit:long,total_il_high_credit_limit:long,revol_bal_joint:double,sec_app_earliest_cr_line:str,sec_app_inq_last_6mths:double,sec_app_mort_acc:double,sec_app_open_acc:double,sec_app_revol_util:double,sec_app_open_act_il:double,sec_app_num_rev_accts:double,sec_app_chargeoff_within_12_mths:double,sec_app_collections_12_mths_ex_med:double,sec_app_mths_since_last_major_derog:double,hardship_flag:str,hardship_type:str,hardship_reason:str,hardship_status:str,deferral_term:str,hardship_amount:str,hardship_start_date:str,hardship_end_date:str,payment_plan_start_date:str,hardship_length:str,hardship_dpd:str,hardship_loan_status:str,orig_projected_additional_accrued_interest:str,hardship_payoff_balance_amount:str,hardship_last_payment_amount:str,disbursement_method:str,debt_settlement_flag:str,debt_settlement_flag_date:str,settlement_status:str,settlement_date:str,settlement_amount:str,settlement_percentage:str,settlement_term:str,year:str
0,,,2500,2500,2500,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,109xx,NY,18.24,0,Apr-2001,1,,45.0,9,1,4341,10.3,34,w,2386.02,2386.02,167.02,167.02,...,0,60124,16901,36500,18124,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
1,,,30000,30000,30000,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,713xx,LA,26.52,0,Jun-1987,0,71.0,75.0,13,1,12315,24.2,44,w,29387.75,29387.75,1507.11,1507.11,...,0,372872,99468,15000,94072,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
2,,,5000,5000,5000,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,490xx,MI,10.51,0,Apr-2011,0,,,8,0,4599,19.1,13,w,4787.21,4787.21,353.89,353.89,...,0,136927,11749,13800,10000,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018


In [None]:
df = fugue_sql(f"""
LOAD "{s3_file}" duck
SELECT *
""",
engine=None
)

In [None]:
type(df)

pandas.core.frame.DataFrame

In [None]:
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,...,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,year
0,,,2500,2500,2500,36 months,13.56,84.92,C,C1,Chef,10+ years,RENT,55000.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,109xx,NY,18.24,0,Apr-2001,1,,45.0,9,1,4341,10.3,34,w,2386.02,2386.02,167.02,167.02,...,0,60124,16901,36500,18124,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
1,,,30000,30000,30000,60 months,18.94,777.23,D,D2,Postmaster,10+ years,MORTGAGE,90000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,713xx,LA,26.52,0,Jun-1987,0,71.0,75.0,13,1,12315,24.2,44,w,29387.75,29387.75,1507.11,1507.11,...,0,372872,99468,15000,94072,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
2,,,5000,5000,5000,36 months,17.97,180.69,D,D1,Administrative,6 years,MORTGAGE,59280.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,490xx,MI,10.51,0,Apr-2011,0,,,8,0,4599,19.1,13,w,4787.21,4787.21,353.89,353.89,...,0,136927,11749,13800,10000,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
3,,,4000,4000,4000,36 months,18.94,146.51,D,D2,IT Supervisor,10+ years,MORTGAGE,92000.0,Source Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,985xx,WA,16.74,0,Feb-2006,0,,,10,0,5468,78.1,13,w,3831.93,3831.93,286.71,286.71,...,0,385183,36151,5000,44984,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018
4,,,30000,30000,30000,60 months,16.14,731.78,C,C4,Mechanic,10+ years,MORTGAGE,57250.0,Not Verified,Dec-2018,Current,n,,,debt_consolidation,Debt consolidation,212xx,MD,26.35,0,Dec-2000,0,,,12,0,829,3.6,26,w,29339.02,29339.02,1423.21,1423.21,...,0,157548,29674,9300,32332,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,,2018


In [None]:
# %%fsql duckdb
# data = LOAD "s3://data-us-east-1-891377318910/datasets/loan/part-loan-2018.parquet"

# df = SELECT * data LIMIT 5

In [None]:
# query = ""
# fsql(query, {"hive.src_table": df, "path": s3_file}).run("duckdb")

In [None]:
fsql("""
SELECT * FROM df
PRINT
""", df=df).run("duckdb")