In [1]:
import pandas as pd
import terality as te
import json
import pyarrow.parquet as pq
from codetiming import Timer
import logging


In [2]:
file_path = "..\\\\data\\\\"
timer = Timer(name="Terality Tests")
logging.getLogger("terality").setLevel(logging.WARNING)

In [3]:
# timer.stop()
# product_tdf.info()

In [4]:
# Loading data into dataframes using Pandas
timer.start()

contact_pdf = pd.read_parquet(file_path + "contact.parquet")
product_pdf = pd.read_json(file_path + "product.json")
txn_pdf = pd.read_csv(file_path + 'txn.csv')

timer.stop()

Elapsed time: 2.4221 seconds


2.422119200000001

In [5]:
# Loading data into dataframes using Terality
timer.start()

with te.disable_cache():
    contact_tdf = te.read_parquet(file_path + "contact.parquet")
    product_tdf = te.read_json(file_path + "product.json")
    txn_tdf = te.read_csv(file_path + 'txn.csv')

timer.stop()

..\\data\\contact.parquet: 100%|███████████████████████████████████████████████████| 6.68M/6.68M [00:12<00:00, 545kB/s]
..\\data\\product.json: 100%|██████████████████████████████████████████████████████| 79.9k/79.9k [00:00<00:00, 209kB/s]
..\\data\\txn.csv: 100%|██████████████████████████████████████████████████████████| 51.4M/51.4M [00:16<00:00, 3.06MB/s]


Elapsed time: 35.8867 seconds


35.8866762

In [6]:
# Sum a column value across the entire dataframe with about 200k rows - using Pandas
timer.start()

contact_pdf.sum()['opportunity']

timer.stop()

Elapsed time: 247.8482 seconds


247.84816089999998

In [7]:
# Sum a column value across the entire dataframe with about 200k rows - using Terality
timer.start()

contact_tdf.sum()['opportunity']

timer.stop()

Elapsed time: 10.5582 seconds


10.558202400000027

In [None]:
# Sum a column value across the entire dataframe with about 1M rows - using Pandas
timer.start()

net_txn_amount = txn_pdf.sum()['sales']
print("Net Txn Amount: ", net_txn_amount)

timer.stop()

In [None]:
# Sum a column value across the entire dataframe with about 1M rows - using Terality
timer.start()

net_txn_amount = txn_tdf.sum()['sales']
print("Net Txn Amount: ", net_txn_amount)

timer.stop()

In [8]:
# Sum a column value grouped by another column across the entire dataframe with about 1M rows - using Pandas
timer.start()

txn_pdf.groupby('contact_id').sum()[['sales'] + ['redemptions']]

timer.stop()

Elapsed time: 0.3382 seconds


0.33816919999998163

In [9]:
# Sum a column value grouped by another column across the entire dataframe with about 1M rows - using Terality
timer.start()

txn_tdf.groupby('contact_id').sum()[['sales'] + ['redemptions']]

timer.stop()

Elapsed time: 3.3017 seconds


3.30167449999999

In [10]:
# Describe the dataframe columns and their types - using Pandas
timer.start()

txn_pdf.describe()

timer.stop()

Elapsed time: 0.2162 seconds


0.21618570000003956

In [11]:
# Describe the dataframe columns and their types - using Terality
timer.start()

txn_tdf.describe()

timer.stop()

Elapsed time: 5.0994 seconds


5.0994470999999635

In [12]:
# Count of rows - using Pandas
timer.start()

txn_pdf.index.size

timer.stop()

Elapsed time: 0.0001 seconds


5.649999997103805e-05

In [13]:
# Count of rows - using Terality
timer.start()

txn_tdf.index.size

timer.stop()

Elapsed time: 0.7884 seconds


0.7883520999999973

In [14]:
# Sort by ticker - using Pandas
timer.start()

txn_pdf.sort_values(by = 'product_id')

timer.stop()

Elapsed time: 2.2027 seconds


2.202657399999964

In [15]:
# Sort by ticker - using Terality
timer.start()

txn_tdf.sort_values(by ='product_id')

timer.stop()

Elapsed time: 4.2181 seconds


4.218130400000064

In [16]:
# Slicing - using Pandas
timer.start()

txn_pdf[113:211]

timer.stop()

Elapsed time: 0.0004 seconds


0.00039140000001225417

In [17]:
# Slicing - using Terality
timer.start()

txn_tdf[113:211]

timer.stop()

Elapsed time: 0.8531 seconds


0.8531338000000233

In [18]:
# Selecting a single row from an un-indexed dataframe - using Pandas
timer.start()

txn_pdf[txn_pdf['txn_id'] == 423603758]

timer.stop()

Elapsed time: 0.0036 seconds


0.0036301000000094064

In [19]:
# Selecting a single row from an un-indexed dataframe - using Terality
timer.start()

txn_tdf[txn_tdf['txn_id'] == 423603758]

timer.stop()

Elapsed time: 2.3734 seconds


2.373365100000001

In [20]:
#filters
timer.start()

txn_pdf[txn_pdf['product_id'].isin(['FPHAX', 'VSIAX', 'PRNHX', 'FKTFX', 'VEIEX'])]

timer.stop()

Elapsed time: 0.0993 seconds


0.09928609999997207

In [21]:
#filters
timer.start()

txn_tdf[txn_tdf['product_id'].isin(['FPHAX', 'VSIAX', 'PRNHX', 'FKTFX', 'VEIEX'])]

timer.stop()

Elapsed time: 2.4200 seconds


2.420022200000062

In [22]:
# Merging two dataframes - using Pandas
timer.start()

contact_pdf.merge(txn_pdf, on='contact_id')

timer.stop()

Elapsed time: 2.0853 seconds


2.08529580000004

In [23]:
# Merging two dataframes - using Terality
timer.start()

contact_tdf.merge(txn_tdf, on='contact_id')

timer.stop()

Elapsed time: 9.0715 seconds


9.071548899999925

In [24]:
# Merging two dataframes and grouping by a single column - using Pandas
timer.start()

contact_pdf.merge(txn_pdf, on='contact_id').groupby('customer_id').sum()[['sales'] + ['redemptions']]

timer.stop()

Elapsed time: 1.0281 seconds


1.0280862000000752

In [25]:
# Merging two dataframes and grouping by a single column - using Terality
timer.start()

contact_tdf.merge(txn_tdf, on='contact_id').groupby('customer_id').sum()[['sales'] + ['redemptions']]

timer.stop()

Elapsed time: 5.7810 seconds


5.78103909999993

In [26]:
# Merging three dataframes and grouping by two columns - using Pandas
timer.start()

contact_pdf.merge(txn_pdf, on='contact_id').merge(product_pdf, on='product_id').groupby(['customer_id', 'product_id']).sum()[['sales'] + ['redemptions']]

timer.stop()

Elapsed time: 6.5483 seconds


6.54834470000003

In [27]:
# Merging three dataframes and grouping by two columns - using Terality
timer.start()

contact_tdf.merge(txn_tdf, on='contact_id').merge(product_tdf, on='product_id').groupby(['customer_id', 'product_id']).sum()[['sales'] + ['redemptions']]

timer.stop()

Elapsed time: 23.8555 seconds


23.855537199999958