In [56]:
import pandas as pd
import terality as te
import json
import pyarrow.parquet as pq
from codetiming import Timer
import logging


In [57]:
raw_pdf = {}
indexed_pdf = {}
raw_tdf = {}
indexed_tdf = {}

timer = Timer(name="Terality Tests")
logging.getLogger("terality").setLevel(logging.WARNING)

In [58]:
# timer.stop()

In [60]:
f = open('..\\conf\\sales_conf.json')
conf_data = json.load(f)

1

In [37]:
def load_data(eval_str: str):
    """
    This function accepts a string representation of a data load command and executes the same and returns a dataframe
    :param eval_str: a string representation of a data load command
    :return: a Pandas dataframe
    """
    if eval_str.find("te.") < 0:    # Not a terality operation
        return eval(eval_str)
    else:
        with te.disable_cache():
            return eval(eval_str)


In [38]:
def set_key(df, index_col):
    """
    This function accepts a dataframe and a column within it and sets the column as the indexed column
    :param df: dataframe that needs its index set
    :param index_col: name of the column that should be set as indexed column
    :return: a new dataframe with its index set
    """
    return df.set_index(index_col)

In [53]:
def print_dict(df_dict):
    """
    This fucntion accepts a dictionary as input and prints its key and the type of its corresponding value
    :param df_dict: dictionary
    :return: None
    """
    for key, value in df_dict.items():
        print("Key: ", key)
        print(type(value))
    print("")


In [40]:
# Loading data into dataframes using Pandas
timer.start()

for i in conf_data["tables"]:
    raw_pdf[i["raw_df"]] = load_data(conf_data["pandas_module"] + i["load_func"] + "('" + conf_data["file_path"] + i["file_name"] + "')")

timer.stop()

Elapsed time: 0.8125 seconds


0.8125301000000036

In [41]:
# Loading data into dataframes using Terality
timer.start()

for i in conf_data["tables"]:
    raw_tdf[i["raw_df"]] = load_data(conf_data["terality_module"] + i["load_func"] + "('" + conf_data["file_path"] + i["file_name"] + "')")

timer.stop()

..\data\contact.parquet:   0%|          | 0.00/6.68M [00:00<?, ?B/s]
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
..\data\customer.parquet:   0%|          | 0.00/94.9k [00:00<?, ?B/s]
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
..\data\product.json:   0%|          | 0.00/79.9k [00:00<?, ?B/s]
INFO:terality:The result of te.read_json was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
..\data\state.parquet:   0%|          | 0.00/3.18k [00:00<?, ?B/s]
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
..\data\txn.csv:   0%|          | 0.00/51.4M [00:00<?, ?B/s]
INFO:terality:The result of te.read_csv was retrieved from cache [docs: https://docs.terality.com/getting-tera

Elapsed time: 2.5701 seconds


2.570144199999959

In [42]:
# Setting Index columns using Pandas
timer.start()

for i in conf_data["tables"]:
    indexed_pdf[i["indexed_df"]] = set_key(raw_pdf[i["raw_df"]], i["index_col"])

timer.stop()

Elapsed time: 0.0298 seconds


0.029794100000799517

In [43]:
# Setting Index columns using Terality
timer.start()

for i in conf_data["tables"]:
    indexed_tdf[i["indexed_df"]] = set_key(raw_tdf[i["raw_df"]], i["index_col"])

timer.stop()

INFO:terality:The result of te.dataframe.set_index was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.dataframe.set_index was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.dataframe.set_index was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.dataframe.set_index was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.dataframe.set_index was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.dataframe.set_index was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].


Elapsed time: 1.8076 seconds


1.8076407999997173

In [54]:
print_dict(raw_pdf)
print_dict(raw_tdf)
print_dict(indexed_pdf)
print_dict(indexed_tdf)

Key:  contact_df_raw
<class 'pandas.core.frame.DataFrame'>
Key:  customer_df_raw
<class 'pandas.core.frame.DataFrame'>
Key:  product_df_raw
<class 'pandas.core.frame.DataFrame'>
Key:  state_df_raw
<class 'pandas.core.frame.DataFrame'>
Key:  txn_df_raw
<class 'pandas.core.frame.DataFrame'>
Key:  touch_df_raw
<class 'pandas.core.frame.DataFrame'>

Key:  contact_df_raw
<class 'terality._terality.terality_structures.dataframe.DataFrame'>
Key:  customer_df_raw
<class 'terality._terality.terality_structures.dataframe.DataFrame'>
Key:  product_df_raw
<class 'terality._terality.terality_structures.dataframe.DataFrame'>
Key:  state_df_raw
<class 'terality._terality.terality_structures.dataframe.DataFrame'>
Key:  txn_df_raw
<class 'terality._terality.terality_structures.dataframe.DataFrame'>
Key:  touch_df_raw
<class 'terality._terality.terality_structures.dataframe.DataFrame'>

Key:  contact_df
<class 'pandas.core.frame.DataFrame'>
Key:  customer_df
<class 'pandas.core.frame.DataFrame'>
Key:  p

In [55]:
# Sum a column value across the entire dataframe with about 200k rows - using Pandas
timer.start()

raw_pdf['contact_df_raw'].sum()['opportunity']

timer.stop()

Elapsed time: 247.1602 seconds


247.16020070000013

In [14]:
# Sum a column value across the entire dataframe with about 200k rows - using Terality
timer.start()

raw_tdf['contact_df_raw'].sum()['opportunity']

timer.stop()



33.84886


In [15]:
# Sum a column value across the entire dataframe with about 1M rows - using Pandas
timer.start()

net_txn_amount = raw_pdf['txn_df_raw'].sum()['sales']
print("Net Txn Amount: ", net_txn_amount)

timer.stop()

Net Txn Amount:  19673120943.690998
2813.992782


In [16]:
# Sum a column value across the entire dataframe with about 1M rows - using Terality
timer.start()

net_txn_amount = raw_tdf['txn_df_raw'].sum()['sales']
print("Net Txn Amount: ", net_txn_amount)

timer.stop()

Net Txn Amount:  19673120943.691
42.772839


In [17]:
# Sum a column value grouped by another column across the entire dataframe with about 1M rows - using Pandas
timer.start()

raw_pdf['txn_df_raw'].groupby('contact_id').sum()[['sales'] + ['redemptions']]

timer.stop()

0.242996


In [18]:
# Sum a column value grouped by another column across the entire dataframe with about 1M rows - using Terality
timer.start()

raw_tdf['txn_df_raw'].groupby('contact_id').sum()[['sales'] + ['redemptions']]

timer.stop()

3.164788


In [19]:
# Describes the dataframe columns and their types
timer.start()

raw_pdf['txn_df_raw'].describe()

timer.stop()

0.13332


In [20]:
# Describes the dataframe columns and their types
timer.start()

raw_tdf['txn_df_raw'].describe()

timer.stop()

5.797104


In [20]:
# Count of rows
timer.start()

txn_pdf.index.size

timer.stop()

0.0


In [21]:
# Count of rows
timer.start()
txn_tdf.index.size


timer.stop()

0.598509


In [22]:
# Sort by ticker
timer.start()
txn_pdf.sort_values(by = 'product_id')


timer.stop()


1.625492


In [23]:
# Sort by ticker
timer.start()
txn_tdf.sort_values(by = 'product_id')


timer.stop()


5.210178


In [24]:
# Slicing
timer.start()
txn_pdf[113:211]


timer.stop()


0.0


In [25]:
# Slicing
timer.start()
txn_tdf[113:211]


timer.stop()


0.952517


In [26]:
# Selecting a single row
timer.start()
txn_pdf[txn_pdf['txn_id'] == 423603758]


timer.stop()


0.025832


In [27]:
# Selecting a single row
timer.start()
txn_tdf[txn_tdf['txn_id'] == 423603758]


timer.stop()


2.750294


In [28]:
#filters
timer.start()
txn_pdf[txn_pdf['product_id'].isin(['FPHAX', 'VSIAX', 'PRNHX', 'FKTFX', 'VEIEX'])]


timer.stop()


0.086713


In [29]:
#filters
timer.start()
txn_tdf[txn_tdf['product_id'].isin(['FPHAX', 'VSIAX', 'PRNHX', 'FKTFX', 'VEIEX'])]


timer.stop()


2.891353


In [None]:
txn_df1 = txn_df[0:5]
#txn_df1
type(txn_df1)

#txn_df1.head()

#txn_df2 = txn_df[6:10]
#txn_df3 = txn_df[11:15]

#txn_df123 = pd.concat(txn_df1,txn_df2,txn_df3)

In [None]:
cc_df = pd.merge(contact_df,customer_df, on = 'customer_id')
cc_df.head()

In [None]:
product_df1 = product_df.groupby(['market_cap']).sum()['tna']
product_df1.head()

In [None]:
product_df1 = product_df.groupby(['market_cap','exp_ratio']).sum()['tna']
product_df1

In [None]:
#joins 2 dfs on a the mentioned index
joint_df = contact_df.set_index('customer_id').join(customer_df.set_index('customer_id'), how = 'inner')

joint_df.head()


In [None]:

joint_df = customer_df.set_index('customer_id').join(contact_df.set_index('customer_id'), how = 'left')

joint_df.count()


In [None]:
#not specifying how gives inner join
#Txn by ms_rating
txn_product_df = txn_df.set_index('ticker').join(product_df.set_index('ticker'))
txn_product_df.head()

txn_product_df1 = txn_product_df[['ms_rating','txn_amount']]
txn_product_df2 = txn_product_df1.groupby('ms_rating').sum()['txn_amount']
txn_product_df2.head()



In [None]:
#Contact Id based txns
txn_contact_df = txn_df.set_index('contact_id').join(contact_df.set_index('contact_id'))
txn_contact_df.head()

txn_contact_df1 = txn_contact_df['txn_amount']
txn_contact_df1.head()
#txn_contact_df2 = txn_contact_df1.groupby(txn_contact_df1.index).sum()
#txn_contact_df2.head()