In [1]:
import pandas as pd
import terality as tr
import time
from datetime import datetime


In [2]:
def get_time():
    return datetime.now()

In [3]:
def get_time_diff(start_time, end_time):
    return (end_time - start_time).total_seconds()

In [4]:
def load_data(file_path, file_format = 'CSV', use_lib = 'PANDAS'):
    if file_format == 'PARQUET':
        if use_lib == 'PANDAS':
            df = pd.read_parquet(file_path)
        else:
            df = tr.read_parquet(file_path)
    else:
        if use_lib == 'PANDAS':
            df = pd.read_csv(file_path)
        else:
            df = tr.read_csv(file_path)
    return df


In [5]:
def set_key(df, index):
    df_indexed = df.set_index(index)
    return df_indexed

In [6]:
start_time = get_time()

contact_pdf = load_data('..\\data\\contact.parquet', 'PARQUET', 'PANDAS')
customer_pdf = load_data('..\\data\\customer.parquet', 'PARQUET', 'PANDAS')
product_pdf = load_data('..\\data\\product.parquet', 'PARQUET', 'PANDAS')
state_pdf = load_data('..\\data\\state.parquet', 'PARQUET', 'PANDAS')
txn_pdf = load_data('..\\data\\txn.csv', 'CSV', 'PANDAS')
touch_pdf = load_data('..\\data\\touch.parquet', 'PARQUET', 'PANDAS')

end_time = get_time()
print(get_time_diff(start_time, end_time))

0.826269


In [7]:
start_time = get_time()

contact_tdf = load_data('..\\data\\contact.parquet', 'PARQUET', 'TERALITY')
customer_tdf = load_data('..\\data\\customer.parquet', 'PARQUET', 'TERALITY')
product_tdf = load_data('..\\data\\product.parquet', 'PARQUET', 'TERALITY')
state_tdf = load_data('..\\data\\state.parquet', 'PARQUET', 'TERALITY')
txn_tdf = load_data('..\\data\\txn.csv', 'CSV', 'TERALITY')
touch_tdf = load_data('..\\data\\touch.parquet', 'PARQUET', 'TERALITY')

end_time = get_time()
print(get_time_diff(start_time, end_time))

INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.read_csv was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.read_parquet was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].


4.866121


In [8]:
start_time = get_time()
contact_pdf.sum()['opportunity']
end_time = get_time()

print(get_time_diff(start_time, end_time))

107.046883


In [9]:
start_time = get_time()
contact_tdf.sum()['opportunity']
end_time = get_time()

print(get_time_diff(start_time, end_time))

INFO:terality:The result of te.dataframe.sum was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].


0.366718


In [10]:
start_time = get_time()
net_txn_amount = txn_pdf.sum()['sales']
print("Net Txn Amount: ", net_txn_amount)
end_time = get_time()

print(get_time_diff(start_time, end_time))

Net Txn Amount:  19673120943.690998
3476.233505


In [11]:
start_time = get_time()
net_txn_amount = txn_tdf.sum()['sales']
print("Net Txn Amount: ", net_txn_amount)
end_time = get_time()

print(get_time_diff(start_time, end_time))

INFO:terality:The result of te.dataframe.sum was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].


Net Txn Amount:  19673120943.691
23.19572


In [12]:
start_time = get_time()
txn_pdf.groupby('contact_id').sum()[['sales'] + ['redemptions']]
end_time = get_time()

print(get_time_diff(start_time, end_time))

0.676331


In [13]:
start_time = get_time()
txn_tdf.groupby('contact_id').sum()[['sales'] + ['redemptions']]
end_time = get_time()

print(get_time_diff(start_time, end_time))

INFO:terality:The result of te.dataframe.groupby was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].
INFO:terality:The result of te.dataframe_groupby.sum was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].


0.942542


In [14]:
start_time = get_time()
# Make a copy of a df
txn_pdf_copy = txn_pdf
end_time = get_time()

print(get_time_diff(start_time, end_time))

0.0


In [15]:
start_time = get_time()
# Make a copy of a df
txn_tdf_copy = txn_tdf
end_time = get_time()

print(get_time_diff(start_time, end_time))

0.0


In [16]:
# Describes the dataframe columns and their types
start_time = get_time()
txn_pdf.info()
end_time = get_time()

print(get_time_diff(start_time, end_time))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   txn_id       1048575 non-null  int64  
 1   txn_date     1048575 non-null  object 
 2   contact_id   1048575 non-null  int64  
 3   product_id   1048575 non-null  object 
 4   sales        1048575 non-null  float64
 5   redemptions  1048575 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 48.0+ MB
0.155092


In [17]:
# Describes the dataframe columns and their types
start_time = get_time()
txn_tdf.info()
end_time = get_time()

print(get_time_diff(start_time, end_time))

INFO:terality:The result of te.dataframe.info was retrieved from cache [docs: https://docs.terality.com/getting-terality/user-guide/caching].


<class 'terality.DataFrame'>
Index: 1048575 entries, 0 to 1048574
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   txn_id       1048575 non-null  int64
 1   txn_date     1048575 non-null  object
 2   contact_id   1048575 non-null  int64
 3   product_id   1048575 non-null  object
 4   sales        1048575 non-null  float64
 5   redemptions  1048575 non-null  float64
dtypes: int64(2), object(2), float64(2)
memory usage: 167.9 MB (run with deep=True)
0.2267


In [18]:
# Describes the dataframe columns and their types
start_time = get_time()
txn_pdf.describe()
end_time = get_time()

print(get_time_diff(start_time, end_time))

0.259977


In [19]:
# Describes the dataframe columns and their types
start_time = get_time()
txn_tdf.describe()
end_time = get_time()

print(get_time_diff(start_time, end_time))



17.205624


In [20]:
# Count of rows
start_time = get_time()
txn_pdf.index.size
end_time = get_time()

print(get_time_diff(start_time, end_time))

0.0


In [21]:
# Count of rows
start_time = get_time()
txn_tdf.index.size
end_time = get_time()

print(get_time_diff(start_time, end_time))

0.598509


In [22]:
# Sort by ticker
start_time = get_time()
txn_pdf.sort_values(by = 'product_id')
end_time = get_time()

print(get_time_diff(start_time, end_time))


1.625492


In [23]:
# Sort by ticker
start_time = get_time()
txn_tdf.sort_values(by = 'product_id')
end_time = get_time()

print(get_time_diff(start_time, end_time))


5.210178


In [24]:
# Slicing
start_time = get_time()
txn_pdf[113:211]
end_time = get_time()

print(get_time_diff(start_time, end_time))


0.0


In [25]:
# Slicing
start_time = get_time()
txn_tdf[113:211]
end_time = get_time()

print(get_time_diff(start_time, end_time))


0.952517


In [26]:
# Selecting a single row
start_time = get_time()
txn_pdf[txn_pdf['txn_id'] == 423603758]
end_time = get_time()

print(get_time_diff(start_time, end_time))


0.025832


In [27]:
# Selecting a single row
start_time = get_time()
txn_tdf[txn_tdf['txn_id'] == 423603758]
end_time = get_time()

print(get_time_diff(start_time, end_time))


2.750294


In [28]:
#filters
start_time = get_time()
txn_pdf[txn_pdf['product_id'].isin(['FPHAX', 'VSIAX', 'PRNHX', 'FKTFX', 'VEIEX'])]
end_time = get_time()

print(get_time_diff(start_time, end_time))


0.086713


In [29]:
#filters
start_time = get_time()
txn_tdf[txn_tdf['product_id'].isin(['FPHAX', 'VSIAX', 'PRNHX', 'FKTFX', 'VEIEX'])]
end_time = get_time()

print(get_time_diff(start_time, end_time))


2.891353


In [None]:
txn_df1 = txn_df[0:5]
#txn_df1
type(txn_df1)

#txn_df1.head()

#txn_df2 = txn_df[6:10]
#txn_df3 = txn_df[11:15]

#txn_df123 = pd.concat(txn_df1,txn_df2,txn_df3)

In [None]:
cc_df = pd.merge(contact_df,customer_df, on = 'customer_id')
cc_df.head()

In [None]:
product_df1 = product_df.groupby(['market_cap']).sum()['tna']
product_df1.head()

In [None]:
product_df1 = product_df.groupby(['market_cap','exp_ratio']).sum()['tna']
product_df1

In [None]:
#joins 2 dfs on a the mentioned index
joint_df = contact_df.set_index('customer_id').join(customer_df.set_index('customer_id'), how = 'inner')

joint_df.head()


In [None]:

joint_df = customer_df.set_index('customer_id').join(contact_df.set_index('customer_id'), how = 'left')

joint_df.count()


In [None]:
#not specifying how gives inner join
#Txn by ms_rating
txn_product_df = txn_df.set_index('ticker').join(product_df.set_index('ticker'))
txn_product_df.head()

txn_product_df1 = txn_product_df[['ms_rating','txn_amount']]
txn_product_df2 = txn_product_df1.groupby('ms_rating').sum()['txn_amount']
txn_product_df2.head()



In [None]:
#Contact Id based txns
txn_contact_df = txn_df.set_index('contact_id').join(contact_df.set_index('contact_id'))
txn_contact_df.head()

txn_contact_df1 = txn_contact_df['txn_amount']
txn_contact_df1.head()
#txn_contact_df2 = txn_contact_df1.groupby(txn_contact_df1.index).sum()
#txn_contact_df2.head()