<a href="https://colab.research.google.com/github/bytehub-ai/code-examples/blob/main/bytehub_featurestore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bytehub>=0.2.1

[31mERROR: distributed 2021.2.0 has requirement cloudpickle>=1.5.0, but you'll have cloudpickle 1.3.0 which is incompatible.[0m


In [2]:
import pandas as pd
import bytehub as bh
import shutil
import os

In [3]:
# Remove previously created Feature Store
try:
    os.remove('bytehub.db')
except FileNotFoundError:
    pass
try:
    shutil.rmtree('/tmp/test-data')
except FileNotFoundError:
    pass

In [4]:
# Create a Feature Store
fs = bh.FeatureStore()

In [5]:
# Create a namespace to store some data
fs.create_namespace(
    'test', url='/tmp/test-data', description='Test data'
)

In [6]:
fs.list_namespaces()

Unnamed: 0,name,version,description,meta,storage_options,url
0,test,1,Test data,{},{},/tmp/test-data


In [8]:
# Get some Bitcoin Price data
import requests

response = requests.get('https://api.coindesk.com/v1/bpi/historical/close.json?start=2015-01-01&end=2021-01-01')
response.raise_for_status()

df_close = pd.DataFrame(
    {
        'time': pd.to_datetime(list(response.json().get('bpi').keys())),
        'created_time': pd.to_datetime(list(response.json().get('bpi').keys())),
        'value': response.json().get('bpi').values()
    }
).set_index('time')
df_close.head()

Unnamed: 0_level_0,created_time,value
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,2015-01-01,313.9247
2015-01-02,2015-01-02,314.5916
2015-01-03,2015-01-03,279.8507
2015-01-04,2015-01-04,263.6343
2015-01-05,2015-01-05,272.9486


In [9]:
# Store it in features in our namespace
fs.create_feature('test/bitcoin.close', partition='year')
fs.save_dataframe(df_close, 'test/bitcoin.close')

In [10]:
# Compute some moving averages
df_averages = pd.concat(
    [
        df_close.ewm(halflife=5).mean().rename(columns={'value': 'test/bitcoin.ewma-5'}),
        df_close.ewm(halflife=20).mean().rename(columns={'value': 'test/bitcoin.ewma-20'})
    ], axis=1
)
df_averages.tail()

Unnamed: 0_level_0,test/bitcoin.ewma-5,test/bitcoin.ewma-20
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-12-28,23791.513397,19433.880895
2020-12-29,24252.672472,19703.668488
2020-12-30,24852.459289,20016.45377
2020-12-31,25383.694626,20320.97656
2021-01-01,25902.538373,20629.961254


In [11]:
# Store the averages in the feature store
fs.create_feature('test/bitcoin.ewma-5', partition='year')
fs.create_feature('test/bitcoin.ewma-20', partition='year')
fs.save_dataframe(df_averages)

In [12]:
# List features
fs.list_features(regex=r'.ewma.')

Unnamed: 0,namespace,name,version,description,meta,partition
1,test,bitcoin.ewma-5,1,,{},year
2,test,bitcoin.ewma-20,1,,{},year


In [13]:
# Load and filter data
df = fs.load_dataframe(['test/bitcoin.close', 'test/bitcoin.ewma-5'], from_date='2020-01-01', to_date='2021-12-31')
df.head()

Unnamed: 0_level_0,test/bitcoin.close,test/bitcoin.ewma-5
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01,7188.4633,7244.504693
2020-01-02,6961.5683,7207.878736
2020-01-03,7346.58,7225.833537
2020-01-04,7355.855,7242.664742
2020-01-05,7356.3117,7257.376277


In [16]:
# Resample to monthly timeseries
df = fs.load_dataframe(fs.list_features(regex=r'bitcoin\..'), from_date='2020-01-01', to_date='2021-12-31', freq='1M')
df.head()

Unnamed: 0,test/bitcoin.close,test/bitcoin.ewma-5,test/bitcoin.ewma-20
2020-01-31,9346.1333,8887.704013,8238.51966
2020-02-29,8538.3983,9291.578593,9086.141735
2020-03-31,6449.95,6423.066033,7490.873582
2020-04-30,8740.75,7632.54603,7383.347519
2020-05-31,9545.15,9279.717876,8623.681782


In [18]:
# Get last values
bitcoin_features = fs.list_features(regex=r'bitcoin\..')
last_values = fs.last(bitcoin_features)
last_values

{'test/bitcoin.close': 29391.775,
 'test/bitcoin.ewma-20': 20629.961254095204,
 'test/bitcoin.ewma-5': 25902.538372937153}

In [19]:
# Delete the features
for feature in last_values.keys():
    fs.delete_feature(feature)

In [20]:
# Delete the namespace
fs.delete_namespace('test')