# Workflow
First, we'll want to get our data using the Coingecko API. We'll import everything we need for data collection & processing.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pycoingecko import CoinGeckoAPI
import datetime as dt
from sklearn.preprocessing import minmax_scale

cg = CoinGeckoAPI()

Next, we'll set the dates we want data from, and query the Coingecko API for it.

In [2]:
# set dates
end_date = dt.datetime(2021, 4, 16)
duration = 30
start_date = end_date - dt.timedelta(duration)
start_utc = start_date.replace(tzinfo=dt.timezone.utc).timestamp()
end_utc = end_date.replace(tzinfo=dt.timezone.utc).timestamp()

# in case we figure out minutely data
# interval_date = start_date
# dates = []
# while interval_date.timestamp() < end_utc.timestamp():
#     dates.append(interval_date.timestamp())
#     interval_date += dt.timedelta(1)
# dates.append(end_utc.timestamp())

coins = cg.get_coins_list()
mi = cg.get_coin_market_chart_range_by_id(id='bitcoin', 
                                          vs_currency='usd', 
                                          from_timestamp=start_utc, 
                                          to_timestamp=end_utc)
len(mi['prices'])

721

### Data preprocessing & normalization
Here we format and scale each feature to between 0 and 1.  
**NOTE:** All incoming data will have to be normalized as well, so save this!! It's probably worth figuring out how to make this happen using the Estimator API, so that it's possible to save the normalization method.

In [3]:
mi_data = np.array([
    np.array(mi['prices'])[:, 1],
    np.array(mi['market_caps'])[:, 1],
    np.array(mi['total_volumes'])[:, 1]
])
mi_scaled = minmax_scale(mi_data, axis=1)
df_dict = {'prices': mi_scaled[0], 'market_caps': mi_scaled[1], 'total_volumes': mi_scaled[2]}
df = pd.DataFrame(df_dict)
df.to_csv('data/raw.csv', index=False)

### Data labelling
We'll use a sliding window to group & label the data with "buy", "sell", or "hold" with $\{1, -1, 0\}$ respectively

In [4]:
# hold tolerance of 0, window size of 24 hours
tol = 0
window_size = 24

X = np.zeros((len(df) - window_size, 3, window_size))
Y = np.zeros((len(df) - window_size))
for i in range(len(df) - window_size):
    
    X[i][0] = df.shift(-i)['prices'][:window_size].values
    X[i][1] = df.shift(-i)['market_caps'][:window_size].values
    X[i][2] = df.shift(-i)['total_volumes'][:window_size].values
    diff = df.shift(-i)['prices'][window_size] - X[i][0][-1]
    
    if diff > tol:
        Y[i] = 1
    elif diff < -tol:
        Y[i] = -1
    else:
        Y[i] = 0
    
    
dfNew = pd.DataFrame(X, columns=['prices', 'market_caps', 'total_volumes', 'label'])

ValueError: Must pass 2-d input

Finally, we'll save the data for our models to use.

In [None]:
# best to save the data in numpy format, since the list data gets messed up
dfNew.head()

In [None]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols = list()
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
    # put it all together
    agg = pd.concat(cols, axis=1)
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg.values

series_to_supervised(mi_data).shape[1]