In [58]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_error

pd.set_option('display.max_rows', 500)

In [2]:
# load the data
data = pd.read_csv('../../btcusd.csv')

### Timestamp Conversion

In [3]:
data['time'] = pd.to_datetime(data['time'], unit = 'ms')

In [4]:
data.set_index('time', inplace = True)

In [5]:
data

Unnamed: 0_level_0,open,close,high,low,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-04-01 00:07:00,93.25,93.30,93.30,93.25,93.300000
2013-04-01 00:08:00,100.00,100.00,100.00,100.00,93.300000
2013-04-01 00:09:00,93.30,93.30,93.30,93.30,33.676862
2013-04-01 00:11:00,93.35,93.47,93.47,93.35,20.000000
2013-04-01 00:12:00,93.47,93.47,93.47,93.47,2.021627
...,...,...,...,...,...
2020-02-07 18:19:00,9764.00,9762.50,9764.00,9762.00,0.170632
2020-02-07 18:20:00,9762.00,9767.10,9769.40,9761.90,7.720155
2020-02-07 18:21:00,9768.40,9769.40,9769.40,9766.60,2.257836
2020-02-07 18:22:00,9768.00,9767.20,9769.20,9767.20,0.555650


### Null Values

In [6]:
data.columns

Index(['open', 'close', 'high', 'low', 'volume'], dtype='object')

In [7]:
data.isnull().sum()

open      0
close     0
high      0
low       0
volume    0
dtype: int64

### Imputing missing data points

In [8]:
data = data.resample('1min').first().ffill()

### Rolling up data

In [9]:
def rollup(data, interval):
    
        temp = data.copy()

        temp['close'] = temp['close'].resample(interval, label='right', closed = 'right').ohlc()['close']
        temp['high'] = temp['high'].resample(interval, label='right', closed = 'right').ohlc()['high']
        temp['low'] = temp['low'].resample(interval, label='right', closed = 'right').ohlc()['low']
        temp['open'] = temp['open'].resample(interval, label='right', closed = 'right').ohlc()['open']
        temp['volume'] = temp['volume'].resample(interval, label='right', closed = 'right').sum()
        
        return temp

In [10]:
data_5min = rollup(data, '5min').dropna()

##### '10min', '15min', '30min', etc. can be other time windows that we can try.

### Creating Features

In [17]:
import ta

In [21]:
data_5min_ta = ta.add_all_ta_features(data_5min, open="open", high="high", low="low", close="close", volume="volume")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{colprefix}trend_vortex_ind_pos'] = indicator.vortex_indicator_pos()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{colprefix}trend_vortex_ind_neg'] = indicator.vortex_indicator_neg()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{colprefix}trend_vortex_ind_diff'] = indicator.vorte

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  high=df[high], low=df[low], close=df[close], volume=df[volume], n=14, fillna=fillna).money_flow_index()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f'{colprefix}momentum_tsi'] = TSIIndicator(close=df[close], r=25, s=13, fillna=fillna).tsi()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filln

### Create the Target Column

In [31]:
# create the target feature
data_5min_ta['nextClosingPrice'] = data_5min_ta['close'].shift(-1)

# drop the rows with 'None' in target column
data_5min_ta = data_5min_ta.dropna(subset=['nextClosingPrice'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Save csv

In [32]:
data_5min_ta.to_csv('data_5min_ta.csv')