# Preprocess BTC Data

In [70]:
import pandas as pd
import pandas_ta as ta

In [72]:
df = pd.read_csv("btc_usdt_ohlcv_1h_5years.csv")

## Time related features

In [75]:
df['timestamp'] = pd.to_datetime(df['timestamp'], format='%Y-%m-%d %H:%M:%S')

df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.dayofweek
df['month'] = df['timestamp'].dt.month

## Add technical indicators

In [78]:
# 1. Last 100 candlesticks average between high and low
df['avg_100_candles'] = (df['high'] + df['low']).rolling(window=100).mean()

# 2. Standard Deviation
df['std'] = (df['high'] + df['low']).rolling(window=100).std()

# 3. EMAs
df['ema_20'] = ta.ema(df['close'], length=20)
df['ema_100'] = ta.ema(df['close'], length=100)

# 4. RSI
df['rsi_20'] = ta.rsi(df['close'], length=20)

# 5. MACD
macd = ta.macd(df['close'], fast=12, slow=26, signal=9)
df['macd_line'] = macd['MACD_12_26_9']
df['macd_signal'] = macd['MACDs_12_26_9']

# 6. Bollinger Bands
bollinger = ta.bbands(df['close'], length=20, std=2)
df['bollinger_upper'] = bollinger['BBU_20_2.0']
df['bollinger_lower'] = bollinger['BBL_20_2.0']
df['bollinger_bandwidth'] = bollinger['BBB_20_2.0']

# 7. Volume over last 100 candles
df['volume_100'] = df['volume'].rolling(window=100).sum()


## Add the data of the next candle for testing

In [81]:
# Creating columns for the next candle's open, high, low, and close
df['next_open'] = df['open'].shift(-1)
df['next_high'] = df['high'].shift(-1)
df['next_low'] = df['low'].shift(-1)
df['next_close'] = df['close'].shift(-1)

# Optional: Drop the last row, as it will have NaN values for these new columns
df = df.dropna().reset_index(drop=True)

In [83]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,hour,day_of_week,month,avg_100_candles,...,macd_line,macd_signal,bollinger_upper,bollinger_lower,bollinger_bandwidth,volume_100,next_open,next_high,next_low,next_close
0,2019-09-12 20:00:00,10301.82,10360.75,10300.85,10320.73,608.006,20,3,9,20401.2434,...,56.231704,43.98768,10394.39804,10000.93896,3.858324,51487.151,10321.45,10400.0,10321.45,10388.87
1,2019-09-12 21:00:00,10321.45,10400.0,10321.45,10388.87,655.809,21,3,9,20408.4579,...,62.091584,47.60846,10421.232309,10000.536691,4.12007,52142.958,10390.01,10450.13,10374.96,10385.45
2,2019-09-12 22:00:00,10390.01,10450.13,10374.96,10385.45,701.318,22,3,9,20416.7088,...,65.702241,51.227216,10442.804259,10006.333741,4.26884,52844.276,10386.76,10415.13,10357.51,10415.13
3,2019-09-12 23:00:00,10386.76,10415.13,10357.51,10415.13,652.897,23,3,9,20417.4856,...,70.149995,55.011772,10466.609332,10012.698668,4.432871,53025.514,10414.96,10440.55,10368.98,10387.34
4,2019-09-13 00:00:00,10414.96,10440.55,10368.98,10387.34,636.735,0,4,9,20418.5491,...,70.618406,58.133099,10478.068469,10032.136531,4.348391,53078.978,10387.34,10387.5,10329.3,10335.68


In [85]:
df.columns

Index(['timestamp', 'open', 'high', 'low', 'close', 'volume', 'hour',
       'day_of_week', 'month', 'avg_100_candles', 'std', 'ema_20', 'ema_100',
       'rsi_20', 'macd_line', 'macd_signal', 'bollinger_upper',
       'bollinger_lower', 'bollinger_bandwidth', 'volume_100', 'next_open',
       'next_high', 'next_low', 'next_close'],
      dtype='object')

## Store the current dataframe

In [88]:
# Save the data to a CSV file
df.to_csv('preprocessed_ohlcv.csv', index=False)

## Standardize the data

In [90]:
# Columns to standardize
cols_to_standardize = ["open", "high", "low", "close", "volume", "hour", "day_of_week", "month", "avg_100_candles", "std", 
                       "ema_20", "ema_100", "rsi_20", "macd_line", "macd_signal", "bollinger_upper", "bollinger_lower", 
                       "bollinger_bandwidth", "volume_100", "next_open", "next_high", "next_low", "next_close"]


### Store the mean and std for each column so that we can convert them back to the original values

In [91]:
import json

# Dictionary to store the mean and std for each column
scaling_params = {}

# Calculate and store the mean and std for each column
for col in cols_to_standardize:
    scaling_params[col] = {
        'mean': df[col].mean(),
        'std': df[col].std()
    }

# Store the scaling parameters in a JSON file
with open("scaling_params.json", "w") as json_file:
    json.dump(scaling_params, json_file, indent=4)

print("Scaling parameters saved to 'scaling_params.json'.")


Scaling parameters saved to 'scaling_params.json'.


In [92]:
# Standardize columns
df[cols_to_standardize] = df[cols_to_standardize].apply(lambda x: (x - x.mean()) / x.std(), axis=0)

In [93]:
df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,hour,day_of_week,month,avg_100_candles,...,macd_line,macd_signal,bollinger_upper,bollinger_lower,bollinger_bandwidth,volume_100,next_open,next_high,next_low,next_close
0,2019-09-12 20:00:00,-1.139323,-1.138999,-1.136794,-1.138429,-0.860748,1.228007,-0.001456,0.651222,-1.145749,...,0.147399,0.111984,-1.144554,-1.143727,0.036895,-1.761497,-1.138395,-1.137151,-1.135815,-1.135205
1,2019-09-12 21:00:00,-1.138394,-1.13715,-1.135815,-1.135205,-0.85785,1.372468,-0.001456,0.651222,-1.145578,...,0.167432,0.125127,-1.143307,-1.143746,0.117051,-1.760688,-1.135151,-1.13479,-1.133272,-1.135367
2,2019-09-12 22:00:00,-1.135151,-1.134789,-1.133272,-1.135367,-0.855091,1.516929,-0.001456,0.651222,-1.145382,...,0.179775,0.138263,-1.142304,-1.143467,0.16261,-1.759823,-1.135305,-1.136438,-1.134101,-1.133963
3,2019-09-12 23:00:00,-1.135304,-1.136438,-1.134101,-1.133963,-0.858026,1.661391,-0.001456,0.651222,-1.145363,...,0.19498,0.152001,-1.141198,-1.14316,0.212842,-1.759599,-1.133971,-1.135241,-1.133556,-1.135278
4,2019-09-13 00:00:00,-1.13397,-1.13524,-1.133556,-1.135278,-0.859006,-1.661216,0.498594,0.651222,-1.145338,...,0.196582,0.163332,-1.140665,-1.142222,0.186971,-1.759533,-1.135278,-1.13774,-1.135442,-1.137721


# Split data into testing and training dataframes

In [102]:
# Shuffle the dataframe to randomize the order of the rows
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into training (80%) and testing (20%) sets
train_size = int(0.8 * len(df))  # 80% for training

train_df = df[:train_size]  # First 80% of the data
test_df = df[train_size:]  # Remaining 20% for testing

# Print the shapes of the resulting datasets
print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")

Training set shape: (37088, 24)
Testing set shape: (9272, 24)


In [104]:
# Save the data to a CSV file
train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)