In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error


In [2]:
stock_data = pd.read_csv('/kaggle/input/btc-usd/BTC-USD.csv')


btc = stock_data.copy()
btc.set_index('Date', inplace=True)

In [3]:
btc

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-01-01,430.721008,436.246002,427.515015,434.334015,434.334015,3.627890e+07
2016-01-02,434.622009,436.062012,431.869995,433.437988,433.437988,3.009660e+07
2016-01-03,433.578003,433.743011,424.705994,430.010986,430.010986,3.963380e+07
2016-01-04,430.061005,434.516998,429.084015,433.091003,433.091003,3.847750e+07
2016-01-05,433.069000,434.182007,429.675995,431.959991,431.959991,3.452260e+07
...,...,...,...,...,...,...
2023-11-23,37420.433594,37643.917969,36923.863281,37289.621094,37289.621094,1.421495e+10
2023-11-24,37296.316406,38415.339844,37261.605469,37720.281250,37720.281250,2.292296e+10
2023-11-25,37721.414063,37892.429688,37617.417969,37796.792969,37796.792969,9.099571e+09
2023-11-26,,,,,,


# Part 2: Feature Engineering

In [4]:
#feature engineering 

btc['Daily_Return'] = btc['Close'].pct_change() #percent change (% of how much higher/lower close is compared to open)

btc['MA_7'] = btc['Close'].rolling(window=7).mean() #7 day rolling average in price
btc['MA_30'] = btc['Close'].rolling(window=30).mean() # 30 day rolling average in price 

btc['EMA_12'] = btc['Close'].ewm(span=12, adjust=False).mean() #12 day exponential moving average
btc['EMA_26'] = btc['Close'].ewm(span=26, adjust=False).mean() #26 day exponential moving average

btc['Avg_Volume'] = btc['Volume'].rolling(window=20).mean()
btc['Volume_Change'] = btc['Volume'].pct_change()

In [5]:
#feature engineering - Relative Strength Index 
delta = btc['Close'].diff(1)
gain = delta.where(delta > 0, 0)
loss = -delta.where(delta < 0, 0)

avg_gain = gain.rolling(window=14).mean()
avg_loss = loss.rolling(window=14).mean()

btc['RS'] = avg_gain / avg_loss
btc['RSI'] = 100 - (100 / (1 + btc['RS']))

# Part 3: Data Cleaning & Pre-Processing

In [6]:
#Data imputation - handling missing (NaN) values

btc['Daily_Return'].fillna(value=0, inplace=True) #imputing 0s into features that deal with day-to-day change
btc['Volume_Change'].fillna(value=0, inplace=True)


btc['Avg_Volume'].fillna(method='bfill', inplace=True) #Imputing values backwards for features that deal with averages over an interval - using the value that's most 
btc['MA_7'].fillna(method='bfill', inplace=True)
btc['MA_30'].fillna(method='bfill', inplace=True)
btc['RS'].fillna(method='bfill', inplace=True)
btc['RSI'].fillna(method='bfill', inplace=True)

In [7]:
#Data normalization 

from sklearn.preprocessing import MinMaxScaler

columns_to_normalize = ['Open', 'High', 'Low','Volume', 'MA_7', 'MA_30', 'RSI', 'EMA_12', 'EMA_26', 'Avg_Volume', 'Volume_Change', 'Close']

###Need to sequence data after normalizing

scaler = MinMaxScaler(feature_range=(0, 1))
btc_norm = scaler.fit_transform(btc[columns_to_normalize])

In [8]:
btc_n = pd.DataFrame(btc_norm, index= btc.index, columns = columns_to_normalize)
btc_n['Daily_Return'] = btc['Daily_Return']

In [9]:
btc_n.drop(['2023-11-26', '2023-11-27'], axis=0, inplace=True)

In [10]:
btc_norm = btc_norm[:-2,:]

In [11]:
btc_norm.shape

(2886, 12)

In [12]:
daily_returns = btc['Daily_Return'][:-2]
btc_norm_fin = np.column_stack((daily_returns, btc_norm))

In [13]:
btc_n

Unnamed: 0_level_0,Open,High,Low,Volume,MA_7,MA_30,RSI,EMA_12,EMA_26,Avg_Volume,Volume_Change,Close,Daily_Return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-01,0.000977,0.000896,0.001100,0.000022,0.000934,0.000450,0.446124,0.000859,0.000756,0.000154,0.137787,0.001042,0.000000
2016-01-02,0.001035,0.000893,0.001166,0.000005,0.000934,0.000450,0.446124,0.000856,0.000755,0.000154,0.110773,0.001028,-0.002063
2016-01-03,0.001020,0.000859,0.001057,0.000032,0.000934,0.000450,0.446124,0.000846,0.000750,0.000154,0.188021,0.000977,-0.007907
2016-01-04,0.000967,0.000871,0.001123,0.000028,0.000934,0.000450,0.446124,0.000845,0.000749,0.000154,0.133162,0.001023,0.007163
2016-01-05,0.001012,0.000866,0.001132,0.000017,0.000934,0.000450,0.446124,0.000841,0.000746,0.000154,0.121493,0.001006,-0.002611
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-11-21,0.552270,0.544564,0.537035,0.071647,0.559863,0.563676,0.494676,0.566049,0.564022,0.215129,0.170299,0.527502,-0.044378
2023-11-22,0.526779,0.547865,0.534872,0.069439,0.558880,0.565997,0.553571,0.568309,0.566489,0.217036,0.132907,0.551587,0.045193
2023-11-23,0.551545,0.544751,0.553847,0.040424,0.561369,0.567807,0.504441,0.569877,0.568602,0.215384,0.071626,0.549463,-0.003813
2023-11-24,0.549698,0.556027,0.558962,0.065238,0.563834,0.569525,0.495865,0.572242,0.571073,0.222881,0.234898,0.555871,0.011549


In [14]:
btc_norm_fin.shape
type(btc_norm_fin)

numpy.ndarray

# Part 4: Sequence Creation

In [15]:
sequences = []
targets = []

sequence_length = 10

for i in range(len(btc_norm_fin) - sequence_length):
    seq = btc_norm_fin[i : i + sequence_length]
    target = btc_norm_fin[i + sequence_length, -1]  # Assuming 'Close' is the first column
    sequences.append(seq)
    targets.append(target)

X = np.array(sequences)
y = np.array(targets)

In [16]:
train_size = int(len(sequences) * 0.80)
X_train = X[:train_size]
y_train = y[:train_size]
X_valid = X[train_size:]
y_valid = y[train_size:]

# Part 5: Model Architecture

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import keras

model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(sequence_length, X[0].shape[1])))  # +1 for the target variable
model.add(Dense(1))
optimizer = keras.optimizers.Adam(lr=0.005)
model.compile(optimizer=optimizer, loss='mse')

# Part 6: Training

In [18]:
#X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
model.fit(X_train, y_train, epochs=25, batch_size=32)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x786be0657610>

In [19]:
pred = model.predict(X_valid)
validation_loss = mean_squared_error(y_valid, pred)
validation_loss



0.00014269783263742277

In [20]:
pred

array([[0.5810292 ],
       [0.5603229 ],
       [0.54682374],
       [0.5604278 ],
       [0.5580978 ],
       [0.5426396 ],
       [0.5754842 ],
       [0.5287689 ],
       [0.5240231 ],
       [0.5153011 ],
       [0.49594167],
       [0.44673112],
       [0.45747164],
       [0.42500663],
       [0.42178443],
       [0.42299932],
       [0.43368357],
       [0.4530355 ],
       [0.4334626 ],
       [0.44277462],
       [0.4160578 ],
       [0.44482708],
       [0.42638683],
       [0.43029532],
       [0.44798434],
       [0.43507537],
       [0.44359058],
       [0.44461393],
       [0.44061294],
       [0.42848304],
       [0.43062568],
       [0.43612522],
       [0.47620338],
       [0.472163  ],
       [0.44442838],
       [0.45359287],
       [0.44101536],
       [0.44229844],
       [0.44174877],
       [0.4687863 ],
       [0.46079987],
       [0.4453819 ],
       [0.4443031 ],
       [0.43013778],
       [0.41909897],
       [0.3944566 ],
       [0.3326628 ],
       [0.323