## Import libraries

In [19]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import Isomap
import umap
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Load the data

In [14]:
path_to_data_directory = '../data/'
exchange = 'binance'
coin = 'BTC'
interval = '1h'
data = pd.read_csv(path_to_data_directory + exchange + '/' + coin + '/' + interval + '.csv')
data.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ignore
0,2024-09-17 19:00:00,60540.17,60696.48,59618.26,59967.99,2112.0532,2024-09-17 19:59:59.999,127061200.0,322417,881.13029,52996740.0,0
1,2024-09-17 20:00:00,59967.98,60342.85,59715.42,60120.0,1665.06151,2024-09-17 20:59:59.999,99969200.0,208296,828.74091,49757090.0,0
2,2024-09-17 21:00:00,60119.99,60424.25,60105.01,60334.07,662.68788,2024-09-17 21:59:59.999,39973260.0,85064,356.50218,21502870.0,0
3,2024-09-17 22:00:00,60334.07,60343.99,60122.5,60240.01,424.37655,2024-09-17 22:59:59.999,25561030.0,94677,173.20763,10431710.0,0
4,2024-09-17 23:00:00,60240.01,60324.0,60005.31,60313.99,489.52738,2024-09-17 23:59:59.999,29457380.0,73556,267.6605,16109520.0,0


## Preprocessing

split the features and target value

In [15]:
features = data.drop(columns=['Close', 'Open Time', 'Close Time'])
target = data['Close']

features.head()

Unnamed: 0,Open,High,Low,Volume,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ignore
0,60540.17,60696.48,59618.26,2112.0532,127061200.0,322417,881.13029,52996740.0,0
1,59967.98,60342.85,59715.42,1665.06151,99969200.0,208296,828.74091,49757090.0,0
2,60119.99,60424.25,60105.01,662.68788,39973260.0,85064,356.50218,21502870.0,0
3,60334.07,60343.99,60122.5,424.37655,25561030.0,94677,173.20763,10431710.0,0
4,60240.01,60324.0,60005.31,489.52738,29457380.0,73556,267.6605,16109520.0,0


scale the features

In [16]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

Dimensionality reduction using isomap and umap

Adjust the n_components and n_neighbors parameters

TODO:
Use Cross Validation with GridSearch to select the best values

In [17]:
# Isomap
isomap_transformer = Isomap(n_components=10, n_neighbors=5) # adjust n_components and n_neighbors 
isomap_features = isomap_transformer.fit_transform(scaled_features)


# Umap
umap_transformer = umap.UMAP(n_components=10, n_neighbors=5, metric='euclidean')
umap_features = umap_transformer.fit_transform(scaled_features)

## Model Training with GBR and RF

### USING ISOMAP

In [20]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(isomap_features, target, test_size=0.2, random_state=42)

# Gradient Boosting Regression
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)
print("GBR RMSE:", mean_squared_error(y_test, y_pred_gbr, squared=False))
print("GBR R^2:", r2_score(y_test, y_pred_gbr))

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("RF RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("RF R^2:", r2_score(y_test, y_pred_rf))

GBR RMSE: 345.1837922274646
GBR R^2: 0.9609407782166599
RF RMSE: 482.3944395933263
RF R^2: 0.9237170672869859




### USING UMAP

In [21]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(umap_features, target, test_size=0.2, random_state=42)

# Gradient Boosting Regression
gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)
print("GBR RMSE:", mean_squared_error(y_test, y_pred_gbr, squared=False))
print("GBR R^2:", r2_score(y_test, y_pred_gbr))

# Random Forest Regression
rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("RF RMSE:", mean_squared_error(y_test, y_pred_rf, squared=False))
print("RF R^2:", r2_score(y_test, y_pred_rf))

GBR RMSE: 391.040335992028
GBR R^2: 0.9498736699159006
RF RMSE: 465.8948619776528
RF R^2: 0.9288461118496464


