In [None]:
#!pip install xgboost

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
import matplotlib.pyplot as plt


In [None]:
# Assuming you have your dataset in a CSV file named 'Binance_BTCUSDT_d_sorted.csv'
data = pd.read_csv('Binance_BTCUSDT_d_sorted.csv')

# Display the first few rows of the dataset
print(data.head())


In [None]:
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by Date
data = data.sort_values('Date')

# Reset index
data.reset_index(drop=True, inplace=True)

# Create additional features based on technical indicators
def add_indicators(df):
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_10'] = df['Close'].rolling(window=10).mean()
    df['EMA_5'] = df['Close'].ewm(span=21, adjust=False).mean()
    df['EMA_10'] = df['Close'].ewm(span=34, adjust=False).mean()
    df['RSI'] = compute_rsi(df['Close'], 14)
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = compute_macd(df['Close'])
    df['Bollinger_Upper'], df['Bollinger_Lower'] = compute_bollinger_bands(df['Close'])
    df['ATR'] = compute_atr(df['High'], df['Low'], df['Close'], 14)
    df['Volume_BTC'] = df['Volume BTC']
    df['Volume_USDT'] = df['Volume USDT']
    return df

def compute_rsi(series, period):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def compute_macd(series, short_period=12, long_period=26, signal_period=9):
    short_ema = series.ewm(span=short_period, adjust=False).mean()
    long_ema = series.ewm(span=long_period, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    hist = macd - signal
    return macd, signal, hist

def compute_bollinger_bands(series, window=20, no_of_std=2):
    rolling_mean = series.rolling(window).mean()
    rolling_std = series.rolling(window).std()
    upper_band = rolling_mean + (rolling_std * no_of_std)
    lower_band = rolling_mean - (rolling_std * no_of_std)
    return upper_band, lower_band

def compute_atr(high, low, close, period):
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.DataFrame({'TR1': tr1, 'TR2': tr2, 'TR3': tr3}).max(axis=1)
    return true_range.rolling(window=period).mean()

data = add_indicators(data)

# Drop rows with NaN values created by rolling windows
data.dropna(inplace=True)

# Display the first few rows of the processed dataset
print(data.head())


In [None]:
# Define the features and target variable
X = data[['SMA_5', 'SMA_10', 'EMA_5', 'EMA_10', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Volume_BTC', 'Volume_USDT']]
y = data['Close']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Reshape input data for LSTM [samples, timesteps, features]
X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(units=50, return_sequences=True, input_shape=(1, X_train.shape[1])))
model_lstm.add(LSTM(units=50, return_sequences=False))
model_lstm.add(Dense(units=1))

# Compile the model
model_lstm.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model_lstm.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=2)

In [None]:
from keras.layers import LSTM, Dense, Input

# Reshape input data for LSTM [samples, timesteps, features]
X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
model_lstm = Sequential()
model_lstm.add(Input(shape=(1, X_train.shape[1])))
model_lstm.add(LSTM(units=50, return_sequences=True))
model_lstm.add(LSTM(units=50, return_sequences=False))
model_lstm.add(Dense(units=1))

# Compile the model
model_lstm.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model_lstm.fit(X_train_lstm, y_train, epochs=50, batch_size=32, verbose=2)


In [None]:
# Reshape input data for CNN [samples, timesteps, features]
X_train_cnn = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

# Define CNN model
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(units=50, activation='relu'))
model_cnn.add(Dense(units=1))

# Compile the model
model_cnn.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model_cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=32, verbose=2)


In [None]:
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Input

# Reshape input data for CNN [samples, timesteps, features]
X_train_cnn = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

# Define CNN model
model_cnn = Sequential()
model_cnn.add(Input(shape=(X_train.shape[1], 1)))
model_cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(units=50, activation='relu'))
model_cnn.add(Dense(units=1))

# Compile the model
model_cnn.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model_cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=32, verbose=2)


In [None]:
# Define the XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.01)

# Train the model
model_xgb.fit(X_train, y_train)

# Make predictions
predictions_xgb = model_xgb.predict(X_test)


In [None]:
# Evaluate LSTM model
predictions_lstm = model_lstm.predict(X_test_lstm)
rmse_lstm = np.sqrt(mean_squared_error(y_test, predictions_lstm))
print('LSTM RMSE:', rmse_lstm)

# Evaluate CNN model
predictions_cnn = model_cnn.predict(X_test_cnn)
rmse_cnn = np.sqrt(mean_squared_error(y_test, predictions_cnn))
print('CNN RMSE:', rmse_cnn)

# Evaluate XGBoost model
rmse_xgb = np.sqrt(mean_squared_error(y_test, predictions_xgb))
print('XGBoost RMSE:', rmse_xgb)

# Plot results
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='True')
#plt.plot(predictions_lstm, label='LSTM Predictions')
#plt.plot(predictions_cnn, label='CNN Predictions')
plt.plot(predictions_xgb, label='XGBoost Predictions')
plt.legend()
plt.show()


In [None]:
import numpy as np
import tensorflow as tf
import google.protobuf

print("NumPy version:", np.__version__)
print("TensorFlow version:", tf.__version__)
print("Protobuf version:", google.protobuf.__version__)


# NumPy version: 1.19.5
# TensorFlow version: 2.6.0
# Protobuf version: 3.20.3

In [None]:
import numpy as np
import tensorflow as tf
import google.protobuf

print("NumPy version:", np.__version__)
print("TensorFlow version:", tf.__version__)
print("Protobuf version:", google.protobuf.__version__)


In [None]:
# Revise predicting the daily change percent

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
import joblib


In [None]:
# Assuming you have your dataset in a CSV file named 'Binance_BTCUSDT_d_sorted.csv'
data = pd.read_csv('Bittrex_BTCUSDT_1h_sorted.csv')

# Display the first few rows of the dataset
print(data.head())


In [None]:
import pandas as pd

# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by Date
data = data.sort_values('Date')

# Reset index
data.reset_index(drop=True, inplace=True)

# Calculate daily change and percentage change
data['Daily_Change'] = data['Close'].diff()
data['Daily_Change_Percentage'] = data['Daily_Change'] / data['Close'].shift(1) * 100

# Drop the first row with NaN values from the shift operation
data.dropna(inplace=True)

# Add direction column
data['Direction'] = data['Daily_Change'].apply(lambda x: 1 if x > 0 else 0)

# Create additional features based on technical indicators
def add_indicators(df):
    df['SMA_50'] = df['Close'].rolling(window=50).mean()
    df['SMA_200'] = df['Close'].rolling(window=200).mean()
    df['EMA_21'] = df['Close'].ewm(span=21, adjust=False).mean()
    df['EMA_34'] = df['Close'].ewm(span=34, adjust=False).mean()
    df['RSI'] = compute_rsi(df['Close'], 14)
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = compute_macd(df['Close'])
    df['Bollinger_Upper'], df['Bollinger_Lower'] = compute_bollinger_bands(df['Close'])
    df['ATR'] = compute_atr(df['High'], df['Low'], df['Close'], 14)
    return df

def compute_rsi(series, period):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def compute_macd(series, short_period=12, long_period=26, signal_period=9):
    short_ema = series.ewm(span=short_period, adjust=False).mean()
    long_ema = series.ewm(span=long_period, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    hist = macd - signal
    return macd, signal, hist

def compute_bollinger_bands(series, window=20, no_of_std=2):
    rolling_mean = series.rolling(window).mean()
    rolling_std = series.rolling(window).std()
    upper_band = rolling_mean + (rolling_std * no_of_std)
    lower_band = rolling_mean - (rolling_std * no_of_std)
    return upper_band, lower_band

def compute_atr(high, low, close, period):
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.DataFrame({'TR1': tr1, 'TR2': tr2, 'TR3': tr3}).max(axis=1)
    return true_range.rolling(window=period).mean()

data = add_indicators(data)

# Drop rows with NaN values created by rolling windows
data.dropna(inplace=True)

# Display the first few rows of the processed dataset
print(data.head())


In [None]:
# Define the features and target variable
X = data[['Low', 'High', 'SMA_50', 'SMA_200', 'EMA_21', 'EMA_34', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Daily_Change_Percentage']]
#y = data['Daily_Change_Percentage']
y = data['Daily Change']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    

from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Input

 Reshape input data for CNN [samples, timesteps, features]
X_train_cnn = np.reshape(X_train.values, (X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = np.reshape(X_test.values, (X_test.shape[0], X_test.shape[1], 1))

 Define CNN model
model_cnn = Sequential()
model_cnn.add(Input(shape=(X_train.shape[1], 1)))
model_cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(units=50, activation='relu'))
model_cnn.add(Dense(units=1))

 Compile the model
model_cnn.compile(optimizer='adam', loss='mean_squared_error')

 Train the model
model_cnn.fit(X_train_cnn, y_train, epochs=50, batch_size=32, verbose=2)


 Evaluate LSTM model
predictions_cnn = model_cnn.predict(X_test_cnn)
rmse_cnn = np.sqrt(mean_squared_error(y_test, predictions_cnn))
print('CNN RMSE:', rmse_cnn)

 Plot training & validation loss values
plt.figure(figsize=(14, 7))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

 Plot true vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='True')
plt.plot(predictions_cnn, label='CNN Predictions')
plt.legend()
plt.show()


In [None]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd

# Reshape input data for XGBoost
X_train_xgb = X_train.values
X_test_xgb = X_test.values

# Define XGBoost model with max_depth
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=2000, learning_rate=0.1, max_depth=5)

# Train the model
model_xgb.fit(X_train_xgb, y_train)


In [None]:
# Evaluate XGBoost model
predictions_xgb = model_xgb.predict(X_test_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, predictions_xgb))
print('XGBoost RMSE:', rmse_xgb)

# Plot true vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='True')
#plt.plot(predictions_xgb, label='XGBoost Predictions')
plt.legend()
plt.show()


In [None]:
# Save results to an Excel file
results_df = pd.DataFrame({
    'True Values': y_test.values,
    'Predictions': predictions_xgb
})

# Define the file path
file_path = 'xgboost_predictions1.xlsx'

# Save to Excel
results_df.to_excel(file_path, index=False)
print(f'Results saved to {file_path}')

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Reshape input data for XGBoost
X_train_xgb = X_train.values
X_test_xgb = X_test.values

# Define XGBoost model
model_xgb = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=2000, learning_rate=0.01, max_depth=5)

# Train the model
model_xgb.fit(X_train_xgb, y_train)

# Evaluate XGBoost model
predictions_xgb = model_xgb.predict(X_test_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, predictions_xgb))
print('XGBoost RMSE:', rmse_xgb)

# Plot true vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='True')
plt.plot(predictions_xgb, label='XGBoost Predictions')
plt.legend()
plt.show()

In [None]:
# Save results to an Excel file
results_df = pd.DataFrame({
    'True Values': y_test.values,
    'Predictions': predictions_xgb
})

# Define the file path
file_path = 'xgboost_predictions2nd.xlsx'

# Save to Excel
results_df.to_excel(file_path, index=False)
print(f'Results saved to {file_path}')

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd

# Define the features and target variable
X = data[['Low', 'High', 'SMA_50', 'SMA_200', 'EMA_21', 'EMA_34', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Daily_Change', 'Daily_Change_Percentage']]
y = data['Direction']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Reshape input data for XGBoost
X_train_xgb = X_train.values
X_test_xgb = X_test.values

In [None]:
# Define XGBoost model with max_depth for classification
model_xgb = xgb.XGBClassifier(n_estimators=2000, learning_rate=0.1, max_depth=5, objective='binary:logistic')

# Train the model
model_xgb.fit(X_train_xgb, y_train)



In [None]:
# Evaluate XGBoost model
predictions_xgb_proba = model_xgb.predict_proba(X_test_xgb)[:, 1]
predictions_xgb = (predictions_xgb_proba >= 0.5).astype(int)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
print('XGBoost Accuracy:', accuracy_xgb)


In [None]:
# Plot true vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='True')
plt.plot(predictions_xgb, label='XGBoost Predictions')
plt.legend()
plt.show()

In [None]:
# Save results to an Excel file
results_df = pd.DataFrame({
    'True Values': y_test.values,
    'Predictions': predictions_xgb
})

# Define the file path
file_path = 'xgboost_predictions1.xlsx'

# Save to Excel
results_df.to_excel(file_path, index=False)
print(f'Results saved to {file_path}')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Define the features and target variable
X = data[['Low', 'High', 'SMA_50', 'SMA_200', 'EMA_21', 'EMA_34', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Daily_Change', 'Daily_Change_Percentage']].values
y = data['Direction'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Reshape input data for CNN
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Define CNN model
model_cnn = Sequential()
model_cnn.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
model_cnn.add(MaxPooling1D(pool_size=2))
model_cnn.add(Flatten())
model_cnn.add(Dense(50, activation='relu'))
model_cnn.add(Dropout(0.5))
model_cnn.add(Dense(1, activation='sigmoid'))

# Compile the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_cnn.fit(X_train_cnn, y_train, epochs=10, batch_size=32, verbose=2)

# Evaluate the model
predictions_cnn_proba = model_cnn.predict(X_test_cnn)
predictions_cnn = (predictions_cnn_proba >= 0.5).astype(int)

# Calculate accuracy
accuracy_cnn = accuracy_score(y_test, predictions_cnn)
print('CNN Accuracy:', accuracy_cnn)

# Plot true vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test, label='True')
plt.plot(predictions_cnn, label='CNN Predictions')
plt.legend()
plt.show()

# Save results to an Excel file
results_df_cnn = pd.DataFrame({
    'True Values': y_test,
    'Predictions': predictions_cnn.flatten()
})

# Define the file path
file_path_cnn = 'cnn_predictions.xlsx'

# Save to Excel
results_df_cnn.to_excel(file_path_cnn, index=False)
print(f'Results saved to {file_path_cnn}')


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Define the features and target variable
X = data[['Low', 'High', 'SMA_50', 'SMA_200', 'EMA_21', 'EMA_34', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Daily_Change', 'Daily_Change_Percentage']].values
y = data['Direction'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Reshape input data for LSTM
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Define LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(1, X_train.shape[1])))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(1, activation='sigmoid'))

# Compile the model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model_lstm.fit(X_train_lstm, y_train, epochs=10, batch_size=32, verbose=2)

# Evaluate the model
predictions_lstm_proba = model_lstm.predict(X_test_lstm)
predictions_lstm = (predictions_lstm_proba >= 0.5).astype(int)

# Calculate accuracy
accuracy_lstm = accuracy_score(y_test, predictions_lstm)
print('LSTM Accuracy:', accuracy_lstm)

# Plot true vs predicted values
plt.figure(figsize=(14, 7))
plt.plot(y_test, label='True')
plt.plot(predictions_lstm, label='LSTM Predictions')
plt.legend()
plt.show()

# Save results to an Excel file
results_df_lstm = pd.DataFrame({
    'True Values': y_test,
    'Predictions': predictions_lstm.flatten()
})

# Define the file path
file_path_lstm = 'lstm_predictions.xlsx'

# Save to Excel
results_df_lstm.to_excel(file_path_lstm, index=False)
print(f'Results saved to {file_path_lstm}')


In [None]:
# When it is a previous day to predict next day

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
import matplotlib.pyplot as plt
import joblib
from sklearn.metrics import accuracy_score
import xgboost as xgb

In [None]:
# Assuming you have your dataset in a CSV file named 'Binance_BTCUSDT_d_sorted.csv'
data = pd.read_csv('Bittrex_BTCUSDT_1h_sorted.csv')

# Display the first few rows of the dataset
print(data.head())


In [None]:
import pandas as pd
import numpy as np

# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by Date
data = data.sort_values('Date')

# Reset index
data.reset_index(drop=True, inplace=True)

# Calculate daily change and percentage change
data['Daily_Change'] = data['Close'].diff()
data['Daily_Change_Percentage'] = data['Daily_Change'] / data['Close'].shift(1) * 100

# Drop the first row with NaN values from the shift operation
data.dropna(inplace=True)

# Add direction column
data['Direction'] = data['Daily_Change'].apply(lambda x: 1 if x > 0 else 0)

# Create additional features based on technical indicators
def add_indicators(df):
    df['SMA_21'] = df['Close'].rolling(window=21).mean()
    df['SMA_55'] = df['Close'].rolling(window=55).mean()
    df['EMA_13'] = df['Close'].ewm(span=34, adjust=False).mean()
    df['EMA_89'] = df['Close'].ewm(span=89, adjust=False).mean()
    df['RSI'] = compute_rsi(df['Close'], 14)
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = compute_macd(df['Close'])
    df['Bollinger_Upper'], df['Bollinger_Lower'] = compute_bollinger_bands(df['Close'])
    df['ATR'] = compute_atr(df['High'], df['Low'], df['Close'], 14)
    df['Stochastic_K'], df['Stochastic_D'] = compute_stochastic(df['High'], df['Low'], df['Close'])
    df['Williams_%R'] = compute_williams_r(df['High'], df['Low'], df['Close'])
    df['PSAR'] = psar(df['High'], df['Low'])
    df['CCI'] = compute_cci(df['High'], df['Low'], df['Close'], 20)
    ichimoku = compute_ichimoku(df['High'], df['Low'], df['Close'])
    df['Ichimoku_Conversion_Line'] = ichimoku['Conversion_Line']
    df['Ichimoku_Base_Line'] = ichimoku['Base_Line']
    df['Ichimoku_Leading_Span_A'] = ichimoku['Leading_Span_A']
    df['Ichimoku_Leading_Span_B'] = ichimoku['Leading_Span_B']
    df['Ichimoku_Lagging_Span'] = ichimoku['Lagging_Span']
    df['VWAP'] = compute_vwap(df['Close'], df['Volume'])
    #df['OBV'] = compute_obv(df['Close'], df['Volume'])
    df['CMF'] = compute_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
    df['TSI'] = compute_tsi(df['Close'])
    return df

def compute_rsi(series, period):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def compute_macd(series, short_period=12, long_period=26, signal_period=9):
    short_ema = series.ewm(span=short_period, adjust=False).mean()
    long_ema = series.ewm(span=long_period, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    hist = macd - signal
    return macd, signal, hist

def compute_bollinger_bands(series, window=21, no_of_std=2):
    rolling_mean = series.rolling(window).mean()
    rolling_std = series.rolling(window).std()
    upper_band = rolling_mean + (rolling_std * no_of_std)
    lower_band = rolling_mean - (rolling_std * no_of_std)
    return upper_band, lower_band

def compute_atr(high, low, close, period):
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.DataFrame({'TR1': tr1, 'TR2': tr2, 'TR3': tr3}).max(axis=1)
    return true_range.rolling(window=period).mean()

def compute_stochastic(high, low, close, k_period=14, d_period=3):
    low_min = low.rolling(window=k_period).min()
    high_max = high.rolling(window=k_period).max()
    stoch_k = 100 * ((close - low_min) / (high_max - low_min))
    stoch_d = stoch_k.rolling(window=d_period).mean()
    return stoch_k, stoch_d

def compute_williams_r(high, low, close, period=14):
    highest_high = high.rolling(window=period).max()
    lowest_low = low.rolling(window=period).min()
    williams_r = (highest_high - close) / (highest_high - lowest_low) * -100
    return williams_r
    
def psar(high, low):
    psar = [0] * len(high)
    psar[0] = low.iloc[0]  # Use .iloc instead of direct indexing
    acceleration = 0.02
    max_acceleration = 0.2
    for i in range(1, len(high)):
        if high.iloc[i] > psar[i-1]:  # Use .iloc instead of direct indexing
            psar[i] = psar[i-1] + acceleration * (high.iloc[i] - psar[i-1])  # Use .iloc instead of direct indexing
        else:
            psar[i] = psar[i-1] - acceleration * (psar[i-1] - low.iloc[i])  # Use .iloc instead of direct indexing
        if psar[i] > high.iloc[i]:  # Use .iloc instead of direct indexing
            psar[i] = high.iloc[i]
        elif psar[i] < low.iloc[i]:  # Use .iloc instead of direct indexing
            psar[i] = low.iloc[i]
        acceleration = min(max_acceleration, acceleration + 0.02)
    return psar
    

def compute_cci(high, low, close, period=20):
    tp = (high + low + close) / 3
    tp_sma = tp.rolling(window=period).mean()
    mad = tp.rolling(window=period).apply(lambda x: np.mean(np.abs(x - np.mean(x))))
    cci = (tp - tp_sma) / (0.015 * mad)
    return cci

def compute_ichimoku(high, low, close):
    nine_period_high = high.rolling(window=9).max()
    nine_period_low = low.rolling(window=9).min()
    period26_high = high.rolling(window=26).max()
    period26_low = low.rolling(window=26).min()
    period52_high = high.rolling(window=52).max()
    period52_low = low.rolling(window=52).min()
    ichimoku_cloud = {
        'Conversion_Line': (nine_period_high + nine_period_low) / 2,
        'Base_Line': (period26_high + period26_low) / 2,
        'Leading_Span_A': ((nine_period_high + nine_period_low) / 2 + (period26_high + period26_low) / 2) / 2,
        'Leading_Span_B': (period52_high + period52_low) / 2,
        'Lagging_Span': close.shift(-26)
    }
    return ichimoku_cloud

def compute_vwap(close, volume):
    return (close * volume).cumsum() / volume.cumsum()

def compute_obv(close, volume):
    obv = volume.copy()
    obv[1:] = np.where(close[1:] > close[:-1], volume[1:], np.where(close[1:] < close[:-1], -volume[1:], 0))
    return obv.cumsum()

def compute_cmf(high, low, close, volume, period=20):
    mfv = ((close - low) - (high - close)) / (high - low) * volume
    cmf = mfv.rolling(window=period).sum() / volume.rolling(window=period).sum()
    return cmf

def compute_tsi(close, r=25, s=13):
    m25 = close.diff(1)
    abs_m25 = abs(m25)
    m25s = m25.ewm(span=r, adjust=False).mean()
    abs_m25s = abs_m25.ewm(span=r, adjust=False).mean()
    m25s = m25s.ewm(span=s, adjust=False).mean()
    abs_m25s = abs_m25s.ewm(span=s, adjust=False).mean()
    tsi = m25s / abs_m25s * 100
    return tsi

# Apply the indicators
data = add_indicators(data)

# Drop rows with NaN values created by rolling windows
data.dropna(inplace=True)

# Display the first few rows of the processed dataset
print(data.head())


In [None]:
# Convert PSAR to binary values based on Close prices
data['PSAR_Binary'] = data.apply(lambda row: 1 if row['PSAR'] > row['Close'] else 0, axis=1)


In [None]:
# Define the file path
file_path = 'data_indicators2nd.csv'

# Save to Excel
data.to_csv(file_path, index=True)
print(f'Results saved to {file_path}')

In [None]:
# Shift the features by 1 day

X = data[['SMA_21', 'SMA_55', 'EMA_13', 'EMA_89', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 
          'Bollinger_Lower', 'ATR', 'Daily_Change', 'Daily_Change_Percentage', 'Stochastic_K', 'Stochastic_D', 
          'Williams_%R', 'CCI', 'Ichimoku_Conversion_Line', 'Ichimoku_Base_Line', 
          'Ichimoku_Leading_Span_A', 'Ichimoku_Leading_Span_B', 'Ichimoku_Lagging_Span', 'VWAP', 'CMF', 'TSI', 'PSAR', 'PSAR_Binary']].shift(1)
y = data['Direction']


# Drop the first row since it will have NaN values due to shifting
X = X.dropna()
y = y.iloc[1:]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)

# Reshape input data for XGBoost
X_train_xgb = X_train.values
X_test_xgb = X_test.values



In [None]:
# Define XGBoost model for classification
model_xgb = xgb.XGBClassifier(n_estimators=2000, learning_rate=0.01, max_depth=1, objective='binary:logistic')

# Train the model
model_xgb.fit(X_train_xgb, y_train)

# Evaluate XGBoost model
predictions_xgb_proba = model_xgb.predict_proba(X_test_xgb)[:, 1]
predictions_xgb = (predictions_xgb_proba >= 0.5).astype(int)
    


In [None]:
# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
print('XGBoost Accuracy:', accuracy_xgb)

# Plot true vs predicted values
#plt.figure(figsize=(14, 7))
#plt.plot(y_test.values, label='True')
#plt.plot(predictions_xgb, label='XGBoost Predictions')
#plt.legend()
#plt.show()



In [None]:
# Save results to an Excel file
results_df_xgb = pd.DataFrame({
    'True Values': y_test.values,
    'Predictions': predictions_xgb
})

# Define the file path
file_path_xgb = 'xgboost_predictionsfinal.xlsx'

# Save to Excel
results_df_xgb.to_excel(file_path_xgb, index=False)
print(f'Results saved to {file_path_xgb}')

In [None]:
#!pip install gplearn

In [None]:
#Other models

# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from gplearn.genetic import SymbolicRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import matplotlib.pyplot as plt

In [None]:
from sklearn.linear_model import LogisticRegression

# Define and train Logistic Regression model with increased iterations and different solver
log_reg = LogisticRegression(max_iter=1000, solver='liblinear')
log_reg.fit(X_train, y_train)

# Predict probabilities
probs = log_reg.predict_proba(X_test)

# Convert probabilities to binary predictions (0 or 1)
predictions_log_reg = (probs[:, 1] >= 0.5).astype(int)

# Calculate accuracy
accuracy_log_reg = accuracy_score(y_test, predictions_log_reg)
print('Logistic Regression Accuracy:', accuracy_log_reg)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import time

# Assuming X_train, X_test, y_train, y_test are already defined

# Record start time and train Logistic Regression
start_time = time.time()
log_reg = LogisticRegression(max_iter=2000, solver='liblinear')
log_reg.fit(X_train, y_train)
log_reg_time = time.time() - start_time

# Record start time and train Decision Tree Classifier
start_time = time.time()
dt_clf = DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42)
dt_clf.fit(X_train, y_train)
dt_clf_time = time.time() - start_time

# Record start time and train Random Forest Classifier
start_time = time.time()
rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42)
rf_clf.fit(X_train, y_train)
rf_clf_time = time.time() - start_time

# Record start time and train XGBoost Classifier
start_time = time.time()
xgb_clf = XGBClassifier(n_estimators=2000, learning_rate=0.01, max_depth=1, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', random_state=42, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
xgb_clf_time = time.time() - start_time

# Record start time and train Gradient Boosting Classifier
start_time = time.time()
gb_clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=1, min_samples_split=10, min_samples_leaf=5, random_state=42)
gb_clf.fit(X_train, y_train)
gb_clf_time = time.time() - start_time

# Predict on test data
log_reg_pred = log_reg.predict(X_test)
dt_clf_pred = dt_clf.predict(X_test)
rf_clf_pred = rf_clf.predict(X_test)
xgb_clf_pred = xgb_clf.predict(X_test)
gb_clf_pred = gb_clf.predict(X_test)

# Calculate accuracy
log_reg_acc = accuracy_score(y_test, log_reg_pred)
dt_clf_acc = accuracy_score(y_test, dt_clf_pred)
rf_clf_acc = accuracy_score(y_test, rf_clf_pred)
xgb_clf_acc = accuracy_score(y_test, xgb_clf_pred)
gb_clf_acc = accuracy_score(y_test, gb_clf_pred)

# Print accuracy
print("Logistic Regression Accuracy:", log_reg_acc)
print("Decision Tree Classifier Accuracy:", dt_clf_acc)
print("Random Forest Classifier Accuracy:", rf_clf_acc)
print("XGBoost Classifier Accuracy:", xgb_clf_acc)
print("Gradient Boosting Classifier Accuracy:", gb_clf_acc)

# Print training times
print("\nTraining Times (in seconds):")
print("Logistic Regression Training Time:", log_reg_time)
print("Decision Tree Classifier Training Time:", dt_clf_time)
print("Random Forest Classifier Training Time:", rf_clf_time)
print("XGBoost Classifier Training Time:", xgb_clf_time)
print("Gradient Boosting Classifier Training Time:", gb_clf_time)

# Combine predictions using weighted majority vote
log_reg_proba = log_reg.predict_proba(X_test)[:, 1]
dt_clf_proba = dt_clf.predict_proba(X_test)[:, 1]
rf_clf_proba = rf_clf.predict_proba(X_test)[:, 1]
xgb_clf_proba = xgb_clf.predict_proba(X_test)[:, 1]
gb_clf_proba = gb_clf.predict_proba(X_test)[:, 1]

combined_proba = (log_reg_proba + dt_clf_proba + rf_clf_proba + xgb_clf_proba + gb_clf_proba) / 5
combined_pred = (combined_proba >= 0.5).astype(int)

# Calculate and print combined accuracy
combined_acc = accuracy_score(y_test, combined_pred)
print("Combined Classifier Accuracy:", combined_acc)

# Display the percentage confidence scores for each classifier
print("\nLogistic Regression Confidence Scores:")
print(log_reg_proba)

print("\nDecision Tree Classifier Confidence Scores:")
print(dt_clf_proba)

print("\nRandom Forest Classifier Confidence Scores:")
print(rf_clf_proba)

print("\nXGBoost Classifier Confidence Scores:")
print(xgb_clf_proba)

print("\nGradient Boosting Classifier Confidence Scores:")
print(gb_clf_proba)


In [None]:
# Calculate accuracy
log_reg_acc = accuracy_score(y_test, log_reg_pred)
dt_clf_acc = accuracy_score(y_test, dt_clf_pred)
rf_clf_acc = accuracy_score(y_test, rf_clf_pred)
xgb_clf_acc = accuracy_score(y_test, xgb_clf_pred)
gb_clf_acc = accuracy_score(y_test, gb_clf_pred)

# Print accuracy
print("Logistic Regression Accuracy:", log_reg_acc)
print("Decision Tree Classifier Accuracy:", dt_clf_acc)
print("Random Forest Classifier Accuracy:", rf_clf_acc)
print("XGBoost Classifier Accuracy:", xgb_clf_acc)
print("Gradient Boosting Classifier Accuracy:", gb_clf_acc)

# Print training times
print("\nTraining Times (in seconds):")
print("Logistic Regression Training Time:", log_reg_time)
print("Decision Tree Classifier Training Time:", dt_clf_time)
print("Random Forest Classifier Training Time:", rf_clf_time)
print("XGBoost Classifier Training Time:", xgb_clf_time)
print("Gradient Boosting Classifier Training Time:", gb_clf_time)

# Combine predictions using weighted majority vote
combined_proba = (log_reg_proba + dt_clf_proba + rf_clf_proba + xgb_clf_proba + gb_clf_proba) / 5
combined_pred = (combined_proba >= 0.5).astype(int)

# Calculate and print combined accuracy
combined_acc = accuracy_score(y_test, combined_pred)
print("Combined Classifier Accuracy:", combined_acc)

# Create a DataFrame to store true values, predictions, and confidence scores
results_df = pd.DataFrame({
    'True Values': y_test,
    'Logistic Regression Prediction': log_reg_pred,
    'Logistic Regression Confidence': log_reg_proba,
    'Decision Tree Prediction': dt_clf_pred,
    'Decision Tree Confidence': dt_clf_proba,
    'Random Forest Prediction': rf_clf_pred,
    'Random Forest Confidence': rf_clf_proba,
    'XGBoost Prediction': xgb_clf_pred,
    'XGBoost Confidence': xgb_clf_proba,
    'Gradient Boosting Prediction': gb_clf_pred,
    'Gradient Boosting Confidence': gb_clf_proba,
    'Combined Prediction': combined_pred,
    'Combined Confidence': combined_proba
})

# Export the DataFrame to a CSV file
results_df.to_csv('model_predictions_with_confidence.csv', index=False)

In [None]:
import pandas as pd

# Create a dictionary to store the results
results = {
    'True Values': y_test.values,
    'Logistic Regression': log_reg_pred,
    'Decision Tree Classifier': dt_clf_pred,
    'Random Forest Classifier': rf_clf_pred,
    'GradB': gb_clf_pred,
    'XGBoost Classifier': xgb_clf_pred
}

# Create a DataFrame from the dictionary
results_df = pd.DataFrame(results)

# Define the file path
file_path = 'all_models_predictions3final3.xlsx'

# Save to Excel
results_df.to_excel(file_path, index=False)
print(f'Results saved to {file_path}')

In [None]:
# Create a DataFrame from the dictionary
results_df = pd.DataFrame(results)

# Calculate overall correlation
correlation = results_df.corr()['True Values'][1:]

print("Overall Correlation:")
print(correlation)

# Calculate absolute differences per row
abs_diff = results_df.iloc[:, 1:].apply(lambda col: (results_df['True Values'] - col).abs())

# Find the model with the smallest difference per row
min_diff_model = abs_diff.idxmin(axis=1)

# Combine results with the original DataFrame
result_df = results_df.copy()
result_df['Min Diff Model'] = min_diff_model

print("\nPer-Row Best Model:")
print(result_df)

In [None]:
import seaborn as sns

# Calculate correlation matrix
correlation_matrix = results_df.corr()

# Convert correlation values to percentages
correlation_matrix_percentage = correlation_matrix * 100

print("Correlation Matrix in Percentages:")
print(correlation_matrix_percentage)

# Plot correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix_percentage, annot=True, fmt=".2f", cmap='coolwarm', cbar_kws={'format': '%.0f%%'})
plt.title('Correlation Matrix in Percentages')
plt.show()

In [None]:
import itertools

# Calculate individual accuracies
accuracies = {}
for col in results_df.columns[1:]:
    accuracies[col] = (results_df['True Values'] == results_df[col]).mean()
    
print("Individual Accuracies:")
for model, accuracy in accuracies.items():
    print(f"{model}: {accuracy:.2f}")

# Function to calculate accuracy of majority voting
def majority_vote_accuracy(results_df, model_combination):
    votes = results_df[list(model_combination)].mode(axis=1)[0]
    return (votes == results_df['True Values']).mean()

# Get all combinations of 2 and 3 models
models = results_df.columns[1:]
combinations_2 = list(itertools.combinations(models, 3))
combinations_3 = list(itertools.combinations(models, 3))

# Calculate accuracies for all combinations
combo_accuracies = {}
for combo in combinations_2 + combinations_3:
    accuracy = majority_vote_accuracy(results_df, combo)
    combo_accuracies[combo] = accuracy

# Find the best combination
best_combination = max(combo_accuracies, key=combo_accuracies.get)
best_accuracy = combo_accuracies[best_combination]

print("\nBest Combination:")
print(best_combination)
print(f"Accuracy: {best_accuracy:.2f}")


In [None]:
# Define and train Symbolic Regression model
sym_reg = SymbolicRegressor(population_size=500, generations=50)
sym_reg.fit(X_train, y_train)
predictions_sym = sym_reg.predict(X_test)
accuracy_sym = accuracy_score(y_test, predictions_sym)
print('Symbolic Regression Accuracy:', accuracy_sym)


In [None]:
# Plot true vs predicted values for each model
plt.figure(figsize=(14, 7))
plt.plot(y_test.values, label='True')
plt.plot(predictions_xgb, label='XGBoost Predictions')
plt.plot(predictions_mlr, label='MLR Predictions')
plt.plot(predictions_sym, label='Symbolic Regression Predictions')
plt.plot(predictions_eml, label='EML Predictions')
plt.legend()
plt.show()

In [None]:
# Function to predict the next 24 days using the XGBoost model
def predict_next_days_xgb(model, X_last_day, num_days=24):
    predictions = []
    current_input = X_last_day.reshape((1, X_last_day.shape[0]))

    for _ in range(num_days):
        prediction_proba = model.predict_proba(current_input)[:, 1]
        prediction = (prediction_proba >= 0.5).astype(int)
        predictions.append(prediction[0])

        # Update current_input with the latest prediction for the next day's input
        current_input = np.roll(current_input, -1)
        current_input[0, -1] = prediction

    return predictions

# Get the last day's data from the training set to start the predictions
X_last_day_xgb = X_train.iloc[-1].values

# Predict the next 24 days using XGBoost model
next_24_days_predictions_xgb = predict_next_days_xgb(model_xgb, X_last_day_xgb, num_days=24)

print('Next 24 days predictions (XGBoost):', next_24_days_predictions_xgb)


In [None]:
#Hypertune parameters

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
import xgboost as xgb
from scipy.stats import uniform, randint

# Define the features and target variable
X = data[['Low', 'High', 'SMA_50', 'SMA_200', 'EMA_21', 'EMA_34', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Daily_Change', 'Daily_Change_Percentage']].shift(1)
y = data['Direction']

# Drop the first row since it will have NaN values due to shifting
X = X.dropna()
y = y.iloc[1:]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'n_estimators': randint(200, 1000, 2000),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(1, 5, 10),
    'subsample': uniform(0.7, 0.3),
    'colsample_bytree': uniform(0.7, 0.3),
    'gamma': uniform(0, 0.5)
}

# Initialize the XGBoost model
model_xgb = xgb.XGBClassifier(objective='binary:logistic')

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    model_xgb, param_distributions=param_grid, n_iter=100,
    scoring='accuracy', n_jobs=-1, cv=5, verbose=2, random_state=42
)

# Fit the model with the random search
random_search.fit(X_train, y_train)

# Get the best parameters
best_params = random_search.best_params_
print("Best parameters found: ", best_params)

# Train the XGBoost model with the best parameters
best_model_xgb = random_search.best_estimator_

# Evaluate the model
predictions_xgb_proba = best_model_xgb.predict_proba(X_test)[:, 1]
predictions_xgb = (predictions_xgb_proba >= 0.5).astype(int)

# Calculate accuracy
accuracy_xgb = accuracy_score(y_test, predictions_xgb)
print('XGBoost Accuracy with best parameters:', accuracy_xgb)





In [None]:
# Save results to an Excel file
results_df_xgb = pd.DataFrame({
    'True Values': y_test.values,
    'Predictions': predictions_xgb
})

# Define the file path
file_path_xgb = 'xgboost_predictions_tuned.xlsx'

# Save to Excel
results_df_xgb.to_excel(file_path_xgb, index=False)
print(f'Results saved to {file_path_xgb}')

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Define the parameter grid
param_grid_xgb = {
    'n_estimators': [100, 200, 300,400,500,1000,2000],
    'learning_rate': [0.01, 0.1,1.0],
    'max_depth': [1,3, 5, 7,9,11]
}

# Create a GridSearchCV object
grid_xgb = GridSearchCV(estimator=xgb.XGBRegressor(objective='reg:squarederror'), param_grid=param_grid_xgb, scoring='neg_mean_squared_error', cv=3)

# Fit the grid search to the data
grid_result_xgb = grid_xgb.fit(X_train_xgb, y_train)

# Summarize results
print(f"Best: {grid_result_xgb.best_score_} using {grid_result_xgb.best_params_}")


# The combined models Class n Regress

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from xgboost import XGBClassifier, XGBRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv1D, MaxPooling1D, Flatten
import matplotlib.pyplot as plt

# Assuming you have your dataset in a CSV file named 'Bittrex_BTCUSDT_d_sorted.csv'
data = pd.read_excel('GBPUSD_M30_2019_01_2024_06_sorted_exc.xlsx')

data.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Spread
0,2019.01.02,00:00:00,1.27459,1.27459,1.27392,1.27392,3,76
1,2019.01.02,00:30:00,1.27392,1.2741,1.27319,1.27409,43,60
2,2019.01.02,01:00:00,1.2741,1.27537,1.27385,1.27475,772,13
3,2019.01.02,01:30:00,1.27475,1.27528,1.27464,1.27506,307,10
4,2019.01.02,02:00:00,1.27507,1.27508,1.27452,1.27468,1053,5


In [13]:
# Convert 'Date' column to datetime format
data['Date'] = pd.to_datetime(data['Date'])

# Sort data by Date
data = data.sort_values('Date')

# Reset index
data.reset_index(drop=True, inplace=True)

# Calculate daily change and percentage change
data['Daily_Change'] = data['Close'].diff()
data['Daily_Change_Percentage'] = data['Daily_Change'] / data['Close'].shift(1) * 100

# Drop the first row with NaN values from the shift operation
data.dropna(inplace=True)

# Add direction column
data['Direction'] = data['Daily_Change'].apply(lambda x: 1 if x > 0 else 0)

# Create additional features based on technical indicators
def add_indicators(df):
    df['SMA_21'] = df['Close'].rolling(window=21).mean()
    df['SMA_55'] = df['Close'].rolling(window=55).mean()
    df['EMA_13'] = df['Close'].ewm(span=34, adjust=False).mean()
    df['EMA_89'] = df['Close'].ewm(span=89, adjust=False).mean()
    df['RSI'] = compute_rsi(df['Close'], 14)
    df['MACD'], df['MACD_Signal'], df['MACD_Hist'] = compute_macd(df['Close'])
    df['Bollinger_Upper'], df['Bollinger_Lower'] = compute_bollinger_bands(df['Close'])
    df['ATR'] = compute_atr(df['High'], df['Low'], df['Close'], 14)
    df['Stochastic_K'], df['Stochastic_D'] = compute_stochastic(df['High'], df['Low'], df['Close'])
    df['Williams_%R'] = compute_williams_r(df['High'], df['Low'], df['Close'])
    df['PSAR'] = psar(df['High'], df['Low'])
    df['CCI'] = compute_cci(df['High'], df['Low'], df['Close'], 20)
    ichimoku = compute_ichimoku(df['High'], df['Low'], df['Close'])
    df['Ichimoku_Conversion_Line'] = ichimoku['Conversion_Line']
    df['Ichimoku_Base_Line'] = ichimoku['Base_Line']
    df['Ichimoku_Leading_Span_A'] = ichimoku['Leading_Span_A']
    df['Ichimoku_Leading_Span_B'] = ichimoku['Leading_Span_B']
    df['Ichimoku_Lagging_Span'] = ichimoku['Lagging_Span']
    df['VWAP'] = compute_vwap(df['Close'], df['Volume'])
    #df['OBV'] = compute_obv(df['Close'], df['Volume'])
    df['CMF'] = compute_cmf(df['High'], df['Low'], df['Close'], df['Volume'])
    df['TSI'] = compute_tsi(df['Close'])
    return df

def compute_rsi(series, period):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def compute_macd(series, short_period=12, long_period=26, signal_period=9):
    short_ema = series.ewm(span=short_period, adjust=False).mean()
    long_ema = series.ewm(span=long_period, adjust=False).mean()
    macd = short_ema - long_ema
    signal = macd.ewm(span=signal_period, adjust=False).mean()
    hist = macd - signal
    return macd, signal, hist

def compute_bollinger_bands(series, window=21, no_of_std=2):
    rolling_mean = series.rolling(window).mean()
    rolling_std = series.rolling(window).std()
    upper_band = rolling_mean + (rolling_std * no_of_std)
    lower_band = rolling_mean - (rolling_std * no_of_std)
    return upper_band, lower_band

def compute_atr(high, low, close, period):
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.DataFrame({'TR1': tr1, 'TR2': tr2, 'TR3': tr3}).max(axis=1)
    return true_range.rolling(window=period).mean()

def compute_stochastic(high, low, close, k_period=14, d_period=3):
    low_min = low.rolling(window=k_period).min()
    high_max = high.rolling(window=k_period).max()
    stoch_k = 100 * ((close - low_min) / (high_max - low_min))
    stoch_d = stoch_k.rolling(window=d_period).mean()
    return stoch_k, stoch_d

def compute_williams_r(high, low, close, period=14):
    highest_high = high.rolling(window=period).max()
    lowest_low = low.rolling(window=period).min()
    williams_r = (highest_high - close) / (highest_high - lowest_low) * -100
    return williams_r
    
def psar(high, low):
    psar = [0] * len(high)
    psar[0] = low.iloc[0]  # Use .iloc instead of direct indexing
    acceleration = 0.02
    max_acceleration = 0.2
    for i in range(1, len(high)):
        if high.iloc[i] > psar[i-1]:  # Use .iloc instead of direct indexing
            psar[i] = psar[i-1] + acceleration * (high.iloc[i] - psar[i-1])  # Use .iloc instead of direct indexing
        else:
            psar[i] = psar[i-1] - acceleration * (psar[i-1] - low.iloc[i])  # Use .iloc instead of direct indexing
        if psar[i] > high.iloc[i]:  # Use .iloc instead of direct indexing
            psar[i] = high.iloc[i]
        elif psar[i] < low.iloc[i]:  # Use .iloc instead of direct indexing
            psar[i] = low.iloc[i]
        acceleration = min(max_acceleration, acceleration + 0.02)
    return psar
    

def compute_cci(high, low, close, period=20):
    tp = (high + low + close) / 3
    tp_sma = tp.rolling(window=period).mean()
    mad = tp.rolling(window=period).apply(lambda x: np.mean(np.abs(x - np.mean(x))))
    cci = (tp - tp_sma) / (0.015 * mad)
    return cci

def compute_ichimoku(high, low, close):
    nine_period_high = high.rolling(window=9).max()
    nine_period_low = low.rolling(window=9).min()
    period26_high = high.rolling(window=26).max()
    period26_low = low.rolling(window=26).min()
    period52_high = high.rolling(window=52).max()
    period52_low = low.rolling(window=52).min()
    ichimoku_cloud = {
        'Conversion_Line': (nine_period_high + nine_period_low) / 2,
        'Base_Line': (period26_high + period26_low) / 2,
        'Leading_Span_A': ((nine_period_high + nine_period_low) / 2 + (period26_high + period26_low) / 2) / 2,
        'Leading_Span_B': (period52_high + period52_low) / 2,
        'Lagging_Span': close.shift(-26)
    }
    return ichimoku_cloud

def compute_vwap(close, volume):
    return (close * volume).cumsum() / volume.cumsum()

def compute_obv(close, volume):
    obv = volume.copy()
    obv[1:] = np.where(close.iloc[1:] > close.iloc[:-1], volume.iloc[1:], np.where(close.iloc[1:] < close.iloc[:-1], -volume.iloc[1:], 0))
    return obv.cumsum()

def compute_cmf(high, low, close, volume, period=20):
    mfv = ((close - low) - (high - close)) / (high - low) * volume
    cmf = mfv.rolling(window=period).sum() / volume.rolling(window=period).sum()
    return cmf

def compute_tsi(close, r=25, s=13):
    m25 = close.diff(1)
    abs_m25 = abs(m25)
    m25s = m25.ewm(span=r, adjust=False).mean()
    abs_m25s = abs_m25.ewm(span=r, adjust=False).mean()
    m25s = m25s.ewm(span=s, adjust=False).mean()
    abs_m25s = abs_m25s.ewm(span=s, adjust=False).mean()
    tsi = m25s / abs_m25s * 100
    return tsi
    
# Shift the daily change to get the next day's change as the action for the current day
data['Next_Day_Daily_Change'] = data['Daily_Change'].shift(-1)

# Drop the last row with NaN values in 'Next_Day_Daily_Change'
data.dropna(inplace=True)

# Add action column based on the next day's daily change
def determine_action(daily_change):
    if daily_change > 0:
        return 1
    elif daily_change < 0:
        return 0
    else:
        return 2

data['Action'] = data['Next_Day_Daily_Change'].apply(determine_action)
    
# Apply the indicators
data = add_indicators(data)

# Drop rows with NaN values created by rolling windows
data.dropna(inplace=True)

# Convert PSAR to binary values based on Close prices
data['PSAR_Binary'] = data.apply(lambda row: 1 if row['PSAR'] > row['Close'] else 0, axis=1)

data.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,Spread,Daily_Change,Daily_Change_Percentage,...,Ichimoku_Conversion_Line,Ichimoku_Base_Line,Ichimoku_Leading_Span_A,Ichimoku_Leading_Span_B,Ichimoku_Lagging_Span,VWAP,CMF,TSI,PSAR_Binary,Next_Day_Daily_Change
55,2019-01-08,15:30:00,1.27424,1.27445,1.27272,1.27343,2608,6,-0.00129,-0.101199,...,1.274675,1.27539,1.275033,1.27539,1.27421,1.27533,-0.15291,-8.615436,1,0.0008
56,2019-01-08,15:00:00,1.27472,1.27544,1.27404,1.27423,2604,0,0.0008,0.062822,...,1.274675,1.27539,1.275033,1.27539,1.27348,1.275288,-0.201882,-8.272316,1,0.00151
57,2019-01-08,12:30:00,1.27542,1.27645,1.27512,1.27574,2305,5,0.00151,0.118503,...,1.274675,1.27539,1.275033,1.27539,1.27382,1.275303,-0.173135,-6.607499,0,0.00133
58,2019-01-08,14:00:00,1.27744,1.27778,1.27703,1.27707,1492,6,0.00133,0.104253,...,1.274675,1.27539,1.275033,1.27539,1.27392,1.275339,-0.212891,-4.128372,0,0.00037
59,2019-01-08,13:30:00,1.27534,1.27793,1.27507,1.27744,2473,7,0.00037,0.028973,...,1.274515,1.27539,1.274952,1.27539,1.27388,1.275408,-0.12462,-1.872221,0,-0.0021


In [14]:
# Define features and target, shift the features by 1 day
X = data[['Open', 'High', 'Low', 'Close', 'Volume', 'SMA_21', 'SMA_55', 'EMA_13', 'EMA_89', 'RSI', 'MACD', 
            'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Stochastic_K', 
            'Stochastic_D', 'Williams_%R', 'CCI', 'Ichimoku_Conversion_Line', 'Ichimoku_Base_Line', 
            'Ichimoku_Leading_Span_A', 'Ichimoku_Leading_Span_B', 'Ichimoku_Lagging_Span', 'PSAR', 
            'VWAP', 'CMF', 'TSI', 'Daily_Change_Percentage', 'PSAR_Binary', 'Next_Day_Daily_Change']].shift(1)
y_class = data['Action']
y_reg = data['Daily_Change']

# Drop the rows with NaN values created by shifting
X.dropna(inplace=True)
y_class = y_class.iloc[1:]
y_reg = y_reg.iloc[1:]

#X.fillna(0, inplace=True)

#y_class = y_class.iloc[X.index]
#y_reg = y_reg.iloc[X.index]

In [15]:
X.head()

Unnamed: 0,Open,High,Low,Close,Volume,SMA_21,SMA_55,EMA_13,EMA_89,RSI,...,Ichimoku_Leading_Span_A,Ichimoku_Leading_Span_B,Ichimoku_Lagging_Span,PSAR,VWAP,CMF,TSI,Daily_Change_Percentage,PSAR_Binary,Next_Day_Daily_Change
56,1.27424,1.27445,1.27272,1.27343,2608.0,1.27588,1.275302,1.275438,1.275297,40.567854,...,1.275033,1.27539,1.27421,1.274042,1.27533,-0.15291,-8.615436,-0.101199,1.0,0.0008
57,1.27472,1.27544,1.27404,1.27423,2604.0,1.27576,1.27529,1.275369,1.275273,44.439095,...,1.275033,1.27539,1.27348,1.274322,1.275288,-0.201882,-8.272316,0.062822,1.0,0.00151
58,1.27542,1.27645,1.27512,1.27574,2305.0,1.275625,1.275312,1.27539,1.275283,49.676226,...,1.275033,1.27539,1.27382,1.27512,1.275303,-0.173135,-6.607499,0.118503,0.0,0.00133
59,1.27744,1.27778,1.27703,1.27707,1492.0,1.275649,1.275365,1.275486,1.275323,52.592593,...,1.275033,1.27539,1.27392,1.27703,1.275339,-0.212891,-4.128372,0.104253,0.0,0.00037
60,1.27534,1.27793,1.27507,1.27744,2473.0,1.275713,1.275407,1.275598,1.27537,48.896195,...,1.274952,1.27539,1.27388,1.27721,1.275408,-0.12462,-1.872221,0.028973,0.0,-0.0021


In [16]:
y_class.head()

56    1
57    1
58    1
59    0
60    0
Name: Action, dtype: int64

In [17]:
# Split data
X_train, X_test, y_train_class, y_test_class = train_test_split(X, y_class, test_size=0.1, shuffle=False)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.1, shuffle=False)

# Train classifiers and get their predictions
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=1000, solver='liblinear'),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=500, max_depth=5, min_samples_split=10, min_samples_leaf=5, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=2000, learning_rate=0.01, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', random_state=42, eval_metric='logloss'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, min_samples_split=10, min_samples_leaf=5, random_state=42)
}
#CatBoostClassifier: AUC Score=0.511 AdaBoostClassifier

class_preds_train = pd.DataFrame(index=X_train.index)
class_preds_test = pd.DataFrame(index=X_test.index)

for name, clf in classifiers.items():
    clf.fit(X_train, y_train_class)
    class_preds_train[name] = clf.predict(X_train)
    class_preds_test[name] = clf.predict(X_test)
    
    # Print the accuracy for each classifier
    train_accuracy = accuracy_score(y_train_class, class_preds_train[name])
    test_accuracy = accuracy_score(y_test_class, class_preds_test[name])
    print(f'{name} Train Accuracy: {train_accuracy:.2f}')
    print(f'{name} Test Accuracy: {test_accuracy:.2f}')

Logistic Regression Train Accuracy: 0.58
Logistic Regression Test Accuracy: 0.55
Decision Tree Train Accuracy: 0.65
Decision Tree Test Accuracy: 0.64
Random Forest Train Accuracy: 0.65
Random Forest Test Accuracy: 0.65
XGBoost Train Accuracy: 0.71
XGBoost Test Accuracy: 0.66
Gradient Boosting Train Accuracy: 0.67
Gradient Boosting Test Accuracy: 0.66


In [18]:
# Add the classification predictions as features for regression
X_train_reg = pd.concat([X_train.reset_index(drop=True), class_preds_train.reset_index(drop=True)], axis=1)
X_test_reg = pd.concat([X_test.reset_index(drop=True), class_preds_test.reset_index(drop=True)], axis=1)

# Align the target variable with the training and testing sets
y_train_reg = y_train_reg.iloc[:len(X_train_reg)]
y_test_reg = y_test_reg.iloc[:len(X_test_reg)]

# Reshape data for CNN and LSTM
X_train_seq = np.expand_dims(X_train_reg.values, axis=1)
X_test_seq = np.expand_dims(X_test_reg.values, axis=1)

# Define and train traditional regression models
regressors = {
    'Linear Regression': XGBRegressor(n_estimators=1000, max_depth=5, learning_rate=0.01),
    'XGBoost': XGBRegressor(n_estimators=2000, max_depth=1, learning_rate=0.01),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=500, max_depth=5, learning_rate=0.01)
}

reg_metrics = {}
confidence_scores = []

for name, reg in regressors.items():
    reg.fit(X_train_reg, y_train_reg)
    y_pred_reg = reg.predict(X_test_reg)
    mse = mean_squared_error(y_test_reg, y_pred_reg)
    reg_metrics[name] = mse
    
    # Calculate confidence score using ensemble predictions
    y_pred_ensemble = []
    for i in range(10):  # Ensemble size
        reg.fit(X_train_reg, y_train_reg)
        y_pred_ensemble.append(reg.predict(X_test_reg))
    y_pred_ensemble = np.array(y_pred_ensemble)
    mean_preds = y_pred_ensemble.mean(axis=0)
    std_preds = y_pred_ensemble.std(axis=0)
    confidence_scores.append(std_preds)

# Print regression metrics
for name, mse in reg_metrics.items():
    print(f'{name}: MSE = {mse}')

# Save results to an Excel file
results_df_xgb = pd.DataFrame({
    'True Values': y_test_reg.values,
    'Predictions': y_pred_reg,
    'Confidence Scores': np.mean(confidence_scores, axis=0)  # Average confidence scores across models
})

# Define the file path
file_path_xgb = 'combo_models4thi.xlsx'

# Save to Excel
results_df_xgb.to_excel(file_path_xgb, index=True)
print(f'Results saved to {file_path_xgb}')


Linear Regression: MSE = 1.2144696656681133e-08
XGBoost: MSE = 1.5696083607647845e-08
Gradient Boosting: MSE = 1.8581104593459692e-10
Results saved to combo_models4thi.xlsx


In [None]:
# Define CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1)
])
cnn_model.compile(optimizer='adam', loss='mse')

# Define LSTM model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(X_train_seq.shape[1], X_train_seq.shape[2])),
    Dense(1)
])
lstm_model.compile(optimizer='adam', loss='mse')

# Train CNN and LSTM models
cnn_model.fit(X_train_seq, y_train_reg, epochs=50, verbose=1)
lstm_model.fit(X_train_seq, y_train_reg, epochs=50, verbose=1)

# Predict and evaluate CNN and LSTM models
y_pred_cnn = cnn_model.predict(X_test_seq)
y_pred_lstm = lstm_model.predict(X_test_seq)

cnn_mse = mean_squared_error(y_test_reg, y_pred_cnn)
lstm_mse = mean_squared_error(y_test_reg, y_pred_lstm)

print(f'CNN: MSE = {cnn_mse}')
print(f'LSTM: MSE = {lstm_mse}')

# Plot predictions
plt.figure(figsize=(10, 5))
plt.plot(y_test_reg.index, y_test_reg, label='True Daily Change')
plt.plot(y_test_reg.index, regressors['XGBoost'].predict(X_test_reg), label='XGBoost Predicted Daily Change')
plt.plot(y_test_reg.index, y_pred_cnn, label='CNN Predicted Daily Change')
plt.plot(y_test_reg.index, y_pred_lstm, label='LSTM Predicted Daily Change')
plt.xlabel('Date')
plt.ylabel('Daily Change')
plt.title('True vs Predicted Daily Change')
plt.legend()
plt.show()

In [None]:
# Prepare data for LSTM
def create_sequences(data, seq_length):
    X = []
    y = []
    for i in range(len(data) - seq_length):
        X.append(data.iloc[i:(i + seq_length)].values)
        y.append(data.iloc[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 10
X_lstm, y_lstm = create_sequences(data[X.columns.tolist() + ['Direction']], seq_length)
y_lstm_reg = data['Daily_Change_Percentage'][seq_length:]

X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm, y_train_lstm_reg, y_test_lstm_reg = train_test_split(
    X_lstm, y_lstm, y_lstm_reg, test_size=0.2, shuffle=False)

# LSTM Model for Classification
model_lstm = Sequential([
    LSTM(50, return_sequences=True, input_shape=(seq_length, X_train_lstm.shape[2])),
    LSTM(50),
    Dense(1, activation='sigmoid')
])
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history_lstm = model_lstm.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32, validation_split=0.2)
lstm_accuracy = model_lstm.evaluate(X_test_lstm, y_test_lstm)[1]
print(f'LSTM Accuracy: {lstm_accuracy:.2f}')

# CNN Model for Classification
model_cnn = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(seq_length, X_train_lstm.shape[2])),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1, activation='sigmoid')
])
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
history_cnn = model_cnn.fit(X_train_lstm, y_train_lstm, epochs=10, batch_size=32, validation_split=0.2)
cnn_accuracy = model_cnn.evaluate(X_test_lstm, y_test_lstm)[1]
print(f'CNN Accuracy: {cnn_accuracy:.2f}')

# Visualization of Deep Learning Models Training History
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(history_lstm.history['accuracy'], label='LSTM Train Accuracy')
plt.plot(history_lstm.history['val_accuracy'], label='LSTM Val Accuracy')
plt.title('LSTM Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history_cnn.history['accuracy'], label='CNN Train Accuracy')
plt.plot(history_cnn.history['val_accuracy'], label='CNN Val Accuracy')
plt.title('CNN Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
## Step 7: Fine-tune the Hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor

def build_model(units=50, optimizer='adam'):
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=True, input_shape=(1, X_train.shape[1])))
    model.add(LSTM(units=units, return_sequences=False))
    model.add(Dense(units=1))
    model.compile(optimizer=optimizer, loss='mean_squared_error')
    return model

model = KerasRegressor(build_fn=build_model, epochs=50, batch_size=32, verbose=2)
param_grid = {
    'units': [50, 100],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [16, 32]
}

grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')
grid_result = grid.fit(X_train_lstm, y_train)

# Summarize results
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")


In [None]:
## Step 8: Deploy and Monitor the Model

In [None]:
# Save the Model
model_lstm.save('best_lstm_model.h5')


In [None]:
# Load and Use the Model for Predictions

from keras.models import load_model

# Load the saved model
model = load_model('best_lstm_model.h5')

# Function to make daily predictions
def make_prediction(new_data):
    new_data = np.reshape(new_data.values, (new_data.shape[0], 1, new_data.shape[1]))
    prediction = model.predict(new_data)
    return prediction

# Example: Predicting the next day's change
new_data = X_test.iloc[-1:]  # Replace with new daily data
predicted_change = make_prediction(new_data)
print(f"Predicted Daily Change Percentage: {predicted_change[0][0]}")


In [None]:
# Continuously Monitor and Update the Model
# You can set up a script to periodically fetch new data, update your dataset, retrain the model, and save the updated model.

In [None]:
# Function to retrain the model
def retrain_model(new_data):
    global data, X, y, X_train, X_test, y_train, y_test, X_train_lstm, X_test_lstm
    
    # Append new data to the existing dataset
    data = data.append(new_data, ignore_index=True)
    data.dropna(inplace=True)  # Ensure no NaN values
    
    # Update features and target
    X = data[['SMA_5', 'SMA_10', 'EMA_5', 'EMA_10', 'RSI', 'MACD', 'MACD_Signal', 'MACD_Hist', 'Bollinger_Upper', 'Bollinger_Lower', 'ATR', 'Volume_BTC', 'Volume_USDT']]
    y = data['Daily_Change_Percentage']
    
    # Split the updated data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Reshape input data for LSTM
    X_train_lstm = np.reshape(X_train.values, (X_train.shape[0], 1, X_train.shape[1]))
    X_test_lstm = np.reshape(X_test.values, (X_test.shape[0], 1, X_test.shape[1]))
    
    # Retrain the model
    model_lstm.fit(X_train_lstm, y_train, epochs=50, batch_size=32, validation_data=(X_test_lstm, y_test), verbose=2)
    
    # Save the updated model
    model_lstm.save('best_lstm_model.h5')

# Example: Retraining with new daily data
new_daily_data = {
    'Unix': 1.5031E+12,
    'Date': '8/20/2017',
    'Symbol': 'BTCUSDT',
    'Open': 4139.98,
    'High': 4200,
    'Low': 4100,
    'Close': 4150,
    'Volume BTC': 400,
    'Volume USDT': 1600000,
    'tradecount': 2200,
    'Daily_Change': 10,
    'Daily_Change_Percentage': 0.24
}
new_daily_data = pd.DataFrame([new_daily_data])
retrain_model(new_daily_data)
