In [None]:
import pandas as pd
import pickle
import joblib
import os
import tempfile
from sklearn.preprocessing import MinMaxScaler

import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.svm import SVR

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
def load_data(path):
    df = pd.read_csv(path, index_col='date')
    df.index = pd.to_datetime(df.index)
    
    features = df[['vwap', 'dema', 'tema', 'williams', 'rsi', 'minus_10_price', 'minus_5_price', 'minus_4_price', 'minus_3_price', 'minus_2_price']]
    target = df[['target']]
    
    return features, target#, other

def normalize_data(train_features, test_features):
    normalizer = MinMaxScaler()
    normalizer.fit(train_features)

    filename = "alphabet_normalizer.pkl"
    with open("../scalers/" + filename, "wb") as file:
        pickle.dump(normalizer, file)

    train_features_n = normalizer.transform(train_features)
    train_features_df = pd.DataFrame(train_features_n, columns=train_features.columns, index=train_features.index)

    test_features_n = normalizer.transform(test_features)
    test_features_df = pd.DataFrame(test_features_n, columns=test_features.columns, index=test_features.index)

    return train_features_df, test_features_df


def save_model(model, filename, dir_path):
    temp_file = None
    try:
        # Create a temporary file in the desired directory
        temp_file = tempfile.NamedTemporaryFile(dir=dir_path, delete=False)
        temp_file.close()  # Close the file so joblib can write to it

        # Save the model to the temporary file
        joblib.dump(model, temp_file.name)

        # Move the temporary file to the final destination
        final_path = os.path.join(dir_path, filename)
        os.replace(temp_file.name, final_path)
        print(f"Model saved successfully to {final_path}")

        # Verify the file by loading it
        loaded_model = joblib.load(final_path)
        print("Model verified successfully")

    except Exception as e:
        print(f"An error occurred: {e}")

        # Clean up the temporary file if it exists
        if temp_file and os.path.exists(temp_file.name):
            os.remove(temp_file.name)
    finally:
        # Ensure the temporary file is removed if it still exists
        if temp_file and os.path.exists(temp_file.name):
            os.remove(temp_file.name)

In [None]:
def objective(trial):
    # Load your stock price dataset
    features, target = load_data('../data/clean/alphabet_training_data.csv')
    
    # Split data into training and validation sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)
    X_train, X_test = normalize_data(X_train, X_test)
    
    # Define base models
    lin_reg = LinearRegression()
    
    xgb_reg = xgb.XGBRegressor(
        n_estimators=trial.suggest_int('xgb_n_estimators', 50, 500),
        max_depth=trial.suggest_int('xgb_max_depth', 3, 9),
        learning_rate=trial.suggest_float('learning_rate', 1e-3, 0.3),
        subsample=trial.suggest_float('subsample', 0.2, 1.0),
        colsample_bytree=trial.suggest_float('colsample_bytree', 0.2, 1.0),
        random_state=42
    )

    svr = SVR(
        C=trial.suggest_float('svr_C', 0.1, 10.0),
        epsilon=trial.suggest_float('svr_epsilon', 0.01, 1.0),
        kernel=trial.suggest_categorical('svr_kernel', ['linear', 'poly', 'rbf'])
    )
    
    # Define meta-model
    meta_model = LinearRegression()

    # Create the stacking regressor
    stacking_reg = StackingRegressor(
        estimators=[
            ('lin_reg', lin_reg),
            ('xgb', xgb_reg),
            ('svr', svr)
        ],
        final_estimator=meta_model
    )

    # Train the stacking regressor
    stacking_reg.fit(X_train, y_train.values.ravel())

    # Make predictions
    y_pred = stacking_reg.predict(X_test)

    # Evaluate the model
    rmse = mean_squared_error(y_test, y_pred)
    
    return rmse

# Suppress Optuna's logging output
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Create and optimize the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100, timeout=600)
trial = study.best_trial
print(f'Best trial: {trial.number}')
print(f'  Value: {trial.value}')
print(f'  Params: {trial.params}')


In [None]:
best_params = trial.params
best_params

In [None]:
best_params = trial.params
path = '../data/clean/alphabet_training_data.csv'

def prep_data(path):
    features, target = load_data(path)
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, shuffle=False)
    X_train, X_test = normalize_data(X_train, X_test)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = prep_data(path)

lin_reg = LinearRegression()

xgb_reg = xgb.XGBRegressor(
    n_estimators=best_params['xgb_n_estimators'],
    max_depth=best_params['xgb_max_depth'],
    learning_rate=best_params['learning_rate'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    random_state=42
)

svr = SVR(
    C=best_params['svr_C'],
    epsilon=best_params['svr_epsilon'],
    kernel=best_params['svr_kernel']
)

# Define meta-model
meta_model = LinearRegression()

# Create the stacking regressor
stacking_reg = StackingRegressor(
    estimators=[
        ('lin_reg', lin_reg),
        ('xgb', xgb_reg),
        ('svr', svr)
    ],
    final_estimator=meta_model
)

# Train the stacking regressor
stacking_reg.fit(X_train, y_train.values.ravel())

# Save the model
model_filename = 'alphabet_stacking_regressor_model.pkl'
dir_path = "../models"
save_model(stacking_reg, model_filename, dir_path)

# Make predictions
y_pred = stacking_reg.predict(X_test)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred)


print(f"MSE: {mean_squared_error(y_pred, y_test): .2f}")
print(f"MAE: {mean_absolute_error(y_pred, y_test): .2f}")
print(f"RMSE: {root_mean_squared_error(y_pred, y_test): .2f}")
print(f"R2 score:  {stacking_reg.score(X_test, y_test): .2f}")

In [None]:
values = stacking_reg.named_estimators_['xgb'].feature_importances_
features = ['vwap', 'dema', 'tema', 'williams', 'rsi', 'minus_10_price', 'minus_5_price', 'minus_4_price', 'minus_3_price', 'minus_2_price']
pd.DataFrame(values, features).sort_values(0, ascending=False)

In [None]:
actual = y_test
predictions = pd.DataFrame(y_pred, columns=y_test.columns, index=y_test.index)

fig = go.Figure()

# Create the subplots
fig = make_subplots(rows=1, cols=1, shared_xaxes=False, vertical_spacing=0.1,
                    subplot_titles=("Prices", ""))

fig.add_trace(
    go.Scatter(
        x=actual.index,
        y=actual.target,
        name='actual'
    ),
    row=1,
    col=1,
    secondary_y=False)

fig.add_trace(
    go.Scatter(
        x=predictions.index,
        y=predictions.target,
        name='predictions'
    ),
    row=1,
    col=1,
    secondary_y=False)

fig.update_layout(paper_bgcolor='#e4ecf6')


fig.show()
fig.write_image("../images/alphabet_pred_actual.png")

In [None]:
path = '../data/clean/alphabet_training_data.csv'
df = pd.read_csv(path)
df.head()

In [None]:
features = ['vwap', 'dema', 'tema', 'williams', 'rsi', 'ratingScore', 'minus_10_price', 'minus_5_price', 'minus_4_price', 'minus_3_price', 'minus_2_price']

In [None]:
df_numerical = df[features]

In [None]:
normalizer = MinMaxScaler()
normalizer.fit(df_numerical)
df_norm = normalizer.transform(df_numerical)
df_norm = pd.DataFrame(df_norm, index=df_numerical.index, columns=df_numerical.columns)
df_norm.head()

In [None]:
fig = px.box(df_norm, y=df_norm.columns)
fig.update_layout(paper_bgcolor='#e4ecf6')
fig.show()
fig.write_image("../images/boxplot_normalized.png")