In [1]:
import numpy as np
import pandas as pd
import datetime
import warnings
import yfinance as yf
from sklearn.metrics import (fbeta_score, accuracy_score, f1_score, 
                             confusion_matrix, balanced_accuracy_score, recall_score, matthews_corrcoef, precision_score)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, StratifiedKFold
from xgboost import XGBClassifier
import pickle
import gc
pd.options.mode.chained_assignment = None 
warnings.filterwarnings(
    "ignore",
    message="A worker stopped while some jobs were given to the executor",
    category=UserWarning,
    module="joblib.externals.loky.process_executor"
)

from sklearn.inspection import permutation_importance
import math
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# For importing universal scripts
import sys
import os
# Go up two levels from the subfolder
sys.path.append(os.path.abspath("../.."))
from indicators_returns import final_df #Universal script for indicator set and actuals

In [3]:
# Create an empty list to accumulate records
def record_validation_metrics(metrics, arch, horizon, model_name):
    """
    Convert metrics dictionary to DataFrame row format and store it.
    """
    for t, vals in metrics.items():
        row = {
            'arch': arch,
            'threshold': t,
            'ticker': 'QQQ',
            'horizon': horizon,
            'model': model_name,
        }
        row.update(vals)
        validation_results.append(row)

def optimize_tests(df_indicators, df_predict, thresh, opt, depth, scale_pos_weight, min_child_weight, r, name, arch, date, return_metrics=False): 

    def train_and_evaluate(model, param_grid, X_train, X_test, y_train, y_test, opt, thresh):
        
        # Create a Stratified K-Fold object
        stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Perform Random Search
        random_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,  # Corrected from param_grid to param_distributions
            scoring=opt,
            cv=stratified_kfold,
            n_jobs=-1,
            n_iter=50,  # Adjust this based on how many random samples you want to try
            random_state=42  # Ensures reproducibility
        )

        random_search.fit(X_train, y_train)
        best_model = random_search.best_estimator_
        # Predict probabilities
        y_prob = best_model.predict_proba(X_test)

        # Evaluate metrics for each threshold
        metrics = {}
        for t in thresh:
            y_pred_thresh = (y_prob[:, 1] > t).astype(int)
            y_pred_thresh[y_prob[:, 0] > t] = 0
            filtered_indices = (y_prob[:, 1] > t) | (y_prob[:, 0] > t)

            if filtered_indices.sum() > 0:
                y_test_valid = y_test[filtered_indices]
                y_pred_valid = y_pred_thresh[filtered_indices]
                metrics[t] = {
                    'PosF1': round(f1_score(y_test_valid, y_pred_valid), 3),
                    'NegF1': round((2*round(recall_score(y_test_valid, y_pred_valid, pos_label=0), 3)*round(precision_score(y_test_valid, y_pred_valid, pos_label=0), 3))/(round(recall_score(y_test_valid, y_pred_valid, pos_label=0), 3) + round(precision_score(y_test_valid, y_pred_valid, pos_label=0), 3)),3),
                    "PosID'd": round(recall_score(y_test_valid, y_pred_valid, pos_label=1), 3),  # % of total positives identified
                    'PosPrec': round(precision_score(y_test_valid, y_pred_valid, pos_label=1), 3),  # % of positive predictions that were actually positive
                    "NegID'd": round(recall_score(y_test_valid, y_pred_valid, pos_label=0), 3),  # % of total negatives identified
                    'NegPrec': round(precision_score(y_test_valid, y_pred_valid, pos_label=0), 3),  # % of negative predictions that were actually negative
                    'PosCnt': sum(y_pred_valid == 1),
                    'NegCnt': sum(y_pred_valid == 0),
                }
            else:
                metrics[t] = {'F1': 0, 'BalAcc': 0, 'PosAcc': 0, 'NegAcc': 0, 'PosCnt': 0, 'NegCnt': 0}

        del random_search
        gc.collect()

        return metrics, best_model

    if arch == 'shallow':
        # Shallow
        xgboost_hyperparameters = {
            'scale_pos_weight': [scale_pos_weight],
            'n_estimators': [200, 300],
            'max_depth': [5, 7], 
            'learning_rate': [0.01],
            'subsample': [0.65],
            'colsample_bytree': [0.6], 
            'gamma': [0.2, 0.4],
            'alpha': [0.1, 1], 
            'lambda': [1, 2, 5], 
            'min_child_weight': [12, 15]
        }

    elif arch == 'moderate':

        # Moderate
        xgboost_hyperparameters = {
            'scale_pos_weight': [scale_pos_weight],
            'n_estimators': [300, 400],
            'max_depth': [7, 9], 
            'learning_rate': [0.01],
            'subsample': [0.65, .75],
            'colsample_bytree': [0.6, 0.7], 
            'gamma': [0.2, 0.3],
            'alpha': [0.1, 1], 
            'lambda': [1, 2, 5], 
            'min_child_weight': [9, 11]
        }

    else:

        # Deep
        xgboost_hyperparameters = {
            'scale_pos_weight': [scale_pos_weight],
            'n_estimators': [300, 400, 500],
            'max_depth': [8, 10, 12], 
            'learning_rate': [0.01],
            'subsample': [0.75, .85],
            'colsample_bytree': [0.75, 0.85], 
            'gamma': [0.1, 0.2],
            'alpha': [0.1, 1], 
            'lambda': [1, 2, 5], 
            'min_child_weight': [5, 7]
        }

    if date == 'lag':
        p = 500
        df_ind_rec = df_indicators.iloc[:p].copy()
        df_pred_rec = df_predict.iloc[:p].copy()
        df_indicators = df_indicators.iloc[p:]
        df_predict = df_predict.iloc[p:]

        # Split data once
        X_train, X_test, y_train, y_test = train_test_split(df_indicators, df_predict, test_size=0.1, random_state=None, shuffle=True)
        X_test = pd.concat([X_test, df_ind_rec], axis=0)
        y_test = pd.concat([y_test, df_pred_rec], axis=0)

    else:
        X_train, X_test, y_train, y_test = train_test_split(df_indicators, df_predict, test_size=0.3, random_state=None, shuffle=True)

    # Train and evaluate models
    xg_metrics, best_xg_model = train_and_evaluate(XGBClassifier(random_state=42), xgboost_hyperparameters, X_train, X_test, y_train, y_test, opt, thresh)
    #record_validation_metrics(xg_metrics, arch='shallow', horizon=r, model_name=name)
    #savearch(best_xg_model, r, name, arch)
    print_metrics(xg_metrics)

    if return_metrics:
        return xg_metrics, best_xg_model
    else:
        #print_metrics(xg_metrics)
        return best_xg_model, X_train, X_test, y_train, y_test
    
def savenew(best_model, days, model, arch, ticker, date, lb):
    
    directory = f'./New_Models/Ensemble_{ticker}'  # Saves to the current working directory
    os.makedirs(directory, exist_ok=True)

    if ticker == 'QQQ':
    
        if date == 'current':
            # Save the best xgboost model
            with open(os.path.join(directory, f'{model}_xgboost_{days}{arch}_{lb}.pkl'), 'wb') as file_object:
                pickle.dump(best_model, file_object)
        else:
            with open(os.path.join(directory, f'{model}_xgboost_{days}{arch}_{lb}.pkl'), 'wb') as file_object:
                pickle.dump(best_model, file_object)

    else:
        with open(os.path.join(directory, str(model)+'_xgboost_'+str(days)+str(arch)+'.pkl'), 'wb') as file_object:
                pickle.dump(best_model, file_object)

    print('Models Saved for '+(ticker)+'_'+str(model)+' '+str(days)+'_'+str(date)+' Returns')

def print_metrics(metrics):
        for thresh, metric_values in metrics.items():
            print(f"  Threshold {thresh}: {metric_values}")

In [5]:
ticker = 'QQQ'
returns = [5, 10, 20]
lb = 10
df = final_df(ticker, returns, lb)