In [1]:
import numpy as np
import pandas as pd
import datetime
import yfinance as yf
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

# For importing universal scripts
import sys
import os
# Go up two levels from the subfolder
sys.path.append(os.path.abspath("../.."))
from indicators_returns import final_df #Universal script for indicator set and actuals

In [3]:
# Create an empty list to accumulate records
def record_validation_metrics(metrics, arch, horizon, model_name):
    """
    Convert metrics dictionary to DataFrame row format and store it.
    """
    for t, vals in metrics.items():
        row = {
            'arch': arch,
            'threshold': t,
            'ticker': 'QQQ',
            'horizon': horizon,
            'model': model_name,
        }
        row.update(vals)
        validation_results.append(row)

def optimize_tests(df_indicators, df_predict, thresh, opt, depth, scale_pos_weight, min_child_weight, r, name, arch, date, return_metrics=False): 

    def train_and_evaluate(model, param_grid, X_train, X_test, y_train, y_test, opt, thresh):
        
        # Create a Stratified K-Fold object
        stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        # Perform Random Search
        random_search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_grid,  # Corrected from param_grid to param_distributions
            scoring=opt,
            cv=stratified_kfold,
            n_jobs=-1,
            n_iter=50,  # Adjust this based on how many random samples you want to try
            random_state=42  # Ensures reproducibility
        )

        random_search.fit(X_train, y_train)
        best_model = random_search.best_estimator_
        # Predict probabilities
        y_prob = best_model.predict_proba(X_test)

        # Evaluate metrics for each threshold
        metrics = {}
        for t in thresh:
            y_pred_thresh = (y_prob[:, 1] > t).astype(int)
            y_pred_thresh[y_prob[:, 0] > t] = 0
            filtered_indices = (y_prob[:, 1] > t) | (y_prob[:, 0] > t)

            if filtered_indices.sum() > 0:
                y_test_valid = y_test[filtered_indices]
                y_pred_valid = y_pred_thresh[filtered_indices]
                metrics[t] = {
                    'PosF1': round(f1_score(y_test_valid, y_pred_valid), 3),
                    'NegF1': round((2*round(recall_score(y_test_valid, y_pred_valid, pos_label=0), 3)*round(precision_score(y_test_valid, y_pred_valid, pos_label=0), 3))/(round(recall_score(y_test_valid, y_pred_valid, pos_label=0), 3) + round(precision_score(y_test_valid, y_pred_valid, pos_label=0), 3)),3),
                    "PosID'd": round(recall_score(y_test_valid, y_pred_valid, pos_label=1), 3),  # % of total positives identified
                    'PosPrec': round(precision_score(y_test_valid, y_pred_valid, pos_label=1), 3),  # % of positive predictions that were actually positive
                    "NegID'd": round(recall_score(y_test_valid, y_pred_valid, pos_label=0), 3),  # % of total negatives identified
                    'NegPrec': round(precision_score(y_test_valid, y_pred_valid, pos_label=0), 3),  # % of negative predictions that were actually negative
                    'PosCnt': sum(y_pred_valid == 1),
                    'NegCnt': sum(y_pred_valid == 0),
                }
            else:
                metrics[t] = {'F1': 0, 'BalAcc': 0, 'PosAcc': 0, 'NegAcc': 0, 'PosCnt': 0, 'NegCnt': 0}

        del random_search
        gc.collect()

        return metrics, best_model

    if arch == 'shallow':
        # Shallow
        xgboost_hyperparameters = {
            'scale_pos_weight': [scale_pos_weight],
            'n_estimators': [200, 300],
            'max_depth': [5, 7], 
            'learning_rate': [0.01],
            'subsample': [0.65],
            'colsample_bytree': [0.6], 
            'gamma': [0.2, 0.4],
            'alpha': [0.1, 1], 
            'lambda': [1, 2, 5], 
            'min_child_weight': [12, 15]
        }

    elif arch == 'moderate':

        # Moderate
        xgboost_hyperparameters = {
            'scale_pos_weight': [scale_pos_weight],
            'n_estimators': [300, 400],
            'max_depth': [7, 9], 
            'learning_rate': [0.01],
            'subsample': [0.65, .75],
            'colsample_bytree': [0.6, 0.7], 
            'gamma': [0.2, 0.3],
            'alpha': [0.1, 1], 
            'lambda': [1, 2, 5], 
            'min_child_weight': [9, 11]
        }

    else:

        # Deep
        xgboost_hyperparameters = {
            'scale_pos_weight': [scale_pos_weight],
            'n_estimators': [300, 400, 500],
            'max_depth': [8, 10, 12], 
            'learning_rate': [0.01],
            'subsample': [0.75, .85],
            'colsample_bytree': [0.75, 0.85], 
            'gamma': [0.1, 0.2],
            'alpha': [0.1, 1], 
            'lambda': [1, 2, 5], 
            'min_child_weight': [5, 7]
        }

    if date == 'lag':
        p = 500
        df_ind_rec = df_indicators.iloc[:p].copy()
        df_pred_rec = df_predict.iloc[:p].copy()
        df_indicators = df_indicators.iloc[p:]
        df_predict = df_predict.iloc[p:]

        # Split data once
        X_train, X_test, y_train, y_test = train_test_split(df_indicators, df_predict, test_size=0.1, random_state=None, shuffle=True)
        X_test = pd.concat([X_test, df_ind_rec], axis=0)
        y_test = pd.concat([y_test, df_pred_rec], axis=0)

    else:
        X_train, X_test, y_train, y_test = train_test_split(df_indicators, df_predict, test_size=0.3, random_state=None, shuffle=True)

    # Train and evaluate models
    xg_metrics, best_xg_model = train_and_evaluate(XGBClassifier(random_state=42), xgboost_hyperparameters, X_train, X_test, y_train, y_test, opt, thresh)
    #record_validation_metrics(xg_metrics, arch='shallow', horizon=r, model_name=name)
    #savearch(best_xg_model, r, name, arch)
    print_metrics(xg_metrics)

    if return_metrics:
        return xg_metrics, best_xg_model
    else:
        #print_metrics(xg_metrics)
        return best_xg_model, X_train, X_test, y_train, y_test
    
def savenew(best_model, days, model, arch, ticker, date, lb):
    
    directory = f'./New_Models/Ensemble_{ticker}'  # Saves to the current working directory
    os.makedirs(directory, exist_ok=True)

    if ticker == 'QQQ':
    
        if date == 'current':
            # Save the best xgboost model
            with open(os.path.join(directory, f'{model}_xgboost_{days}{arch}_{lb}.pkl'), 'wb') as file_object:
                pickle.dump(best_model, file_object)
        else:
            with open(os.path.join(directory, f'{model}_xgboost_{days}{arch}_{lb}.pkl'), 'wb') as file_object:
                pickle.dump(best_model, file_object)

    else:
        with open(os.path.join(directory, str(model)+'_xgboost_'+str(days)+str(arch)+'.pkl'), 'wb') as file_object:
                pickle.dump(best_model, file_object)

    print('Models Saved for '+(ticker)+'_'+str(model)+' '+str(days)+'_'+str(date)+' Returns')

def print_metrics(metrics):
        for thresh, metric_values in metrics.items():
            print(f"  Threshold {thresh}: {metric_values}")

In [30]:
ticker = 'QQQ'
returns = [5, 10, 20, 30, 45, 60, 90]
lb = 20
df = final_df(ticker, returns, lb)
df = df.iloc[:-101].replace([np.inf, -np.inf], 0)

# Logistic Regression Univariate Evaluation

In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

def evaluate_indicator_randomsplit(indicator_series, target_series, r, test_size=0.3, random_state=42):

    """Train logistic regression on a single indicator vs binary target. Return stats."""
    X = indicator_series.values.reshape(-1, 1)
    y = target_series.values

    # Drop NA
    mask = ~np.isnan(X.flatten()) & ~np.isnan(y)
    X = X[mask]
    y = y[mask]

    if len(np.unique(y)) < 2 or len(y) < 100:
        return None  # not enough signal or samples

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_test)[:, 1]

    return {
        'r': r,
        'auc_rs': roc_auc_score(y_test, y_proba),
        'log_loss_rs': log_loss(y_test, y_proba),
        'coef_rs': model.coef_[0][0]
    }

def evaluate_indicator_recent500(indicator_series, target_series, r, p=500):

    """Train logistic regression on a single indicator vs binary target. Return stats."""
    X = indicator_series.values.reshape(-1, 1)
    y = target_series.values

    # Drop NA
    mask = ~np.isnan(X.flatten()) & ~np.isnan(y)
    X = X[mask]
    y = y[mask]

    if len(np.unique(y)) < 2 or len(y) < 100:
        return None  # not enough signal or samples
    
    X_test = X[:p].copy()
    y_test = y[:p].copy()
    X_train = X[p:].copy()
    y_train = y[p:].copy()


    model = LogisticRegression()
    model.fit(X_train, y_train)

    y_proba = model.predict_proba(X_test)[:, 1]

    return {
        'r': r,
        'auc_ts': roc_auc_score(y_test, y_proba),
        'log_loss_ts': log_loss(y_test, y_proba),
        'coef_ts': model.coef_[0][0]
    }

# Results for random split
results_rs = []
# Results for temporal split
results_ts = []

for r in returns:
    df_test = df.copy()
    return_col = f'Return_{r}'
    indicator_cols = [col for col in df_test.columns 
                    if col not in ['Date', 'Close', 'High', 'Low', 'Volume']
                    and not col.startswith('Return')
    ]
    

    for col in indicator_cols:
        df_eval = df.iloc[r:].copy() # Remove records without actuals
        result = evaluate_indicator_randomsplit(df_eval[col], df_eval[return_col], r)
        if result:
            result['indicator'] = col
            results_rs.append(result)

    logistic_results_rs = pd.DataFrame(results_rs)

    
    

    for col in indicator_cols:
        df_eval = df.iloc[r:].copy() # Remove records without actuals
        result = evaluate_indicator_recent500(df_eval[col], df_eval[return_col], r)
        if result:
            result['indicator'] = col
            results_ts.append(result)

    logistic_results_ts = pd.DataFrame(results_ts)


In [32]:
# Merge them into a wide format
merged_df = pd.merge(logistic_results_rs, logistic_results_ts, 
                     on=['indicator', 'r'], how='outer', suffixes=('_rs', '_ts'))
order = ['r', 'indicator', 'auc_rs', 'auc_ts', 'log_loss_rs', 'log_loss_ts', 'coef_rs', 'coef_ts'] 
        
merged_df[order].sort_values(by=['indicator', 'r'])

Unnamed: 0,r,indicator,auc_rs,auc_ts,log_loss_rs,log_loss_ts,coef_rs,coef_ts
0,5,100_EMA_200,0.494549,0.424242,0.676573,0.675429,-0.517016,0.021185
1,10,100_EMA_200,0.503271,0.395961,0.658263,0.653156,-0.072074,0.385507
2,20,100_EMA_200,0.527863,0.654937,0.642633,0.616307,-0.009967,-0.081769
3,30,100_EMA_200,0.544469,0.701299,0.626834,0.620562,-0.316896,-0.300490
4,45,100_EMA_200,0.512505,0.669870,0.605028,0.596595,-0.648517,-0.240014
...,...,...,...,...,...,...,...,...
4559,20,vol_5_MA50,0.515089,0.499478,0.641647,0.614579,0.148698,0.136398
4560,30,vol_5_MA50,0.474390,0.477918,0.631771,0.620239,0.221457,0.145014
4561,45,vol_5_MA50,0.516029,0.555040,0.604317,0.593068,0.143019,0.111876
4562,60,vol_5_MA50,0.534538,0.533136,0.578696,0.550503,0.174174,0.162983


In [27]:
logistic_results_rs.sort_values(by='auc_rs', ascending=False)

Unnamed: 0,r,auc_rs,log_loss_rs,coef_rs,indicator
1100,10,0.566707,0.658217,-0.002055,OBV_EA50
1055,10,0.566707,0.658220,-0.002862,OBV_EA25
920,10,0.566088,0.658178,-0.004472,OBV_MA50
875,10,0.564670,0.658228,-0.001607,OBV_MA25
613,5,0.555991,0.676087,0.332358,OBV_ROC10_slope50
...,...,...,...,...,...
1840,20,0.455939,0.642818,0.389071,CMF_20_slope10
170,5,0.455228,0.680637,-0.530728,RSI_21_MA10
1878,20,0.453385,0.645249,1.470880,OBV_Z5_slope25
1884,20,0.450302,0.642653,-0.027919,Vol_Spike_20_slope25


In [19]:
logistic_results_ts.sort_values(by='auc_ts', ascending=False)

Unnamed: 0,r,auc_ts,log_loss_ts,coef_ts,indicator
572,5,0.636745,0.675288,-1.103359,OBV_ROC5_slope25
9,5,0.621900,0.670458,-0.000908,num_days_100
261,5,0.618384,0.675393,-0.000090,MACD_MA50
190,5,0.616593,0.674833,0.034059,Vol_Ratio_25_MA10
214,5,0.609900,0.674831,0.025461,RSI_14_MA25
...,...,...,...,...,...
574,5,0.380115,0.679536,0.920744,OBV_Z5_slope25
503,5,0.380065,0.676883,0.173709,Vol_Ratio_25_slope5
573,5,0.363222,0.675470,0.296422,OBV_ROC10_slope25
582,5,0.359531,0.675411,0.000016,Vol_Ratio_10_slope25
