In [1]:
import numpy as np
import pandas as pd

from warnings import simplefilter
simplefilter('ignore')

from performStockAnalysis.main import select_kode_to_model, develop_models_for_selected_kode, make_forecasts_using_the_developed_models

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

In [None]:
from dataPreparation.main import prepare_data_for_modelling

bbca = prepare_data_for_modelling('BBCA', '2021-01-01', '', 'Close', [10, 15])
bbri = prepare_data_for_modelling('BBRI', '2021-01-01', '', 'Close', [10, 15])
bmri = prepare_data_for_modelling('BMRI', '2021-01-01', '', 'Close', [10, 15])
bbni = prepare_data_for_modelling('BBNI', '2021-01-01', '', 'Close', [10, 15])

In [None]:
data = pd.concat((bbca, bbri, bmri, bbni))

In [None]:
data['Good Fit'] = data['Upcoming 10 Days Trend RSquared'].apply(lambda row: row >= 0.7)

In [None]:
data['Good Fit'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

X = data[data.columns[6:-5]].values
y = data['Good Fit'].values.reshape(-1, 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,        # 20% of the data will be used for testing
    random_state=42,      # Ensures the split is the same every time
    stratify=y
)

In [None]:
import lightgbm as lgb
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

model = lgb.LGBMClassifier(random_state=42)
model.fit(X_train, y_train)

search_space = {
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'n_estimators': Integer(100, 1000),
    'num_leaves': Integer(2, 100),
    'max_depth': Integer(-1, 50), # -1 means no limit
    'reg_alpha': Real(0.0, 1.0), # L1 regularization
    'reg_lambda': Real(0.0, 1.0), # L2 regularization
    'min_child_samples': Integer(1, 50),
}

bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces=search_space,
    n_iter=10,  # Increase for more thorough search
    cv=3,
    n_jobs=-1,  # Use all available CPU cores
    verbose=0,
    random_state=42
)

bayes_search.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

def calculate_gini(target_true: np.array, target_pred_proba: np.array):
    """
    Calculates the Gini coefficient from the model's prediction probabilities.

    The Gini coefficient is a common metric for evaluating binary classification
    models and is derived from the Area Under the ROC Curve (AUC).
    Formula: Gini = 2 * AUC - 1

    Args:
        target_true (np.array): The true labels of the target variable.
        target_pred_proba (np.array): The predicted probabilities for each class.

    Returns:
        float: The calculated Gini coefficient, or 0.0 if AUC cannot be calculated.
    """
    try:
        # The positive class ('Up Trend') is typically the second column.
        # We need to find its index to correctly calculate AUC.
        auc = roc_auc_score(target_true, target_pred_proba)
        gini = 2 * auc - 1
    except (ValueError, IndexError):
        # Handle cases where AUC cannot be computed (e.g., only one class present).
        logging.warning("Could not calculate Gini coefficient (likely only one class in target). Returning 0.0.")
        gini = 0.0
    return gini

In [None]:
from sklearn.metrics import accuracy_score

y_train_pred = bayes_search.predict(X_train) 
y_train_pred_proba = bayes_search.predict_proba(X_train)[:, 1]

y_test_pred = bayes_search.predict(X_test)
y_test_pred_proba = bayes_search.predict_proba(X_test)[:, 1]

accuracy_train = accuracy_score(y_train, y_train_pred)
accuracy_test = accuracy_score(y_test, y_test_pred)

print(accuracy_train)
print(accuracy_test)

In [None]:
data_fit = data.loc[data['Good Fit'], :]

In [None]:
data_fit['Upcoming 10 Days Trend'].value_counts()

In [2]:
# data_fit.to_csv('test.csv', index=False)
data_fit = pd.read_csv('test.csv')

In [3]:
from modelDevelopment.main import develop_model

model_10_days, train_metrics_10_days, test_metrics_10_days = develop_model(data_fit, 'Upcoming 10 Days Trend')

2025-08-10 15:00:18 - INFO - --- Starting Model Development for Target: 'Upcoming 10 Days Trend' ---
2025-08-10 15:00:18 - INFO - Loading feature names from 'modelDevelopment/technical_indicator_features.txt'.
2025-08-10 15:00:18 - INFO - Loaded 34 features.
2025-08-10 15:00:18 - INFO - Splitting data into training, validation, testing, and forecast sets.
2025-08-10 15:00:18 - INFO - Found 0 data points for future forecasting.
2025-08-10 15:00:18 - INFO - Training set size: 573, Test set size: 30.
2025-08-10 15:00:18 - INFO - Created PredefinedSplit for time-series cross-validation.
2025-08-10 15:00:18 - INFO - Initializing CatBoost model and starting hyperparameter tuning with BayesSearchCV.
2025-08-10 15:00:18 - INFO - Hyperparameter search space defined: {'depth': Integer(low=1, high=5, prior='uniform', transform='identity'), 'learning_rate': Real(low=0.01, high=0.1, prior='log-uniform', transform='identity'), 'iterations': Integer(low=150, high=300, prior='uniform', transform='iden

In [4]:
train_metrics_10_days

{'Accuracy': [0.7382198952879581],
 'Precision Up Trend': [0.7108433734939759],
 'Precision Down Trend': [0.92],
 'Recall Up Trend': [0.9833333333333333],
 'Recall Down Trend': [0.323943661971831],
 'Gini': [0.6554773082942096]}

In [5]:
test_metrics_10_days

{'Accuracy': [0.5],
 'Precision Up Trend': [0.4],
 'Precision Down Trend': [1.0],
 'Recall Up Trend': [1.0],
 'Recall Down Trend': [0.25],
 'Gini': [0.0]}

In [13]:
from datetime import datetime, timedelta

min_date = (datetime.strptime(data_fit['Date'].max(), '%Y-%m-%d') - timedelta(days=30)).strftime('%Y-%m-%d')
train_data = data_fit[data_fit['Date'] < min_date]
test_data = data_fit[data_fit['Date'] >= min_date]

In [19]:
from modelDevelopment.main import initialize_and_fit_model, measure_model_performance

feature_file = 'modelDevelopment/technical_indicator_features.txt'
with open(feature_file, "r") as file:
    feature_columns = [line.strip() for line in file]

train_feature = train_data[feature_columns].values
train_target = train_data['Upcoming 10 Days Trend'].values
test_feature = test_data[feature_columns].values
test_target = test_data['Upcoming 10 Days Trend'].values

# --- Step 3: Initialize, Tune, and Fit Model ---
model = initialize_and_fit_model(train_feature, train_target, )

# --- Step 4: Evaluate Final Model ---
train_metrics, test_metrics = measure_model_performance(
    model, train_feature, train_target, test_feature, test_target
)


NameError: name 'cv_split' is not defined