# Import Packages

In [1]:
from warnings import simplefilter
simplefilter('ignore')

import numpy as np
import pandas as pd

from dataPreparation.main import prepare_data_for_modelling
from modelDevelopment.main import split_data_to_train_and_test, initialize_and_fit_model, measure_model_performance

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 20)

# Load Data

In [2]:
data = pd.read_csv('dataPreparation/BBCA.csv')
prepared_data = prepare_data_for_modelling(data, '', '', 'Close', [10, 15])

In [3]:
prepared_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Upcoming 10 Days Trend,Upcoming 15 Days Trend,Higher Price Resistance Trend SAR,Lower Price Resistance Trend SAR,Change Price Resistance Trend SAR,Higher Price Support Trend SAR,Lower Price Support Trend SAR,Change Price Support Trend SAR,Aroon Up,Aroon Down,Aaron,Price Resistance Trend,Price Support Trend,Volume Resistance Trend,Volume Suppport Trend,RSI Overbought,RSI Oversold,RSI Tendencies,ADX Uptrend 14,ADX Downtrend 14,ADX Weaktrend 14,5 Days Historical AD Uptrend,5 Days Historical AD Downtrend,5 Days Historical AD Sideways,5 Days Historical Uptrend,5 Days Historical Downtrend,5 Days Historical Sideways,10 Days Historical AD Uptrend,10 Days Historical AD Downtrend,10 Days Historical AD Sideways,10 Days Historical Uptrend,10 Days Historical Downtrend,10 Days Historical Sideways,MACD Upwards,MACD Downwards,MACD Sideways
248,2021-01-12,7345.0,7380.0,7160.0,7160.0,6585.039062,95235000,Down Trend,Down Trend,1.0,0.0,0.0,1.0,0.0,0.0,0.04,1.0,-0.96,1.0,1.0,0.0,1.0,0.0,0.0,0.425279,1.0,0.0,0.0,1,0,0,1,0,0,0,1,1,0,1,1,1,0,0
249,2021-01-13,7245.0,7275.0,7070.0,7120.0,6548.250977,76581500,Down Trend,Down Trend,1.0,0.0,0.0,1.0,0.0,0.0,0.04,1.0,-0.96,1.0,1.0,0.0,0.0,0.0,0.0,0.640187,0.0,1.0,0.0,1,0,0,1,0,0,0,1,1,0,1,1,1,0,0
250,2021-01-14,7050.0,7090.0,6995.0,7020.0,6456.281738,89319500,Down Trend,Down Trend,1.0,0.0,0.0,1.0,0.0,0.0,0.08,0.96,-0.88,1.0,1.0,0.0,1.0,0.0,0.0,0.376751,0.0,1.0,0.0,0,1,1,0,1,1,0,1,1,0,1,1,1,0,0
251,2021-01-15,7095.0,7100.0,6840.0,6955.0,6396.500488,89853500,Down Trend,Down Trend,1.0,0.0,0.0,1.0,0.0,0.0,0.12,1.0,-0.88,1.0,0.0,0.0,1.0,0.0,0.0,0.490899,0.0,1.0,0.0,0,1,0,0,1,1,0,1,1,0,1,1,1,0,0
252,2021-01-18,6955.0,7125.0,6845.0,7120.0,6548.250977,86343000,Down Trend,Down Trend,1.0,0.0,0.0,1.0,0.0,0.0,0.16,0.96,-0.8,1.0,1.0,0.0,1.0,0.0,0.0,0.70041,1.0,0.0,0.0,0,1,0,0,1,0,0,1,1,0,1,1,1,0,0


In [4]:
prepared_data[['Upcoming 10 Days Trend', 'Upcoming 15 Days Trend']].describe()

Unnamed: 0,Upcoming 10 Days Trend,Upcoming 15 Days Trend
count,854,854
unique,2,2
top,Up Trend,Up Trend
freq,460,473


# Modelling Experiments

In [5]:
feature_columns = prepared_data.columns[9:]
target_column = 'Upcoming 10 Days Trend'

train_feature, train_target, test_feature, test_target = split_data_to_train_and_test(prepared_data, feature_columns, target_column)
model = initialize_and_fit_model(train_feature, train_target)
measure_model_performance(model, train_feature, train_target, test_feature, test_target)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Best parameters found by RandomizedSearchCV:
{'depth': 3, 'iterations': 155, 'l2_leaf_reg': 1.6935667971372248, 'learning_rate': 0.10694704332753689}

Best cross-validated accuracy: 0.4830
------------------------------

Model performance on training data

              precision    recall  f1-score   support

  Down Trend       0.83      0.80      0.81       386
    Up Trend       0.83      0.85      0.84       438

    accuracy                           0.83       824
   macro avg       0.83      0.83      0.83       824
weighted avg       0.83      0.83      0.83       824

    Gini:  0.8327891735869593

Model performance on testing data

              precision    recall  f1-score   support

  Down Trend       0.44      1.00      0.62         8
    Up Trend       1.00      0.55      0.71        22

    accuracy                           0.67        30
   macro avg       0.72      0.77      0.66        30
weighted avg  

In [6]:
feature_columns = prepared_data.columns[9:]
target_column = 'Upcoming 15 Days Trend'

train_feature, train_target, test_feature, test_target = split_data_to_train_and_test(prepared_data, feature_columns, target_column)
model = initialize_and_fit_model(train_feature, train_target)
measure_model_performance(model, train_feature, train_target, test_feature, test_target)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
Best parameters found by RandomizedSearchCV:
{'depth': 1, 'iterations': 210, 'l2_leaf_reg': 1.0768670564260123, 'learning_rate': 0.014086861626647885}

Best cross-validated accuracy: 0.4768
------------------------------

Model performance on training data

              precision    recall  f1-score   support

  Down Trend       0.62      0.17      0.27       376
    Up Trend       0.57      0.91      0.70       448

    accuracy                           0.57       824
   macro avg       0.59      0.54      0.48       824
weighted avg       0.59      0.57      0.50       824

    Gini:  0.2611607142857144

Model performance on testing data

              precision    recall  f1-score   support

  Down Trend       0.25      0.20      0.22         5
    Up Trend       0.85      0.88      0.86        25

    accuracy                           0.77        30
   macro avg       0.55      0.54      0.54        30
weighted avg 

In [7]:
import lightgbm as lgb

def initialize_and_fit_model(train_feature, train_target, test_feature, test_target):
    lgbm = lgb.LGBMClassifier(
        objective='binary', 
        metric='gini', 
        n_estimators=500, 
        learning_rate=0.01,
        random_state=42
    )
    
    lgbm.fit(
        train_feature, train_target,
        eval_set=[(test_feature, test_target)],
        eval_metric='gini'
    )

    return lgbm

In [8]:
feature_columns = prepared_data.columns[9:]
target_column = 'Upcoming 10 Days Trend'
test_size = 0.2
train_feature, train_target, test_feature, test_target = split_data_to_train_and_test(prepared_data, feature_columns, target_column, test_size)
model = initialize_and_fit_model(train_feature, train_target, test_feature, test_target)
measure_model_performance(model, train_feature, train_target, test_feature, test_target)

TypeError: split_data_to_train_and_test() takes 3 positional arguments but 4 were given

In [None]:
feature_columns = prepared_data.columns[9:]
target_column = 'Upcoming 15 Days Trend'
test_size = 0.2
train_feature, train_target, test_feature, test_target = split_data_to_train_and_test(prepared_data, feature_columns, target_column, test_size)
model = initialize_and_fit_model(train_feature, train_target, test_feature, test_target)
measure_model_performance(model, train_feature, train_target, test_feature, test_target)

In [None]:
from sklearn.ensemble import RandomForestClassifier

def initialize_and_fit_model(train_feature, train_target):
    rf = RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    )

    rf.fit(train_feature, train_target)
    
    return rf

In [None]:
feature_columns = prepared_data.columns[9:]
target_column = 'Upcoming 10 Days Trend'
test_size = 0.2
train_feature, train_target, test_feature, test_target = split_data_to_train_and_test(prepared_data, feature_columns, target_column, test_size)
model = initialize_and_fit_model(train_feature, train_target)
measure_model_performance(model, train_feature, train_target, test_feature, test_target)

In [None]:
feature_columns = prepared_data.columns[9:]
target_column = 'Upcoming 15 Days Trend'
test_size = 0.2
train_feature, train_target, test_feature, test_target = split_data_to_train_and_test(prepared_data, feature_columns, target_column, test_size)
model = initialize_and_fit_model(train_feature, train_target)
measure_model_performance(model, train_feature, train_target, test_feature, test_target)