This notebook has the training workflow 

## Import everything here

In [None]:
# Internal imports
from src.utils.config_reader import read_config, Configrations, read_predictor_config
from src.generic.run_trainer import TrainerPipeline
from src.generic.predictor import Predictor

# Library imports
# sklearn imports
import lightgbm
import sklearn.linear_model
import sklearn
#pandas imports
from pandas import DataFrame
import ibis
import duckdb



## Define the config here

In [None]:
config = {
  'project_name': 'statistella',
  'training': {
    'input_data_source': 'main_staging.stg_full', #location for the input data in spark
    'target': 'TARGET' #The target column
  },
  'models': {
    'LGBMClassifier': {
      'experiment_name': 'an experiment',
      'model_params': {
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'max_depth': -1,
        'learning_rate': 0.1,
        'n_estimators': 100,
        'subsample_for_bin': 200000,
        'objective': 'binary',
        'class_weight': 'balanced',
        'min_split_gain': 0.0,
        'min_child_weight': 0.001,
        'min_child_samples': 20,
        'subsample': 1.0,
        'subsample_freq': 0,
        'colsample_bytree': 1.0,
        'reg_alpha': 0.0,
        'reg_lambda': 0.0,
        'random_state': 42,
        'n_jobs': -1,
        'importance_type': 'split'
      }
    },
    'LogisticRegression': {
      'model_params': {
        'penalty': 'l2',
        'tol': 0.0001,
        'C': 1.0,
        'fit_intercept': True,
        'intercept_scaling': 1,
        'class_weight': 'balanced',
        'random_state': 42,
        'solver': 'lbfgs',
        'max_iter': 100,
        'n_jobs': -1
      }
    }
  },
  'hyper_parameter_tuning': { #optional, if present, parameter tuning will be performed
    'perform': False,
    'scoring': 'f1',
    'cv': 5,
    'factor': 2
  }
}

pred_config = {
  "registered_model": "lgbm",
  "version": 2
}


final_config = read_config(config=config)
final_pred_config = read_predictor_config(config=pred_config)

## Training workflow 

#### Read input

In [None]:
def read_training_input() -> None:
        
    """
    Input table retreival flow -> write any custom input table code here
    """
    input_table = final_config.training.input_data_source

    con = duckdb.connect("/app/database/ml_db.duckdb")
    
    df = con.sql(f"SELECT * FROM {input_table} WHERE use_for_training=1").df()

    df_num = df.select_dtypes(include=['number'])
    df_num = df_num.fillna(0)

    return df_num

def read_testing_input() -> None:
        
    """
    Input table retreival flow -> write any custom input table code here
    """
    input_table = final_config.training.input_data_source

    con = duckdb.connect("/app/database/ml_db.duckdb")
    
    df = con.sql(f"SELECT * FROM {input_table} WHERE use_for_training=0").df()

    df_num = df.select_dtypes(include=['number'])
    df_num = df_num.fillna(0)

    return df_num

## Training model / models

In [None]:
input_df = read_training_input()

trainer = TrainerPipeline(
    config = final_config,
    df = input_df
)

# Train the model using this code 
trainer.pipeline_run()

## Prediction workflow

In [None]:
pred_model = Predictor(
    config=final_pred_config
)

test_df = read_testing_input()

out = pred_model.predict(
    df = test_df
)