# Clasificador OP

In [11]:
import json
import os
from pathlib import Path
import shutil
from datetime import datetime
from flow import preprocessing, training, testing, preprocessing_predict, predict
## Parametros base
STUDY_NAME = "lgbm_kappa_bbdd_2"
STUDY_NAME = STUDY_NAME.replace(" ","_")
BASEDIR = os.getcwd() 

# Carpetas base
os.makedirs(os.path.join(BASEDIR,"work"),exist_ok=True) # crear la carpeta work
os.makedirs(os.path.join(BASEDIR,"flow"),exist_ok=True) # crear la carpeta flow
os.makedirs(os.path.join(BASEDIR,"work",STUDY_NAME),exist_ok=True) # crear la carpeta del estudio nuevo

PATH_WORK = os.path.join(os.getcwd(),"work",STUDY_NAME)
PATH_FLOW = os.path.join(os.getcwd(),"flow")

# work
PATH_PREPROCESSING = os.path.join(PATH_WORK,"preprocessing")
PATH_TRAINING = os.path.join(PATH_WORK,"training")
PATH_MODELS = os.path.join(PATH_WORK,"models")
PATH_PREDICT = os.path.join(PATH_WORK,"predict")
PATH_RESULTS = os.path.join(PATH_WORK,"results")

os.makedirs(PATH_PREPROCESSING,exist_ok=True)
os.makedirs(PATH_TRAINING,exist_ok=True)
os.makedirs(PATH_MODELS,exist_ok=True)
os.makedirs(PATH_PREDICT,exist_ok=True)
os.makedirs(PATH_RESULTS,exist_ok=True)

bbdd_name = "optuna.sqlite3"
path_optuna = os.path.join(os.getcwd(),"work",bbdd_name)
BBDD_OPTUNA = f"sqlite:///{Path(path_optuna).as_posix()}"

DATASET_BASE = "dt_train.xlsx"
FEATURES_BASE = ['corrent',
 'nom_entidad1',
 'cod_cta_banco',
 'nom_cta',
 'clase_reg',
 'cod_banco',
 'clase_cta',
 'descripcion',
 'clase_gasto',
 'codigo_acr1',
 'codfte1',
 'tipo_comprobante2',
 'cuit',
 'glosa1',
 'clase']

# VARIABLES CATEGORICAS PARA ONEHOT ENCODING
FEATURES_CATEG = ['tipo_comprobante2','clase_reg','clase_gasto','clase_cta']

params_base = {}
# study
params_base["STUDY_NAME"] = STUDY_NAME
params_base["BBDD_OPTUNA"] = BBDD_OPTUNA
params_base["TABLE"] = "mesa_entrada_clase"
params_base["TABLE_DEUDA"] = "mesa_entrada_deuda"
# carpetas
params_base["BASEDIR"] = BASEDIR
params_base["PATH_WORK"] = PATH_WORK
params_base["PATH_FLOW"] = PATH_FLOW
params_base["PATH_PREPROCESSING"] = PATH_PREPROCESSING
params_base["PATH_TRAINING"] = PATH_TRAINING
params_base["PATH_MODELS"] = PATH_MODELS

# datasets
params_base["DATASET_BASE"] = os.path.join(BASEDIR,DATASET_BASE)
params_base["DATASET_PROCESSED"] = os.path.join(PATH_TRAINING,'dt_train_processed.csv')
params_base["FEATURES_BASE"] = FEATURES_BASE
params_base["FEATURES_CATEG"] = FEATURES_CATEG
params_base["DATASET_PREDICT"] = os.path.join(PATH_PREDICT,'dt_pred_orig.csv')
params_base["DATASET_PREDICT_PROCESSED"] = os.path.join(PATH_PREDICT,'dt_prep_processed.csv')

# preprocess
path_preprocess_params = os.path.join(params_base["PATH_WORK"],f'params_preprocessing.json')
params_base["PREPROCESSING_PARAMS"] = path_preprocess_params
# Training 
params_base["MODEL"] = f'{PATH_MODELS}/model_{STUDY_NAME}.pkl'
params_base["SEED"] = 12345
params_base["TEST_SIZE"] = 0.2
params_base["N_TRIALS"] = 100
params_base["PARAMS_INT"] = ['num_leaves','max_depth','min_child_samples']
params_base["PARAMS_UNIFORM"] = ['num_leaves','max_depth']
params_base["PARAMS_LOGUNIFORM"] = ['learning_rate','reg_alpha','reg_lambda']
params_base["HP"] = {
    'objective': 'multiclass',
    'metric': 'multi_logloss',  # Esto es solo para LightGBM; la métrica de optimización será kappa
    'boosting_type': 'gbdt',
    'learning_rate': (1e-3, 1e-1),
    'num_leaves': (31, 256),
    'max_depth': (-1, 15),
    'min_child_samples': (5, 100),
    'subsample': (0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'reg_alpha': ( 1e-8, 1.0),
    'reg_lambda': (1e-8, 1.0),
    'verbose': -1,
}


path_params_base = f'work/{STUDY_NAME}/paramas_base.json'
with open(path_params_base, 'w') as file:
    json.dump(params_base, file, indent=4)

# crear un bat para ejecutar el optuna dashboard
with open(os.path.join(os.getcwd(),"work","optuna.bat"),'w') as optunabat:
    optunabat.write(f"optuna-dashboard sqlite:///{bbdd_name}")

In [6]:
## Preprocesamiento
shutil.copy(DATASET_BASE, PATH_PREPROCESSING)
preprocessing.preprocess(path_params_base=path_params_base, path_train_local = f'{PATH_PREPROCESSING}\\{DATASET_BASE}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fcarreno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Se cargan datos de las BBDD
Se cargan datos locales
Onehot encoding finalizado
Procesando texto
Procesamiento de texto finalizado
Parametros de preprocesamiento guardados
Dataframe preprocesado guardado


'Preprocesamiento finalizado'

In [None]:
## Entrenamiento
## para 100 trials demora 1 hora
training.train(path_params_base=path_params_base)

In [None]:
# test
testing.test(path_params_base=path_params_base)


In [3]:

preprocessing_predict.preprocess_predict(path_params_base=path_params_base)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fcarreno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Se cargan datos de las BBDD
Onehot encoding finalizado
Procesando texto
Procesamiento de texto finalizado
Dataframe procesado guardado


'Preprocesamiento finalizado'

In [12]:
# predict final
import importlib
importlib.reload(predict)
now = datetime.now().strftime("%d-%m-%Y %H%M")
predict.predict(path_params_base=path_params_base, dataset_final = os.path.join(PATH_RESULTS,f'dt_final {now}.csv'))

modelo lgbm_kappa_bbdd_2 cargado
Proceso Terminado
