In [1]:
import os
import sys
import glob
from dotenv import load_dotenv, find_dotenv
import pandas as pd
import numpy as np
# Load the .env file
load_dotenv(find_dotenv())

package_path = os.getenv('PACKAGE_PATH')
# package_path = '/home/dwna/projects/domain_class'
sys.path.append(package_path)


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval


In [2]:

# Load data
data = pd.read_csv(package_path + '/data/processed/profiles/1/ver_1_len_1000_rate_0.01.csv')
data = data.dropna()
# Split data into features and target
X = data.drop(columns=['col_name', 'datatype', 'domain' ])
y = data['domain']

# Define continuous and binary columns
continuous_cols = [col for col in X.columns if col[0].islower()]
binary_cols = [col for col in X.columns if col[0].isupper()]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [20]:
# Extend the model_spaces dictionary with hyperparameter spaces for SVM, LightGBM, and CatBoost
model_spaces = {
    'logistic_regression': {
        'preprocessing': hp.choice('lr_preprocessing', ['standard', 'minmax']),
        'C': hp.loguniform('lr_C', -4, 4)
    },
    'random_forest': {
        'preprocessing': hp.choice('rf_preprocessing', ['standard', 'minmax']),
        'n_estimators': hp.choice('rf_n_estimators', [10, 50, 100, 200]),
        'max_depth': hp.choice('rf_max_depth', [5, 10, 20, None])
    },
    'svm': {
        'preprocessing': hp.choice('svm_preprocessing', ['standard', 'minmax']),
        'C': hp.loguniform('svm_C', -4, 4),
        'gamma': hp.loguniform('svm_gamma', -4, 4)
    },
    'lgbm': {
        'preprocessing': hp.choice('lgbm_preprocessing', ['standard', 'minmax']),
        'learning_rate': hp.loguniform('lgbm_learning_rate', -4, 0),
        'n_estimators': hp.choice('lgbm_n_estimators', [10, 50, 100, 200]),
        'num_leaves': hp.choice('lgbm_num_leaves', [15, 31, 63, 127]), 
        'max_depth': hp.choice('lgbm_max_depth', [5, 10, 20, -1])
    },
    'catboost': {
        'preprocessing': hp.choice('catboost_preprocessing', ['standard', 'minmax']),
        'learning_rate': hp.loguniform('catboost_learning_rate', -4, 0),
        'iterations': hp.choice('catboost_iterations', [10, 50, 100, 200]),
        'depth': hp.choice('catboost_depth', [4, 6, 8, 10])
    }
}

# Update the create_model function
def create_model(model_name, params):
    if model_name == 'logistic_regression':
        model = LogisticRegression(C=params['C'])
    elif model_name == 'random_forest':
        model = RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'])
    elif model_name == 'svm':
        model = SVC(C=params['C'], gamma=params['gamma'])
    elif model_name == 'lgbm':
        model = LGBMClassifier(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'], max_depth=params['max_depth'])
    elif model_name == 'catboost':
        model = CatBoostClassifier(learning_rate=params['learning_rate'], iterations=params['iterations'], depth=params['depth'], verbose=0)
    return model

# Function to choose preprocessing based on the parameter
def choose_preprocessing(preprocessing_choice):
    if preprocessing_choice == 'standard':
        return StandardScaler()
    elif preprocessing_choice == 'minmax':
        return MinMaxScaler()
    # Add more choices if needed

# Objective function for optimization
def objective(params, model_name):
    preprocessing = choose_preprocessing(params['preprocessing'])
    model = create_model(model_name, params)

    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    acc = pipeline.score(X_test, y_test)
    #     # Parallelize cross-validation
    # score = cross_val_score(pipeline, X, y, cv=3, n_jobs=-1).mean()
    # return {'loss': -score, 'status': STATUS_OK}
    return {'loss': -acc, 'status': STATUS_OK}

# Perform optimization for each model
# best_params = {}
# for model_name, space in model_spaces.items():
#     trials = Trials()
#     best = fmin(fn=lambda params: objective(params, model_name),
#                 space=space,
#                 algo=tpe.suggest,
#                 max_evals=50,  # Adjust as needed
#                 trials=trials)
#     best_params[model_name] = best
#     print(f"Best parameters for {model_name}: {best}")

# best_params contains the best parameters for each model

In [5]:
model_spaces.keys()

dict_keys(['logistic_regression', 'random_forest', 'svm', 'lgbm', 'catboost'])

In [6]:
best_params = {}


In [21]:
model_name, space = 'lgbm', model_spaces['lgbm']

trials = Trials()
best = fmin(fn=lambda params: objective(params, model_name),
            space=space,
            algo=tpe.suggest,
            max_evals=50,  # Adjust as needed
            trials=trials)
best_params[model_name] = best
print(f"Best parameters for {model_name}: {best}")

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003963 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2617                     
[LightGBM] [Info] Number of data points in the train set: 6905, number of used features: 21
[LightGBM] [Info] Start training from score -4.762464 
[LightGBM] [Info] Start training from score -1.384124 
[LightGBM] [Info] Start training from score -1.066827 
[LightGBM] [Info] Start training from score -1.969948 
[LightGBM] [Info] Start training from score -2.402249 
[LightGBM] [Info] Start training from score -5.011360 
[LightGBM] [Info] Start training from score -1.831496 
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000947 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2251                                                
[LightGBM] [Info] Number of data points in the train set: 6905

In [17]:
# best_params

# {'lgbm': {'lgbm_learning_rate': 0.05297169275142796,
#   'lgbm_max_depth': 2,
#   'lgbm_n_estimators': 1,
#   'lgbm_preprocessing': 0}}

{'lgbm': {'lgbm_learning_rate': 0.05297169275142796,
  'lgbm_max_depth': 2,
  'lgbm_n_estimators': 1,
  'lgbm_preprocessing': 0}}

In [19]:
trained_models = {}

# Function to strip the model name prefix from the hyperparameters
# def strip_model_name_prefix(params, model_name):
#     return {k[len(model_name) + 1:]: v for k, v in params.items() if k.startswith(model_name)}


for model_name, best_param in best_params.items():
    # Remove model name prefix from parameter keys
    # model_params = strip_model_name_prefix(best_param, model_name)
    # Reconstruct the best model
    model_params = space_eval(model_spaces[model_name], best_param)
    
    preprocessing_choice = model_params.pop('preprocessing') if 'preprocessing' in model_params else 'standard'
    preprocessing = choose_preprocessing(preprocessing_choice)
    # Create and train the pipeline with the best model
    model = create_model(model_name, model_params)   
    pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)

    # Store the trained model
    trained_models[model_name] = pipeline

    # Evaluate on the test set
    test_score = pipeline.score(X_test, y_test)
    print(f"{model_name} Test Accuracy: {test_score:.2f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001016 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2251
[LightGBM] [Info] Number of data points in the train set: 6905, number of used features: 21
[LightGBM] [Info] Start training from score -4.762464
[LightGBM] [Info] Start training from score -1.384124
[LightGBM] [Info] Start training from score -1.066827
[LightGBM] [Info] Start training from score -1.969948
[LightGBM] [Info] Start training from score -2.402249
[LightGBM] [Info] Start training from score -5.011360
[LightGBM] [Info] Start training from score -1.831496
lgbm Test Accuracy: 0.89


In [22]:
model_params

{'learning_rate': 0.05297169275142796, 'max_depth': 20, 'n_estimators': 50}

In [1]:
rf = {
  "금액": {
    "precision": 0.8,
    "recall": 0.7272727272727273,
    "f1-score": 0.761904761904762,
    "support": 11
  },
  "날짜": {
    "precision": 0.9698275862068966,
    "recall": 0.9574468085106383,
    "f1-score": 0.9635974304068524,
    "support": 235
  },
  "번호": {
    "precision": 0.8613861386138614,
    "recall": 0.9157894736842105,
    "f1-score": 0.8877551020408163,
    "support": 285
  },
  "수량": {
    "precision": 0.7570093457943925,
    "recall": 0.6923076923076923,
    "f1-score": 0.7232142857142857,
    "support": 117
  },
  "여부": {
    "precision": 0.9487179487179487,
    "recall": 0.9367088607594937,
    "f1-score": 0.9426751592356688,
    "support": 79
  },
  "율": {
    "precision": 0.5,
    "recall": 0.2,
    "f1-score": 0.28571428571428575,
    "support": 5
  },
  "코드": {
    "precision": 0.8222222222222222,
    "recall": 0.8222222222222222,
    "f1-score": 0.8222222222222222,
    "support": 135
  },
  "accuracy": 0.8777393310265282,
  "macro avg": {
    "precision": 0.8084518916507601,
    "recall": 0.750249683536712,
    "f1-score": 0.7695833210341274,
    "support": 867
  },
  "weighted avg": {
    "precision": 0.8756901311075352,
    "recall": 0.8777393310265282,
    "f1-score": 0.8758391384601569,
    "support": 867
  }
}

In [3]:
catboost = {
  "금액": {
    "precision": 1,
    "recall": 0.7272727272727273,
    "f1-score": 0.8421052631578948,
    "support": 11
  },
  "날짜": {
    "precision": 0.9615384615384616,
    "recall": 0.9574468085106383,
    "f1-score": 0.9594882729211087,
    "support": 235
  },
  "번호": {
    "precision": 0.864406779661017,
    "recall": 0.8947368421052632,
    "f1-score": 0.8793103448275862,
    "support": 285
  },
  "수량": {
    "precision": 0.7368421052631579,
    "recall": 0.717948717948718,
    "f1-score": 0.7272727272727273,
    "support": 117
  },
  "여부": {
    "precision": 0.9506172839506173,
    "recall": 0.9746835443037974,
    "f1-score": 0.9625,
    "support": 79
  },
  "율": {
    "precision": 0.5,
    "recall": 0.2,
    "f1-score": 0.28571428571428575,
    "support": 5
  },
  "코드": {
    "precision": 0.8646616541353384,
    "recall": 0.8518518518518519,
    "f1-score": 0.8582089552238805,
    "support": 135
  },
  "accuracy": 0.8823529411764706,
  "macro avg": {
    "precision": 0.8397237549355132,
    "recall": 0.760562927427571,
    "f1-score": 0.7877999784453548,
    "support": 867
  },
  "weighted avg": {
    "precision": 0.8810335475445066,
    "recall": 0.8823529411764706,
    "f1-score": 0.8809244980181821,
    "support": 867
  }
}

In [4]:
lgbm = {
  "금액": {
    "precision": 1,
    "recall": 0.6363636363636364,
    "f1-score": 0.7777777777777778,
    "support": 11
  },
  "날짜": {
    "precision": 0.9537815126050421,
    "recall": 0.9659574468085106,
    "f1-score": 0.9598308668076109,
    "support": 235
  },
  "번호": {
    "precision": 0.8719723183391004,
    "recall": 0.8842105263157894,
    "f1-score": 0.8780487804878048,
    "support": 285
  },
  "수량": {
    "precision": 0.7407407407407407,
    "recall": 0.6837606837606838,
    "f1-score": 0.7111111111111111,
    "support": 117
  },
  "여부": {
    "precision": 0.9615384615384616,
    "recall": 0.9493670886075949,
    "f1-score": 0.9554140127388535,
    "support": 79
  },
  "율": {
    "precision": 0.25,
    "recall": 0.2,
    "f1-score": 0.22222222222222224,
    "support": 5
  },
  "코드": {
    "precision": 0.8321678321678322,
    "recall": 0.8814814814814815,
    "f1-score": 0.8561151079136691,
    "support": 135
  },
  "accuracy": 0.8777393310265282,
  "macro avg": {
    "precision": 0.8014572664844539,
    "recall": 0.7430201233339566,
    "f1-score": 0.7657885541512928,
    "support": 867
  },
  "weighted avg": {
    "precision": 0.8764378646593898,
    "recall": 0.8777393310265282,
    "f1-score": 0.8762676694119889,
    "support": 867
  }
}

In [5]:
svm = {
  "금액": {
    "precision": 0.8888888888888888,
    "recall": 0.7272727272727273,
    "f1-score": 0.7999999999999999,
    "support": 11
  },
  "날짜": {
    "precision": 0.9572649572649573,
    "recall": 0.9531914893617022,
    "f1-score": 0.9552238805970149,
    "support": 235
  },
  "번호": {
    "precision": 0.8610169491525423,
    "recall": 0.8912280701754386,
    "f1-score": 0.8758620689655173,
    "support": 285
  },
  "수량": {
    "precision": 0.7407407407407407,
    "recall": 0.6837606837606838,
    "f1-score": 0.7111111111111111,
    "support": 117
  },
  "여부": {
    "precision": 0.9746835443037974,
    "recall": 0.9746835443037974,
    "f1-score": 0.9746835443037974,
    "support": 79
  },
  "율": {
    "precision": 0.6666666666666666,
    "recall": 0.4,
    "f1-score": 0.5,
    "support": 5
  },
  "코드": {
    "precision": 0.8489208633093526,
    "recall": 0.8740740740740741,
    "f1-score": 0.8613138686131386,
    "support": 135
  },
  "accuracy": 0.8800461361014994,
  "macro avg": {
    "precision": 0.848311801475278,
    "recall": 0.7863157984212034,
    "f1-score": 0.8111706390843685,
    "support": 867
  },
  "weighted avg": {
    "precision": 0.8785803803809457,
    "recall": 0.8800461361014994,
    "f1-score": 0.8787493354766375,
    "support": 867
  }
}

In [6]:
logistic = {
  "금액": {
    "precision": 1,
    "recall": 0.7272727272727273,
    "f1-score": 0.8421052631578948,
    "support": 11
  },
  "날짜": {
    "precision": 0.8007380073800738,
    "recall": 0.9234042553191489,
    "f1-score": 0.8577075098814229,
    "support": 235
  },
  "번호": {
    "precision": 0.7894736842105263,
    "recall": 0.7368421052631579,
    "f1-score": 0.7622504537205083,
    "support": 285
  },
  "수량": {
    "precision": 0.5943396226415094,
    "recall": 0.5384615384615384,
    "f1-score": 0.5650224215246635,
    "support": 117
  },
  "여부": {
    "precision": 0.9382716049382716,
    "recall": 0.9620253164556962,
    "f1-score": 0.95,
    "support": 79
  },
  "율": {
    "precision": 0.5,
    "recall": 0.6,
    "f1-score": 0.5454545454545454,
    "support": 5
  },
  "코드": {
    "precision": 0.7984496124031008,
    "recall": 0.762962962962963,
    "f1-score": 0.7803030303030303,
    "support": 135
  },
  "accuracy": 0.7843137254901961,
  "macro avg": {
    "precision": 0.7744675045104973,
    "recall": 0.750138415105033,
    "f1-score": 0.7575490320060093,
    "support": 867
  },
  "weighted avg": {
    "precision": 0.782151467183294,
    "recall": 0.7843137254901961,
    "f1-score": 0.7811898583203962,
    "support": 867
  }
}