# 안개 발생 진단 분류 모델 AutoML

## 1. 데이터 설명

## 2. 데이터 불러오기

In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle
import optuna
from optuna.samplers import TPESampler

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import Pool, CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import CategoricalNB

from sklearn.model_selection import train_test_split

# from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# from sklearn.metrics import make_scorer


import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def get_local_path(train_dataset_name, val_dataset_name, test_dataset_name):

    # 작업 디렉토리 origin_dir에 선언
    origin_dir = os.getcwd()

    # ".." : 상위 디렉토리로 이동
    os.chdir("..")

    # 상위 디렉토리에 fog_data가 없다면 fog_data 폴더 생성
    # (이미 폴더 있어도 exist_ok=True면 넘어감)
    os.makedirs("fog_data", exist_ok=True)

    # train/test 데이터셋 경로 잡아준다
    train_path = os.path.join(os.getcwd(), "fog_data", train_dataset_name)
    val_path = os.path.join(os.getcwd(), "fog_data", val_dataset_name)
    test_path = os.path.join(os.getcwd(), "fog_data", test_dataset_name)

    # 운영체제가 윈도우일 경우, "\\"를 "/"로 바꿔줘야 한다
    if os.name == "nt":
        train_path = train_path.replace("\\", "/")
        val_path = val_path.replace("\\", "/")
        test_path = test_path.replace("\\", "/")

    # origin_dir로 경로 다시 변경 (초기화)
    os.chdir(origin_dir)

    return train_path, val_path, test_path

In [3]:
# 전처리 중간에 nan 제거해주고 이상치 제거한 train_df
train_path, val_path, test_path = get_local_path("fog_train_dropped_no_outlier.csv", 
                                                 "fog_val_dropped_no_outlier.csv", 
                                                 "fog_test_dropped_no_outlier.csv")

In [4]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

## 3. Autogluon

In [5]:
import autogluon.core as ag
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.core.metrics import make_scorer

In [11]:
# CSI 지수 함수 생성
def csi_index(y_true, y_pred):

    model_cm = confusion_matrix(y_true, y_pred)

    H = (model_cm[0][0] + model_cm[1][1] + model_cm[2][2])
    F = (model_cm[0][1] + model_cm[0][2] +
            model_cm[1][0] + model_cm[1][2] +
            model_cm[2][0] + model_cm[2][1] +
            model_cm[3][0] + model_cm[3][1] + model_cm[3][2])
    M = (model_cm[0][3] + model_cm[1][3] + model_cm[2][3])

    model_csi = H / (H + F + M)

    return model_csi

ag_csi_scorer = make_scorer(name='CSI',
                            score_func=csi_index,
                            optimum=1,
                            greater_is_better=True,
                            needs_class=True)

In [28]:
# target 컬럼
target = "class"

# # 모델 생성시 사용할 컬럼 지정
used_cols = train_df.drop(columns=[target, "year", "vis1", "is_fog", "ws10_deg", "dew_point"]).columns
used_cols_target = train_df.drop(columns=["year", "vis1", "is_fog", "ws10_deg", "dew_point"]).columns

train = train_df[used_cols_target]
# val = train_df[used_cols]
val = val_df[used_cols_target]

test = test_df[used_cols]

# X_train = train_df[used_cols]
# y_train = train_df[target]

# X_val = val_df[used_cols]
# y_val = val_df[target]

# X_test = test_df[used_cols]

In [None]:
from autogluon.tabular.configs.hyperparameter_configs import get_hyperparameter_config


In [20]:
predictor = TabularPredictor(label="class", eval_metric=ag_csi_scorer).fit(
    train_data=train,
    presets='best_quality',
    # time_limit=500,  # 학습 시간 제한 (초 단위)
    # ag_args_fit={'num_gpus': 1}  # GPU 사용 설정
    num_gpus=1
)


No path specified. Models will be saved in: "AutogluonModels\ag-20240624_184625"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.0
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          6
Memory Avail:       7.29 GB / 15.95 GB (45.7%)
Disk Space Avail:   1593.22 GB / 1862.39 GB (85.5%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation data is used to detect stacked overfit

In [21]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                    model  score_val eval_metric  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  NeuralNetFastAI_BAG_L2   0.147103         CSI     286.521387  2249.585996               13.392524         773.743898            2       True          6
1     WeightedEnsemble_L3   0.147103         CSI     286.785384  2268.081046                0.263997          18.495050            3       True          7
2   KNeighborsDist_BAG_L1   0.111520         CSI     139.653692    31.104553              139.653692          31.104553            1       True          2
3     WeightedEnsemble_L2   0.111520         CSI     139.911692    46.174592                0.258001          15.070039            2       True          5
4   KNeighborsUnif_BAG_L1   0.099642         CSI     119.749321    31.082488              119.749321          31.082488            1       True          1
5  Neura

{'model_types': {'KNeighborsUnif_BAG_L1': 'StackerEnsembleModel_KNN',
  'KNeighborsDist_BAG_L1': 'StackerEnsembleModel_KNN',
  'NeuralNetFastAI_BAG_L1': 'StackerEnsembleModel_NNFastAiTabular',
  'LightGBMXT_BAG_L1': 'StackerEnsembleModel_LGB',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel',
  'NeuralNetFastAI_BAG_L2': 'StackerEnsembleModel_NNFastAiTabular',
  'WeightedEnsemble_L3': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif_BAG_L1': 0.09964184731385485,
  'KNeighborsDist_BAG_L1': 0.11152013019723331,
  'NeuralNetFastAI_BAG_L1': 0.06201066133599867,
  'LightGBMXT_BAG_L1': 0.0017868538608806638,
  'WeightedEnsemble_L2': 0.11152013019723331,
  'NeuralNetFastAI_BAG_L2': 0.14710276600040378,
  'WeightedEnsemble_L3': 0.14710276600040378},
 'model_best': 'WeightedEnsemble_L3',
 'model_paths': {'KNeighborsUnif_BAG_L1': ['KNeighborsUnif_BAG_L1'],
  'KNeighborsDist_BAG_L1': ['KNeighborsDist_BAG_L1'],
  'NeuralNetFastAI_BAG_L1': ['NeuralNetFastAI_BAG_L1'],
  'LightGBMXT_BA

In [22]:
ld_board1 = predictor.leaderboard(train, silent=True)
ld_board1

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,KNeighborsDist_BAG_L1,1.0,0.11152,CSI,129.177168,139.653692,31.104553,129.177168,139.653692,31.104553,1,True,2
1,WeightedEnsemble_L2,1.0,0.11152,CSI,129.265444,139.911692,46.174592,0.088276,0.258001,15.070039,2,True,5
2,NeuralNetFastAI_BAG_L2,0.890147,0.147103,CSI,448.814812,286.521387,2249.585996,109.944973,13.392524,773.743898,2,True,6
3,WeightedEnsemble_L3,0.890147,0.147103,CSI,448.877813,286.785384,2268.081046,0.063001,0.263997,18.49505,3,True,7
4,KNeighborsUnif_BAG_L1,0.248162,0.099642,CSI,102.678838,119.749321,31.082488,102.678838,119.749321,31.082488,1,True,1
5,NeuralNetFastAI_BAG_L1,0.060813,0.062011,CSI,101.534393,13.054852,1294.839772,101.534393,13.054852,1294.839772,1,True,3
6,LightGBMXT_BAG_L1,0.0,0.001787,CSI,5.47944,0.670999,118.815284,5.47944,0.670999,118.815284,1,True,4


In [29]:
ld_board2 = predictor.leaderboard(val, silent=True)
ld_board2

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_BAG_L2,0.146188,0.147103,CSI,173.377162,286.521387,2249.585996,40.465162,13.392524,773.743898,2,True,6
1,WeightedEnsemble_L3,0.146188,0.147103,CSI,173.408161,286.785384,2268.081046,0.031,0.263997,18.49505,3,True,7
2,KNeighborsDist_BAG_L1,0.104877,0.11152,CSI,46.357697,139.653692,31.104553,46.357697,139.653692,31.104553,1,True,2
3,WeightedEnsemble_L2,0.104877,0.11152,CSI,46.394694,139.911692,46.174592,0.036998,0.258001,15.070039,2,True,5
4,KNeighborsUnif_BAG_L1,0.096025,0.099642,CSI,46.547238,119.749321,31.082488,46.547238,119.749321,31.082488,1,True,1
5,NeuralNetFastAI_BAG_L1,0.060917,0.062011,CSI,37.943581,13.054852,1294.839772,37.943581,13.054852,1294.839772,1,True,3
6,LightGBMXT_BAG_L1,0.0,0.001787,CSI,2.063483,0.670999,118.815284,2.063483,0.670999,118.815284,1,True,4


In [30]:
ld_board2

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetFastAI_BAG_L2,0.146188,0.147103,CSI,173.377162,286.521387,2249.585996,40.465162,13.392524,773.743898,2,True,6
1,WeightedEnsemble_L3,0.146188,0.147103,CSI,173.408161,286.785384,2268.081046,0.031,0.263997,18.49505,3,True,7
2,KNeighborsDist_BAG_L1,0.104877,0.11152,CSI,46.357697,139.653692,31.104553,46.357697,139.653692,31.104553,1,True,2
3,WeightedEnsemble_L2,0.104877,0.11152,CSI,46.394694,139.911692,46.174592,0.036998,0.258001,15.070039,2,True,5
4,KNeighborsUnif_BAG_L1,0.096025,0.099642,CSI,46.547238,119.749321,31.082488,46.547238,119.749321,31.082488,1,True,1
5,NeuralNetFastAI_BAG_L1,0.060917,0.062011,CSI,37.943581,13.054852,1294.839772,37.943581,13.054852,1294.839772,1,True,3
6,LightGBMXT_BAG_L1,0.0,0.001787,CSI,2.063483,0.670999,118.815284,2.063483,0.670999,118.815284,1,True,4


## autogluon 연습

In [7]:
from autogluon.tabular import TabularDataset, TabularPredictor


In [8]:
train_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/train.csv')
subsample_size = 500  # subsample subset of data for faster demo, try setting this to much larger values
train_data = train_data.sample(n=subsample_size, random_state=0)
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
6118,51,Private,39264,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,>50K
23204,58,Private,51662,10th,6,Married-civ-spouse,Other-service,Wife,White,Female,0,0,8,United-States,<=50K
29590,40,Private,326310,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,44,United-States,<=50K
18116,37,Private,222450,HS-grad,9,Never-married,Sales,Not-in-family,White,Male,0,2339,40,El-Salvador,<=50K
33964,62,Private,109190,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,40,United-States,>50K


In [9]:
label = 'class'
print("Summary of class variable: \n", train_data[label].describe())

Summary of class variable: 
 count        500
unique         2
top        <=50K
freq         365
Name: class, dtype: object


In [10]:
predictor = TabularPredictor(label=label).fit(
    train_data=train_data,
    num_gpus=1
)

No path specified. Models will be saved in: "AutogluonModels\ag-20240624_182441"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.0
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          6
Memory Avail:       8.23 GB / 15.95 GB (51.6%)
Disk Space Avail:   1594.22 GB / 1862.39 GB (85.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluo

In [20]:
test_data = TabularDataset('https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv')
y_test = test_data[label]  # values to predict
test_data_nolab = test_data.drop(columns=[label])  # delete label column to prove we're not cheating
test_data_nolab.head()

Loaded data from: https://autogluon.s3.amazonaws.com/datasets/Inc/test.csv | Columns = 15 / 15 | Rows = 9769 -> 9769


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,31,Private,169085,11th,7,Married-civ-spouse,Sales,Wife,White,Female,0,0,20,United-States
1,17,Self-emp-not-inc,226203,12th,8,Never-married,Sales,Own-child,White,Male,0,0,45,United-States
2,47,Private,54260,Assoc-voc,11,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,60,United-States
3,21,Private,176262,Some-college,10,Never-married,Exec-managerial,Own-child,White,Female,0,0,30,United-States
4,17,Private,241185,12th,8,Never-married,Prof-specialty,Own-child,White,Male,0,0,20,United-States


In [21]:


y_pred = predictor.predict(test_data_nolab)
print("Predictions:  \n", y_pred)
perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Predictions:  
 0        <=50K
1        <=50K
2         >50K
3        <=50K
4        <=50K
         ...  
9764     <=50K
9765     <=50K
9766     <=50K
9767     <=50K
9768     <=50K
Name: class, Length: 9769, dtype: object


In [22]:
from autogluon.tabular import TabularPredictor

predictor = TabularPredictor(label=label).fit(train_data, hyperparameters='toy')

predictor.leaderboard(test_data, silent=True)

No path specified. Models will be saved in: "AutogluonModels\ag-20240624_164420"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.11.0
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          6
Memory Avail:       5.26 GB / 15.95 GB (33.0%)
Disk Space Avail:   1601.48 GB / 1862.39 GB (86.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluo

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,XGBoost,0.827618,0.83,accuracy,0.024001,0.006003,0.035997,0.024001,0.006003,0.035997,1,True,3
1,CatBoost,0.818098,0.84,accuracy,0.023998,0.004,0.424512,0.023998,0.004,0.424512,1,True,2
2,WeightedEnsemble_L2,0.818098,0.84,accuracy,0.025998,0.005,0.461511,0.001999,0.001,0.036999,2,True,5
3,LightGBM,0.796806,0.78,accuracy,0.067,0.004,0.34865,0.067,0.004,0.34865,1,True,1
4,NeuralNetTorch,0.786263,0.75,accuracy,0.060001,0.01,0.451998,0.060001,0.01,0.451998,1,True,4


In [None]:
1+1

In [31]:
# tensorflow device 확인
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 15946767276091886305
 xla_global_id: -1]

In [32]:
import time
from sklearn.datasets import make_regression
from xgboost import XGBRegressor

def model_test(model_name, model):
    x, y = make_regression(n_samples=100000, n_features=100)
    
    start_time = time.time()
    model.fit(x, y)
    end_time = time.time()
    return f'{model_name}: 소요시간: {(end_time - start_time)} 초'

xgb = XGBRegressor(n_estimators=1000, 
                   learning_rate=0.01, 
                   subsample=0.8, 
                   colsample_bytree=0.8,
                   objective='reg:squarederror', 
                  )

print(model_test('xgb (cpu)', xgb))

xgb = XGBRegressor(n_estimators=1000, 
                   learning_rate=0.01, 
                   subsample=0.8, 
                   colsample_bytree=0.8,
                   objective='reg:squarederror', 
                   tree_method='gpu_hist')

print(model_test('xgb (gpu)', xgb))

xgb (cpu): 소요시간: 18.49636197090149 초
xgb (gpu): 소요시간: 7.452618598937988 초
