### 데이터셋 불러오기

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from datetime import datetime

# from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('fog_train.csv', index_col=0).rename(columns=lambda x: x.split('.')[1])
test = pd.read_csv('fog_test.csv', index_col=0).rename(columns=lambda x: x.split('.')[1])

### 데이터 전처리

In [3]:
train['stn_group'] = train['stn_id'].str[0]
train['stn_group_binary'] = train['stn_group'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6})

test['stn_group'] = test['stn_id'].str[0]
test['stn_group_binary'] = test['stn_group'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6})

```stn_id```의 경우 앞 문자만 따서 Label Encoding 해주었다.

In [4]:
drop_columns = ['year', 'month', 'day', 'time', 'minute', 'class'] + ['stn_id', 'stn_group']
train.loc[train['class'] == -99, 'class'] = np.nan

train_drop = train.dropna().drop(drop_columns, axis=1).reset_index(drop=True)
test_drop = test.drop(drop_columns, axis=1).reset_index(drop=True)

train_drop.shape, test_drop.shape

((3133943, 9), (262800, 8))

```class```가 -99인 row들은 결측치로 간주하여 drop했으며 그 외에 쓰지 않을 컬럼들(datetime 관련)을 날렸다.

In [5]:
X = train_drop.drop('vis1', axis=1)
y = train_drop['vis1']

X_test = test_drop

X_train_temp, X_val, y_train_temp, y_val = train_test_split(X, y, random_state=42, train_size=0.75)
X_train_temp.shape, X_val.shape, y_train_temp.shape, y_val.shape

((2350457, 8), (783486, 8), (2350457,), (783486,))

별도의 스케일링 없이 train과 validation으로 나눠줬다.

In [6]:
X_train = X_train_temp[y_train_temp < 20000]
y_train = y_train_temp[y_train_temp < 20000]

X_train.shape, y_train.shape

((1053640, 8), (1053640,))

이때 train set은 ```vis1``` 값이 **20,000 미만**인 것들만 취급하기로 했다.

In [58]:
models = {
    'DecisionTree': DecisionTreeRegressor(random_state = 42),
    'XGBoost': XGBRegressor(tree_method = "hist", device = "cuda", random_state = 42),
    'LightGBM': LGBMRegressor(verbosity=-1, device_type='gpu', random_state = 42),
    'CatBoost': CatBoostRegressor(verbose=0, task_type='GPU', random_state = 42)
}

model_results = {}

for name, model in models.items():
    start_time = datetime.now()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    model_results[name] = rmse
    time_delta = datetime.now() - start_time
    with open(f'''{name}(튜닝X)_{start_time.strftime('%Y%m%d%H%M%S')}.pkl''', 'wb') as file:
        pickle.dump(model, file)
    print(f"{name} 학습 완료! RMSE: {np.round(rmse, 2)}, 학습 시간: {time_delta.seconds}.{time_delta.microseconds // 10000}s")

DecisionTree 학습 완료! RMSE: 13583.33, 학습 시간: 7.64s
XGBoost 학습 완료! RMSE: 13120.4, 학습 시간: 1.37s
LightGBM 학습 완료! RMSE: 13217.89, 학습 시간: 1.84s
CatBoost 학습 완료! RMSE: 13116.81, 학습 시간: 7.68s


4가지 모델에 대해 파라미터 튜닝 없이 돌려봤다. 성능 지표 RMSE 기준으로 **CatBoost >= XGBoost > LightGBM > DecisionTree** 순서로 나왔다.

In [59]:
cat_boost = CatBoostRegressor(verbose=0, task_type='GPU', random_state=42)
cat_boost.fit(X_train, y_train)
y_test = cat_boost.predict(X_test)
test_drop['vis1'] = y_test

가장 높은 성능을 보였던 CatBoost 모델 이용하여 test set 예측을 해봤다.

In [60]:
test_drop['class'] = test_drop['vis1'].apply(
    lambda x: 0 if x <= 0
        else 1 if x < 200
        else 2 if x < 500
        else 3 if x < 1000
        else 4
)

np.round(
    test_drop['class'].value_counts(normalize=True).to_frame().reset_index().sort_values('index').set_index('index') * 100
, 3).astype(str) + '%'

Unnamed: 0_level_0,class
index,Unnamed: 1_level_1
0,0.014%
1,0.002%
2,0.008%
3,0.014%
4,99.963%


예측 결과 안개로 예측되는 비율이 굉장히 낮다.

In [61]:
np.round(
    train.dropna()['class'].value_counts(normalize=True).to_frame().reset_index().sort_values('index').set_index('index') * 100
, 3).astype(str) + '%'

Unnamed: 0_level_0,class
index,Unnamed: 1_level_1
1.0,0.251%
2.0,0.386%
3.0,0.389%
4.0,98.975%


기존 train set에서는 대략 이 정도 나왔다.

---

In [None]:
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'bagging_temperature': [0, 1, 2],
}

cat_boost = CatBoostRegressor(verbose=0, task_type='GPU', random_state=42)
grid_search = GridSearchCV(estimator=cat_boost, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error')

grid_search.fit(X_train, y_train)
print("Best parameters found: ", grid_search.best_params_)
print("Lowest RMSE found: ", (-grid_search.best_score_)**0.5)

In [62]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'RMSE(val): {rmse}')

KeyboardInterrupt: 

In [None]:
with open(f'''CatBoost(GridSearch)_{datetime.now().strftime('%Y%m%d%H%M%S')}.pkl''', 'wb') as file:
    pickle.dump(model, file)