## 회귀 모델 가능성 검토 / Feature Importance 확인

### Library import

In [1]:
import os
from typing import Dict
import numpy as np
import pandas as pd

### Data Load

In [28]:
data_path: str = "../../data"
df: pd.DataFrame = pd.read_csv(os.path.join(data_path,"raw.csv"))
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv"))

### EDA

In [3]:
df.shape

(11552, 255)

In [5]:
# 수치형 열과 target의 상관관계
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

correlation_with_target = df[numerical_cols].corrwith(df['target'])
correlation_with_target_sorted = correlation_with_target.sort_values(ascending=False)

print(correlation_with_target_sorted[:30])

target                                                                           1.000000
hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd         0.055130
hourly_market-data_liquidations_binance_all_symbol_long_liquidations             0.055063
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations               0.054655
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations_usd           0.054421
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                0.052132
hourly_market-data_liquidations_binance_btc_usd_long_liquidations_usd            0.051910
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                 0.049724
hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio            0.049467
hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations        0.041983
hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd    0.041855
hourly_mar

In [6]:
pd.set_option('display.max_rows', None)
print(correlation_with_target_sorted)
pd.reset_option('display.max_rows')

target                                                                            1.000000
hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd          0.055130
hourly_market-data_liquidations_binance_all_symbol_long_liquidations              0.055063
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations                0.054655
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations_usd            0.054421
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                 0.052132
hourly_market-data_liquidations_binance_btc_usd_long_liquidations_usd             0.051910
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                  0.049724
hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio             0.049467
hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations         0.041983
hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd     0.041855

In [7]:
# 현재값과 다음값의 차이를 구해서 새로운 열 생성
df['spot_closed_difference'] = df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'].diff().shift(-1)

# 변동률을 퍼센트로 계산해서 새로운 열 생성
df['spot_closed_percent'] = df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'].pct_change().shift(-1) * 100

# 결과 확인
print(df[['ID', 'target','hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 'spot_closed_difference', 'spot_closed_percent']].head())

                    ID  target  \
0  2023-01-01 00:00:00     2.0   
1  2023-01-01 01:00:00     1.0   
2  2023-01-01 02:00:00     1.0   
3  2023-01-01 03:00:00     1.0   
4  2023-01-01 04:00:00     2.0   

   hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close  \
0                                       16536.747967                
1                                       16557.136536                
2                                       16548.149805                
3                                       16533.632875                
4                                       16524.712159                

   spot_closed_difference  spot_closed_percent  
0               20.388568             0.123292  
1               -8.986731            -0.054277  
2              -14.516930            -0.087725  
3               -8.920715            -0.053955  
4                4.695906             0.028417  


  df['spot_closed_percent'] = df['hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close'].pct_change().shift(-1) * 100


In [8]:
# 기준에 맞는지 확인하는 함수 정의
def check_target_percent(row):
    percent = row['spot_closed_percent']
    target = row['target']
    
    if target == 0:
        return percent < -0.5
    elif target == 1:
        return -0.5 <= percent < 0
    elif target == 2:
        return 0 <= percent < 0.5
    elif target == 3:
        return percent >= 0.5
    else:
        return False

# 각 행에 대해 기준에 맞는지 확인
df['is_valid'] = df.apply(check_target_percent, axis=1)

# is_valid 열에서 False 값이 있는지 확인
has_false = df['is_valid'].any() == False

# 결과 출력
if has_false:
    print("값 확인 필요")
else:
    print("값 이상 없음")

값 이상 없음


In [9]:
# 'spot_closed_difference'에서 가장 작은 값 10개 추출
bottom_10_difference = df.nsmallest(10, 'spot_closed_difference')[['ID', 'spot_closed_difference']]

# 'spot_closed_percent'에서 가장 작은 값 10개 추출
bottom_10_percent = df.nsmallest(10, 'spot_closed_percent')[['ID', 'spot_closed_percent']]

# 결과 출력
print("Bottom 10 'spot_closed_difference':")
print(bottom_10_difference)

print("\nBottom 10 'spot_closed_percent':")
print(bottom_10_percent)

Bottom 10 'spot_closed_difference':
                       ID  spot_closed_difference
2778  2023-04-26 18:00:00            -1847.558628
4546  2023-07-09 10:00:00            -1500.551073
4402  2023-07-03 10:00:00            -1500.300167
4521  2023-07-08 09:00:00            -1484.727527
4539  2023-07-09 03:00:00            -1462.594247
4419  2023-07-04 03:00:00            -1427.547812
4684  2023-07-15 04:00:00            -1421.906526
4485  2023-07-06 21:00:00            -1354.215113
1938  2023-03-22 18:00:00            -1341.401905
4565  2023-07-10 05:00:00            -1340.076369

Bottom 10 'spot_closed_percent':
                       ID  spot_closed_percent
2778  2023-04-26 18:00:00            -6.213531
1464  2023-03-03 00:00:00            -5.427712
4546  2023-07-09 10:00:00            -4.976665
4521  2023-07-08 09:00:00            -4.909297
4539  2023-07-09 03:00:00            -4.828643
5492  2023-08-17 20:00:00            -4.822073
1938  2023-03-22 18:00:00            -4.792632
4684

In [10]:
# 수치형 열과 target의 상관관계
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

correlation_with_target = df[numerical_cols].corrwith(df['spot_closed_difference'])
correlation_with_target_sorted = correlation_with_target.sort_values(ascending=False)

print(correlation_with_target_sorted[:30])

spot_closed_difference                                                           1.000000
spot_closed_percent                                                              0.988604
target                                                                           0.749042
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                0.033069
hourly_market-data_liquidations_binance_btc_usd_long_liquidations_usd            0.032378
hourly_market-data_liquidations_binance_all_symbol_long_liquidations             0.031976
hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd         0.031638
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations               0.030876
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations_usd           0.030385
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                 0.029638
hourly_market-data_liquidations_binance_btc_busd_long_liquidations               0.026852
hourly_mar

In [11]:
# 수치형 열과 target의 상관관계
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

correlation_with_target = df[numerical_cols].corrwith(df['spot_closed_percent'])
correlation_with_target_sorted = correlation_with_target.sort_values(ascending=False)

print(correlation_with_target_sorted[:30])

spot_closed_percent                                                              1.000000
spot_closed_difference                                                           0.988604
target                                                                           0.754323
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                0.032666
hourly_market-data_coinbase-premium-index_coinbase_premium_index                 0.032339
hourly_market-data_liquidations_binance_btc_busd_long_liquidations               0.031955
hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd           0.031055
hourly_market-data_liquidations_binance_all_symbol_long_liquidations             0.030328
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations               0.027581
hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume                  0.024657
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                 0.023778
hourly_mar

In [12]:
pd.set_option('display.max_rows', None)
print(correlation_with_target_sorted)
pd.reset_option('display.max_rows')

spot_closed_percent                                                               1.000000
spot_closed_difference                                                            0.988604
target                                                                            0.754323
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                 0.032666
hourly_market-data_coinbase-premium-index_coinbase_premium_index                  0.032339
hourly_market-data_liquidations_binance_btc_busd_long_liquidations                0.031955
hourly_market-data_liquidations_binance_btc_busd_long_liquidations_usd            0.031055
hourly_market-data_liquidations_binance_all_symbol_long_liquidations              0.030328
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations                0.027581
hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_volume                   0.024657
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                  0.023778

In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df['diff_stand'] = scaler.fit_transform(df[['spot_closed_difference']])

# 결과 확인
print(df[['ID', 'spot_closed_difference', 'diff_stand']].head())

                    ID  spot_closed_difference  diff_stand
0  2023-01-01 00:00:00               20.388568    0.093497
1  2023-01-01 01:00:00               -8.986731   -0.063906
2  2023-01-01 02:00:00              -14.516930   -0.093539
3  2023-01-01 03:00:00               -8.920715   -0.063553
4  2023-01-01 04:00:00                4.695906    0.009410


In [14]:
# 수치형 열과 target의 상관관계
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

correlation_with_target = df[numerical_cols].corrwith(df['diff_stand'])
correlation_with_target_sorted = correlation_with_target.sort_values(ascending=False)

pd.set_option('display.max_rows', None)
print(correlation_with_target_sorted)
pd.reset_option('display.max_rows')

diff_stand                                                                        1.000000
spot_closed_difference                                                            1.000000
spot_closed_percent                                                               0.988604
target                                                                            0.749042
hourly_market-data_liquidations_binance_btc_usd_long_liquidations                 0.033069
hourly_market-data_liquidations_binance_btc_usd_long_liquidations_usd             0.032378
hourly_market-data_liquidations_binance_all_symbol_long_liquidations              0.031976
hourly_market-data_liquidations_binance_all_symbol_long_liquidations_usd          0.031638
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations                0.030876
hourly_market-data_liquidations_binance_btc_usdt_long_liquidations_usd            0.030385
hourly_market-data_taker-buy-sell-stats_binance_taker_sell_ratio                  0.029638

In [15]:
# df[numerical_cols]의 분산을 계산하고 내림차순으로 정렬하여 출력
variances = df[numerical_cols].var().sort_values(ascending=False)
pd.set_option('display.max_rows', None)
print(variances)
pd.reset_option('display.max_rows')

hourly_network-data_difficulty_difficulty                                         2.063032e+26
hourly_network-data_hashrate_hashrate                                             4.246257e+22
hourly_market-data_open-interest_all_exchange_all_symbol_open_interest            7.613689e+18
hourly_market-data_open-interest_binance_all_symbol_open_interest                 1.337461e+18
hourly_market-data_open-interest_bybit_all_symbol_open_interest                   9.071239e+17
hourly_market-data_open-interest_binance_btc_usdt_open_interest                   8.241512e+17
hourly_market-data_open-interest_bybit_btc_usdt_open_interest                     6.243740e+17
hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume            1.484597e+17
hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume             1.444660e+17
hourly_market-data_open-interest_okx_all_symbol_open_interest                     1.380794e+17
hourly_market-data_open-interest_binance_btc_usd_o

In [16]:
eda_columns = [
    'ID', 'target', 'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close', 
    'spot_closed_difference', 'spot_closed_percent', 
    'hourly_network-data_hashrate_hashrate', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume', 
    'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations', 
    'hourly_market-data_coinbase-premium-index_coinbase_premium_index', 
    'hourly_market-data_funding-rates_all_exchange_funding_rates'
]

eda_df = df[eda_columns]

print(eda_df.columns)

Index(['ID', 'target',
       'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close',
       'spot_closed_difference', 'spot_closed_percent',
       'hourly_network-data_hashrate_hashrate',
       'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio',
       'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume',
       'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume',
       'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest',
       'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations',
       'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations',
       'hourly_market-data_coinbase-premium-index_coinbase_premium_index',
       'hourly_market-data_funding-rates_all_exchange_funding_rates'],
      dtype='object')


In [17]:
cols_dict: Dict[str, str] = {
    'ID': 'ID',
    'target': 'target',
    'hourly_market-data_price-ohlcv_all_exchange_spot_btc_usd_close' : 'price',
    'spot_closed_difference' : 'difference', 
    'spot_closed_percent' : 'percent', 
    'hourly_network-data_hashrate_hashrate' : 'hashrate', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio' : 'taker_buy_sell_ratio', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume' : 'taker_sell_volume', 
    'hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume' : 'taker_buy_volume', 
    'hourly_market-data_open-interest_all_exchange_all_symbol_open_interest' : 'open_interest', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations' : 'long_liquidations', 
    'hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations' : 'short_liquidations', 
    'hourly_market-data_coinbase-premium-index_coinbase_premium_index' : 'coinbase_premium_index', 
    'hourly_market-data_funding-rates_all_exchange_funding_rates' : 'funding_rates'
}

eda_df = eda_df[list(cols_dict.keys())].rename(cols_dict, axis=1)
eda_df.head()


Unnamed: 0,ID,target,price,difference,percent,hashrate,taker_buy_sell_ratio,taker_sell_volume,taker_buy_volume,open_interest,long_liquidations,short_liquidations,coinbase_premium_index,funding_rates
0,2023-01-01 00:00:00,2.0,16536.747967,20.388568,0.123292,506291700000.0,0.904774,51375370.0,46483090.0,6271344000.0,0.012,0.0,-0.05965,0.005049
1,2023-01-01 01:00:00,1.0,16557.136536,-8.986731,-0.054277,168763900000.0,1.655721,24011290.0,39755990.0,6288683000.0,0.0,0.712,-0.053047,0.005049
2,2023-01-01 02:00:00,1.0,16548.149805,-14.51693,-0.087725,337527800000.0,1.027512,23409950.0,24054020.0,6286796000.0,0.0,0.0,-0.057952,0.005049
3,2023-01-01 03:00:00,1.0,16533.632875,-8.920715,-0.053955,210954900000.0,0.874477,32707730.0,28602150.0,6284575000.0,0.593,0.0,-0.058912,0.005067
4,2023-01-01 04:00:00,2.0,16524.712159,4.695906,0.028417,295336800000.0,0.966796,31937660.0,30877200.0,6291582000.0,0.361,0.0,-0.061373,0.00621


In [18]:
from sklearn.preprocessing import StandardScaler

standard_scaler = StandardScaler()

eda_df[['hashrate', 'taker_buy_sell_ratio', 'open_interest', 'funding_rates']] = standard_scaler.fit_transform(
    eda_df[['hashrate', 'taker_buy_sell_ratio', 'open_interest', 'funding_rates']]
)

print(eda_df[['ID', 'hashrate', 'taker_buy_sell_ratio', 'open_interest', 'funding_rates']].head())

                    ID  hashrate  taker_buy_sell_ratio  open_interest  \
0  2023-01-01 00:00:00  0.389863             -0.552604      -1.268021   
1  2023-01-01 01:00:00 -1.248180              3.479284      -1.261736   
2  2023-01-01 02:00:00 -0.429158              0.106389      -1.262420   
3  2023-01-01 03:00:00 -1.043424             -0.715270      -1.263225   
4  2023-01-01 04:00:00 -0.633914             -0.219603      -1.260686   

   funding_rates  
0      -0.343185  
1      -0.343193  
2      -0.343247  
3      -0.341936  
4      -0.259929  


In [19]:
eda_df['coinbase_premium_index'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  eda_df['coinbase_premium_index'].fillna(0, inplace=True)


In [20]:
missing_values_df = eda_df[eda_df.drop(columns=['target', 'difference', 'percent']).isnull().any(axis=1)]
print(missing_values_df)

                        ID  target  price  difference  percent  hashrate  \
8760   2024-01-01 00:00:00     NaN    NaN         NaN      0.0 -0.399560   
8761   2024-01-01 01:00:00     NaN    NaN         NaN      0.0  2.518812   
8762   2024-01-01 02:00:00     NaN    NaN         NaN      0.0  0.851171   
8763   2024-01-01 03:00:00     NaN    NaN         NaN      0.0  0.434261   
8764   2024-01-01 04:00:00     NaN    NaN         NaN      0.0  1.268081   
...                    ...     ...    ...         ...      ...       ...   
11547  2024-04-26 03:00:00     NaN    NaN         NaN      0.0 -0.536851   
11548  2024-04-26 04:00:00     NaN    NaN         NaN      0.0  0.993500   
11549  2024-04-26 05:00:00     NaN    NaN         NaN      0.0  0.993500   
11550  2024-04-26 06:00:00     NaN    NaN         NaN      0.0 -0.026734   
11551  2024-04-26 07:00:00     NaN    NaN         NaN      NaN -0.536851   

       taker_buy_sell_ratio  taker_sell_volume  taker_buy_volume  \
8760               

In [21]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

numeric_columns = eda_df.select_dtypes(include=[np.number]).columns

mean_values = eda_df.groupby('target')[numeric_columns].mean()


fig = make_subplots(
    rows=6, 
    cols=2,
    shared_xaxes=True,
    vertical_spacing=0.1,  
    subplot_titles=(
        "Difference",
        "Percent",
        "Hashrate",
        "Taker Buy/Sell Ratio",
        "Taker Sell Volume",
        "Taker Buy Volume",
        "Open Interest",
        "Long Liquidations",
        "Short Liquidations",
        "Coinbase Premium Index",
        "Funding Rates"
    ),
)


fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['difference']), row=1, col=1)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['percent']), row=1, col=2)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['hashrate']), row=2, col=1)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['taker_buy_sell_ratio']), row=2, col=2)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['taker_sell_volume']), row=3, col=1)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['taker_buy_volume']), row=3, col=2)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['open_interest']), row=4, col=1)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['long_liquidations']), row=4, col=2)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['short_liquidations']), row=5, col=1)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['coinbase_premium_index']), row=5, col=2)
fig.add_trace(go.Bar(x=mean_values.index, y=mean_values['funding_rates']), row=6, col=1)


fig.update_layout(
    title_text="Mean Values by Target",
    showlegend=False,
    height=2000,  
    width=1500
)

fig.update_xaxes(title_text="Target", row=6, col=1)
fig.show()


### Model Training

In [22]:
eda_df['_type'] = eda_df['ID'].apply(lambda x: 'train' if x < '2024-01-01 00:00:00' else 'test')

print(eda_df[['ID', '_type']].tail())

                        ID _type
11547  2024-04-26 03:00:00  test
11548  2024-04-26 04:00:00  test
11549  2024-04-26 05:00:00  test
11550  2024-04-26 06:00:00  test
11551  2024-04-26 07:00:00  test


In [23]:
eda_df.drop(columns=['difference', 'percent', 'price'], inplace=True)

In [24]:
eda_df.columns

Index(['ID', 'target', 'hashrate', 'taker_buy_sell_ratio', 'taker_sell_volume',
       'taker_buy_volume', 'open_interest', 'long_liquidations',
       'short_liquidations', 'coinbase_premium_index', 'funding_rates',
       '_type'],
      dtype='object')

In [25]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

train_df = eda_df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = eda_df.loc[df["_type"]=="test"].drop(columns=["_type"])

# 특징과 타겟 변수 정의
features = ['hashrate', 'taker_buy_sell_ratio', 'taker_sell_volume',
       'taker_buy_volume', 'open_interest', 'long_liquidations',
       'short_liquidations', 'coinbase_premium_index', 'funding_rates']

X_train = train_df[features]
y_train = train_df['target']
X_test = test_df[features]

# 데이터 분할 (훈련 데이터와 검증 데이터)
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# XGBoost DMatrix 생성
train_data = xgb.DMatrix(X_train_split, label=y_train_split)
val_data = xgb.DMatrix(X_val_split, label=y_val_split)
test_data = xgb.DMatrix(X_test)

# XGBoost 파라미터 설정
params = {
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'learning_rate': 0.05,
    'num_class': 4,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.9
}

# 모델 훈련
watchlist = [(train_data, 'train'), (val_data, 'eval')]
model = xgb.train(params, train_data, evals=watchlist)


# 검증 데이터에 대한 예측
val_preds = model.predict(val_data)
val_preds_class = val_preds.argmax(axis=1)

# 검증 데이터의 정확도 계산
val_accuracy = accuracy_score(y_val_split, val_preds_class)
print(f"Validation Accuracy: {val_accuracy:.4f}")


[0]	train-mlogloss:1.36694	eval-mlogloss:1.37115
[1]	train-mlogloss:1.34885	eval-mlogloss:1.35688
[2]	train-mlogloss:1.33140	eval-mlogloss:1.34374
[3]	train-mlogloss:1.31512	eval-mlogloss:1.33131
[4]	train-mlogloss:1.29936	eval-mlogloss:1.31977
[5]	train-mlogloss:1.28479	eval-mlogloss:1.30845
[6]	train-mlogloss:1.27087	eval-mlogloss:1.29871
[7]	train-mlogloss:1.25768	eval-mlogloss:1.28942
[8]	train-mlogloss:1.24505	eval-mlogloss:1.28113
[9]	train-mlogloss:1.23274	eval-mlogloss:1.27284
Validation Accuracy: 0.4235


In [30]:
# 테스트 데이터에 대한 예측
test_preds = model.predict(test_data)
test_preds_class = test_preds.argmax(axis=1)

# 예측 결과를 submission_df에 추가
submission_df['target'] = test_preds_class
submission_df.to_csv("output_xgboost_2.csv", index=False)

In [31]:
import plotly.express as px
import pandas as pd
import os

data_path: str = "../../data"

# target의 범주 비율 계산
target_counts = submission_df['target'].value_counts(normalize=True).reset_index()
target_counts.columns = ['Category', 'Proportion']

# 시각화
fig = px.bar(target_counts, x='Category', y='Proportion', title='Proportion of Categories in submission_df["target"]')
fig.show()

In [32]:
import plotly.express as px
# Feature importance 평가
importance = model.get_score(importance_type='weight')
importance_df = pd.DataFrame({
    'Feature': [k for k in importance.keys()],
    'Importance': importance.values()
}).sort_values(by='Importance', ascending=False)

# 시각화
fig = px.bar(importance_df, x='Feature', y='Importance', title='Feature Importance')
fig.show()