In [None]:
!wget https://aistages-api-public-prod.s3.amazonaws.com/app/Competitions/000313/data/20240731021525/data.tar.gz

In [None]:
!tar -xzvf data.tar.gz

### Library Import

In [None]:
import os
from typing import List, Dict
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import lightgbm as lgb
from plotly.subplots import make_subplots
import plotly.graph_objects as go


### Data Load

In [None]:
# 파일 호출
data_path: str = "./data"
train_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "train.csv")).assign(_type="train") # train 에는 _type = train
test_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")).assign(_type="test") # test 에는 _type = test
submission_df: pd.DataFrame = pd.read_csv(os.path.join(data_path, "test.csv")) # ID, target 열만 가진 데이터 미리 호출
df: pd.DataFrame = pd.concat([train_df, test_df], axis=0)

In [None]:
# HOURLY_ 로 시작하는 .csv 파일 이름을 file_names 에 할딩
file_names: List[str] = [
    f for f in os.listdir(data_path) if f.startswith("HOURLY_") and f.endswith(".csv")
]

# 파일명 : 데이터프레임으로 딕셔너리 형태로 저장
file_dict: Dict[str, pd.DataFrame] = {
    f.replace(".csv", ""): pd.read_csv(os.path.join(data_path, f)) for f in file_names
}

for _file_name, _df in tqdm(file_dict.items()):
    # 열 이름 중복 방지를 위해 {_file_name.lower()}_{col.lower()}로 변경, datetime 열을 ID로 변경
    _rename_rule = {
        col: f"{_file_name.lower()}_{col.lower()}" if col != "datetime" else "ID"
        for col in _df.columns
    }
    _df = _df.rename(_rename_rule, axis=1)
    df = df.merge(_df, on="ID", how="left")


In [None]:
df = df.dropna(axis=1, how='all')

### Feature engineering

In [None]:
# 모델에 사용할 컬럼, 컬럼의 rename rule을 미리 할당함
cols_dict: Dict[str, str] = {
    "ID": "ID",
    "target": "target",
    "_type": "_type",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_gap": "coinbase_premium_gap",
    "hourly_market-data_funding-rates_all_exchange_funding_rates": "funding_rates",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations": "long_liquidations",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations": "short_liquidations",
    "hourly_market-data_open-interest_all_exchange_all_symbol_open_interest": "open_interest",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_ratio": "buy_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_volume": "buy_volume",
    "hourly_network-data_addresses-count_addresses_count_active": "active_count",
    "hourly_network-data_addresses-count_addresses_count_receiver": "receiver_count",
    "hourly_network-data_addresses-count_addresses_count_sender": "sender_count",
    "hourly_market-data_coinbase-premium-index_coinbase_premium_index": "coinbase_premium_index",
    "hourly_market-data_liquidations_all_exchange_all_symbol_long_liquidations_usd": "long_liquidations_usd",
    "hourly_market-data_liquidations_all_exchange_all_symbol_short_liquidations_usd": "short_liquidations_usd",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_buy_sell_ratio": "buy_sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_ratio": "sell_ratio",
    "hourly_market-data_taker-buy-sell-stats_all_exchange_taker_sell_volume": "sell_volume",
}

erased_dict: Dict[str, str] = {
}

df = df[cols_dict.keys()].rename(cols_dict, axis=1)
df.shape

In [None]:
# eda 에서 파악한 차이와 차이의 음수, 양수 여부를 새로운 피쳐로 생성
df = df.assign(
    liquidation_diff=df["long_liquidations"] - df["short_liquidations"],
    liquidation_diffg=np.sign(df["long_liquidations"] - df["short_liquidations"]),
    liquidation_usd_diff=df["long_liquidations_usd"] - df["short_liquidations_usd"],
    volume_diff=df["buy_volume"] - df["sell_volume"],
    liquidation_usd_diffg=np.sign(df["long_liquidations_usd"] - df["short_liquidations_usd"]),
    volume_diffg=np.sign(df["buy_volume"] - df["sell_volume"]),
    buy_sell_volume_ratio=df["buy_volume"] / (df["sell_volume"] + 1),
)
'''erased_df = {

}'''
# category, continuous 열을 따로 할당해둠
category_cols: List[str] = ["liquidation_diffg", "liquidation_usd_diffg", "volume_diffg"]
conti_cols: List[str] = [_ for _ in cols_dict.values() if _ not in ["ID", "target", "_type"]] + [
    "liquidation_diff",
    "buy_sell_volume_ratio",
    "liquidation_usd_diff",
    "volume_diff",
]
'''
erased_conti_cols = {
}
'''

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Extract numerical features for correlation analysis
numerical_df = df.select_dtypes(include=['number'])

# Calculate the correlation matrix using Spearman method on numerical features
corr_matrix = numerical_df.corr(method='spearman')

corr_with_target = corr_matrix['target'].abs().sort_values(ascending=False)
print("Top correlated features:\n", corr_with_target.head(10))

'''# Get column names and extract the last three words separated by '_'
column_names = corr_matrix.columns
last_three_words = ['_'.join(name.split('_')[-3:]) for name in column_names]

# Plotting the heatmap with a larger figure size
plt.figure(figsize=(200, 200))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", xticklabels=last_three_words, yticklabels=last_three_words)
plt.title('Spearman Correlation Matrix')
plt.show()'''

In [None]:
def shift_feature(
    df: pd.DataFrame,
    conti_cols: List[str],
    intervals: List[int],
) -> List[pd.Series]:
    """
    연속형 변수의 shift feature 생성
    Args:
        df (pd.DataFrame)
        conti_cols (List[str]): continuous colnames
        intervals (List[int]): shifted intervals
    Return:
        List[pd.Series]
    """
    df_shift_dict = [
        df[conti_col].shift(interval).rename(f"{conti_col}_{interval}")
        for conti_col in conti_cols
        for interval in intervals
    ]
    return df_shift_dict

# 최대 24시간의 shift 피쳐를 계산
shift_list = shift_feature(
    df=df, conti_cols=conti_cols, intervals=[_ for _ in range(1, 24)]
)

In [None]:
# concat 하여 df 에 할당

df = pd.concat([df, pd.concat(shift_list, axis=1)], axis=1)

# 타겟 변수를 제외한 변수를 forwardfill, -999로 결측치 대체
_target = df["target"]
# df = df.ffill().fillna(-999).assign(target = _target)

# _type에 따라 train, test 분리
train_df = df.loc[df["_type"]=="train"].drop(columns=["_type"])
test_df = df.loc[df["_type"]=="test"].drop(columns=["_type"])

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Select only the desired features
features = ['volume_diff', 'buy_ratio', 'liquidation_diff', 'open_interest']
train = train_df[features + ['target']]  # Assuming 'target' is the label column
test = test_df[features]

# Fill missing values with the mean of 4 values above and below
def fill_missing_with_mean(df, features):
    for feature in features:
        df[feature] = df[feature].fillna(df[feature].rolling(4, min_periods=1).mean())
        df[feature] = df[feature].fillna(df[feature].rolling(4, min_periods=1).mean().shift(-4))
    return df

train = fill_missing_with_mean(train, features)
test = fill_missing_with_mean(test, features)

# Split data into features and target
X = train[features]
y = train['target']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# Train the model
xgb_model.fit(X_train, y_train)

# Validate the model
y_pred_val = xgb_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f'Validation Accuracy: {accuracy}')
print('Classification Report:\n')
print(classification_report(y_val, y_pred_val))

"""train_dataset = lgb.Dataset(X_train, label=y_train)
val_dataset = lgb.Dataset(X_val, label=y_val, reference=train_dataset)

params = {
    'objective': 'multiclass',
    'num_class': 4,
    'metric': 'multi_logloss',
    'boosting': 'gbdt',
    'num_leaves': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'random_state': 42,
    'verbose': 0
}

# Train LightGBM model
lgb_model = lgb.train(params, train_dataset, valid_sets=val_dataset, num_boost_round=100)

y_pred = lgb_model.predict(X_val)
y_pred_class = [list(x).index(max(x)) for x in y_pred]
print("Validation accuracy:", accuracy_score(y_val, y_pred_class))"""


In [None]:
# 전체 데이터로 XGBoost 모델 학습
xgb_model_full = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss'
)

x_train = train_df.drop(["target", "ID"], axis = 1)
y_train_new = train_df["target"].astype(int)

xgb_model_full.fit(x_train, y_train_new)

y_pred_val = xgb_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred_val)
print(f'Validation Accuracy: {accuracy}')
print('Classification Report:\n')
print(classification_report(y_val, y_pred_val))

# Test 데이터 예측
y_pred_test = xgb_model_full.predict(test_df.drop(["target", "ID"], axis = 1))

# Create output file
output = pd.DataFrame({'ID': test_df['ID'], 'target': y_pred_test})
output.to_csv('output.csv', index=False)

print("Predictions saved to output.csv")

In [None]:
output['target'].value_counts()

In [None]:
pip install wandb

In [None]:
import wandb

wandb.login()

In [None]:
import wandb
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Initialize W&B
wandb.init(project="btc-prediction")

# Define your hyperparameter sweep configuration
sweep_config = {
    'method': 'grid',  # or 'grid'
    'metric': {'name': 'accuracy', 'goal': 'maximize'},
    'parameters': {
        'n_estimators': {'values': [50, 100, 150]},
        'learning_rate': {'values': [0.01, 0.05, 0.1]},
        'max_depth': {'values': [4, 6, 8]},
        'subsample': {'values': [0.6, 0.8, 1.0]},
        'colsample_bytree': {'values': [0.6, 0.8, 1.0]}
    }
}

# Initialize a sweep
sweep_id = wandb.sweep(sweep_config, project="btc-prediction")

# Define a function to train your model
def train():
    # Access the hyperparameters through wandb.config
    config = wandb.config

    # Initialize the model with current hyperparameters
    xgb_model = xgb.XGBClassifier(
        n_estimators=config.n_estimators,
        learning_rate=config.learning_rate,
        max_depth=config.max_depth,
        subsample=config.subsample,
        colsample_bytree=config.colsample_bytree,
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss'
    )

    # Split the dataset
    X_train, X_val, y_train, y_val = train_test_split(x_train, y_train_new, test_size=0.2, random_state=42)

    # Train the model
    xgb_model.fit(X_train, y_train)

    # Make predictions and evaluate
    y_pred_val = xgb_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)

    # Log the results to W&B
    wandb.log({'accuracy': accuracy})

    # Optionally, log additional metrics like the classification report
    print(f'Validation Accuracy: {accuracy}')
    print('Classification Report:\n')
    print(classification_report(y_val, y_pred_val))

# Run the sweep agent
wandb.agent(sweep_id, function=train)

# Test 데이터 예측
y_pred_test = xgb_model.predict(test_df.drop(["target", "ID"], axis = 1))

# Create output file
output = pd.DataFrame({'ID': test_df['ID'], 'target': y_pred_test})
output.to_csv('output.csv', index=False)

print("Predictions saved to output.csv")