In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [6]:
# Paths
DATA_DIR = '../data'
X_TRAIN_PATH = os.path.join(DATA_DIR, 'X_train.csv')
Y_TRAIN_PATH = os.path.join(DATA_DIR, 'y_train.csv')
X_TEST_PATH = os.path.join(DATA_DIR, 'X_test.csv')

# Load data
X_train = pd.read_csv(X_TRAIN_PATH, index_col='ID')
y_train = pd.read_csv(Y_TRAIN_PATH, index_col='ID')
X_test = pd.read_csv(X_TEST_PATH, index_col='ID')

# Drop metadata
cols_drop = ['DATE', 'STOCK', 'INDUSTRY', 'INDUSTRY_GROUP', 'SUB_INDUSTRY']
X_train = X_train.drop(cols_drop, axis=1, errors='ignore')
X_test = X_test.drop(cols_drop, axis=1, errors='ignore')

# Feature Engineering
# Volatility feature
X_train['VOLATILITY'] = X_train.filter(like='RET').std(axis=1)
X_test['VOLATILITY'] = X_test.filter(like='RET').std(axis=1)

# Encode Sector
X_train = pd.get_dummies(X_train, columns=['SECTOR'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['SECTOR'], drop_first=True)
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

# Build Pipeline
# 1. Impute (Median) -> 2. Scale (Robust) -> 3. PCA (95% variance) -> 4. Boost Model
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=0.95)),
    ('model', HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1, random_state=42))
    # HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1, random_state=42)
    # GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
])

# Validation Split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['RET'], test_size=0.2, random_state=42)

# Train Pipeline
pipe.fit(X_tr, y_tr)

# Validate
acc = accuracy_score(y_val, pipe.predict(X_val))

# Full Retrain
pipe.fit(X_train, y_train['RET'])

# Predict
sub = pd.DataFrame({'RET': pipe.predict(X_test)}, index=X_test.index)
sub.to_csv('ver2.csv')

print(f"Pipeline Accuracy (PCA + Boosting): {acc:.4f}")

Pipeline Accuracy (PCA + Boosting): 0.5208


imputacja median -> zamiast zerami, braki uzupelniamy mediana

pca -> 0.95 zachowania informacji

robust scaler -> odporny na outliery