In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Paths
DATA_DIR = '../data'
X_TRAIN_PATH = os.path.join(DATA_DIR, 'X_train.csv')
Y_TRAIN_PATH = os.path.join(DATA_DIR, 'y_train.csv')
X_TEST_PATH = os.path.join(DATA_DIR, 'X_test.csv')

# Load
X_train = pd.read_csv(X_TRAIN_PATH, index_col='ID')
y_train = pd.read_csv(Y_TRAIN_PATH, index_col='ID')
X_test = pd.read_csv(X_TEST_PATH, index_col='ID')

# Clean
cols_drop = ['DATE', 'STOCK', 'INDUSTRY', 'INDUSTRY_GROUP', 'SUB_INDUSTRY', 'SECTOR']
X_train = X_train.drop(cols_drop, axis=1, errors='ignore').fillna(0)
X_test = X_test.drop(cols_drop, axis=1, errors='ignore').fillna(0)

# Feature Engineering (FIX: Volatility instead of Mean)
# Calculating standard deviation (risk/volatility) of last 5 days
X_train['RET_5_STD'] = X_train[['RET_1', 'RET_2', 'RET_3', 'RET_4', 'RET_5']].std(axis=1)
X_test['RET_5_STD'] = X_test[['RET_1', 'RET_2', 'RET_3', 'RET_4', 'RET_5']].std(axis=1)

# Split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['RET'], test_size=0.2, random_state=42)

# Train
clf = RandomForestClassifier(n_estimators=100, max_depth=8, random_state=42, n_jobs=-1)
clf.fit(X_tr, y_tr)

# Score
acc = accuracy_score(y_val, clf.predict(X_val))

# Final Train & Submission
clf.fit(X_train, y_train['RET'])
sub = pd.DataFrame({'RET': clf.predict(X_test)}, index=X_test.index)
sub.to_csv('submission_volatility.csv')

print(f"Volatility Experiment Accuracy: {acc:.4f}")

Volatility Experiment Accuracy: 0.5294


-> dodano nowa ceche - policzona srednia z ostatnich 5 zwrotow ('ret_1' do 'ret_5'), co pozwala modelowi wylapac krotkoterminowy trend (momentum).
update: accuracy lekko spadlo

-> zamieniono srednia na odchylenie standardowe (z 5 dni)
update: accuracy znowu spadlo