# Random Forest – Next-Day Direction Classifier

Binary classification: predict whether a stock's **next-day return is positive (1) or negative/zero (0)**.

An ensemble of 300 trees reduces variance significantly over a single decision tree and can handle non-linear feature interactions.

**Time-based split:** train `< 2023-01-01`, test `≥ 2023-01-01`

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)

## 1. Load Data

In [None]:
df = pd.read_csv('../data/merged_dataset.csv', parse_dates=['date'])
print(f'Shape: {df.shape}')
print(f'Date range: {df["date"].min().date()} – {df["date"].max().date()}')
df.head(3)

## 2. Feature Engineering

In [None]:
df = df.sort_values(['ticker', 'date']).reset_index(drop=True)

# ── Target: next-day direction ────────────────────────────────────────────────
df['next_return'] = df.groupby('ticker')['daily_return'].shift(-1)
df['target'] = (df['next_return'] > 0).astype(int)

# ── Lagged returns ────────────────────────────────────────────────────────────
df['lag_return_1'] = df['daily_return']
df['lag_return_2'] = df.groupby('ticker')['daily_return'].shift(1)
df['lag_return_5'] = df.groupby('ticker')['daily_return'].shift(4)

# ── Price vs 20-day MA ────────────────────────────────────────────────────────
df['price_to_ma20'] = (
    df['close'] / df['rolling_mean_20'].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan) - 1

# ── Intraday range ────────────────────────────────────────────────────────────
df['hl_range'] = (df['high'] - df['low']) / df['close'].replace(0, np.nan)

# ── Overnight gap ─────────────────────────────────────────────────────────────
df['prev_close'] = df.groupby('ticker')['close'].shift(1)
df['oc_gap'] = (
    (df['open'] - df['prev_close']) / df['prev_close'].replace(0, np.nan)
).replace([np.inf, -np.inf], np.nan)

# ── Abnormal volume ───────────────────────────────────────────────────────────
df['vol_20ma'] = df.groupby('ticker')['volume'].transform(
    lambda x: x.rolling(20, min_periods=1).mean()
)
df['vol_norm'] = (df['volume'] / df['vol_20ma'].replace(0, np.nan)).clip(0, 10)

# ── News flag ─────────────────────────────────────────────────────────────────
df['has_news'] = (df['news_count'].fillna(0) > 0).astype(float)

# ── Log market cap ────────────────────────────────────────────────────────────
df['log_marketcap'] = np.log1p(df['Marketcap'].fillna(0))

# ── Impute sparse columns ─────────────────────────────────────────────────────
df['VIX']             = df['VIX'].fillna(df['VIX'].median())
df['Yield_Spread']    = df['Yield_Spread'].fillna(df['Yield_Spread'].median())
df['Regime_GMM']      = df['Regime_GMM'].fillna(df['Regime_GMM'].median())
df['sentiment_mean']  = df['sentiment_mean'].fillna(0)
df['sentiment_ratio'] = df['sentiment_ratio'].fillna(0)
df['Revenuegrowth']   = df['Revenuegrowth'].fillna(0)
df['Weight']          = df['Weight'].fillna(0)

# ── Sector encoding ───────────────────────────────────────────────────────────
df['Sector_encoded'] = df['Sector'].astype('category').cat.codes

print(f'Shape after feature engineering: {df.shape}')

## 3. Feature Selection & Train / Test Split

In [None]:
FEATURE_COLS = [
    'lag_return_1', 'lag_return_2', 'lag_return_5',
    'rolling_std_20', 'price_to_ma20',
    'hl_range', 'oc_gap', 'vol_norm',
    'VIX', 'Yield_Spread', 'Regime_GMM',
    'sentiment_mean', 'sentiment_ratio', 'has_news',
    'log_marketcap', 'Revenuegrowth', 'Weight',
    'Sector_encoded',
]

model_df = df[FEATURE_COLS + ['target', 'date']].dropna()
print(f'Rows after dropna: {len(model_df):,}')

SPLIT_DATE = '2023-01-01'
train = model_df[model_df['date'] <  SPLIT_DATE]
test  = model_df[model_df['date'] >= SPLIT_DATE]

X_train, y_train = train[FEATURE_COLS].values, train['target'].values
X_test,  y_test  = test[FEATURE_COLS].values,  test['target'].values

naive = max(y_test.mean(), 1 - y_test.mean())
print(f'Train: {len(train):,}  |  Test: {len(test):,}')
print(f'Naive baseline accuracy: {naive:.4f}')

## 4. Train Random Forest

An ensemble of 300 trees reduces variance significantly over a single tree.

In [None]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=8,
    min_samples_leaf=200,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1,
)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print('=== Random Forest ===')
print(f'Accuracy : {accuracy_score(y_test, y_pred_rf):.4f}  (naive: {naive:.4f})')
print()
print(classification_report(y_test, y_pred_rf, target_names=['Down', 'Up']))

## 5. Feature Importance

Mean decrease in impurity across all 300 trees gives a reliable importance ranking.

In [None]:
fi_df = pd.DataFrame({
    'feature':    FEATURE_COLS,
    'importance': rf.feature_importances_,
}).sort_values('importance', ascending=True)

fig, ax = plt.subplots(figsize=(8, 7))
ax.barh(fi_df['feature'], fi_df['importance'], color='seagreen')
ax.set_title('Random Forest Feature Importance')
ax.set_xlabel('Importance (mean decrease in impurity)')
plt.tight_layout()
plt.show()