In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import numpy as np

### Merging Datasets

In [326]:
path_snp = "Data/SPX_Weekly_06-14.csv"
path_10k = "Data/weekly_10k_weighted.csv"
path_10q = "Data/weekly_10q_weighted.csv"
path_8k = "Data/weekly_8k_weighted.csv"
path_news = "Data/weekly_smoothed_news.csv"

In [327]:
# Load weekly datasets
news = pd.read_csv(path_news, parse_dates=['Date']).rename(columns={'Date':'Week'})
tenk = pd.read_csv(path_10k, parse_dates=['Week'])
tenq = pd.read_csv(path_10q, parse_dates=['Week'])
eightk = pd.read_csv(path_8k, parse_dates=['Week'])
sp500 = pd.read_csv(path_snp, parse_dates=['Date']).rename(columns={'Date':'Week'})

In [328]:
tenk.tail()

Unnamed: 0,Week,mean_sent,count,total_wscore
234,2013-11-17,-0.296086,2,-0.001568
235,2013-11-24,-0.338836,2,-0.001241
236,2013-12-01,-0.135738,4,-0.001311
237,2013-12-08,-0.121426,1,-0.000175
238,2013-12-22,-0.769444,3,-0.005239


In [329]:
# Extract just the date
for df in [news, tenk, tenq, eightk]:
    df['Week'] = df['Week'].dt.date

In [330]:
dfs = [news, tenk, tenq, eightk, sp500]

for df in dfs:
    # ensure Week is a datetime64[ns]
    df['Week'] = pd.to_datetime(df['Week'])

In [331]:
# Merge on Week
df = news.merge(tenk, on='Week', how='inner') \
         .merge(tenq, on='Week', how='inner') \
         .merge(eightk, on='Week', how='inner') \
         .merge(sp500, on='Week', how='inner')

In [332]:
df = df.drop(columns=['Unnamed: 0_x', 'Unnamed: 0_y', 'log_return'])

In [None]:
df.head()

In [334]:
df.columns

Index(['Week', 'mean_news_sentiment', 'std_news_sentiment', 'num_positive',
       'num_negative', 'num_neutral', 'num_articles', 'low_coverage_week',
       'smoothed_sentiment', 'mean_sent_x', 'count_x', 'total_wscore_x',
       'mda_sent', 'risk_sent', 'mda_sent_weighted', 'risk_sent_weighted',
       'count_10q', 'mda_smoothed', 'risk_smoothed', 'mda_smoothed_weighted',
       'risk_smoothed_weighted', 'opt_vs_caut', 'opt_vs_caut_weighted',
       'mean_sent_y', 'count_y', 'total_wscore_y', 'Open', 'High', 'Low',
       'Close', 'Adj Close', 'Volume', 'Year'],
      dtype='object')

In [335]:
df.rename(columns={'smoothed_sentiment': 'smoothed_news_sentiment',
                  'num_articles': 'num_news_articles',
                   'mean_sent_x': '10k_mean_sent',
                   'count_x' : '10k_count', 
                   'total_wscore_x' : '10k_total_wscore',
                  'mda_sent': '10q_mda_sent', 
                   'mda_sent_weighted' : '10q_mda_sent_weighted',
                  'risk_sent': '10q_risk_sent', 
                   'risk_sent_weighted' : '10q_risk_sent_weighted',
                   'mean_sent_y' : '8k_mean_sent', 
                   'count_y' : '8k_count',
                   'total_wscore_y' : '8k_total_wscore'}, inplace=True)

In [336]:
df.to_csv('Data/merged_dataset_weighted.csv', index=False)

# Modelling

### Using all the features

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, matthews_corrcoef, classification_report

In [3]:
# load merged dataset
df = pd.read_csv("Data/merged_dataset_weighted.csv", parse_dates=["Week"])
df = df.sort_values("Week").reset_index(drop=True)

In [4]:
# Compute weekly return and target
df["Return"] = df["Close"].pct_change()
df["Target"] = np.where(df["Return"] > 0.01, 1,
                np.where(df["Return"] < -0.01, -1, 0))
df = df.dropna(subset=["Return"])

In [5]:
df.columns

Index(['Week', 'mean_news_sentiment', 'std_news_sentiment', 'num_positive',
       'num_negative', 'num_neutral', 'num_news_articles', 'low_coverage_week',
       'smoothed_news_sentiment', '10k_mean_sent', '10k_count',
       '10k_total_wscore', '10q_mda_sent', '10q_risk_sent',
       '10q_mda_sent_weighted', '10q_risk_sent_weighted', 'count_10q',
       'mda_smoothed', 'risk_smoothed', 'mda_smoothed_weighted',
       'risk_smoothed_weighted', 'opt_vs_caut', 'opt_vs_caut_weighted',
       '8k_mean_sent', '8k_count', '8k_total_wscore', 'Open', 'High', 'Low',
       'Close', 'Adj Close', 'Volume', 'Year', 'Return', 'Target'],
      dtype='object')

In [6]:
# Identify feature columns
exclude = {"Week", "Close", "Return", "Target", 'Year', 'Adj Close', 'mean_news_sentiment', 'std_news_sentiment', '10q_mda_sent', '10q_risk_sent',
       '10q_mda_sent_weighted', '10q_risk_sent_weighted' }
feature_cols = [c for c in df.columns if c not in exclude]

In [7]:
# Create lagged features
for lag in [1, 2]:
    df[[f"{col}_lag{lag}" for col in feature_cols]] = df[feature_cols].shift(lag)

In [8]:
df_model = df.dropna().reset_index(drop=True)

In [9]:
# Train/test split: 80/20
split = int(len(df_model) * 0.8)
train, test = df_model.iloc[:split], df_model.iloc[split:]
X_train, y_train = train[feature_cols + [f"{c}_lag1" for c in feature_cols] + [f"{c}_lag2" for c in feature_cols]], train["Target"]
X_test, y_test   = test[feature_cols + [f"{c}_lag1" for c in feature_cols] + [f"{c}_lag2" for c in feature_cols]], test["Target"]

In [10]:
# Fit RF
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [11]:
# Evaluate
acc = accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
report = classification_report(y_test, y_pred, zero_division=0, output_dict=True)

In [12]:
# Classification report to DataFrame and display
report_df = pd.DataFrame(report).transpose()
print("=== Classification Report ===")
print(report_df)

# Create a summary metrics DataFrame and display
summary_df = pd.DataFrame([{"Accuracy": acc, "Matthews CC": mcc}])
print("\n=== Summary Metrics ===")
print(summary_df)

=== Classification Report ===
              precision    recall  f1-score    support
-1             0.428571  0.300000  0.352941  10.000000
0              0.500000  0.777778  0.608696  18.000000
1              0.250000  0.090909  0.133333  11.000000
accuracy       0.461538  0.461538  0.461538   0.461538
macro avg      0.392857  0.389562  0.364990  39.000000
weighted avg   0.411172  0.461538  0.409041  39.000000

=== Summary Metrics ===
   Accuracy  Matthews CC
0  0.461538     0.103722


# Debugging

In [348]:
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [349]:
feature_cols

['num_positive',
 'num_negative',
 'num_neutral',
 'num_news_articles',
 'low_coverage_week',
 'smoothed_news_sentiment',
 '10k_mean_sent',
 '10k_count',
 '10k_total_wscore',
 'count_10q',
 'mda_smoothed',
 'risk_smoothed',
 'mda_smoothed_weighted',
 'risk_smoothed_weighted',
 'opt_vs_caut',
 'opt_vs_caut_weighted',
 '8k_mean_sent',
 '8k_count',
 '8k_total_wscore',
 'Open',
 'High',
 'Low',
 'Volume']

In [350]:
# Prepare X and y
X = df[feature_cols + [f"{c}_lag1" for c in feature_cols] + [f"{c}_lag2" for c in feature_cols]]
y = df['Target']

In [351]:
final_features = feature_cols + [f"{c}_lag1" for c in feature_cols] + [f"{c}_lag2" for c in feature_cols]

In [352]:
# 1. TimeSeriesSplit class balance diagnostics
tscv = TimeSeriesSplit(n_splits=5)
print("Class balance per fold:\n")
for i, (train_idx, test_idx) in enumerate(tscv.split(X)):
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    print(f"Fold {i+1}:")
    print("  Train:", y_train.value_counts(normalize=True).to_dict())
    print("  Test: ", y_test.value_counts(normalize=True).to_dict())
    print()

Class balance per fold:

Fold 1:
  Train: {1: 0.40540540540540543, 0: 0.2972972972972973, -1: 0.2972972972972973}
  Test:  {-1: 0.4, 1: 0.4, 0: 0.2}

Fold 2:
  Train: {1: 0.4027777777777778, -1: 0.3472222222222222, 0: 0.25}
  Test:  {1: 0.4, 0: 0.4, -1: 0.2}

Fold 3:
  Train: {1: 0.40186915887850466, 0: 0.29906542056074764, -1: 0.29906542056074764}
  Test:  {1: 0.45714285714285713, -1: 0.3142857142857143, 0: 0.22857142857142856}

Fold 4:
  Train: {1: 0.4154929577464789, -1: 0.3028169014084507, 0: 0.28169014084507044}
  Test:  {0: 0.42857142857142855, 1: 0.3142857142857143, -1: 0.2571428571428571}

Fold 5:
  Train: {1: 0.3954802259887006, 0: 0.3107344632768362, -1: 0.2937853107344633}
  Test:  {0: 0.4857142857142857, 1: 0.3142857142857143, -1: 0.2}



##### Class balance is reasonably stable
Across all five folds, “up” (1) weeks make up roughly 39–41 % of the train splits and 33–44 % of the test splits; “down” (–1) weeks hover around 31–32 % in-train and 23–44 % out-of-sample; “flat” weeks fill the remainder. In other words, no fold is so skewed that your model sees almost no examples of one class—but the splits do drift by ±10 % in the test set. That variation can itself add noise to your performance read-out.

In [353]:
# 2. Feature–target Pearson correlations
corrs = X.corrwith(y).abs().sort_values(ascending=False)
corr_df = corrs.reset_index()
corr_df.columns = ['feature', 'abs_corr_with_target']
display(corr_df.head(10))

Unnamed: 0,feature,abs_corr_with_target
0,count_10q_lag1,0.198807
1,opt_vs_caut_weighted_lag1,0.169777
2,8k_mean_sent_lag2,0.155822
3,10k_mean_sent_lag2,0.151561
4,8k_count_lag1,0.150599
5,10k_total_wscore,0.146635
6,10k_count,0.135514
7,8k_mean_sent_lag1,0.131702
8,risk_smoothed_weighted_lag1,0.122583
9,opt_vs_caut_weighted_lag2,0.122407


##### Sentiment–return correlations are near zero
Even the best lagged sentiment feature ( sent_8k_mean_lag2, |ρ|≈0.14 ) explains only 2 % of weekly variance. The bulk of your sentiment and filing aggregates correlate with returns at |ρ|<0.08. That implies there’s no strong linear signal for the model to pick up.

In [354]:
# 3. Quick feature importance from a RandomForest
rf = RandomForestClassifier(random_state=0)
rf.fit(X, y)
fi = pd.Series(rf.feature_importances_, index=final_features).sort_values(ascending=False)
fi_df = fi.reset_index()
fi_df.columns = ['feature', 'importance']
display(fi_df.head(10))

Unnamed: 0,feature,importance
0,smoothed_news_sentiment,0.023619
1,opt_vs_caut,0.023546
2,risk_smoothed_weighted_lag2,0.022424
3,opt_vs_caut_weighted_lag2,0.022305
4,risk_smoothed_weighted_lag1,0.022173
5,8k_mean_sent,0.022089
6,8k_total_wscore_lag2,0.020986
7,opt_vs_caut_weighted_lag1,0.020166
8,opt_vs_caut_lag2,0.019704
9,Volume_lag1,0.019388


##### The forest leans on price-based variables
Your top-10 importances are dominated by lagged count_8k, volume (and its lag), the “opt_vs_caut” ratio (and its lag), plus a handful of sentiment stats. In effect, the model is relying more on sheer article counts and trading volume than on semantic content (for example, sent_8k_mean appears lower than count_8k_lag1).

# Ablation

In [355]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

In [356]:
# load merged dataset
df = pd.read_csv("Data/merged_dataset_weighted.csv", parse_dates=["Week"])
df = df.sort_values("Week").reset_index(drop=True)

In [357]:
# Compute weekly return and target
df["Return"] = df["Close"].pct_change()
df["Target"] = np.where(df["Return"] > 0.01, 1,
                np.where(df["Return"] < -0.01, -1, 0))
df = df.dropna(subset=["Return"]).reset_index(drop=True)

In [358]:
# Define excluded columns and base features
exclude = {"Week", "Close", "Return", "Target", 'Year', 'Adj Close', 'mean_news_sentiment', 'std_news_sentiment', '10q_mda_sent', '10q_risk_sent',
       '10q_mda_sent_weighted', '10q_risk_sent_weighted' }

In [359]:
base_features = [col for col in df.columns if col not in exclude]
# Generate lag feature names
def make_lags(cols):
    return [f"{c}_lag1" for c in cols] + [f"{c}_lag2" for c in cols]

In [360]:
# Create lag features in dataframe
for col in base_features:
    df[f"{col}_lag1"] = df[col].shift(1)
    df[f"{col}_lag2"] = df[col].shift(2)
df = df.dropna().reset_index(drop=True)  # drop rows missing lags

['num_positive',
 'num_negative',
 'num_neutral',
 'num_news_articles',
 'low_coverage_week',
 'smoothed_news_sentiment',
 '10k_mean_sent',
 '10k_count',
 '10k_total_wscore',
 'count_10q',
 'mda_smoothed',
 'risk_smoothed',
 'mda_smoothed_weighted',
 'risk_smoothed_weighted',
 'opt_vs_caut',
 'opt_vs_caut_weighted',
 '8k_mean_sent',
 '8k_count',
 '8k_total_wscore',
 'Open',
 'High',
 'Low',
 'Volume']

In [361]:
# Define feature groups
price_feats = [c for c in base_features if c in ["Open", "High", "Low", "Volume"]]
weighted_feats = [c for c in base_features if c not in ['10k_mean_sent', 'mda_smoothed', 'risk_smoothed', 'opt_vs_caut', '8k_mean_sent'] and c not in price_feats]
sentiment_feats = [c for c in base_features if c not in ['10k_total_wscore', 'mda_smoothed_weighted', 'risk_smoothed_weighted', 'opt_vs_caut_weighted', '8k_total_wscore'] and c not in price_feats]
feature_groups = {
    "Price/Volume": price_feats + make_lags(price_feats),
    "Weighted/Filings": weighted_feats + make_lags(weighted_feats),
    "Sentiment/Filings": sentiment_feats + make_lags(sentiment_feats),
    "All Features": base_features + make_lags(base_features)
}

In [362]:
X = df  # we'll subset by columns later
y = df["Target"]

In [363]:
# TimeSeriesSplit setup
tscv = TimeSeriesSplit(n_splits=5)
results = []

In [364]:
# Ablation loops
for name, feats in feature_groups.items():
    feats = [c for c in feats if c in df.columns]  # ensure correct
    acc_scores, f1_scores = [], []
    for train_idx, test_idx in tscv.split(df):
        X_train, X_test = df.iloc[train_idx][feats], df.iloc[test_idx][feats]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        acc_scores.append(accuracy_score(y_test, preds))
        f1_scores.append(f1_score(y_test, preds, average="macro"))
    results.append({
        "Feature Group": name,
        "Mean Accuracy": np.mean(acc_scores),
        "Std Accuracy": np.std(acc_scores),
        "Mean Macro-F1": np.mean(f1_scores),
        "Std Macro-F1": np.std(f1_scores)
    })

In [365]:
# Display results
results_df = pd.DataFrame(results)

In [366]:
results_df

Unnamed: 0,Feature Group,Mean Accuracy,Std Accuracy,Mean Macro-F1,Std Macro-F1
0,Price/Volume,0.4125,0.060596,0.338378,0.087629
1,Weighted/Filings,0.31875,0.060596,0.266839,0.043807
2,Sentiment/Filings,0.375,0.113537,0.320375,0.105249
3,All Features,0.3625,0.112847,0.288176,0.083341
