In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    precision_recall_curve,
    auc,
    confusion_matrix
)



In [2]:
# Load Monthly CRSP

CRSP_PATH = '../data/monthly_crsp.csv'
df_crsp = pd.read_csv(
    CRSP_PATH ,
    parse_dates=['MthCalDt'],
    usecols=['PERMNO','CUSIP','MthCalDt','MthRet']
)


# Load Compustat Fundamentals

COMP_PATH = '../data/CompFirmCharac.csv'

df_comp = pd.read_csv(
    COMP_PATH,
    parse_dates=['datadate'], dayfirst=True,
)


  df_comp = pd.read_csv(
  df_comp = pd.read_csv(


### CLEAN CRSP


In [3]:
# 2.1: Keep only rows where MthRet is available and cast to float
df_crsp = df_crsp.dropna(subset=['MthRet']).copy()
df_crsp['MthRet'] = df_crsp['MthRet'].astype(float)

# 2.2: Sort by CUSIP, date so that shift is correct
df_crsp['date'] = pd.to_datetime(df_crsp['MthCalDt'].astype(str), format='mixed')
df_crsp = df_crsp.sort_values(['CUSIP','date']).reset_index(drop=True)

# 2.3: Create next‐month return target (binary)
# df_crsp['Ret_t1'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)
df_crsp['y'] = df_crsp.groupby('CUSIP')['MthRet'].shift(-1)
# df_crsp['y'] = (df_crsp['Ret_t1'] > 0).astype(int)
df_crsp = df_crsp.dropna(subset=['y']).copy()

In [4]:
import calendar

# Add technical indicators and months

def compute_close(y_series):
    close = (1 + y_series.fillna(0)).cumprod()
    close.iloc[0] = 1.0
    return close

df_crsp = df_crsp.sort_values(["PERMNO", "date"])
df_crsp["close"] = df_crsp.groupby("PERMNO")["MthRet"].apply(compute_close).reset_index(level=0, drop=True)

def calculate_rsi(series, window=14):
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.rolling(window).mean()
    ma_down = down.rolling(window).mean()
    rs = ma_up / ma_down
    return 100 - (100 / (1 + rs))

# Group-wise calculations
def add_technical_indicators(group):
    group = group.copy()
    group['EMA_Close'] = group['close'].ewm(span=14, adjust=False).mean()
    group['EMA'] = (group['close'] - group['EMA_Close'])/ group['close']

    rolling_std = group['close'].rolling(window=14).std()
    group['Volatility'] = rolling_std / group['close']

    group['RSI'] = calculate_rsi(group['close'])
    
    # MACD
    ema12 = group['close'].ewm(span=12, adjust=False).mean()
    ema26 = group['close'].ewm(span=26, adjust=False).mean()
    group['MACD_diff'] = ema12 - ema26
    group['MACD_Signal'] = group['MACD_diff'].ewm(span=9, adjust=False).mean()
    group['MACD'] = (group['MACD_diff'] - group['MACD_Signal']) / group['close']


    return group

# Apply technical indicators to each stock (PERMNO)
df_crsp = df_crsp.groupby("PERMNO", group_keys=False).apply(add_technical_indicators, include_groups=False)

df_crsp.drop(columns=["close", "EMA_Close", "MACD_diff", "MACD_Signal", 'y_forward'], inplace=True, errors='ignore')

# Add months
df_crsp['month'] = df_crsp['date'].dt.month.map(lambda x: calendar.month_name[x])

# Create dummy variables with month names as column names
month_dummies = pd.get_dummies(df_crsp['month']).astype(int)

# Concatenate dummies with original dataframe
df_crsp = pd.concat([df_crsp, month_dummies], axis=1)

# drop the intermediate 'month' column 
df_crsp = df_crsp.drop(columns=['month'])

# Get the current columns
cols = list(df_crsp.columns)

# Move 'y' to the end if it exists
if 'y' in cols:
    cols.remove('y')
    cols.append('y')

# Reorder the DataFrame columns
df_crsp = df_crsp[cols]

df_crsp = df_crsp.dropna()
print(df_crsp)

            CUSIP   MthCalDt    MthRet       date       EMA  Volatility  \
2970518  68391610 1987-03-31 -0.384615 1987-03-31 -4.416586    6.008675   
2970519  68391610 1987-04-30 -0.062500 1987-04-30 -4.140666    6.416235   
1746075  39040610 1987-03-31  0.037486 1987-03-31  0.028843    0.058017   
1746076  39040610 1987-04-30 -0.039216 1987-04-30 -0.009357    0.057093   
1746077  39040610 1987-05-29 -0.071429 1987-05-29 -0.075400    0.065004   
...           ...        ...       ...        ...       ...         ...   
3917811  88160R10 2024-07-31  0.172781 2024-07-31  0.094132    0.146526   
3917812  88160R10 2024-08-30 -0.077390 2024-08-30  0.015727    0.148719   
3917813  88160R10 2024-09-30  0.221942 2024-09-30  0.168567    0.119113   
3917814  88160R10 2024-10-31 -0.045025 2024-10-31  0.112118    0.121613   
3917815  88160R10 2024-11-29  0.381469 2024-11-29  0.309653    0.131800   

               RSI      MACD  April  August  ...  February  January  July  \
2970518  31.218276 -1.

In [5]:
# 2.4: Generate price‐history features (momentum + volatility)
#
#   - 3M momentum: cumulative return over past 3 months (t-3 → t-1)
#   - 6M momentum: cumulative return over past 6 months
#   - 12M momentum: cumulative return over past 12 months
#   - 12M rolling volatility: std of monthly returns over past 12 months
#
def compute_momentum_and_vol(df):
    df = df.sort_values('date')
    # Rolling log(1+return), because cumulative product of (1 + ret) = exp(sum(log(1+ret)))
    df['log1p_ret'] = np.log1p(df['MthRet'])
    df['log1p_ret_shift1'] = df.groupby('CUSIP')['log1p_ret'].shift(1)
    df['cum12_1_log'] = df.groupby('CUSIP')['log1p_ret_shift1'].rolling(window=11).sum().reset_index(0,drop=True)
    df['mom_12_1'] = np.expm1(df['cum12_1_log'])
    df['cum3m_log'] = df.groupby('CUSIP')['log1p_ret'].rolling(window=3, min_periods=3).sum().reset_index(0,drop=True)
    df['cum6m_log'] = df.groupby('CUSIP')['log1p_ret'].rolling(window=6, min_periods=6).sum().reset_index(0,drop=True)
    df['cum12m_log'] = df.groupby('CUSIP')['log1p_ret'].rolling(window=12, min_periods=12).sum().reset_index(0,drop=True)
    df['momentum_3m'] = np.expm1(df['cum3m_log'])    # exp(sum)-1 => (1+r1)*(1+r2)*(1+r3) - 1
    df['momentum_6m'] = np.expm1(df['cum6m_log'])
    df['momentum_12m'] = np.expm1(df['cum12m_log'])
    df['volatility_12m'] = df.groupby('CUSIP')['MthRet'].rolling(window=12, min_periods=12).std().reset_index(0,drop=True)
    # Drop intermediate log columns
    return df.drop(columns=['log1p_ret','cum3m_log','cum6m_log','cum12m_log'])

df_crsp = compute_momentum_and_vol(df_crsp)

# 2.5: Trim CUSIP to 8 characters (for merging) and drop NA
df_crsp['cusip'] = df_crsp['CUSIP'].astype(str).str[:8]
df_crsp = df_crsp.dropna(subset=['cusip']).copy()

### CLEAN COMPFIRM

In [6]:
# 3.1: Keep only Industrial & Consolidated
df_comp = df_comp[
    (df_comp['consol'] == 'C')
].copy()

# 3.2: Trim & parse keys/dates
df_comp['cusip'] = df_comp['cusip'].astype(str).str[:8]
df_comp['datadate'] = pd.to_datetime(df_comp['datadate'])
df_comp = df_comp.dropna(subset=['cusip','datadate']).copy()

# 3.3: Build “effective_date” = datadate + 45 calendar days,
#      so that we only use Q data ~45 days after quarter‐end.
df_comp['effective_date'] = df_comp['datadate'] + pd.Timedelta(days=45)
df_comp = df_comp.set_index('effective_date').sort_index()

# 3.4: Select a larger fundamental set (YTD flows + per‐share metrics)
fundamental_cols = [
    'revty',    # Revenue YTD
    'saley',    # Sales YTD
    'capxy',    # CapEx YTD
    'oibdpy',   # EBITDA YTD
    'rdipay',   # R&D expense YTD
    'xsgay',    # SG&A expense YTD
    'txpdy',    # Tax provision YTD
    'epsfxy',   # Diluted EPS ex‐extra YTD
    'cshfdy',   # Diluted shares YTD (millions)
    'xoptepsy'  # Option expense per share YTD
]

df_comp_small = df_comp[['cusip'] + fundamental_cols].copy()

# 3.5: For each “cusip + quarter,” drop exact duplicates
df_comp_small = df_comp_small.reset_index().drop_duplicates(
    subset=['cusip','effective_date']
).set_index('effective_date').sort_index()

### MERGE

In [7]:
# 4.1: Set df_crsp index to “date”
df_crsp = df_crsp.set_index('date').sort_index()

# 4.2: Merge (for every month, get the most recent Compustat row ≤ that month’s date)
df_merged = pd.merge_asof(
    left = df_crsp.reset_index(),
    right = df_comp_small.reset_index(),
    left_on = 'date',
    right_on = 'effective_date',
    by = 'cusip',
    direction = 'backward',
    allow_exact_matches=True
).set_index('date')

# 4.3: Drop rows where any of our fundamentals are NA, since we can’t compute ratios otherwise
df_merged = df_merged.dropna(subset=fundamental_cols + ['y']).copy()


In [8]:
df = df_merged.copy()

# 5.1: Engineer simple ratios (safe‐guard divisions by zero)
df['EBIT_margin']      = df['oibdpy'] / df['saley'].replace({0: np.nan})
df['R&D_intensity']    = df['rdipay'] / df['saley'].replace({0: np.nan})
df['SGA_intensity']    = df['xsgay'] / df['saley'].replace({0: np.nan})
df['Tax_rate']         = df['txpdy']  / df['oibdpy'].replace({0: np.nan})
df['Capex_to_Revenue'] = df['capxy']  / df['revty'].replace({0: np.nan})

# 5.2: Compute QoQ growth rates on YTD fundamentals
for col in ['revty','oibdpy','rdipay','xsgay']:
    df[col + '_QoQ_growth'] = df.groupby('cusip')[col].pct_change()  # (This QY / Last QY) - 1

# 5.3: Optionally drop some raw dollar‐amount YTD columns if you want just ratios
#      (otherwise let the model pick scale vs. ratio)
# df = df.drop(columns=['revty','saley','capxy','oibdpy','rdipay','xsgay','txpdy','epsfxy','cshfdy','xoptepsy'])

### TRAINING

In [9]:
feature_columns = [
    # Price/technical:
    'momentum_3m', 'momentum_6m', 'momentum_12m', 'volatility_12m',

    # Basic YTD fundamentals (optional—tree can split on scale):
    'revty', 'saley', 'capxy', 'oibdpy', 'rdipay', 'xsgay', 'txpdy', 'epsfxy', 'cshfdy', 'xoptepsy',

    # Engineered ratios:
    'EBIT_margin', 'R&D_intensity', 'SGA_intensity', 'Tax_rate', 'Capex_to_Revenue',

    # QoQ growth rates:
    'revty_QoQ_growth', 'oibdpy_QoQ_growth', 'rdipay_QoQ_growth', 'xsgay_QoQ_growth'
]

# Drop any rows where engineered features are NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=feature_columns + ['y']).copy()


X = df[feature_columns]
y = df['y']

In [10]:
# Instead of a fixed 80/20 cutoff, build an expanding‐window cross‐validation
# but keep a final out‐of‐sample test set (last 20% of months).
n_obs = len(df)
cutpoint = int(n_obs * 0.8)

X_train = X.iloc[:cutpoint]
y_train = y.iloc[:cutpoint]

X_test  = X.iloc[cutpoint:]
y_test  = y.iloc[cutpoint:]

In [13]:

# pipe = Pipeline([
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler',   StandardScaler()),   # scale ratio features so splits are easier
#     ('clf',      RandomForestClassifier(random_state=42, class_weight='balanced'))
# ])

# param_grid = {
#     'clf__n_estimators': [100, 200],
#     'clf__max_depth': [5, 7, 9],
#     'clf__max_features': ['sqrt', 'log2']
# }

from sklearn.ensemble import RandomForestRegressor

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('reg', RandomForestRegressor(random_state=42))
])

param_grid = {
    'reg__n_estimators': [100, 200],
    'reg__max_depth': [5, 7, 9],
    'reg__max_features': ['sqrt', 'log2']
}


tscv = TimeSeriesSplit(n_splits=5)

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=tscv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best parameters:", grid.best_params_)
best_model = grid.best_estimator_


Fitting 5 folds for each of 12 candidates, totalling 60 fits




Best parameters: {'reg__max_depth': 5, 'reg__max_features': 'sqrt', 'reg__n_estimators': 100}


In [15]:
# # 9.1: Prediction & Classification Report
# y_pred = best_model.predict(X_test)
# print("\nClassification Report on Test Set:")
# print(classification_report(y_test, y_pred))

# # 9.2: ROC AUC + Precision‐Recall AUC
# y_proba = best_model.predict_proba(X_test)[:, 1]
# roc_auc = roc_auc_score(y_test, y_proba)

# precision, recall, _ = precision_recall_curve(y_test, y_proba)
# pr_auc = auc(recall, precision)

# print(f"Test ROC AUC:  {roc_auc:.4f}")
# print(f"Test PR AUC:   {pr_auc:.4f}")

# # 9.3: Display a confusion matrix if you like
# cm = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix (low values = better balance):\n", cm)

# # ───────────
# # 10. FEATURE IMPORTANCE AND NEXT STEPS
# # ───────────

# importances = best_model.named_steps['clf'].feature_importances_
# feat_imp = pd.Series(importances, index=feature_columns).sort_values(ascending=False)
# print("\nTop 10 Feature Importances:")
# print(feat_imp.head(10))

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = best_model.predict(X_test)

print("\nRegression Metrics on Test Set:")
print(f"MAE:  {mean_absolute_error(y_test, y_pred):.4f}")
print(f"MSE:  {mean_squared_error(y_test, y_pred):.4f}")
print(f"R²:   {r2_score(y_test, y_pred):.4f}")



Regression Metrics on Test Set:
MAE:  0.0769
MSE:  0.0125
R²:   0.0057


TRAINING 2

In [16]:
feature_columns = [
    # Price/technical:
    'momentum_3m', 'momentum_6m', 'mom_12_1', 'volatility_12m',

    # Basic YTD fundamentals (optional—tree can split on scale):
    'revty', 'saley', 'capxy', 'oibdpy', 'rdipay', 'xsgay', 'txpdy', 'epsfxy', 'cshfdy', 'xoptepsy',

    # Engineered ratios:
    'EBIT_margin', 'R&D_intensity', 'SGA_intensity', 'Tax_rate', 'Capex_to_Revenue',

    # QoQ growth rates:
    'revty_QoQ_growth', 'oibdpy_QoQ_growth', 'rdipay_QoQ_growth', 'xsgay_QoQ_growth'
]

# Drop any rows where engineered features are NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=feature_columns + ['y']).copy()


X = df[feature_columns]
y = df['y']

In [17]:
# Instead of a fixed 80/20 cutoff, build an expanding‐window cross‐validation
# but keep a final out‐of‐sample test set (last 20% of months).
n_obs = len(df)
cutpoint = int(n_obs * 0.8)

X_train = X.iloc[:cutpoint]
y_train = y.iloc[:cutpoint]

X_test  = X.iloc[cutpoint:]
y_test  = y.iloc[cutpoint:]

In [19]:
from xgboost import XGBClassifier

pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',   StandardScaler()),  
    ('clf',      XGBClassifier(
         objective='binary:logistic',
         eval_metric='auc',
         use_label_encoder=False,
         n_estimators=200,
         max_depth=5,
         learning_rate=0.05,
         random_state=42
    ))
])

param_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [3, 5, 7],
    'clf__learning_rate': [0.01, 0.05]
}

grid = GridSearchCV(pipe, param_grid, cv=TimeSeriesSplit(n_splits=5), scoring='roc_auc', n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)
print("XGBoost Best params:", grid.best_params_)
best_model = grid.best_estimator_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


ValueError: 
All the 60 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\sklearn.py", line 1640, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251
 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305
 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413
 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449
 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485
 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
 576 577 578 579 580 581 582], got [-8.718750e-01 -7.752410e-01 -5.690570e-01 -5.535520e-01 -4.922640e-01
 -4.513890e-01 -4.375000e-01 -4.294190e-01 -4.237290e-01 -4.164190e-01
 -4.127760e-01 -4.035990e-01 -3.955070e-01 -3.769720e-01 -3.738900e-01
 -3.657270e-01 -3.527940e-01 -3.483130e-01 -3.463770e-01 -3.462540e-01
 -3.434310e-01 -3.385230e-01 -3.350170e-01 -3.266440e-01 -3.230510e-01
 -3.222220e-01 -3.126760e-01 -3.073390e-01 -3.004570e-01 -2.989250e-01
 -2.963330e-01 -2.943770e-01 -2.936960e-01 -2.867940e-01 -2.854720e-01
 -2.834300e-01 -2.796460e-01 -2.769230e-01 -2.752070e-01 -2.750460e-01
 -2.702410e-01 -2.662550e-01 -2.635510e-01 -2.565400e-01 -2.554740e-01
 -2.543100e-01 -2.532830e-01 -2.518990e-01 -2.502280e-01 -2.493720e-01
 -2.473170e-01 -2.441180e-01 -2.428570e-01 -2.416790e-01 -2.380950e-01
 -2.361280e-01 -2.357770e-01 -2.334880e-01 -2.311320e-01 -2.307210e-01
 -2.283080e-01 -2.261900e-01 -2.234310e-01 -2.222880e-01 -2.222220e-01
 -2.217570e-01 -2.182280e-01 -2.181910e-01 -2.176360e-01 -2.154100e-01
 -2.115100e-01 -2.105260e-01 -2.101530e-01 -2.069430e-01 -2.045170e-01
 -2.035290e-01 -2.033580e-01 -1.996500e-01 -1.985710e-01 -1.956520e-01
 -1.951920e-01 -1.940300e-01 -1.935010e-01 -1.896550e-01 -1.894520e-01
 -1.828250e-01 -1.803830e-01 -1.801800e-01 -1.785710e-01 -1.782820e-01
 -1.782230e-01 -1.777240e-01 -1.758980e-01 -1.705070e-01 -1.703010e-01
 -1.677500e-01 -1.653430e-01 -1.648090e-01 -1.645410e-01 -1.633330e-01
 -1.630320e-01 -1.617890e-01 -1.611020e-01 -1.601420e-01 -1.592980e-01
 -1.592880e-01 -1.561030e-01 -1.529650e-01 -1.511940e-01 -1.504570e-01
 -1.489900e-01 -1.470990e-01 -1.450780e-01 -1.424710e-01 -1.422320e-01
 -1.398030e-01 -1.392860e-01 -1.391110e-01 -1.370800e-01 -1.365740e-01
 -1.358340e-01 -1.354790e-01 -1.334060e-01 -1.332020e-01 -1.320410e-01
 -1.301450e-01 -1.249440e-01 -1.246040e-01 -1.246030e-01 -1.243390e-01
 -1.230000e-01 -1.221680e-01 -1.219840e-01 -1.215220e-01 -1.207010e-01
 -1.181020e-01 -1.159700e-01 -1.147540e-01 -1.139400e-01 -1.128400e-01
 -1.127270e-01 -1.120510e-01 -1.119360e-01 -1.103900e-01 -1.056730e-01
 -1.052630e-01 -1.050740e-01 -1.037410e-01 -1.033150e-01 -1.031790e-01
 -1.016840e-01 -1.015540e-01 -1.015470e-01 -9.612600e-02 -9.600000e-02
 -9.580100e-02 -9.560200e-02 -9.420600e-02 -9.412800e-02 -9.313300e-02
 -9.090900e-02 -9.075300e-02 -8.940400e-02 -8.799300e-02 -8.688700e-02
 -8.651800e-02 -8.625000e-02 -8.546600e-02 -8.544800e-02 -8.520200e-02
 -8.326300e-02 -8.296800e-02 -8.203700e-02 -8.135100e-02 -8.054700e-02
 -8.006700e-02 -7.925400e-02 -7.842000e-02 -7.692300e-02 -7.618500e-02
 -7.580600e-02 -7.462700e-02 -7.362000e-02 -7.317100e-02 -7.211500e-02
 -7.129000e-02 -6.976700e-02 -6.919400e-02 -6.822800e-02 -6.746600e-02
 -6.644400e-02 -6.622500e-02 -6.486000e-02 -6.442600e-02 -6.432100e-02
 -6.293700e-02 -6.251400e-02 -6.168000e-02 -6.075900e-02 -6.043400e-02
 -6.012700e-02 -5.957000e-02 -5.919100e-02 -5.828000e-02 -5.804200e-02
 -5.796100e-02 -5.754100e-02 -5.680400e-02 -5.677900e-02 -5.677300e-02
 -5.660400e-02 -5.623900e-02 -5.571300e-02 -5.554700e-02 -5.536300e-02
 -5.524900e-02 -5.448000e-02 -5.144300e-02 -4.926100e-02 -4.914400e-02
 -4.837400e-02 -4.836400e-02 -4.812700e-02 -4.545500e-02 -4.401900e-02
 -4.309700e-02 -4.166700e-02 -4.081600e-02 -4.027200e-02 -3.899700e-02
 -3.829500e-02 -3.703700e-02 -3.610100e-02 -3.568000e-02 -3.519100e-02
 -3.433500e-02 -3.387600e-02 -3.379700e-02 -3.338400e-02 -3.191500e-02
 -3.183500e-02 -3.173900e-02 -3.153400e-02 -3.091800e-02 -3.081700e-02
 -3.077400e-02 -3.046400e-02 -3.009500e-02 -2.941200e-02 -2.649000e-02
 -2.609300e-02 -2.577900e-02 -2.551300e-02 -2.507800e-02 -2.379700e-02
 -2.141800e-02 -2.122600e-02 -2.010100e-02 -1.967200e-02 -1.861300e-02
 -1.787100e-02 -1.761100e-02 -1.678200e-02 -1.662000e-02 -1.573700e-02
 -1.503300e-02 -1.480900e-02 -1.423600e-02 -1.396200e-02 -1.195700e-02
 -1.126600e-02 -1.101100e-02 -1.022200e-02 -9.898000e-03 -9.733000e-03
 -8.081000e-03 -6.623000e-03 -5.254000e-03 -5.189000e-03 -4.124000e-03
 -3.933000e-03 -3.125000e-03 -2.646000e-03 -2.026000e-03 -1.128000e-03
  0.000000e+00  7.110000e-04  2.053000e-03  2.094000e-03  2.349000e-03
  2.514000e-03  2.857000e-03  3.442000e-03  4.800000e-03  5.069000e-03
  7.625000e-03  7.921000e-03  8.349000e-03  1.008400e-02  1.043100e-02
  1.117300e-02  1.142400e-02  1.188900e-02  1.290300e-02  1.375000e-02
  1.399900e-02  1.440800e-02  1.447900e-02  1.487700e-02  1.603300e-02
  1.801800e-02  1.833800e-02  2.024300e-02  2.032200e-02  2.054300e-02
  2.061100e-02  2.077300e-02  2.147500e-02  2.287900e-02  2.459000e-02
  2.513700e-02  2.663700e-02  2.765800e-02  2.786900e-02  2.939700e-02
  3.005500e-02  3.292600e-02  3.328300e-02  3.333300e-02  3.359400e-02
  3.377400e-02  3.427500e-02  3.666400e-02  3.691900e-02  3.747100e-02
  3.809500e-02  3.889900e-02  3.890600e-02  3.898800e-02  4.054100e-02
  4.087800e-02  4.092700e-02  4.143600e-02  4.391900e-02  4.516100e-02
  4.526500e-02  4.575200e-02  4.625900e-02  4.776300e-02  4.830900e-02
  4.875300e-02  4.963500e-02  5.080800e-02  5.283800e-02  5.456400e-02
  5.699900e-02  5.755400e-02  5.956300e-02  5.994400e-02  6.015800e-02
  6.122400e-02  6.166200e-02  6.185600e-02  6.217600e-02  6.222200e-02
  6.383000e-02  6.413300e-02  6.519400e-02  6.574700e-02  6.679400e-02
  6.722100e-02  6.834400e-02  6.953200e-02  7.112500e-02  7.152700e-02
  7.299300e-02  7.644300e-02  7.692300e-02  7.713300e-02  7.770500e-02
  7.980500e-02  8.000000e-02  8.055600e-02  8.112100e-02  8.249400e-02
  8.372400e-02  8.437800e-02  8.447000e-02  8.508300e-02  8.541500e-02
  8.625300e-02  8.706300e-02  8.708300e-02  8.842400e-02  8.907600e-02
  9.062500e-02  9.063900e-02  9.078300e-02  9.149700e-02  9.458300e-02
  9.480500e-02  9.498700e-02  9.536800e-02  9.648000e-02  9.734500e-02
  9.892500e-02  9.917800e-02  1.007740e-01  1.033370e-01  1.034480e-01
  1.036870e-01  1.049760e-01  1.115170e-01  1.122150e-01  1.122280e-01
  1.124090e-01  1.128000e-01  1.137490e-01  1.154020e-01  1.167100e-01
  1.168710e-01  1.180590e-01  1.188990e-01  1.198910e-01  1.200130e-01
  1.204820e-01  1.215020e-01  1.221530e-01  1.223870e-01  1.224490e-01
  1.230730e-01  1.253370e-01  1.264770e-01  1.267020e-01  1.273010e-01
  1.277780e-01  1.283780e-01  1.287670e-01  1.295780e-01  1.296830e-01
  1.299770e-01  1.300170e-01  1.303790e-01  1.306120e-01  1.306670e-01
  1.307830e-01  1.331840e-01  1.343980e-01  1.347990e-01  1.367290e-01
  1.389430e-01  1.397670e-01  1.403470e-01  1.430000e-01  1.446720e-01
  1.476090e-01  1.483340e-01  1.485410e-01  1.488100e-01  1.493600e-01
  1.512030e-01  1.518990e-01  1.522570e-01  1.549480e-01  1.578950e-01
  1.583430e-01  1.621620e-01  1.633110e-01  1.634920e-01  1.639680e-01
  1.651050e-01  1.688520e-01  1.690190e-01  1.725290e-01  1.730280e-01
  1.738100e-01  1.777590e-01  1.807690e-01  1.810440e-01  1.815600e-01
  1.858410e-01  1.863840e-01  1.864840e-01  1.889460e-01  1.900000e-01
  1.939590e-01  1.980850e-01  1.991530e-01  2.000000e-01  2.009940e-01
  2.010660e-01  2.023810e-01  2.065220e-01  2.076020e-01  2.082520e-01
  2.083330e-01  2.104270e-01  2.110730e-01  2.141090e-01  2.160980e-01
  2.241130e-01  2.257140e-01  2.274630e-01  2.293200e-01  2.330290e-01
  2.338710e-01  2.339060e-01  2.362760e-01  2.368890e-01  2.376960e-01
  2.411020e-01  2.411350e-01  2.455000e-01  2.484470e-01  2.491540e-01
  2.500000e-01  2.517880e-01  2.566000e-01  2.571430e-01  2.577960e-01
  2.598430e-01  2.637680e-01  2.670810e-01  2.672020e-01  2.672810e-01
  2.690360e-01  2.690660e-01  2.705150e-01  2.716220e-01  2.763820e-01
  2.790700e-01  2.794610e-01  2.865550e-01  2.875820e-01  2.938570e-01
  2.966360e-01  3.012990e-01  3.019110e-01  3.189130e-01  3.228570e-01
  3.229770e-01  3.270270e-01  3.321690e-01  3.326210e-01  3.336360e-01
  3.345260e-01  3.435780e-01  3.436960e-01  3.440000e-01  3.541490e-01
  3.767710e-01  3.771110e-01  3.776380e-01  4.016670e-01  4.016770e-01
  4.018690e-01  4.066670e-01  4.072990e-01  4.080740e-01  4.234950e-01
  4.501140e-01  4.615380e-01  4.632170e-01  4.644070e-01  4.655170e-01
  4.671120e-01  4.671530e-01  4.691360e-01  4.771050e-01  5.000000e-01
  5.209420e-01  5.286140e-01  5.771810e-01  6.333330e-01  6.475640e-01
  6.917510e-01  8.869780e-01  9.085110e-01  9.452870e-01  9.827590e-01
  1.041667e+00  1.490011e+00  1.562701e+00]

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\sklearn.py", line 1640, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1157 1158 1159], got [-0.871875 -0.775241 -0.569057 ...  1.041667  1.490011  1.562701]

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\sklearn.py", line 1640, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 1733 1734 1735], got [-0.871875 -0.775241 -0.569057 ...  1.041667  1.490011  1.562701]

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\sklearn.py", line 1640, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 2309 2310 2311], got [-0.871875 -0.775241 -0.569057 ...  1.041667  1.490011  1.562701]

--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "C:\Users\sinna\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\xgboost\sklearn.py", line 1640, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [   0    1    2 ... 2880 2881 2882], got [-0.871875 -0.775241 -0.569057 ...  1.041667  1.490011  1.562701]


In [None]:
# 9.1: Prediction & Classification Report
y_pred = best_model.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred))

# 9.2: ROC AUC + Precision‐Recall AUC
y_proba = best_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_proba)

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)

print(f"Test ROC AUC:  {roc_auc:.4f}")
print(f"Test PR AUC:   {pr_auc:.4f}")

# 9.3: Display a confusion matrix if you like
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix (low values = better balance):\n", cm)

# ───────────
# 10. FEATURE IMPORTANCE AND NEXT STEPS
# ───────────

importances = best_model.named_steps['clf'].feature_importances_
feat_imp = pd.Series(importances, index=feature_columns).sort_values(ascending=False)
print("\nTop 10 Feature Importances:")
print(feat_imp.head(10))


Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.44      0.37      0.40       355
           1       0.63      0.70      0.66       541

    accuracy                           0.57       896
   macro avg       0.53      0.53      0.53       896
weighted avg       0.55      0.57      0.56       896

Test ROC AUC:  0.5590
Test PR AUC:   0.6516
Confusion Matrix (low values = better balance):
 [[131 224]
 [165 376]]

Top 10 Feature Importances:
oibdpy_QoQ_growth    0.053566
xsgay_QoQ_growth     0.051362
xoptepsy             0.050947
oibdpy               0.050897
Tax_rate             0.050676
Capex_to_Revenue     0.049257
cshfdy               0.049228
mom_12_1             0.048768
xsgay                0.048410
rdipay               0.046477
dtype: float32
