In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB


In [2]:
csv_path= "C:/Users/ywexl/OneDrive/Desktop/AIM_5005/QQQ_1min.csv" # Path to your CSV file

# Calculate RSI Feature
def compute_rsi(series, period=14):
    delta = series.diff()  
    gain = delta.clip(lower=0)      
    loss = -1 * delta.clip(upper=0) 

    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()

    rs = avg_gain / (avg_loss + 1e-10)

    rsi = 100 - (100 / (1.0 + rs))
    return rsi


def load_and_engineer(csv_path,short_window=5, long_window=15, rsi_period=14):
   # Load Data
   df= pd.read_csv(csv_path, parse_dates=['Datetime'], index_col='Datetime')
   df = df.sort_index()

   for col in ['Open', 'High', 'Low', 'Close']:
      df[col] = df[col].astype(str).str.replace(r'[^0-9.]', '', regex=True).pipe(pd.to_numeric, errors='coerce')
   df['Volume']=pd.to_numeric(df['Volume'], errors='coerce').fillna(0).astype(int)
   df['Close_next']=df['Close'].shift(-1)
   df.dropna(subset=['Close', 'Close_next'], inplace=True)
   df['Target'] = (df['Close_next'] > df['Close']).astype(int)
#features:
   df['sma_short'] = df['Close'].rolling(window=short_window).mean()
   df['sma_long'] = df['Close'].rolling(window=long_window).mean()
   df['rolling_std'] = df['Close'].rolling(window=short_window).std()
   df['ema_short'] = df['Close'].ewm(span=short_window, adjust=False).mean()
   df['ema_long'] = df['Close'].ewm(span=long_window, adjust=False).mean()
   df['Volume_Ratio_short'] = df['Volume'].rolling(window=5).mean() / df['Volume'].rolling(window=15).mean()
   df['Volume_Ratio_long'] = df['Volume'].rolling(window=15).mean() / df['Volume'].rolling(window=30).mean()

   df['RSI'] = compute_rsi(df['Close'], rsi_period)

   df.dropna(inplace=True)


   feature_cols=[
      'sma_short',
      'sma_long',
      'rolling_std',
      'ema_short',
      'ema_long',
      'Volume_Ratio_short',
      'Volume_Ratio_long',
      'RSI'
      ]
   x=df[feature_cols].copy()
   y=df['Target'].copy()
   return x, y, df


feature_sets = {
    'Base (SMA & RSI)': [
        'sma_short', 'sma_long', 'rolling_std', 'RSI'
    ],
    'With ema': [
        'sma_short', 'sma_long', 'rolling_std', 'RSI',
        'ema_short', 'ema_long'
    ],
    'With ema + Volume': [
        'sma_short', 'sma_long', 'rolling_std', 'RSI',
        'ema_short', 'ema_long',
        'Volume_Ratio_short', 'Volume_Ratio_long'
    ],
}

In [3]:
X_all, y, df = load_and_engineer(csv_path)
for name, cols in feature_sets.items():
    print(name, "→", X_all[cols].shape)

Base (SMA & RSI) → (1920, 4)
With ema → (1920, 6)
With ema + Volume → (1920, 8)


In [4]:
def run_model(model, X, y, df, test_frac=0.2):
    split= int(len(X)*(1-test_frac))
    X_train=X.iloc[:split]
    X_test=X.iloc[split:]
    y_train=y.iloc[:split]
    y_test=y.iloc[split:]
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    print(f"\n=== {model.__class__.__name__} ===")
    print(classification_report(y_test, y_pred))   

    backtest=df.iloc[split:].copy()
    backtest["BarRet"]= backtest["Close"].pct_change().fillna(0)
    backtest["StrategyRet"]= backtest["BarRet"]*y_pred
    strategy_ret= (1+backtest["StrategyRet"]).cumprod().iloc[-1]-1
    bh_return= (1+backtest["BarRet"]).cumprod().iloc[-1]-1
    return strategy_ret, bh_return

In [5]:
X_all, y, df = load_and_engineer(csv_path)

models =[
    ("Random Forest", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("XGBoost", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ("Naive Bayes", GaussianNB())
]
results = []
for model_name,model in models:
    print(f"\n=== {model_name} ===")
    for fs_name, cols in feature_sets.items():
        X_fs= X_all[cols]
        strategy_ret, bh_ret= run_model(model, X_fs, y, df)
        results.append({
            "model": model_name,
            "feature_set": fs_name,
            "strategy_return": strategy_ret,
            "buy_hold_return": bh_ret
        })


df_results = pd.DataFrame(results)
df_results


=== Random Forest ===

=== RandomForestClassifier ===
              precision    recall  f1-score   support

           0       0.44      0.55      0.49       181
           1       0.48      0.37      0.42       203

    accuracy                           0.46       384
   macro avg       0.46      0.46      0.45       384
weighted avg       0.46      0.46      0.45       384


=== RandomForestClassifier ===
              precision    recall  f1-score   support

           0       0.43      0.53      0.48       181
           1       0.47      0.37      0.42       203

    accuracy                           0.45       384
   macro avg       0.45      0.45      0.45       384
weighted avg       0.45      0.45      0.44       384


=== RandomForestClassifier ===
              precision    recall  f1-score   support

           0       0.45      0.55      0.49       181
           1       0.50      0.40      0.45       203

    accuracy                           0.47       384
   macro 

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== XGBClassifier ===
              precision    recall  f1-score   support

           0       0.46      0.55      0.50       181
           1       0.51      0.42      0.46       203

    accuracy                           0.48       384
   macro avg       0.49      0.49      0.48       384
weighted avg       0.49      0.48      0.48       384


=== XGBClassifier ===
              precision    recall  f1-score   support

           0       0.46      0.55      0.50       181
           1       0.52      0.44      0.48       203

    accuracy                           0.49       384
   macro avg       0.49      0.49      0.49       384
weighted avg       0.49      0.49      0.49       384


=== XGBClassifier ===
              precision    recall  f1-score   support

           0       0.46      0.50      0.48       181
           1       0.51      0.47      0.49       203

    accuracy                           0.48       384
   macro avg       0.49      0.49      0.48       384
weigh

Unnamed: 0,model,feature_set,strategy_return,buy_hold_return
0,Random Forest,Base (SMA & RSI),-0.000104,0.01663
1,Random Forest,With ema,0.000572,0.01663
2,Random Forest,With ema + Volume,0.000198,0.01663
3,Logistic Regression,Base (SMA & RSI),0.000707,0.01663
4,Logistic Regression,With ema,-0.004966,0.01663
5,Logistic Regression,With ema + Volume,-0.00432,0.01663
6,XGBoost,Base (SMA & RSI),0.005011,0.01663
7,XGBoost,With ema,0.003849,0.01663
8,XGBoost,With ema + Volume,0.003088,0.01663
9,Naive Bayes,Base (SMA & RSI),0.00793,0.01663


In [6]:
def run_trade_simulation(df, feature_sets, model_grids, test_size=0.2, n_splits=5, tcost=0.0):
    summary_results = []
    tscv = TimeSeriesSplit(n_splits=n_splits)
    df=df.sort_index()
    df["Return"] = df["Close"].pct_change().fillna(0)
    for fs_name, cols in feature_sets.items():
        X= df[cols]
        y= df["Target"]

        split= int(len(X)*(1-test_size))
        X_train, X_test= X.iloc[:split], X.iloc[split:]
        y_train, y_test= y.iloc[:split], y.iloc[split:]
        df_test= df.loc[X_test.index].copy()

        for model_name,estimator, param_grid in model_grids:
            gs=GridSearchCV(estimator, param_grid, cv=tscv, scoring="accuracy", n_jobs=-1, verbose=0)
            gs.fit(X_train, y_train)
            df_test["signal"]= gs.predict(X_test)
            df_test["signal_prev"]= df_test["signal"].shift(1).fillna(0).astype(int)

            entries=df_test[
                (df_test["signal_prev"]==0) & (df_test["signal"]==1)].index
            exits=df_test[
                (df_test["signal_prev"]==1) & (df_test["signal"]==0)].index

            if len(entries) and len(exits) and exits[0]<entries[0]:
                exits=exits[1:]
            if len(entries) and len(exits) and exits[-1]<entries[-1]:
                exits=exits.append(pd.Index([df_test.index[-1]]))
            pairs=list(zip(entries, exits))

            pnls=[]
            for entry, exit in pairs:
                entry_price= df_test.loc[entry, "Close"]*(1+tcost)
                exit_price= df_test.loc[exit, "Close"]*(1-tcost)
                pnls.append((exit_price/entry_price)-1)
        
            total_return= np.prod([1+pnl for pnl in pnls])-1 if pnls else 0
            num_trades= len(pnls)
            avg_trade_return= np.mean(pnls) if pnls else 0
            win_rate= np.mean(np.array(pnls)>0) if pnls else 0
            average_pnl= np.mean(pnls) if pnls else 0
            accuracy= accuracy_score(y_test, df_test["signal"])

            summary_results.append({
                "model": model_name,
                "feature_set": fs_name,
                "total_return": total_return,
                "num_trades": num_trades,
                "avg_trade_return": avg_trade_return,
                "win_rate": win_rate,
                "average_pnl": average_pnl,
                "accuracy": accuracy
            })
    return pd.DataFrame(summary_results)

In [7]:
rf_param_grid  = {'n_estimators': [50,100,200]}
lr_param_grid  = {'C': [0.01,0.1,1]}
xgb_param_grid = {'max_depth': [3,5], 'learning_rate':[0.01,0.1]}
nb_param_grid  = {}  # no hyperparams

In [8]:

model_grids = [
    ("Random Forest",       RandomForestClassifier(random_state=42), rf_param_grid),
    ("Logistic Regression", LogisticRegression(solver="liblinear", random_state=42), lr_param_grid),
    ("XGBoost",             XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42), xgb_param_grid),
    ("Naive Bayes",         GaussianNB(), nb_param_grid)
]

In [10]:
X_all, _, df = load_and_engineer(csv_path)
trade_df = run_trade_simulation(df, feature_sets, model_grids)
display(trade_df)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,model,feature_set,total_return,num_trades,avg_trade_return,win_rate,average_pnl,accuracy
0,Random Forest,Base (SMA & RSI),0.003726,56,6.7e-05,0.571429,6.7e-05,0.46875
1,Logistic Regression,Base (SMA & RSI),0.008074,21,0.000383,0.666667,0.000383,0.486979
2,XGBoost,Base (SMA & RSI),0.008425,62,0.000136,0.5,0.000136,0.481771
3,Naive Bayes,Base (SMA & RSI),0.010005,7,0.001426,0.571429,0.001426,0.520833
4,Random Forest,With ema,-0.000378,47,-8e-06,0.510638,-8e-06,0.4375
5,Logistic Regression,With ema,0.007829,21,0.000372,0.666667,0.000372,0.486979
6,XGBoost,With ema,0.011161,58,0.000192,0.586207,0.000192,0.489583
7,Naive Bayes,With ema,0.010165,4,0.002538,1.0,0.002538,0.515625
8,Random Forest,With ema + Volume,0.003901,39,0.0001,0.512821,0.0001,0.466146
9,Logistic Regression,With ema + Volume,0.003011,22,0.000137,0.636364,0.000137,0.481771
