# Setup  
- Import Data  
- create features

In [308]:
import os 
import yfinance as yf
import pandas as pd
import datetime
from datetime import datetime, timedelta
import numpy as np

In [309]:
os.chdir('C:/Users/ywexl/OneDrive/Desktop/AIM_5005')

In [310]:
df=pd.read_csv('Month_1m_QQQ.csv')

In [311]:
df['Datetime'] = pd.to_datetime(df['Datetime'])
df['Close'] = df['Close'].astype(float)
df['Open'] = df['Open'].astype(float)
df['High'] = df['High'].astype(float)
df['Low'] = df['Low'].astype(float)
df['Volume'] = df['Volume'].astype(int)

In [312]:
df['Close'] = df['Close'].astype(str).str.replace(r'[^0-9.]', '', regex=True)
df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
df['SMA'] = df['Close'].rolling(window=5).mean()
df['Close_next'] = df['Close'].shift(-1)
df = df.dropna(subset=['Close', 'Close_next'])
df['Target'] = (df['Close_next'] > df['Close']).astype(int)
df.dropna(subset=['Close_next'], inplace=True)


In [313]:
# Calculate SMA and STD Features
short_window = 5
long_window = 15

df['SMA_short'] = df['Close'].rolling(window=short_window).mean()
df['SMA_long'] = df['Close'].rolling(window=long_window).mean()
df['RollingStd'] = df['Close'].rolling(window=short_window).std()
df.dropna(inplace=True)


In [314]:
# Calculate RSI Feature
def compute_rsi(series, period=14):
    delta = series.diff()  
    gain = delta.clip(lower=0)      
    loss = -1 * delta.clip(upper=0) 

    avg_gain = gain.rolling(window=period).mean()
    avg_loss = loss.rolling(window=period).mean()

    rs = avg_gain / (avg_loss + 1e-10)

    rsi = 100 - (100 / (1.0 + rs))
    return rsi

df['RSI'] = compute_rsi(df['Close'], period=14)

df=df.dropna(subset=['RSI'])

In [315]:
# Calculate EMA Feature
df['EMA_Short'] = df['Close'].ewm(span=5, adjust=False).mean()
df['EMA_Long'] = df['Close'].ewm(span=15, adjust=False).mean()
df.dropna(inplace=True)
df=df.dropna(subset=['EMA_Short', 'EMA_Long'])

In [316]:
# Calculate Volume Feature
df['Volume_SMA_5'] = df['Volume'].rolling(window=5).mean()
df['Volume_SMA_15'] = df['Volume'].rolling(window=15).mean()
df['Volume_Ratio_5'] = df['Volume'] / df['Volume_SMA_5']
df['Volume_Ratio_15'] = df['Volume'] / df['Volume_SMA_15']
df.dropna(inplace=True)

# Model 1 - Random Forest

In [317]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


In [318]:
feature_cols = ['SMA_short', 'SMA_long', 'RollingStd', 'RSI']  
X = df[feature_cols]
y = df['Target']

# 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False  
)


In [319]:
# Run and Evaluate Model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Accuracy: 0.5096649484536082
              precision    recall  f1-score   support

           0       0.51      0.54      0.52       770
           1       0.51      0.48      0.49       782

    accuracy                           0.51      1552
   macro avg       0.51      0.51      0.51      1552
weighted avg       0.51      0.51      0.51      1552



In [320]:


results = X_test.copy()
results['ActualClose'] = df.loc[X_test.index, 'Close']
results['Prediction'] = y_pred

results['Close_next'] = df.loc[X_test.index, 'Close_next']

results['StrategyReturn'] = np.where(
    results['Prediction'] == 1,
    (results['Close_next'] - results['ActualClose']) / results['ActualClose'],
    0
)

# Cumulative return of the strategy
results['CumulativeStrategy'] = (1 + results['StrategyReturn']).cumprod()

# Buy and Hold Return
results['BuyHoldReturn'] = (results['ActualClose'] / results['ActualClose'].iloc[0])

final_strategy_return = results['CumulativeStrategy'].iloc[-1] - 1
final_buy_hold_return = results['BuyHoldReturn'].iloc[-1] - 1

print(f"Final Strategy Return: {final_strategy_return * 100:.2f}%")
print(f"Final Buy and Hold Return: {final_buy_hold_return * 100:.2f}%")


Final Strategy Return: 3.51%
Final Buy and Hold Return: 0.96%


# Parameter 2- EMA

In [321]:

feature_cols = ['SMA_short', 'SMA_long', 'RollingStd', 'RSI', 'EMA_Short', 'EMA_Long']
X = df[feature_cols]
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False  
)

In [322]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.49677835051546393
              precision    recall  f1-score   support

           0       0.49      0.53      0.51       770
           1       0.50      0.47      0.48       782

    accuracy                           0.50      1552
   macro avg       0.50      0.50      0.50      1552
weighted avg       0.50      0.50      0.50      1552



In [323]:


results = X_test.copy()
results['ActualClose'] = df.loc[X_test.index, 'Close']
results['Prediction'] = y_pred

results['Close_next'] = df.loc[X_test.index, 'Close_next']

results['StrategyReturn'] = np.where(
    results['Prediction'] == 1,
    (results['Close_next'] - results['ActualClose']) / results['ActualClose'],
    0
)

results['CumulativeStrategy'] = (1 + results['StrategyReturn']).cumprod()
results['BuyHoldReturn'] = (results['ActualClose'] / results['ActualClose'].iloc[0])

final_strategy_return = results['CumulativeStrategy'].iloc[-1] - 1
final_buy_hold_return = results['BuyHoldReturn'].iloc[-1] - 1

print(f"Final Strategy Return: {final_strategy_return * 100:.2f}%")
print(f"Final Buy and Hold Return: {final_buy_hold_return * 100:.2f}%")

Final Strategy Return: 1.98%
Final Buy and Hold Return: 0.96%


# Parameter 3- Volume

In [324]:
feature_cols = ['SMA_short', 'SMA_long', 'RollingStd', 'RSI', 'EMA_Short', 'EMA_Long', 'Volume_Ratio_5', 'Volume_Ratio_15']
X = df[feature_cols]
y = df['Target']


X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    shuffle=False  
)

In [325]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.5180412371134021
              precision    recall  f1-score   support

           0       0.51      0.53      0.52       770
           1       0.52      0.51      0.51       782

    accuracy                           0.52      1552
   macro avg       0.52      0.52      0.52      1552
weighted avg       0.52      0.52      0.52      1552



In [326]:

results = X_test.copy()
results['ActualClose'] = df.loc[X_test.index, 'Close']
results['Prediction'] = y_pred

results['Close_next'] = df.loc[X_test.index, 'Close_next']

results['StrategyReturn'] = np.where(
    results['Prediction'] == 1,
    (results['Close_next'] - results['ActualClose']) / results['ActualClose'],
    0
)

results['CumulativeStrategy'] = (1 + results['StrategyReturn']).cumprod()
results['BuyHoldReturn'] = (results['ActualClose'] / results['ActualClose'].iloc[0])

final_strategy_return = results['CumulativeStrategy'].iloc[-1] - 1
final_buy_hold_return = results['BuyHoldReturn'].iloc[-1] - 1

print(f"Final Strategy Return: {final_strategy_return * 100:.2f}%")
print(f"Final Buy and Hold Return: {final_buy_hold_return * 100:.2f}%")

Final Strategy Return: 3.30%
Final Buy and Hold Return: 0.96%


# Model 2 - Logistic Regression

In [327]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [328]:
feature_cols = ['SMA_short', 'SMA_long', 'RollingStd', 'RSI']
X = df[feature_cols]
y = df['Target']

train_size = int(len(X) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

In [329]:
logreg = LogisticRegression()

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    scoring='accuracy',
    cv=tscv,
    n_jobs=-1
)

In [330]:
# Find best parameters and run + evaluate model
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


best_logreg = grid_search.best_estimator_
y_pred = best_logreg.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV Score: 0.5067698259187621
Test Accuracy: 0.4954896907216495
              precision    recall  f1-score   support

           0       0.49      0.55      0.52       770
           1       0.50      0.44      0.47       782

    accuracy                           0.50      1552
   macro avg       0.50      0.50      0.49      1552
weighted avg       0.50      0.50      0.49      1552



In [331]:
results = X_test.copy()
results['ActualClose'] = df.loc[X_test.index, 'Close']
results['Prediction'] = y_pred

results['Close_next'] = df.loc[X_test.index, 'Close_next']

results['StrategyReturn'] = np.where(
    results['Prediction'] == 1,
    (results['Close_next'] - results['ActualClose']) / results['ActualClose'],
    0
)

results['CumulativeStrategy'] = (1 + results['StrategyReturn']).cumprod()
results['BuyHoldReturn'] = (results['ActualClose'] / results['ActualClose'].iloc[0])

final_strategy_return = results['CumulativeStrategy'].iloc[-1] - 1
final_buy_hold_return = results['BuyHoldReturn'].iloc[-1] - 1

print(f"Final Strategy Return: {final_strategy_return * 100:.2f}%")
print(f"Final Buy and Hold Return: {final_buy_hold_return * 100:.2f}%")

Final Strategy Return: 1.18%
Final Buy and Hold Return: 0.96%


# Parameter 2  EMA

In [332]:
feature_cols = ['SMA_short', 'SMA_long', 'RollingStd', 'RSI', 'EMA_Short', 'EMA_Long']
X = df[feature_cols]
y = df['Target']

In [333]:
train_size = int(len(X) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

logreg = LogisticRegression()

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    scoring='accuracy',
    cv=tscv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

best_logreg = grid_search.best_estimator_
y_pred = best_logreg.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV Score: 0.5061895551257253
Test Accuracy: 0.48195876288659795
              precision    recall  f1-score   support

           0       0.48      0.55      0.51       770
           1       0.48      0.41      0.45       782

    accuracy                           0.48      1552
   macro avg       0.48      0.48      0.48      1552
weighted avg       0.48      0.48      0.48      1552



In [334]:
results = X_test.copy()
results['ActualClose'] = df.loc[X_test.index, 'Close']
results['Prediction'] = y_pred

results['Close_next'] = df.loc[X_test.index, 'Close_next']

results['StrategyReturn'] = np.where(
    results['Prediction'] == 1,
    (results['Close_next'] - results['ActualClose']) / results['ActualClose'],
    0
)

results['CumulativeStrategy'] = (1 + results['StrategyReturn']).cumprod()
results['BuyHoldReturn'] = (results['ActualClose'] / results['ActualClose'].iloc[0])

final_strategy_return = results['CumulativeStrategy'].iloc[-1] - 1
final_buy_hold_return = results['BuyHoldReturn'].iloc[-1] - 1

print(f"Final Strategy Return: {final_strategy_return * 100:.2f}%")
print(f"Final Buy and Hold Return: {final_buy_hold_return * 100:.2f}%")

Final Strategy Return: 0.59%
Final Buy and Hold Return: 0.96%


# Parameter  3 Volume

In [335]:
feature_cols = ['SMA_short', 'SMA_long', 'RollingStd', 'RSI', 'EMA_Short', 'EMA_Long', 'Volume_Ratio_5', 'Volume_Ratio_15']
X = df[feature_cols]
y = df['Target']

In [336]:
train_size = int(len(X) * 0.8)
X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
y_train, y_test = y.iloc[:train_size], y.iloc[train_size:]

logreg = LogisticRegression()

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}

tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=logreg,
    param_grid=param_grid,
    scoring='accuracy',
    cv=tscv,
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

best_logreg = grid_search.best_estimator_
y_pred = best_logreg.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Best Parameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Best CV Score: 0.5073500967117989
Test Accuracy: 0.5070876288659794
              precision    recall  f1-score   support

           0       0.50      0.38      0.43       770
           1       0.51      0.63      0.56       782

    accuracy                           0.51      1552
   macro avg       0.51      0.51      0.50      1552
weighted avg       0.51      0.51      0.50      1552



In [337]:
results = X_test.copy()
results['ActualClose'] = df.loc[X_test.index, 'Close']
results['Prediction'] = y_pred

results['Close_next'] = df.loc[X_test.index, 'Close_next']

results['StrategyReturn'] = np.where(
    results['Prediction'] == 1,
    (results['Close_next'] - results['ActualClose']) / results['ActualClose'],
    0
)

results['CumulativeStrategy'] = (1 + results['StrategyReturn']).cumprod()
results['BuyHoldReturn'] = (results['ActualClose'] / results['ActualClose'].iloc[0])

final_strategy_return = results['CumulativeStrategy'].iloc[-1] - 1
final_buy_hold_return = results['BuyHoldReturn'].iloc[-1] - 1

print(f"Final Strategy Return: {final_strategy_return * 100:.2f}%")
print(f"Final Buy and Hold Return: {final_buy_hold_return * 100:.2f}%")

Final Strategy Return: 0.98%
Final Buy and Hold Return: 0.96%
