# Implimeting ML model into trade strategy

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from ta import momentum, volatility, trend
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import resample

In [11]:
import asyncio
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from alpaca.data.live import StockDataStream
from alpaca.trading.client import TradingClient
from alpaca.trading.requests import MarketOrderRequest
from alpaca.trading.enums import OrderSide, TimeInForce
from alpaca.data.enums import DataFeed
from alpaca.data.timeframe import TimeFrame
import matplotlib.pyplot as plt
from datetime import datetime
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest

# Load environment variables
load_dotenv()  # Load .env file

API_KEY = os.getenv("ALPACA_API_KEY")
API_SECRET = os.getenv("ALPACA_SECRET_KEY")

# Initialize TradingClient
Trading_Client = TradingClient(API_KEY, API_SECRET, paper=True)  # paper=True for paper trading
data_client = StockHistoricalDataClient(API_KEY, API_SECRET)
SYMBOL = "MSFT"

In [13]:
LOOKAHEAD_BARS = 30
RETURN_THRESHOLD = 0.01  # 1%

# === Step 1: Fetch historical data ===
print(f"Fetching historical data for {SYMBOL}...")
request = StockBarsRequest(
    symbol_or_symbols=[SYMBOL],
    timeframe=TimeFrame.Minute,
    start=datetime(2024, 1, 1),
    end=datetime(2024, 12, 30),
    adjustment='all'
)
bars = data_client.get_stock_bars(request).df

# === Step 2: Clean DataFrame ===
if isinstance(bars.index, pd.MultiIndex):
    df = bars.xs(SYMBOL, level='symbol').reset_index()
else:
    df = bars.reset_index()

df['timestamp'] = pd.to_datetime(df['timestamp'])
df['date'] = df['timestamp'].dt.date

# === Step 3: Feature Engineering ===
print("Generating features...")

# Basic indicators
df['rsi'] = momentum.RSIIndicator(df['close'], window=14).rsi()
df['atr'] = volatility.AverageTrueRange(df['high'], df['low'], df['close'], window=14).average_true_range()
df['macd_diff'] = trend.MACD(df['close']).macd_diff()
df['vol_rolling'] = df['volume'].rolling(30).mean()
df['atr_ratio'] = df['atr'] / df['close']

# Trend info
df['daily_close'] = df.groupby('date')['close'].transform('last')
df['sma_trend'] = df['daily_close'].rolling(20).mean()
df['daily_trend'] = np.where(df['daily_close'] > df['sma_trend'], 'up', 'down')
df['trend_num'] = df['daily_trend'].map({'up': 1, 'down': -1})

# Additional features
df['returns'] = df['close'].pct_change()
df['volatility'] = df['returns'].rolling(20).std()
df['volume_spike'] = df['volume'] / df['volume'].rolling(20).mean()
df['sma_slope'] = df['sma_trend'].diff()

# Lagged momentum features
for lag in range(1, 4):
    df[f'rsi_lag{lag}'] = df['rsi'].shift(lag)
    df[f'macd_diff_lag{lag}'] = df['macd_diff'].shift(lag)

# === Step 4: Return-Based Labeling ===
def generate_labels_by_future_return(df, lookahead=30, threshold=0.01):
    labels = []
    for i in range(len(df) - lookahead):
        future_return = (df['close'].iloc[i + lookahead] - df['close'].iloc[i]) / df['close'].iloc[i]
        labels.append(1 if future_return >= threshold else 0)
    labels += [np.nan] * lookahead
    return pd.Series(labels, index=df.index)

print("Labeling data based on future returns...")
df['label'] = generate_labels_by_future_return(df, lookahead=LOOKAHEAD_BARS, threshold=RETURN_THRESHOLD)

# === Step 5: Prepare Dataset ===
features = [
    'rsi', 'macd_diff', 'vol_rolling', 'atr', 'atr_ratio', 'trend_num',
    'volatility', 'volume_spike', 'sma_slope',
    'rsi_lag1', 'rsi_lag2', 'rsi_lag3',
    'macd_diff_lag1', 'macd_diff_lag2', 'macd_diff_lag3'
]

df = df.dropna(subset=features + ['label'])

X = df[features]
y = df['label']

print("\nLabel distribution after labeling:")
print(y.value_counts())

# === Step 6: Train/Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

# === Step 7: Train Model with Class Weighting ===
print("Training model with class_weight='balanced'...")
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    class_weight='balanced',
    random_state=42
)
model.fit(X_train, y_train)

# === Step 8: Evaluation ===
y_pred = model.predict(X_test)
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# === Step 9: Output Predictions ===
df.loc[X_test.index, 'predicted_label'] = y_pred
df.loc[X_test.index, 'predicted_prob'] = model.predict_proba(X_test)[:, 1]

df[['timestamp', 'close', 'label', 'predicted_label', 'predicted_prob']].to_csv("ml_predictions.csv", index=False)
print("\nSaved predictions to ml_predictions.csv")

Fetching historical data for MSFT...
Generating features...
Labeling data based on future returns...

Label distribution after labeling:
label
0.0    151937
1.0       837
Name: count, dtype: int64
Training model with class_weight='balanced'...

=== Classification Report ===
              precision    recall  f1-score   support

         0.0       1.00      0.93      0.96     45628
         1.0       0.01      0.21      0.03       205

    accuracy                           0.93     45833
   macro avg       0.50      0.57      0.49     45833
weighted avg       0.99      0.93      0.96     45833


Confusion Matrix:
[[42465  3163]
 [  162    43]]

Saved predictions to ml_predictions.csv
