In [2]:
API_KEY = "YOUR_API_KEY"  # Set up personal Alpaca API key before running the script
SECRET_KEY = "YOUR_SECRET_KEY"
BASE_URL = "https://paper-api.alpaca.markets"
from alpaca.data.historical import StockHistoricalDataClient
from alpaca.data.requests import StockBarsRequest
from alpaca.data.timeframe import TimeFrame
from datetime import datetime, timedelta
import pandas as pd
import importlib
import functions
importlib.reload(functions)

<module 'functions' from '/Users/lirunhe/Desktop/Stock Prediction/functions.py'>

In [3]:
# 1. Authenticate
client = StockHistoricalDataClient(API_KEY, SECRET_KEY)

# 2. Set up timeframe
end_date = datetime.now()- timedelta(days=1)  # Make end date yesterday to predict and evaluate yesterday's market
start_date = end_date - timedelta(days=500) # Request 500 days to ensure 300 days of trading day data
symbol = ["LMT"] # Stock ticker

# 3. Set up the request
request_params = StockBarsRequest(
    symbol_or_symbols = symbol,      
    timeframe = TimeFrame.Day,          # TimeFrame = Day for daily return data
    start = start_date,               
    end = end_date                      
)

# 4. Fetch the data
bars = client.get_stock_bars(request_params)

# 5. Convert to DataFrame
df = bars.df.reset_index()
df = df.sort_values("timestamp")
df = (
    df.groupby("symbol", group_keys=False)
      .tail(321)                         # 321 rows, 300 after taking out 21 N/As from shifting and calculation
      .reset_index(drop=True)           
)

In [4]:
df

Unnamed: 0,symbol,timestamp,open,high,low,close,volume,trade_count,vwap
0,LMT,2024-04-03 04:00:00+00:00,452.92,454.1469,447.590,447.90,758767.0,26864.0,448.913734
1,LMT,2024-04-04 04:00:00+00:00,450.26,455.9800,447.145,454.04,1398347.0,38908.0,452.571627
2,LMT,2024-04-05 04:00:00+00:00,452.41,455.5732,449.850,455.38,892120.0,29698.0,453.590634
3,LMT,2024-04-08 04:00:00+00:00,456.00,457.0000,452.270,452.38,751180.0,28135.0,453.510247
4,LMT,2024-04-09 04:00:00+00:00,448.69,452.0600,446.180,447.57,842962.0,31257.0,447.998298
...,...,...,...,...,...,...,...,...,...
316,LMT,2025-07-09 04:00:00+00:00,465.38,465.9855,460.205,463.06,892696.0,37768.0,462.473059
317,LMT,2025-07-10 04:00:00+00:00,458.68,466.4800,456.000,464.31,1255293.0,43750.0,463.371261
318,LMT,2025-07-11 04:00:00+00:00,464.75,468.2100,461.820,467.51,1209186.0,43407.0,466.199567
319,LMT,2025-07-14 04:00:00+00:00,470.00,474.7600,468.000,473.57,1488005.0,53256.0,472.593694


In [5]:
def add_lagged_features(df):
    # All features should have shift(1), except for target
    # Lagged close, open and volume
    for col in ['close', 'open', 'volume']:
        for lag in range(1, 4):
            df[f'{col}_lag{lag}'] = df[col].shift(lag)

    # Returns and gaps
    df['return_1d'] = df['close'].pct_change(1).shift(1)
    df['return_5d'] = df['close'].pct_change(5).shift(1)
    df['gap'] = df['open'] - df['close'].shift(1)
    df['gap_pct'] = df['gap'] / df['close'].shift(1)

    # Rolling statistics
    df['sma_5'] = df['close'].rolling(window=5).mean().shift(1)
    df['sma_10'] = df['close'].rolling(window=10).mean().shift(1)
    df['std_5'] = df['close'].rolling(window=5).std().shift(1)
    df['std_10'] = df['close'].rolling(window=10).std().shift(1)
    df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean().shift(1)

    # Price ratios
    df['price_vs_sma5'] = df['close'].shift(1) / df['sma_5']
    df['price_vs_sma10'] = df['close'].shift(1) / df['sma_10']
    df['price_vs_vwap'] = df['close'].shift(1) / df['vwap'].shift(1)

    # Volume features
    df['volume_avg_10'] = df['volume'].rolling(window=10).mean().shift(1)
    df['volume_spike'] = df['volume'].shift(1) / df['volume_avg_10']
    df['volume_change_pct'] = df['volume'].pct_change(1).shift(1)

    # Volatility and range
    df['range'] = df['high'] - df['low']
    df['range_lag1'] = df['range'].shift(1)

    # MACD and signal
    ema_12_raw = df['close'].ewm(span=12, adjust=False).mean()
    ema_26_raw = df['close'].ewm(span=26, adjust=False).mean()
    macd_raw = ema_12_raw - ema_26_raw
    df['MACD'] = macd_raw.shift(1)
    df['MACD_signal'] = macd_raw.shift(1).ewm(span=9, adjust=False).mean()

    # RSI 14
    delta = df['close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df['RSI_14'] = (100 - (100 / (1 + rs))).shift(1)

    # Longer return
    df['return_10d'] = df['close'].pct_change(10).shift(1)

    # Target: next day's return
    df['target'] = df['close'].pct_change().shift(-1)

    sma_20 = df['close'].rolling(window=20).mean()
    std_20 = df['close'].rolling(window=20).std()
    upper_band = sma_20 + (2 * std_20)
    lower_band = sma_20 - (2 * std_20)

    # How far price is from the bands (normalized distance)
    df['price_vs_upper_band'] = df['close'].shift(1) / upper_band.shift(1)
    df['price_vs_lower_band'] = df['close'].shift(1) / lower_band.shift(1)

    # --- 2. Rolling Skewness of 1-day returns ---
    df['return_skew_5d'] = df['return_1d'].rolling(window=5).skew().shift(1)

    # --- 3. Stochastic Oscillator %K ---
    low_14 = df['low'].rolling(window=14).min()
    high_14 = df['high'].rolling(window=14).max()
    df['stoch_k'] = ((df['close'] - low_14) / (high_14 - low_14 + 1e-6)).shift(1)

    # --- 4. Price Acceleration (second derivative of close price) ---
    df['price_accel'] = df['close'].shift(1) - 2 * df['close'].shift(2) + df['close'].shift(3)

    df.drop(columns=[
        'open',  # current-day open (used in gap)
        'high',  # current-day high
        'low',  # current-day low
        'close',  # current-day close (used in target, return, etc.)
        'volume',  # current-day volume
        'trade_count',  # not used in any feature
        'vwap',  # raw current-day vwap (you use vwap.shift(1) instead)
        'range',  # current-day high-low range (already lagged as range_lag1)
    ], errors='ignore', inplace=True)

    # Drop all rows with any NaN values except the last row (for prediction)
    last_row = df.iloc[[-1]].copy()
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)

    # Add one last row with latest available features (but NaN target)
    df.loc[len(df)] = last_row.iloc[0]

    return df

In [6]:
add_lagged_features(df)

Unnamed: 0,symbol,timestamp,close_lag1,close_lag2,close_lag3,open_lag1,open_lag2,open_lag3,volume_lag1,volume_lag2,...,MACD,MACD_signal,RSI_14,return_10d,target,price_vs_upper_band,price_vs_lower_band,return_skew_5d,stoch_k,price_accel
0,LMT,2024-05-01 04:00:00+00:00,464.93,467.55,461.29,467.08,461.99,463.90,711757.0,951555.0,...,4.060010,3.141022,67.062468,0.023376,0.003184,0.992505,1.044836,0.388204,0.649929,-8.88
1,LMT,2024-05-02 04:00:00+00:00,461.73,464.93,467.55,462.61,467.08,461.99,995050.0,711757.0,...,3.848427,3.282503,61.383983,0.012455,-0.002785,0.985418,1.034711,0.452046,0.495945,-0.58
2,LMT,2024-05-03 04:00:00+00:00,463.20,461.73,464.93,461.10,462.61,467.08,1011394.0,995050.0,...,3.756066,3.377216,65.655577,0.015589,0.001883,0.987233,1.037335,0.598376,0.528069,4.67
3,LMT,2024-05-06 04:00:00+00:00,461.91,463.20,461.73,462.93,461.10,462.61,910781.0,1011394.0,...,3.537992,3.409371,61.180046,-0.004225,0.008427,0.983641,1.033865,1.262922,0.466756,-2.76
4,LMT,2024-05-07 04:00:00+00:00,462.78,461.91,463.20,463.16,462.93,461.10,796391.0,910781.0,...,3.396220,3.406741,60.822898,0.003143,-0.001114,0.984754,1.034221,1.248304,0.506646,2.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,LMT,2025-07-09 04:00:00+00:00,463.01,469.06,462.52,464.20,464.50,463.71,1242958.0,1224120.0,...,-2.309064,-2.096704,46.907925,-0.020002,0.002699,0.956311,1.027144,-0.497174,0.373665,-12.59
297,LMT,2025-07-10 04:00:00+00:00,463.06,463.01,469.06,465.38,464.20,464.50,892696.0,1242958.0,...,-2.301016,-2.137566,34.716485,0.006215,0.006892,0.960442,1.026678,0.182130,0.375283,6.10
298,LMT,2025-07-11 04:00:00+00:00,464.31,463.06,463.01,458.68,465.38,464.20,1255293.0,892696.0,...,-2.168772,-2.143808,45.099383,0.012915,0.012962,0.965814,1.029156,0.711694,0.415723,1.20
299,LMT,2025-07-14 04:00:00+00:00,467.51,464.31,463.06,464.75,458.68,465.38,1209186.0,1255293.0,...,-1.785177,-2.072081,46.611864,0.020987,-0.007285,0.972706,1.033483,-0.087461,0.519249,1.95


In [7]:
df

Unnamed: 0,symbol,timestamp,close_lag1,close_lag2,close_lag3,open_lag1,open_lag2,open_lag3,volume_lag1,volume_lag2,...,MACD,MACD_signal,RSI_14,return_10d,target,price_vs_upper_band,price_vs_lower_band,return_skew_5d,stoch_k,price_accel
0,LMT,2024-05-01 04:00:00+00:00,464.93,467.55,461.29,467.08,461.99,463.90,711757.0,951555.0,...,4.060010,3.141022,67.062468,0.023376,0.003184,0.992505,1.044836,0.388204,0.649929,-8.88
1,LMT,2024-05-02 04:00:00+00:00,461.73,464.93,467.55,462.61,467.08,461.99,995050.0,711757.0,...,3.848427,3.282503,61.383983,0.012455,-0.002785,0.985418,1.034711,0.452046,0.495945,-0.58
2,LMT,2024-05-03 04:00:00+00:00,463.20,461.73,464.93,461.10,462.61,467.08,1011394.0,995050.0,...,3.756066,3.377216,65.655577,0.015589,0.001883,0.987233,1.037335,0.598376,0.528069,4.67
3,LMT,2024-05-06 04:00:00+00:00,461.91,463.20,461.73,462.93,461.10,462.61,910781.0,1011394.0,...,3.537992,3.409371,61.180046,-0.004225,0.008427,0.983641,1.033865,1.262922,0.466756,-2.76
4,LMT,2024-05-07 04:00:00+00:00,462.78,461.91,463.20,463.16,462.93,461.10,796391.0,910781.0,...,3.396220,3.406741,60.822898,0.003143,-0.001114,0.984754,1.034221,1.248304,0.506646,2.16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,LMT,2025-07-09 04:00:00+00:00,463.01,469.06,462.52,464.20,464.50,463.71,1242958.0,1224120.0,...,-2.309064,-2.096704,46.907925,-0.020002,0.002699,0.956311,1.027144,-0.497174,0.373665,-12.59
297,LMT,2025-07-10 04:00:00+00:00,463.06,463.01,469.06,465.38,464.20,464.50,892696.0,1242958.0,...,-2.301016,-2.137566,34.716485,0.006215,0.006892,0.960442,1.026678,0.182130,0.375283,6.10
298,LMT,2025-07-11 04:00:00+00:00,464.31,463.06,463.01,458.68,465.38,464.20,1255293.0,892696.0,...,-2.168772,-2.143808,45.099383,0.012915,0.012962,0.965814,1.029156,0.711694,0.415723,1.20
299,LMT,2025-07-14 04:00:00+00:00,467.51,464.31,463.06,464.75,458.68,465.38,1209186.0,1255293.0,...,-1.785177,-2.072081,46.611864,0.020987,-0.007285,0.972706,1.033483,-0.087461,0.519249,1.95


## Predict using random forest regressor

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Break up training and testing data
features = [col for col in df.columns if col not in ['symbol','timestamp', 'target']]
X = df[features].values
y = df['target'].values
X_train = X[:300]
X_test = X[300:301]

In [10]:
X_train = X[:300]
y_train = y[:300]
X_test = X[300:301]

In [11]:
scaler = StandardScaler()
scaler

In [12]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [14]:
model.predict(X_test)

array([-0.03629211])