In [2]:
import json
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import alpaca
from alpaca.trading.client import TradingClient
from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
from alpaca.data.historical.stock import StockHistoricalDataClient
from alpaca.trading.stream import TradingStream
from alpaca.data.live.stock import StockDataStream

from alpaca.data.requests import (
    StockBarsRequest,
    StockTradesRequest,
    StockQuotesRequest
)
from alpaca.trading.requests import (
    GetAssetsRequest, 
    MarketOrderRequest, 
    LimitOrderRequest, 
    StopOrderRequest, 
    StopLimitOrderRequest, 
    TakeProfitRequest, 
    StopLossRequest, 
    TrailingStopOrderRequest, 
    GetOrdersRequest, 
    ClosePositionRequest
)
from alpaca.trading.enums import ( 
    AssetStatus, 
    AssetExchange, 
    OrderSide, 
    OrderType, 
    TimeInForce, 
    OrderClass, 
    QueryOrderStatus
)
from alpaca.common.exceptions import APIError

from sklearn.preprocessing import StandardScaler
import pandas as pd
from xgboost import XGBClassifier

In [3]:
api_key = ""
secret_key = ""

paper = True 

# Below are the variables for development this documents
# Please do not change these variables
trade_api_url = None
trade_api_wss = None
data_api_url = None
stream_data_wss = None

# Fetching Market Data

In [4]:
# setup stock historical data client
stock_historical_data_client = StockHistoricalDataClient(api_key, secret_key, url_override = data_api_url)

In [19]:
symbols = ['META', 'AAPL', 'MSFT', 'AMZN', 'GOOG', 'TSLA']
now = datetime.now(ZoneInfo("America/New_York"))
start_date = datetime(2022, 9, 1)  # Starting from September 1, 2022
end_date = datetime(2024, 3, 1)    # Ending on September 1, 2023
train_percent = 0.66

In [36]:
train_dict = {}
val_dict = {}
for symbol in symbols:
    print(f"Fetching {symbol} data...")
    req = StockBarsRequest(
        symbol_or_symbols = [symbol],
        timeframe=TimeFrame(amount = 5, unit = TimeFrameUnit.Minute), 
        start = start_date,     
        end=end_date,                                                        
    )
    df = stock_historical_data_client.get_stock_bars(req).df.loc[:,["close","volume"]]
    train_cutoff = int(len(df) * 0.66)
    train_dict[symbol] = df.iloc[:train_cutoff]
    val_dict[symbol] = df.iloc[train_cutoff:]

Fetching META data...
Fetching AAPL data...
Fetching MSFT data...
Fetching AMZN data...
Fetching GOOG data...
Fetching TSLA data...


# Technical Indicator Functions

In [7]:
def calculate_bollinger_bands(data, window=20, num_of_std=2):
    """Calculate Bollinger Bands ratio wrt current price"""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    bb_ratio = (data - rolling_mean) / (rolling_std * num_of_std)
    return bb_ratio


In [8]:
def calculate_rsi(data, window=20):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

In [9]:
def calculate_sma(data, window=20):
    """Calculate SMA ratio of current price."""
    rolling_mean = data.rolling(window=window).mean()
    sma = (data / rolling_mean) - 1
    return sma

In [10]:
def calculate_obv(data):
    # Initialize OBV series with the same index as the DataFrame
    obv = [0]
    
    # Loop through each row in the DataFrame
    for i in range(1, len(data)):
        if data['close'].iloc[i] > data['close'].iloc[i - 1]:
            # Price went up, add the volume
            obv.append(obv[-1] + data['volume'].iloc[i])
        elif data['close'].iloc[i] < data['close'].iloc[i - 1]:
            # Price went down, subtract the volume
            obv.append(obv[-1] - data['volume'].iloc[i])
        else:
            # Price stayed the same, OBV remains unchanged
            obv.append(obv[-1]) 
    return obv


In [11]:
def calculate_ema(data, window = 20):
    """Calculate EMA ratio of current price."""
    rolling_mean = data.ewm(span=window, adjust=False).mean()
    ema = (data / rolling_mean) - 1
    return ema

In [12]:
def calculate_macd(prices, short_window=12, long_window=26, signal_window=9):
    """
    Calculate the MACD line, Signal line, and MACD Histogram.
    """
    # Calculate the short and long EMAs
    short_ema = prices.ewm(span=short_window, adjust=False).mean()
    long_ema = prices.ewm(span=long_window, adjust=False).mean()
    
    # Calculate the MACD line
    macd_line = short_ema - long_ema
    
    # Calculate the Signal line
    signal_line = macd_line.ewm(span=signal_window, adjust=False).mean()
    
    # Calculate the MACD Histogram
    macd_histogram = macd_line - signal_line
    
    # Combine the results in a DataFrame
    macd_df = pd.DataFrame({
        'MACD': macd_line,
        'Signal': signal_line,
        'Histogram': macd_histogram
    })
    
    return macd_df
    

# Label Generation Functions

In [13]:
def bollinger_detection(bb, thresh = 1):
    """
    Assign labels based on bollinger band crossings.
    """
    # Initialize the result list with zeros
    crossings = [0] * len(bb)
    
    # Loop through each element, starting from the second element (index 1)
    for i in range(1, len(bb)):
        # Check for -1 crossing: previous value <= -1 and current value > -1
        if bb.iloc[i-1] <= -thresh and bb.iloc[i] > -thresh:
            crossings[i] = 1
        # Check for 1 crossing: previous value >= 1 and current value < 1
        elif bb.iloc[i-1] >= thresh and bb.iloc[i] < thresh:
            crossings[i] = 2
    
    return pd.Series(crossings, index=bb.index)

In [14]:
def rsi_detection(rsi, low_thresh = 30, high_thresh = 70):
    """
    Assign labels based on RSI crossings.
    """
    # Initialize the result list with zeros
    crossings = [0] * len(rsi)
    
    # Loop through each element, starting from the second element (index 1)
    for i in range(1, len(rsi)):
        # Check for low crossing: previous value <= low_thresh and current value > low_thresh
        if rsi.iloc[i-1] <= low_thresh and rsi.iloc[i] > low_thresh:
            crossings[i] = 1
        # Check for high crossing: previous value >= 1 and current value < 1
        elif rsi.iloc[i-1] >= high_thresh and rsi.iloc[i] < high_thresh:
            crossings[i] = 2
    
    return pd.Series(crossings, index=rsi.index)

In [15]:
def generate_labels(df):
    """
    Generate trading labels for algorithm. 0 = hold, 1 = buy, 2 = sell.
    """
    labels_bb = bollinger_detection(df["bb"])
    labels_rsi = rsi_detection(df["rsi"])
    
    labels_final = pd.Series(
        [val if val == labels_bb.iloc[i] else 0 for i, val in enumerate(labels_rsi)],
        index=labels_rsi.index
    )
    
    return labels_final
    

# Generating Feature Tables

In [37]:
for data in (train_dict, val_dict):
    for symbol in symbols:
        # Calc indicators / features
        data[symbol].loc[:,'bb'] = calculate_bollinger_bands(data[symbol]['close'])
        data[symbol].loc[:,'sma'] = calculate_sma(data[symbol]['close'])
        data[symbol].loc[:,'rsi'] = calculate_rsi(data[symbol]['close'])
        data[symbol].loc[:,'obv'] = calculate_obv(data[symbol])
        data[symbol].loc[:,'ema'] = calculate_ema(data[symbol]['close'])
        macd_df = calculate_macd(data[symbol]['close'])
        data[symbol] = pd.concat([data[symbol], macd_df], axis=1)
        # Generate Labels
        labels_df = data[symbol].shift(periods=-2)
        data[symbol].loc[:,'label'] = generate_labels(labels_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[symbol].loc[:,'bb'] = calculate_bollinger_bands(data[symbol]['close'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[symbol].loc[:,'sma'] = calculate_sma(data[symbol]['close'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[symbol].loc[:,'rsi'] = calculate_rsi(data[symbol]['close'])

In [39]:
# Stack all dataframes
train_df = pd.concat(train_dict.values(), ignore_index=True)
val_df = pd.concat(val_dict.values(), ignore_index=True)
# Drop NaN
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)

In [None]:
# Optional Normalization (Depending on Model)

In [44]:
n_buy = (train_df['label'] == 1).sum()
n_sell = (train_df['label'] == 2).sum()
print(f"{n_buy} total train buy labels") 
print(f"{n_sell} total train sell labels")

n_buy = (val_df['label'] == 1).sum()
n_sell = (val_df['label'] == 2).sum()
print(f"{n_buy} total val buy labels") 
print(f"{n_sell} total val sell labels")

897 total train buy labels
967 total train sell labels
444 total val buy labels
493 total val sell labels


In [46]:
train_df.describe()

Unnamed: 0,close,volume,bb,sma,rsi,obv,ema,MACD,Signal,Histogram,label
count,258743.0,258743.0,258743.0,258743.0,258743.0,258743.0,258743.0,258743.0,258743.0,258743.0,258743.0
mean,175.679824,355580.7,0.016718,4.7e-05,50.52086,60362950.0,4e-05,0.006164,0.006153,1.1e-05,0.010941
std,70.671107,1008820.0,0.654127,0.004999,14.483776,160186200.0,0.004279,0.407344,0.384456,0.120374,0.135264
min,81.56,100.0,-2.12412,-0.169462,1.810865,-312984300.0,-0.161479,-5.899816,-4.91741,-2.541669,0.0
25%,117.41,3723.0,-0.4863,-0.001661,40.55666,-44844980.0,-0.001447,-0.136234,-0.131425,-0.042206,0.0
50%,155.26,53886.0,0.025437,5.9e-05,50.495809,53193450.0,6.8e-05,0.008358,0.008741,-1.7e-05,0.0
75%,233.81995,372671.0,0.519996,0.001863,60.548812,177728700.0,0.001609,0.153661,0.147616,0.043125,0.0
max,365.95,114007600.0,2.123587,0.155642,97.254902,440228300.0,0.137252,8.038458,7.229975,3.19888,2.0


In [48]:
train_df.corr()

Unnamed: 0,close,volume,bb,sma,rsi,obv,ema,MACD,Signal,Histogram,label
close,1.0,-0.027383,0.005078,0.012057,0.012137,0.568778,0.014333,0.025475,0.027024,-0.000106,-0.000926
volume,-0.027383,1.0,0.008063,0.008065,0.004705,0.036319,0.007541,0.003961,0.003049,0.003666,0.013308
bb,0.005078,0.008063,1.0,0.706737,0.752389,0.000119,0.705343,0.448774,0.300021,0.560425,0.049038
sma,0.012057,0.008065,0.706737,1.0,0.69178,0.00164,0.982406,0.694428,0.495042,0.768846,0.029626
rsi,0.012137,0.004705,0.752389,0.69178,1.0,0.003434,0.701601,0.677906,0.579408,0.443486,0.03686
obv,0.568778,0.036319,0.000119,0.00164,0.003434,1.0,0.00231,0.008409,0.009125,-0.000689,-0.001359
ema,0.014333,0.007541,0.705343,0.982406,0.701601,0.00231,1.0,0.735012,0.565838,0.68007,0.031343
MACD,0.025475,0.003961,0.448774,0.694428,0.677906,0.008409,0.735012,1.0,0.95541,0.332553,0.014891
Signal,0.027024,0.003049,0.300021,0.495042,0.579408,0.009125,0.565838,0.95541,1.0,0.039248,0.009185
Histogram,-0.000106,0.003666,0.560425,0.768846,0.443486,-0.000689,0.68007,0.332553,0.039248,1.0,0.021055


# Model Development

In [52]:
from xgboost import XGBClassifier

In [109]:
X_train = train_df.drop(columns=['label']).reset_index(drop=True)
y_train = train_df['label'].reset_index(drop=True)
X_test = val_df.drop(columns=['label']).reset_index(drop=True)
y_test = val_df['label'].reset_index(drop=True)

In [110]:
X_train

Unnamed: 0,close,volume,bb,sma,rsi,obv,ema,MACD,Signal,Histogram
0,161.70,868.0,0.035078,0.000118,45.508982,2272.0,0.000011,-0.064312,-0.092727,0.028416
1,161.94,500.0,0.489873,0.001621,49.162011,2772.0,0.001353,-0.037538,-0.081689,0.044152
2,161.72,881.0,0.109039,0.000346,46.315789,1891.0,-0.000007,-0.033683,-0.072088,0.038405
3,161.61,1001.0,-0.069768,-0.000210,44.871795,890.0,-0.000622,-0.039054,-0.065481,0.026427
4,162.50,4066.0,1.420646,0.005171,54.255319,4956.0,0.004415,0.028180,-0.046749,0.074929
...,...,...,...,...,...,...,...,...,...,...
258738,253.98,35628.0,-1.531590,-0.008694,15.151515,368735474.0,-0.007915,-0.364344,-0.122231,-0.242113
258739,254.16,26510.0,-1.129329,-0.007538,19.210526,368761984.0,-0.006529,-0.463429,-0.190471,-0.272958
258740,253.84,21177.0,-1.075031,-0.008279,17.848411,368740807.0,-0.007044,-0.561306,-0.264638,-0.296668
258741,253.00,36705.0,-1.181922,-0.010913,15.176715,368704102.0,-0.009356,-0.698602,-0.351431,-0.347171


In [111]:
class_weights = y_train.value_counts(normalize=True)  # Get class distribution
total_samples = len(y_train)
scale_pos_weight = total_samples / (len(class_weights) * class_weights)
print(scale_pos_weight)

label
0    8.687351e+04
2    2.307754e+07
1    2.487846e+07
Name: proportion, dtype: float64


In [112]:
model = XGBClassifier(eval_metric='mlogloss', n_estimators = 100)

model.fit(X_train, y_train, sample_weight=y_train.map(scale_pos_weight))


In [113]:
y_preds = model.predict(X_test)

In [114]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_preds)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision, recall, and F1 score (averaging by 'macro', 'micro', or 'weighted')
precision = precision_score(y_test, y_preds, average='weighted')
recall = recall_score(y_test, y_preds, average='weighted')
f1 = f1_score(y_test, y_preds, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_preds)
print("\nConfusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_preds)
print("\nClassification Report:")
print(class_report)

Accuracy: 0.91
Precision: 0.99
Recall: 0.91
F1 Score: 0.95

Confusion Matrix:
[[121071   5541   5693]
 [   244    200      0]
 [   258      0    235]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.92      0.95    132305
           1       0.03      0.45      0.06       444
           2       0.04      0.48      0.07       493

    accuracy                           0.91    133242
   macro avg       0.36      0.61      0.36    133242
weighted avg       0.99      0.91      0.95    133242



In [70]:
model.objective

'multi:softprob'

In [82]:
model.n_estimators

100

# Market Simulation

In [121]:
# Initialize portfolio values
cash = 10000  # Starting cash
shares_owned = 0
initial_value = cash  # Keep track of the initial portfolio value
df = val_dict['AAPL']
df.dropna(inplace=True)
#df.drop(columns=['label'], inplace = True)
df.reset_index(inplace=True, drop=True)
# Simulate trading strategy
for index, row in df.iterrows():
    current_price = row['close']

    # Example trading strategy: Buy if the price is lower than a threshold, sell if higher
    action = model.predict(pd.DataFrame([row]))

    if action == 1 and cash >= current_price:
        # Buy 1 share
        shares_owned += 1
        cash -= current_price
        print(f"Buying 1 share at ${current_price:.2f} on {index}")

    elif action == 2 and shares_owned > 0:
        # Sell 1 share
        shares_owned -= 1
        cash += current_price
        print(f"Selling 1 share at ${current_price:.2f} on {index}")

# Calculate the final portfolio value
final_value = cash + shares_owned * current_price
print(f"\nInitial Portfolio Value: ${initial_value:.2f}")
print(f"Final Portfolio Value: ${final_value:.2f}")
portfolio_return = (final_value - initial_value) / initial_value * 100
print(f"Portfolio Return: {portfolio_return:.2f}%")

Buying 1 share at $180.03 on 47
Buying 1 share at $180.09 on 56
Selling 1 share at $180.36 on 68
Selling 1 share at $180.48 on 69
Buying 1 share at $180.10 on 89
Buying 1 share at $180.19 on 93
Buying 1 share at $180.18 on 94
Buying 1 share at $180.00 on 95
Buying 1 share at $179.99 on 96
Selling 1 share at $180.94 on 113
Selling 1 share at $181.59 on 115
Selling 1 share at $181.59 on 116
Selling 1 share at $183.65 on 147
Selling 1 share at $184.14 on 166
Buying 1 share at $184.21 on 235
Buying 1 share at $184.22 on 236
Buying 1 share at $184.16 on 237
Buying 1 share at $184.28 on 238
Buying 1 share at $184.09 on 240
Buying 1 share at $184.02 on 241
Buying 1 share at $184.02 on 248
Buying 1 share at $183.86 on 249
Buying 1 share at $183.75 on 250
Buying 1 share at $183.74 on 251
Buying 1 share at $183.57 on 255
Buying 1 share at $183.68 on 257
Selling 1 share at $184.04 on 273
Selling 1 share at $184.11 on 274
Selling 1 share at $184.14 on 276
Selling 1 share at $184.23 on 277
Selling 

In [103]:
val_dict['AAPL']

Unnamed: 0_level_0,Unnamed: 1_level_0,close,volume,bb,sma,rsi,obv,ema,MACD,Signal,Histogram
symbol,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAPL,2023-08-28 20:15:00+00:00,180.0300,13138.0,0.147227,0.000261,60.969912,-2897316.0,0.000154,0.093237,0.092459,0.000778
AAPL,2023-08-28 20:20:00+00:00,180.0100,8701.0,0.042636,0.000070,60.118674,-2906017.0,0.000039,0.082351,0.090437,-0.008086
AAPL,2023-08-28 20:25:00+00:00,180.0700,186080.0,0.207484,0.000309,61.406472,-2719937.0,0.000337,0.077670,0.087884,-0.010214
AAPL,2023-08-28 20:30:00+00:00,180.0700,4859.0,0.170604,0.000240,58.976016,-2719937.0,0.000305,0.073118,0.084931,-0.011813
AAPL,2023-08-28 20:35:00+00:00,180.0300,3338.0,-0.031303,-0.000041,57.329331,-2723275.0,0.000075,0.065527,0.081050,-0.015523
AAPL,...,...,...,...,...,...,...,...,...,...,...
AAPL,2024-02-29 23:40:00+00:00,180.3000,1368.0,-0.684132,-0.001419,26.563763,64183535.0,-0.001072,-0.085367,-0.037552,-0.047814
AAPL,2024-02-29 23:45:00+00:00,180.2800,560.0,-0.667855,-0.001383,27.682331,64182975.0,-0.001070,-0.090856,-0.048213,-0.042643
AAPL,2024-02-29 23:50:00+00:00,180.2700,4603.0,-0.638446,-0.001284,26.218787,64178372.0,-0.001018,-0.094920,-0.057554,-0.037365
AAPL,2024-02-29 23:55:00+00:00,180.3059,765.0,-0.488663,-0.000976,31.810210,64179137.0,-0.000742,-0.094158,-0.064875,-0.029283


In [105]:
df

Unnamed: 0,close,volume,bb,sma,rsi,obv,ema,MACD,Signal,Histogram
0,180.0300,13138.0,0.147227,0.000261,60.969912,-2897316.0,0.000154,0.093237,0.092459,0.000778
1,180.0100,8701.0,0.042636,0.000070,60.118674,-2906017.0,0.000039,0.082351,0.090437,-0.008086
2,180.0700,186080.0,0.207484,0.000309,61.406472,-2719937.0,0.000337,0.077670,0.087884,-0.010214
3,180.0700,4859.0,0.170604,0.000240,58.976016,-2719937.0,0.000305,0.073118,0.084931,-0.011813
4,180.0300,3338.0,-0.031303,-0.000041,57.329331,-2723275.0,0.000075,0.065527,0.081050,-0.015523
...,...,...,...,...,...,...,...,...,...,...
23327,180.3000,1368.0,-0.684132,-0.001419,26.563763,64183535.0,-0.001072,-0.085367,-0.037552,-0.047814
23328,180.2800,560.0,-0.667855,-0.001383,27.682331,64182975.0,-0.001070,-0.090856,-0.048213,-0.042643
23329,180.2700,4603.0,-0.638446,-0.001284,26.218787,64178372.0,-0.001018,-0.094920,-0.057554,-0.037365
23330,180.3059,765.0,-0.488663,-0.000976,31.810210,64179137.0,-0.000742,-0.094158,-0.064875,-0.029283
