In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgbm

### util functions

In [2]:
nInst = 50
currentPos = np.zeros(nInst)
models = [None] * nInst
last_train_row_count = [0] * nInst

def calculate_macd(prices, fast=12, slow=26, signal=9):
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd, signal_line

def calculate_bollinger_bands(prices, window=20, num_std=2):
    rolling_mean = prices.rolling(window=window).mean()
    rolling_std = prices.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, lower_band

def calculate_stochastic_oscillator(prices, window=14):
    low_min = prices.rolling(window=window).min()
    high_max = prices.rolling(window=window).max()
    k = 100 * (prices - low_min) / (high_max - low_min)
    d = k.rolling(window=3).mean()
    return k, d

### prediction change direction

In [3]:
#features with binary target
def prepare_features(prcSoFar, inst):
    prices = prcSoFar[inst, :]
    df = pd.DataFrame({'price': prices})
    
    # Calculate moving averages
    for ma in [5, 10, 20, 50, 100, 200]:
        df[f'MA_{ma}'] = df['price'].rolling(window=ma).mean()
        # df[f'MA_{ma}_pct_diff'] = (df['price'] - df[f'MA_{ma}']) / df[f'MA_{ma}'] * 100
        # df.drop(f'MA_{ma}', axis=1, inplace=True)  # Remove the raw MA colum

    # Calculate RSI
    delta = df['price'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # Calculate MACD
    macd, signal_line = calculate_macd(df['price'])
    df['MACD'] = macd
    df['MACD_signal'] = signal_line
    df['MACD_hist'] = macd - signal_line

    # Calculate Bollinger Bands
    upper_band, lower_band = calculate_bollinger_bands(df['price'])
    df['BB_upper'] = upper_band
    df['BB_lower'] = lower_band
    df['BB_width'] = (upper_band - lower_band) / df['price']

    # Calculate Stochastic Oscillator
    k, d = calculate_stochastic_oscillator(df['price'])
    df['Stoch_K'] = k
    df['Stoch_D'] = d
    
    # Calculate previous day return
    df['prev_return'] = df['price'].pct_change()
    
    # Add previous day returns of other stocks
    correlated_stocks = [22, 30, 11, 27, 38]
    for other_inst in range(nInst):
        if other_inst != inst:
            other_prices = pd.Series(prcSoFar[other_inst, :])
            other_return = other_prices.pct_change()
            df[f'prev_return_{other_inst}'] = other_return
            
            if other_inst in correlated_stocks:
                df[f'corr_return_{other_inst}'] = df['prev_return'] * other_return
    
    # Calculate target (next day return)
    df['target'] = df['price'].pct_change().shift(-1)
    df['target'] = (df['target'] > 0).astype(float)  # Use float instead of int to handle NaN
    
    return df

In [4]:
#classifier model
def train_model(features, target):
    lgbm_params_fixed = {
        "objective": "binary",
        "max_bin": 100, 
        "min_data_in_bin": 10,
        "is_unbalance": False,
        "verbosity": -1
    }
    model = lgbm.LGBMClassifier(**lgbm_params_fixed)
    model.fit(features, target)
    return model

def getMyPosition(prcSoFar):
    global currentPos, models
    (nins, nt) = prcSoFar.shape
    
    if nt < 250:
        return np.zeros(nins)
    
    new_pos = np.zeros(nins)
    
    for inst in range(nins):
        df = prepare_features(prcSoFar, inst)
        
        if models[inst] is None and nt >= 250 or (nt - last_train_row_count[inst] >= 10):
            features = df.drop(['price', 'target'], axis=1).iloc[200:-1]  # Start from day 200 to ensure all features are available
            target = df['target'].iloc[200:-1]
            features_clean = features.dropna()
            target_clean = target[features_clean.index]
            
            if len(features_clean) > 0 and len(target_clean) > 0:
                models[inst] = train_model(features_clean, target_clean)
                last_train_row_count[inst] = nt
        
        if models[inst] is not None:
            latest_features = df.drop(['price', 'target'], axis=1).iloc[-1:].dropna()
            if not latest_features.empty:
                prediction = models[inst].predict(latest_features)
                if prediction:
                    new_pos[inst] = 10000
                else:
                    new_pos[inst] = -10000
    
    # Apply position changes gradually
    position_changes = new_pos - currentPos
    max_change = 5000  # Maximum allowed position change
    position_changes = np.clip(position_changes, -max_change, max_change)
    currentPos += position_changes.astype(int)
    
    return currentPos

### Evaluation

In [5]:
nInst = 0
nt = 0
commRate = 0.0010
dlrPosLimit = 10000

def loadPrices(fn):
    global nt, nInst
    df = pd.read_csv(fn, sep='\s+', header=None, index_col=None)
    (nt, nInst) = df.shape
    return (df.values).T

pricesFile = "./prices.txt"
prcAll = loadPrices(pricesFile)
print("Loaded %d instruments for %d days" % (nInst, nt))

def calcPL(prcHist):
    cash = 0
    curPos = np.zeros(nInst)
    totDVolume = 0
    totDVolumeSignal = 0
    totDVolumeRandom = 0
    value = 0
    todayPLL = []
    (_, nt) = prcHist.shape
    for t in range(250, 501):
        prcHistSoFar = prcHist[:, :t]
        newPosOrig = getMyPosition(prcHistSoFar)
        curPrices = prcHistSoFar[:, -1]
        posLimits = np.array([int(x) for x in dlrPosLimit / curPrices])
        newPos = np.clip(newPosOrig, -posLimits, posLimits)
        deltaPos = newPos - curPos
        dvolumes = curPrices * np.abs(deltaPos)
        dvolume = np.sum(dvolumes)
        totDVolume += dvolume
        comm = dvolume * commRate
        cash -= curPrices.dot(deltaPos) + comm
        curPos = np.array(newPos)
        posValue = curPos.dot(curPrices)
        todayPL = cash + posValue - value
        todayPLL.append(todayPL)
        value = cash + posValue
        ret = 0.0
        if (totDVolume > 0):
            ret = value / totDVolume
        print("Day %d value: %.2lf todayPL: $%.2lf $-traded: %.0lf return: %.5lf" %
              (t, value, todayPL, totDVolume, ret))
    pll = np.array(todayPLL)
    (plmu, plstd) = (np.mean(pll), np.std(pll))
    annSharpe = 0.0
    if (plstd > 0):
        annSharpe = np.sqrt(250) * plmu / plstd
    return (plmu, ret, plstd, annSharpe, totDVolume)

(meanpl, ret, plstd, sharpe, dvol) = calcPL(prcAll)
score = meanpl - 0.1*plstd
print("=====")
print("mean(PL): %.1lf" % meanpl)
print("return: %.5lf" % ret)
print("StdDev(PL): %.2lf" % plstd)
print("annSharpe(PL): %.2lf " % sharpe)
print("totDvolume: %.0lf " % dvol)
print("Score: %.2lf" % score)

Loaded 50 instruments for 500 days
Day 250 value: -498.92 todayPL: $-498.92 $-traded: 498924 return: -0.00100
Day 251 value: -867.26 todayPL: $-368.33 $-traded: 739797 return: -0.00117
Day 252 value: -1015.56 todayPL: $-148.30 $-traded: 980846 return: -0.00104
Day 253 value: -765.67 todayPL: $249.89 $-traded: 1122127 return: -0.00068
Day 254 value: -1206.05 todayPL: $-440.39 $-traded: 1273224 return: -0.00095
Day 255 value: -1362.78 todayPL: $-156.72 $-traded: 1384425 return: -0.00098
Day 256 value: -1743.73 todayPL: $-380.96 $-traded: 1535361 return: -0.00114
Day 257 value: -1463.30 todayPL: $280.43 $-traded: 1716234 return: -0.00085
Day 258 value: -1724.83 todayPL: $-261.52 $-traded: 1907208 return: -0.00090
Day 259 value: -1077.92 todayPL: $646.91 $-traded: 2038003 return: -0.00053
Day 260 value: -1752.35 todayPL: $-674.42 $-traded: 2179346 return: -0.00080
Day 261 value: -2330.42 todayPL: $-578.08 $-traded: 2329735 return: -0.00100
Day 262 value: -2330.80 todayPL: $-0.38 $-traded: 

KeyboardInterrupt: 

In [None]:
#features with numerical target
def prepare_features(prcSoFar, inst):
    prices = prcSoFar[inst, :]
    df = pd.DataFrame({'price': prices})
    
    # Calculate moving averages and their percentage differences
    for ma in [5, 10, 20, 50, 100, 200]:
        df[f'MA_{ma}'] = df['price'].rolling(window=ma).mean()
        df[f'MA_{ma}_pct_diff'] = (df['price'] - df[f'MA_{ma}']) / df[f'MA_{ma}'] * 100
        df.drop(f'MA_{ma}', axis=1, inplace=True)  # Remove the raw MA column

    # Calculate RSI
    delta = df['price'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))
    
    # Calculate MACD
    macd, signal_line = calculate_macd(df['price'])
    df['MACD'] = macd
    df['MACD_signal'] = signal_line
    df['MACD_hist'] = macd - signal_line

    # Calculate Bollinger Bands
    upper_band, lower_band = calculate_bollinger_bands(df['price'])
    df['BB_upper'] = upper_band
    df['BB_lower'] = lower_band
    df['BB_width'] = (upper_band - lower_band) / df['price']

    # Calculate Stochastic Oscillator
    k, d = calculate_stochastic_oscillator(df['price'])
    df['Stoch_K'] = k
    df['Stoch_D'] = d
    
    # Calculate previous day return
    df['prev_return'] = df['price'].pct_change()
    
    # Add previous day returns of other stocks
    correlated_stocks = [22, 30, 11, 27, 38]
    for other_inst in range(nInst):
        if other_inst != inst:
            other_prices = pd.Series(prcSoFar[other_inst, :])
            other_return = other_prices.pct_change()
            df[f'prev_return_{other_inst}'] = other_return
            
            if other_inst in correlated_stocks:
                df[f'corr_return_{other_inst}'] = df['prev_return'] * other_return
    
    # Calculate target (next day return)
    df['target'] = df['price'].pct_change().shift(-1)
    
    return df