In [2003]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score

def getData(symbol):
    conn = pyodbc.connect(
        "DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour', m1.[Close] as 'Market', m2.[close] as 'Stock'  from MinuteQuote m1
    inner join MinuteQuote m2 on m1.Date = m2.date
    where m1.Date > '2002-04-08 01:20:00' and m1.Contract = 'ym' and m2.Contract = '{symbol}'
    and DATEPART(minute, m1.Date) % 30 = 0
    order by m1.date"""

    df = pd.read_sql(query, conn)
    df.set_index('Date', inplace=True)

    return df




In [2004]:
def transformData(df):
    df["StockReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Stock"]
    df["MarketReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Market"]
    df["OutPerform"] = df["StockReturn"] - df["MarketReturn"]
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)



In [2005]:
def getPredictors(df):
    predictors = ['Hour']
    for i in range(12):
        df[f'OutPerform{pow(2,i)*5}'] = df['OutPerform'].rolling(pow(2,i)).sum()
        predictors.append(f'OutPerform{pow(2,i)*5}')
        df[f'StockReturn{pow(2,i)*5}'] = df['StockReturn'].rolling(pow(2,i)).sum()
        predictors.append(f'StockReturn{pow(2,i)*5}')

    return predictors    


In [2006]:
def getMLdata(df, predictors):
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target"]]
    data = data.join(prev[predictors])
    data = data.copy().dropna()

    return data


In [2009]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict(test[predictors])#[:,1]
    # preds[preds > .6] = 1
    # preds[preds<=.6] = 0
    print(preds)
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [2010]:
def backtest(data, model, predictors, start=10000, step=1000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = 0 # loop + 1

    return pd.concat(all_predictions)


In [2012]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = ['msft', 'intc']
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    
    predictions = backtest(data, model, predictors)

[0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1 1
 1 1 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 1 0 1 0 0 0 0 0 1 0 0
 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 0 1 0 1 0
 0 1 0 1 1 0 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 0 0
 1 1 0 1 1 0 1 0 1 1 1 1 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1
 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 0 1 0 0 1 1 1
 1 1 1 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 1 1 0 0
 1 0 1 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 0 1
 1 1 1 0 1 0 0 1 1 0 0 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 1 0 1 0 1 0 1 

In [2014]:
predictions["Predictions"].value_counts()

0    34370
1    24173
Name: Predictions, dtype: int64

In [2015]:
precision_score(predictions["Target"], predictions["Predictions"])

0.5257518719232201

In [2019]:
tempData = data[['OutPerform']].copy()
# tempPredictions = predictions.copy()


In [2020]:
result1 = pd.merge(tempData, predictions, left_index=True, right_index=True)

In [2021]:

result1['year'] = result1.index.strftime('%Y')

In [2022]:
result2 = result1.loc[result1['Predictions'] == 1]

In [2023]:
count = 0
newtrade = True
for ind in result1.index:
    if result1['Predictions'][ind] == 1:
        if newtrade == True:
            count = count + 1
            newtrade = False
    else:
        newtrade = True 

print(result2['OutPerform'].sum() / count)

0.0005398087811936728


In [2024]:
resultsum = result2.groupby('year').sum()
resultmean = result2.groupby('year').mean()
print(resultsum)


      OutPerform  Target  Predictions
year                                 
2005    0.353476     250          439
2006    0.573924     896         1750
2007    0.608164     865         1594
2008    0.482567    1037         1995
2009    0.932871     970         1796
2010    0.481291     792         1466
2011    0.099420     740         1400
2012    0.292291     706         1361
2013    0.055152     603         1190
2014    0.322674     621         1145
2015    0.146377     629         1230
2016    0.108291     532         1033
2017    0.139423     574         1107
2018    0.094249     709         1352
2019    0.466167     782         1459
2020    0.506936     946         1805
2021    0.268140     738         1422
2022    0.032933     319          629


In [2025]:
print(resultmean)

      OutPerform    Target  Predictions
year                                   
2005    0.000805  0.569476          1.0
2006    0.000328  0.512000          1.0
2007    0.000382  0.542660          1.0
2008    0.000242  0.519799          1.0
2009    0.000519  0.540089          1.0
2010    0.000328  0.540246          1.0
2011    0.000071  0.528571          1.0
2012    0.000215  0.518736          1.0
2013    0.000046  0.506723          1.0
2014    0.000282  0.542358          1.0
2015    0.000119  0.511382          1.0
2016    0.000105  0.515005          1.0
2017    0.000126  0.518519          1.0
2018    0.000070  0.524408          1.0
2019    0.000320  0.535984          1.0
2020    0.000281  0.524100          1.0
2021    0.000189  0.518987          1.0
2022    0.000052  0.507154          1.0
