In [219]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score
import pickle
from datetime import datetime
import warnings

def getData(symbol):
    conn = pyodbc.connect(
        "DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour', m1.[Close] as 'Market', m2.[close] as 'Stock'  from MinuteQuote m1
    inner join MinuteQuote m2 on m1.Date = m2.date
    where m1.Date > '2002-04-08 01:20:00' and m1.Contract = 'ym' and m2.Contract = '{symbol}'

    order by m1.date"""

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        df = pd.read_sql(query, conn)
        df.set_index('Date', inplace=True)
        return df

 


In [220]:
def transformData(df):
    df["StockReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Stock"]
    df["MarketReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Market"]
    df["OutPerform"] = df["StockReturn"] - df["MarketReturn"]
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)


In [221]:
def getPredictors(df):
    predictors = ['Hour']
    for i in range(12):
        df[f'MarketReturn{i}'] = df["Market"] / df["Market"].shift(pow(2,i)) - 1
        predictors.append(f'OutPerform{i}')
        df[f'StockReturn{i}'] = df["Stock"] / df["Stock"].shift(pow(2,i)) - 1
        predictors.append(f'StockReturn{i}')
        df[f"OutPerform{i}"] = df[f'StockReturn{i}'] - df[f'MarketReturn{i}']
        df.drop([f'MarketReturn{i}'], axis = 1)
    return predictors


In [222]:
def getMLdata(df, predictors):
    df = df[df.index.minute % 30 == 0]
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target","Stock","Market"]]
    data = data.join(prev[predictors])
    transformData(data)
    data = data.copy().dropna()
    # print(data)
    return data


In [223]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [224]:
def backtest(data, model, predictors, start=20000, step=5000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = loop + 1

    # print(all_predictions)    
    return pd.concat(all_predictions)


In [None]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = [
'CVX'
,'HON',
'CRM',
'UNH',
'CSCO',
'WMT',
'AXP',
'JPM',
'MCD',
'HD',
'AMGN',
'V',
'INTC',
'WBA',
'GS',
'JNJ',
'PG',
'AAPL',
'DIS',
'MMM',
'MRK',
'MSFT',
'TRV',
'VZ',
'IBM',
'CAT',
'NKE'
]
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    predictions = backtest(data, model, predictors)
    pickle.dump(model, open(f"Model/{s}", 'wb'))
    tempData = data[['OutPerform']].copy()
    result = pd.merge(tempData, predictions, left_index=True, right_index=True)
    result.to_csv(f'./Result/{s}.csv')

    # precision_score(predictions["Target"], predictions["Predictions"])
    

In [228]:
import glob
files = [f for f in glob.glob("Result/*.csv")]
result = pd.DataFrame(columns = ['symbol', 'year', 'gain','count'])
for file in files:

    s = file.split('\\')[1].replace('.csv','')
    df = pd.read_csv(f'Result/{s}.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    df['year'] = df.index.strftime('%Y')
    groupbyyear = df.groupby('year')

    for group_name, df_group in groupbyyear:
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] < .43:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] <= .49 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 

        if count > 0:       
            result.loc[len(result.index)] = [s, group_name, gsum, count]

dfgain = result.groupby('year')['gain'].sum()
dfcount = result.groupby('year')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())            

          gain  count       avg
year                           
2008 -3.902670   1942 -0.002010
2009 -2.360551   3175 -0.000743
2010 -1.922523   1879 -0.001023
2011 -1.383975   1689 -0.000819
2012 -1.130273   1397 -0.000809
2013 -0.646089   1001 -0.000645
2014 -0.138743    844 -0.000164
2015 -0.618047    995 -0.000621
2016 -0.621164   1036 -0.000600
2017 -0.280868    517 -0.000543
2018 -0.576939    896 -0.000644
2019 -0.430292    816 -0.000527
2020 -1.271753    972 -0.001308
2021 -0.297147    486 -0.000611
2022 -0.100933    246 -0.000410
17891
-0.0008765282842521972


In [None]:
dfgain = result.groupby('symbol')['gain'].sum()
dfcount = result.groupby('symbol')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())  

            gain  count       avg
symbol                           
AAPL    2.908318   1764  0.001649
AMGN    0.957115    693  0.001381
AXP     0.666471    287  0.002322
CAT     0.623844    462  0.001350
CRM     0.823747    255  0.003230
CSCO    1.860007   1280  0.001453
CVX     0.362620    247  0.001468
CVX1    0.362620    247  0.001468
DIS     2.121101   1004  0.002113
GS      0.522703    321  0.001628
HD      0.555255    417  0.001332
HON     1.266060    717  0.001766
HON1    1.266531    716  0.001769
IBM     0.672213    649  0.001036
INTC    1.679318   1527  0.001100
JNJ     0.429056    483  0.000888
JPM     1.331236    610  0.002182
MCD     0.176229    292  0.000604
MMM     1.311314   1313  0.000999
MRK     0.570556    411  0.001388
MSFT    0.857099    558  0.001536
NKE     0.940154    596  0.001577
PG      0.351785    249  0.001413
TRV     1.988972    803  0.002477
UNH     0.915211    387  0.002365
V       0.724504    396  0.001830
VZ      0.466690    184  0.002536
WBA     0.5045