In [3]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score
import pickle
from datetime import datetime
import warnings

def getData(symbol):
    conn = pyodbc.connect("DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    # conn = pyodbc.connect("DRIVER={SQL Server};SERVER=.;Database=Qihuo;Trusted_Connection=True")    
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour', m1.[Close] as 'Market', m2.[close] as 'Stock'  from MinuteQuote m1
    inner join MinuteQuote m2 on m1.Date = m2.date
    where m1.Date > '2002-04-08 01:20:00' and m1.Contract = 'ym' and m2.Contract = '{symbol}'

    order by m1.date"""

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        df = pd.read_sql(query, conn)
        df.set_index('Date', inplace=True)
        return df


Unnamed: 0_level_0,Hour,Market,Stock
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-04 09:05:00,9,7590.0,7695.0
2010-01-04 09:10:00,9,7590.0,7710.0
2010-01-04 09:15:00,9,7591.0,7710.0
2010-01-04 09:20:00,9,7590.0,7700.0
2010-01-04 09:25:00,9,7593.0,7695.0
...,...,...,...
2010-01-07 14:05:00,14,7605.0,7545.0
2010-01-07 14:15:00,14,7605.0,7555.0
2010-01-07 14:20:00,14,7605.0,7550.0
2010-01-07 14:25:00,14,7603.0,7545.0


In [286]:
def transformData(df):
    df[f'StockReturn'] = df["Stock"] / df["Stock"].shift(1) - 1
    df[f'MarketReturn'] = df["Market"] / df["Market"].shift(1) - 1
    df[f"OutPerform"] = df['StockReturn'] - df['MarketReturn']
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)


In [287]:
def getPredictors(df):
    predictors = ['Hour']
    for i in range(12):
        df[f'OutPerform{pow(2,i)*5}'] = df['OutPerform'].rolling(pow(2,i)).sum()
        predictors.append(f'OutPerform{pow(2,i)*5}')
        df[f'StockReturn{pow(2,i)*5}'] = df['StockReturn'].rolling(pow(2,i)).sum()
        predictors.append(f'StockReturn{pow(2,i)*5}')
        # df[f'MarketReturn{i}'] = df["Market"] / df["Market"].shift(pow(2,i)) - 1
        # predictors.append(f'OutPerform{i}')
        # df[f'StockReturn{i}'] = df["Stock"] / df["Stock"].shift(pow(2,i)) - 1
        # predictors.append(f'StockReturn{i}')
        # df[f"OutPerform{i}"] = df[f'StockReturn{i}'] - df[f'MarketReturn{i}']
        # df.drop([f'MarketReturn{i}'], axis = 1)
    return predictors


In [288]:
def getMLdata(df, predictors):
    df = df[df.index.minute % 30 == 0]
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target","Stock","Market"]]
    data = data.join(prev[predictors])
    transformData(data)
    data = data.copy().dropna()
    # print(data)
    return data


In [289]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [290]:
def backtest(data, model, predictors, start=20000, step=5000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = loop + 1

    # print(all_predictions)    
    return pd.concat(all_predictions)


In [291]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = [
'CVX'
,'HON',
'CRM',
'UNH',
'CSCO',
'WMT',
'AXP',
'JPM',
'MCD',
'HD',
'AMGN',
'V',
'INTC',
'WBA',
'GS',
'JNJ',
'PG',
'AAPL',
'DIS',
'MMM',
'MRK',
'MSFT',
'TRV',
'VZ',
'IBM',
'CAT',
'NKE'
]
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    predictions = backtest(data, model, predictors)
    pickle.dump(model, open(f"Model/{s}", 'wb'))
    tempData = data[['OutPerform']].copy()
    result = pd.merge(tempData, predictions, left_index=True, right_index=True)
    result.to_csv(f'./Result/{s}.csv')

    # precision_score(predictions["Target"], predictions["Predictions"])
    

In [296]:
import glob
files = [f for f in glob.glob("Result/*.csv")]
result = pd.DataFrame(columns = ['symbol', 'year', 'gain','count'])
for file in files:

    s = file.split('\\')[1].replace('.csv','')
    df = pd.read_csv(f'Result/{s}.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    df['year'] = df.index.strftime('%Y')
    groupbyyear = df.groupby('year')

    for group_name, df_group in groupbyyear:
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] < .43:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] <= .50 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 

        if count > 0:       
            result.loc[len(result.index)] = [s, group_name, gsum, count]

dfgain = result.groupby('year')['gain'].sum()
dfcount = result.groupby('year')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())            

          gain  count       avg
year                           
2008 -5.997381   3168 -0.001893
2009 -2.181672   3037 -0.000718
2010 -1.516277   1912 -0.000793
2011 -1.561079   1933 -0.000808
2012 -1.279071   1500 -0.000853
2013 -0.825113   1163 -0.000709
2014 -0.094238    766 -0.000123
2015 -0.828632   1040 -0.000797
2016 -0.574336    949 -0.000605
2017 -0.570301    688 -0.000829
2018 -1.061762    911 -0.001165
2019 -0.046561    743 -0.000063
2020 -1.294121   1060 -0.001221
2021 -0.385023    416 -0.000926
2022 -0.341630    250 -0.001367
19536
-0.0009498975173197401


In [293]:
dfgain = result.groupby('symbol')['gain'].sum()
dfcount = result.groupby('symbol')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())  

            gain  count       avg
symbol                           
AAPL    3.459059   2155  0.001605
AMGN    1.540286    739  0.002084
AXP     0.794140    408  0.001946
CAT     0.767943    520  0.001477
CRM     0.572991    298  0.001923
CSCO    1.924361   1305  0.001475
CVX     0.195587    255  0.000767
DIS     2.405222   1088  0.002211
GS      0.847261    477  0.001776
HD      0.456537    343  0.001331
HON     1.297321    916  0.001416
IBM     0.690721    762  0.000906
INTC    1.869201   1715  0.001090
JNJ     0.424261    544  0.000780
JPM     1.736822    751  0.002313
MCD     0.252776    284  0.000890
MMM     1.489093   1602  0.000930
MRK     0.159021    398  0.000400
MSFT    0.863488    601  0.001437
NKE     1.066383    686  0.001554
PG      0.383549    243  0.001578
TRV     1.866953    843  0.002215
UNH     0.810741    444  0.001826
V       0.969065    445  0.002178
VZ      0.527578    297  0.001776
WBA     0.727802    537  0.001355
WMT     0.299353    483  0.000620
19139
0.001483

In [294]:

result

Unnamed: 0,symbol,year,gain,count
0,AAPL,2008,0.609871,421
1,AAPL,2009,0.752397,322
2,AAPL,2010,0.275000,226
3,AAPL,2011,0.201593,204
4,AAPL,2012,0.269048,251
...,...,...,...,...
377,WMT,2017,0.034476,5
378,WMT,2018,0.000607,3
379,WMT,2019,-0.002245,3
380,WMT,2020,0.000875,12
