In [69]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score
import pickle
from datetime import datetime
import warnings

def getData(symbol):
    conn = pyodbc.connect(
        "DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour', m1.[Close] as 'Market', m2.[close] as 'Stock'  from MinuteQuote m1
    inner join MinuteQuote m2 on m1.Date = m2.date
    where m1.Date > '2002-04-08 01:20:00' and m1.Contract = 'ym' and m2.Contract = '{symbol}'

    order by m1.date"""

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        df = pd.read_sql(query, conn)
        df.set_index('Date', inplace=True)
        return df

 


In [70]:
def transformData(df):
    df["StockReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Stock"]
    df["MarketReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Market"]
    df["OutPerform"] = df["StockReturn"] - df["MarketReturn"]
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)


In [71]:
def getPredictors(df):
    predictors = ['Hour']
    for i in range(12):
        df[f'OutPerform{pow(2,i)*5}'] = df['OutPerform'].rolling(pow(2,i)).sum()
        predictors.append(f'OutPerform{pow(2,i)*5}')
        df[f'StockReturn{pow(2,i)*5}'] = df['StockReturn'].rolling(pow(2,i)).sum()
        predictors.append(f'StockReturn{pow(2,i)*5}')

    return predictors    


In [72]:
def getMLdata(df, predictors):
    df = df[df.index.minute % 30 == 0]
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target","Stock","Market"]]
    data = data.join(prev[predictors])
    data = data.copy().dropna()

    transformData(data)
    # print(data)
    return data


In [73]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [74]:
def backtest(data, model, predictors, start=20000, step=5000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = 0 # loop + 1

    # print(all_predictions)    
    return pd.concat(all_predictions)


In [75]:
def processResult(data, predictions):
    tempData = data[['OutPerform']].copy()
    result1 = pd.merge(tempData, predictions, left_index=True, right_index=True)
    # print(result1)
    result1['year'] = result1.index.strftime('%Y')
    groupbyyear = result1.groupby('year')
    for group_name, df_group in groupbyyear:
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] > .56:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] >= .49 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 
        # gsum = df_group[df_group['Predictions'] == 1]['OutPerform'].sum()      
        if count > 0:  
            print('long', s, group_name, gsum/count, count)
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] < .44:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] <= .51 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 
        # gsum = df_group[df_group['Predictions'] == 1]['OutPerform'].sum()   
        if count > 0:       
            print('short', s, group_name, gsum/count, count)

In [76]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = [
'CVX'
,'HON',
'CRM',
'UNH',
'CSCO',
'WMT',
'AXP',
'JPM',
'MCD',
'HD',
'AMGN',
'V',
'INTC',
'WBA',
'GS',
'JNJ',
'PG',
'AAPL',
'DIS',
'MMM',
'MRK',
'MSFT',
'TRV',
'VZ',
'IBM',
'CAT',
'NKE'
]
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    predictions = backtest(data, model, predictors)
    pickle.dump(model, open(f"{s}", 'wb'))
    processResult(data, predictions)
    # precision_score(predictions["Target"], predictions["Predictions"])
    

long CVX 2008 -0.0012280095405127886 47
short CVX 2008 -0.00028379363893045 84
long CVX 2009 0.0013455010406739354 44
short CVX 2009 -0.006575036741064215 38
long CVX 2010 0.00010027449037814122 19
short CVX 2010 -0.007103610448440008 6
long CVX 2011 8.493779926435398e-05 48
short CVX 2011 0.0013175170820488873 19
long CVX 2012 -0.0006310912387443827 38
short CVX 2012 -0.0002916385378428088 10
long CVX 2013 -0.0005856550036982162 32
short CVX 2013 0.011798003825360504 2
long CVX 2014 -0.0008548957613020223 47
short CVX 2014 -0.0011952843975405747 13
long CVX 2015 0.00011908341142378175 53
short CVX 2015 0.0009789538448859714 44
long CVX 2016 3.336303676629798e-05 44
short CVX 2016 -0.00035976750894824833 35
long CVX 2017 -0.0007600126410250846 8
short CVX 2017 -0.0001853756826258778 11
long CVX 2018 0.000985413231035473 24
short CVX 2018 0.0031469910392292384 23
long CVX 2019 0.0039767761698904685 12
short CVX 2019 0.00032111831450122645 23
long CVX 2020 0.005844969140650714 42
short C