In [81]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score
import pickle
from datetime import datetime
import warnings

def getData(symbol):
    conn = pyodbc.connect(
        "DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour', m1.[Close] as 'Market', m2.[close] as 'Stock'  from MinuteQuote m1
    inner join MinuteQuote m2 on m1.Date = m2.date
    where m1.Date > '2002-04-08 01:20:00' and m1.Contract = 'ym' and m2.Contract = '{symbol}'

    order by m1.date"""

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        df = pd.read_sql(query, conn)
        df.set_index('Date', inplace=True)
        return df

 


In [82]:
def transformData(df):
    df["StockReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Stock"]
    df["MarketReturn"] = df.rolling(2).apply(lambda x: x.iloc[1] / x.iloc[0] - 1)["Market"]
    df["OutPerform"] = df["StockReturn"] - df["MarketReturn"]
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)


In [83]:
def getPredictors(df):
    predictors = ['Hour']
    for i in range(12):
        df[f'OutPerform{pow(2,i)*5}'] = df['OutPerform'].rolling(pow(2,i)).sum()
        predictors.append(f'OutPerform{pow(2,i)*5}')
        df[f'StockReturn{pow(2,i)*5}'] = df['StockReturn'].rolling(pow(2,i)).sum()
        predictors.append(f'StockReturn{pow(2,i)*5}')

    return predictors    


In [84]:
def getMLdata(df, predictors):
    df = df[df.index.minute % 30 == 0]
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target","Stock","Market"]]
    data = data.join(prev[predictors])
    transformData(data)
    data = data.copy().dropna()
    # print(data)
    return data


In [85]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [86]:
def backtest(data, model, predictors, start=20000, step=5000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = loop + 1

    # print(all_predictions)    
    return pd.concat(all_predictions)


In [87]:
def processResult(data, predictions):
    tempData = data[['OutPerform']].copy()
    result1 = pd.merge(tempData, predictions, left_index=True, right_index=True)
    # print(result1)
    result1['year'] = result1.index.strftime('%Y')
    groupbyyear = result1.groupby('year')
    for group_name, df_group in groupbyyear:
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] > .58:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] >= .50 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 
        # gsum = df_group[df_group['Predictions'] == 1]['OutPerform'].sum()  
        if count > 0:       
            print('long','58-50', s, group_name, gsum/count, count)    
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] > .56:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] >= .49 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 
        # gsum = df_group[df_group['Predictions'] == 1]['OutPerform'].sum()  
        if count > 0:       
            print('long','56-49', s, group_name, gsum/count, count)                
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] < .44:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] <= .50 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 
        # gsum = df_group[df_group['Predictions'] == 1]['OutPerform'].sum()   
        if count > 0:       
            print('short','44-50', s, group_name, gsum/count, count)
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] < .44:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] <= .51 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 
        # gsum = df_group[df_group['Predictions'] == 1]['OutPerform'].sum()   
        if count > 0:       
            print('short','44-51', s, group_name, gsum/count, count)     

In [88]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = [
'CVX'
,'HON',
'CRM',
'UNH',
'CSCO',
'WMT',
'AXP',
'JPM',
'MCD',
'HD',
'AMGN',
'V',
'INTC',
'WBA',
'GS',
'JNJ',
'PG',
'AAPL',
'DIS',
'MMM',
'MRK',
'MSFT',
'TRV',
'VZ',
'IBM',
'CAT',
'NKE'
]
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    predictions = backtest(data, model, predictors)
    pickle.dump(model, open(f"{s}", 'wb'))
    processResult(data, predictions)
    # precision_score(predictions["Target"], predictions["Predictions"])
    

long 58-50 CVX 2008 0.004722018462122731 14
long 56-49 CVX 2008 -0.0012280095405127886 47
short 42 CVX 2008 -0.0008410992437884295 96
short 44-51 CVX 2008 -0.00028379363893045 84
long 58-50 CVX 2009 -0.0029506949678593797 7
long 56-49 CVX 2009 0.0010715382133370717 42
short 42 CVX 2009 -0.003057437619907513 43
short 44-51 CVX 2009 -0.006368386946630771 39
long 56-49 CVX 2010 0.0016837753819497084 14
short 42 CVX 2010 -0.006983866963412744 4
short 44-51 CVX 2010 -0.0089872921521148 4
long 58-50 CVX 2011 0.0015390157891511152 11
long 56-49 CVX 2011 4.452765715858259e-05 61
short 42 CVX 2011 -0.00025163324545868405 31
short 44-51 CVX 2011 -7.420999668741783e-05 29
long 58-50 CVX 2012 -0.00011148439729047332 7
long 56-49 CVX 2012 0.00011886241900520505 71
short 42 CVX 2012 -0.002845699821296499 10
short 44-51 CVX 2012 -0.0032470759718797758 10
long 58-50 CVX 2013 -0.0012070613665101648 14
long 56-49 CVX 2013 2.9310862183476634e-05 75
short 42 CVX 2013 -0.0011820310141121837 3
short 44-51 C