In [119]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score
import pickle
from datetime import datetime
import warnings

def getData(symbol):
    # conn = pyodbc.connect("DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    conn = pyodbc.connect("DRIVER={SQL Server};SERVER=.;Database=Qihuo;Trusted_Connection=True")    
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour', DATEPART(Month, m1.Date) AS 'Month',
     m1.[close]/m1.[LastClose] - 1 as 'StockReturn',
     m1.[Close] as 'Stock'
     from MinuteQuoteLag m1
    where m1.Symbol = '{symbol}' 
    order by m1.date"""

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        df = pd.read_sql(query, conn)
        df.set_index('Date', inplace=True)
        return df


In [120]:
def transformData(df):
    df[f"OutPerform"] = df['StockReturn'] 
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)


In [121]:
def getPredictors(df):
    predictors = ['Hour','Month']
    for i in range(14):
        # df[f'OutPerform{pow(2,i)*5}'] = df['OutPerform'].rolling(pow(2,i)).sum()
        # predictors.append(f'OutPerform{pow(2,i)*5}')
        df[f'StockReturn{pow(2,i)*5}'] = df['StockReturn'].rolling(pow(2,i)).sum()
        predictors.append(f'StockReturn{pow(2,i)*5}')

    return predictors


In [122]:
def getMLdata(df, predictors):
    # df['OutPerform'] = df['OutPerform'].rolling(6).sum()
    df = df[df.index.minute % 30 == 0]
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target","Stock", "StockReturn"]]
    data = data.join(prev[predictors])
    transformData(data)
    data = data.copy().dropna()
    # print(data)
    return data


In [123]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [124]:
def backtest(data, model, predictors, start=10000, step=1000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = loop + 1

    print(all_predictions)   
    if (len(all_predictions) > 0):
        all_predictions = pd.concat(all_predictions) 
    return all_predictions


In [125]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = [
'j',
'rm',
'ru',
'p',
'cf',
'sr',
'c',
'l',
'al',
'au',
'rb',
'ta',
'oi',
'fg',
'jd',
'm',
'pp',
'ag',
'jm',
'i',
'zn',
'a',
'fu',
'bu',
'y',
'hc',
'vv'
]
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    predictions = backtest(data, model, predictors)
    if (len(predictions)):
        pickle.dump(model, open(f"ModelCommodity/{s}", 'wb'))
        tempData = data[['OutPerform']].copy()
        result = pd.merge(tempData, predictions, left_index=True, right_index=True)
        result.to_csv(f'./ResultCommodity/{s}.csv')

    # precision_score(predictions["Target"], predictions["Predictions"])
    

[                     Target  Predictions
Date                                    
2016-01-22 15:00:00       0     0.416913
2016-01-22 21:30:00       0     0.341492
2016-01-22 22:00:00       0     0.344059
2016-01-22 22:30:00       0     0.308289
2016-01-22 23:00:00       0     0.388734
...                     ...          ...
2016-05-27 14:30:00       1     0.406529
2016-05-27 15:00:00       0     0.406994
2016-05-27 21:30:00       1     0.407109
2016-05-27 22:00:00       1     0.377902
2016-05-27 22:30:00       1     0.343825

[1000 rows x 2 columns],                      Target  Predictions
Date                                    
2016-05-27 23:00:00       0     0.427167
2016-05-27 23:30:00       1     0.392363
2016-05-30 09:30:00       0     0.413054
2016-05-30 10:00:00       1     0.389834
2016-05-30 11:00:00       1     0.423052
...                     ...          ...
2016-09-27 22:30:00       1     0.370917
2016-09-27 23:00:00       0     0.350265
2016-09-27 23:30:00       0   

In [126]:
def printDataFrame(df):
    for i in range(len(df.index)):
        row = list(df.iloc[i])
        print(df.index[i] +"\t" + "\t".join(map(str, row)))

In [127]:
import glob
files = [f for f in glob.glob("ResultCommodity/*.csv")]
result = pd.DataFrame(columns = ['symbol', 'year', 'gain','count'])
for file in files:

    s = file.split('\\')[1].replace('.csv','')
    df = pd.read_csv(f'ResultCommodity/{s}.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    df['year'] = df.index.strftime('%Y')
    groupbyyear = df.groupby('year')

    for group_name, df_group in groupbyyear:
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] > .52:            
            # if df_group['Predictions'][ind] < .43:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] >= .50 and newtrade == False:
            # elif df_group['Predictions'][ind] <= .50 and newtrade == False:                
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 

        if count > 0:       
            result.loc[len(result.index)] = [s, group_name, gsum, count]

dfgain = result.groupby('year')['gain'].sum()
dfcount = result.groupby('year')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
printDataFrame(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())            

2015	0.0097562711349661	20.0	0.000487813556748305
2016	0.19733992018434238	216.0	0.0009136107415941777
2017	0.39942839459888513	439.0	0.0009098596687901711
2018	0.09588212049363427	417.0	0.00022993314267058578
2019	0.3014521662309693	523.0	0.0005763903752026182
2020	0.2590172913682931	825.0	0.0003139603531736886
2021	0.9713108914379	1161.0	0.0008366157548991386
2022	0.3854477060641823	635.0	0.0006070042615183974
4236
0.0006184218039455081


In [128]:
dfgain = result.groupby('symbol')['gain'].sum()
dfcount = result.groupby('symbol')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())  

            gain  count       avg
symbol                           
a      -0.016706     83 -0.000201
ag     -0.001666     25 -0.000067
al      0.001369      6  0.000228
au      0.015500    154  0.000101
bu      0.012448     13  0.000958
c       0.001647      2  0.000823
cf      0.003287     20  0.000164
hc      0.272797    632  0.000432
i       0.005690      1  0.005690
j       0.494184    568  0.000870
jd      0.074331     83  0.000896
jm      0.244514    143  0.001710
l       0.051646     65  0.000795
m       0.005605      2  0.002803
oi     -0.000787    110 -0.000007
p       0.010366     27  0.000384
pp      0.312782    480  0.000652
rb      0.867465   1059  0.000819
rm     -0.002630      3 -0.000877
ru      0.047898    140  0.000342
sr      0.085031    306  0.000278
ta      0.107703    276  0.000390
vv      0.008478      3  0.002826
y      -0.007337     12 -0.000611
zn      0.026023     23  0.001131
4236
0.0006184218039455081
