In [307]:
import pyodbc
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.metrics import precision_score
import pickle
from datetime import datetime
import warnings

def getData(symbol):
    # conn = pyodbc.connect("DRIVER={SQL Server};SERVER=lenovo-desktop;DATABASE=Qihuo;UID=samtsql;PWD=F(W}q:TsyK,7^+>`P28e79s#Uc5n")
    conn = pyodbc.connect("DRIVER={SQL Server};SERVER=.;Database=Qihuo;Trusted_Connection=True")    
    query = f"""select m1.date as 'Date',DATEPART(hour, m1.Date) AS 'Hour',DATEPART(Month, m1.Date) AS 'Month',
     m1.[close]/m1.[LastClose] - 1 as 'StockReturn',
     m1.[Close] as 'Stock'
     from MinuteQuoteLag m1
    where m1.Symbol = '{symbol}'
    order by m1.date"""

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', UserWarning)
        df = pd.read_sql(query, conn)
        df.set_index('Date', inplace=True)
        return df


In [308]:
def transformData(df):
    df[f"OutPerform"] = df['StockReturn'] 
    df["Target"] = (df.apply(lambda x: x > 0)["OutPerform"]).astype(int)


In [309]:
def getPredictors(df):
    predictors = ['Hour','Month']
    for i in range(14):
        # df[f'OutPerform{pow(2,i)*5}'] = df['OutPerform'].rolling(pow(2,i)).sum()
        # predictors.append(f'OutPerform{pow(2,i)*5}')
        df[f'StockReturn{pow(2,i)*5}'] = df['StockReturn'].rolling(pow(2,i)).sum()
        predictors.append(f'StockReturn{pow(2,i)*5}')

    return predictors


In [310]:
def getMLdata(df, predictors):
    # df['OutPerform'] = df['OutPerform'].rolling(6).sum()
    # df = df[df.index.minute % 30 == 0]
    prev = df.copy()
    prev = prev.shift(1)
    data = df[["OutPerform","Target","Stock", "StockReturn"]]
    data = data.join(prev[predictors])

    data = data.copy().dropna()
    # print(data)
    return data


In [311]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:,1]
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [312]:
def backtest(data, model, predictors, start=20000, step=5000):
    all_predictions = []
    loop = 0
    for i in range(start, data.shape[0], step):
        train = data.iloc[loop * step:i].copy()
        test = data.iloc[i:(i+step)].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
        loop = loop + 1

    # print(all_predictions)    
    return pd.concat(all_predictions)


In [313]:
model = RandomForestClassifier(n_estimators=100, min_samples_split=1200, random_state=1)
symbols = [
'j',
'rm',
'ru',
'p',
'cf',
'sr',
'c',
'l',
'al',
'au',
'rb',
'ta',
'oi',
'fg',
'jd',
'm',
'pp',
'ag',
'jm',
'i',
'zn',
'a',
'fu',
'bu',
'y',
'hc',
'vv'
]
for s in symbols:
    df = getData(s)
    transformData(df)
    predictors = getPredictors(df)
    data = getMLdata(df, predictors)
    predictions = backtest(data, model, predictors)
    pickle.dump(model, open(f"ModelCommodity/{s}", 'wb'))
    tempData = data[['OutPerform']].copy()
    result = pd.merge(tempData, predictions, left_index=True, right_index=True)
    result.to_csv(f'./ResultCommodity/{s}.csv')

    # precision_score(predictions["Target"], predictions["Predictions"])
    

In [358]:
import glob
files = [f for f in glob.glob("ResultCommodity/*.csv")]
result = pd.DataFrame(columns = ['symbol', 'year', 'gain','count'])
for file in files:

    s = file.split('\\')[1].replace('.csv','')
    df = pd.read_csv(f'ResultCommodity/{s}.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df.set_index('Date', inplace=True)
    df['year'] = df.index.strftime('%Y')
    groupbyyear = df.groupby('year')

    for group_name, df_group in groupbyyear:
        count = 0
        newtrade = True
        gsum = 0
        for ind in df_group.index:
            if df_group['Predictions'][ind] < .25:
                gsum = gsum + df_group['OutPerform'][ind]
                if newtrade == True:
                    count = count + 1
                    newtrade = False
            elif df_group['Predictions'][ind] <= .35 and newtrade == False:
                gsum = gsum + df_group['OutPerform'][ind]
            else:
                newtrade = True 

        if count > 0:       
            result.loc[len(result.index)] = [s, group_name, gsum, count]

dfgain = result.groupby('year')['gain'].sum()
dfcount = result.groupby('year')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())            

          gain  count       avg
year                           
2012 -0.284726    875 -0.000325
2013 -0.982732   3179 -0.000309
2014 -1.977320   5737 -0.000345
2015 -3.560148   8145 -0.000437
2016 -0.814367   3497 -0.000233
2017 -0.120399    398 -0.000303
2018 -0.450189   1521 -0.000296
2019 -1.101711   3811 -0.000289
2020 -0.741201   1983 -0.000374
2021 -0.007002     77 -0.000091
29223
-0.0003435579759670132


In [359]:
dfgain = result.groupby('symbol')['gain'].sum()
dfcount = result.groupby('symbol')['count'].sum()
dfresult = dfgain.to_frame().merge(dfcount.to_frame(), left_index=True, right_index=True)
dfresult['avg'] = dfresult['gain']/dfresult['count']
print(dfresult)
print(dfresult['count'].sum())
print(dfresult['gain'].sum()/dfresult['count'].sum())  

            gain  count       avg
symbol                           
a      -0.042197     30 -0.001407
ag     -0.335002   2025 -0.000165
al     -0.785593   3921 -0.000200
au      0.017663    670  0.000026
bu     -0.141579    324 -0.000437
c      -1.585053   6137 -0.000258
cf      0.007078     73  0.000097
fg     -1.723660   3884 -0.000444
fu     -0.020271     78 -0.000260
hc     -0.032414    288 -0.000113
i      -0.582729    901 -0.000647
j      -1.654241   3515 -0.000471
jm     -0.557771   1614 -0.000346
l      -0.028069     18 -0.001559
m      -0.017130      3 -0.005710
oi     -0.020486     27 -0.000759
p      -0.022030    134 -0.000164
rb     -0.127814    395 -0.000324
rm      0.026978     23  0.001173
ru     -0.035411     12 -0.002951
sr      0.008242     18  0.000458
ta     -0.046809     28 -0.001672
vv     -2.254723   4647 -0.000485
y      -0.041238     29 -0.001422
zn     -0.045533    429 -0.000106
29223
-0.00034355797596701324
