In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd 
import numpy as np
from t_nachine.optimization import Analyzer
from t_nachine.constants import TRADES_ATTRIBUTES
from t_nachine.optimization.ml.utils import save, load
from tqdm import tqdm

In [2]:
path1 = "logs/bouncing.csv"

df = pd.read_csv(path1)

df.index = range(len(df))

In [3]:
analyzer = Analyzer(df)
results = analyzer.backtest_results

In [4]:
len(results)

58841

In [5]:
analyzer.win_rate

0.38

In [6]:
analyzer.stats

Unnamed: 0_level_0,Duration,Duration,Duration,Duration,Duration,RiskToReward,RiskToReward,RiskToReward,RiskToReward,RiskToReward
Unnamed: 0_level_1,mean,median,min,max,std,mean,median,min,max,std
WinningTrade,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
False,13.752,7.0,0.0,1854.0,24.333,-1.044,-1.0,-14.106,-0.0,0.291
True,23.218,15.0,0.0,1068.0,31.285,2.039,2.0,0.01,17.378,0.366


## Processing

In [7]:
results = results.sort_values(by="EntryTime")

In [8]:
results.SlPrice = results.EntryPrice - results.OneR

In [9]:
results = results[~results.Symbol.isna()] 

In [10]:
## Adding cohorts as a feature

def compute_cohots(price: float) -> int:
    
    if price <= 5:
        return 1
    
    if price <= 10:
        return 2
    
    if price <= 50:
        return 3
    
    if price <= 100:
        return 4
    
    if price <= 500:
        return 5
    
    if price <= 1000:
        return 6

    return 7

results['cohorts'] = results.EntryPrice.apply(compute_cohots)

In [11]:
len(results)

58832

In [12]:
results

Unnamed: 0,Size,EntryBar,ExitBar,OneR,SlPrice,TpPrice,EntryPrice,ExitPrice,MaxPnL,MaxNegativePnl,...,ATR,RSI,ADX,WILLIAMS,Volume,Symbol,Duration,WinningTrade,RiskToReward,cohorts
13459,188.000,598.000,601.000,0.107,6.443,6.763,6.550,6.763,0.172,-0.061,...,0.063,0.583,0.144,-0.125,668254.000,ibm,3.000,True,2.000,2
13460,155.000,617.000,645.000,0.132,6.443,6.839,6.575,6.443,0.239,-0.078,...,0.084,0.545,0.146,-0.375,337719.000,ibm,28.000,False,-1.000,2
19576,555.000,803.000,865.000,0.036,0.811,0.919,0.847,0.811,0.038,-0.034,...,0.008,0.690,0.265,-0.000,2980989.000,ge,62.000,False,-1.000,1
13461,58.000,1107.000,1112.000,0.349,7.403,8.450,7.752,7.403,0.022,-0.279,...,0.145,0.622,0.236,-0.040,953278.000,ibm,5.000,False,-1.000,2
13462,93.000,1283.000,1290.000,0.215,8.096,8.742,8.312,8.742,0.415,-0.062,...,0.122,0.597,0.190,-0.315,349698.000,ibm,7.000,True,2.000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13549,19.000,3200.000,3200.000,1.060,20.220,23.400,21.280,21.212,0.270,-0.530,...,0.976,0.453,0.351,-0.631,44201.000,vicr,0.000,False,-0.064,3
40113,8.000,6951.000,6951.000,2.750,15.370,23.620,18.120,18.050,0.405,-0.120,...,0.764,0.440,0.394,-0.390,101117.000,kbal,0.000,False,-0.025,3
21414,16.000,2896.000,2896.000,1.253,5.337,9.097,6.590,6.400,0.300,-0.279,...,0.365,0.748,0.341,-0.107,279893.000,trt,0.000,False,-0.152,2
17021,4.000,3200.000,3200.000,5.400,120.810,137.010,126.210,125.020,1.410,-1.190,...,2.492,0.534,0.172,-0.332,1009386.000,pvh,0.000,False,-0.220,5


## Create the dataset

In [13]:
import os
stocks_path = "../archive/Stocks/"
symbols = results.Symbol.unique()
data = pd.DataFrame()
for symbol in tqdm(symbols):
    stock = pd.read_csv(os.path.join(stocks_path, symbol + '.us.txt'))
    trades = results[results.Symbol == symbol]
    entry_bars = trades.EntryBar - 1
    
    trades_data = stock.iloc[entry_bars]
    trades_data['WinningTrade'] = trades.WinningTrade.values
    trades_data['OneR'] = trades.OneR.values
    trades_data['EntryPrice'] = trades.EntryPrice.values
    trades_data['TpPrice'] = trades.TpPrice.values
    trades_data['SlPrice'] = trades.SlPrice.values
    trades_data['cohorts'] = trades.cohorts.values

    trades_data['Symbol'] = [symbol for i in range(len(trades_data))]
    data = data.append(trades_data)

100%|███████████████████████████████████████| 5420/5420 [04:01<00:00, 22.42it/s]


In [14]:
data

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt,VWAP,ATR,RSI,...,r2,r3,r4,WinningTrade,OneR,EntryPrice,TpPrice,SlPrice,cohorts,Symbol
597,1964-05-15,6.480,6.530,6.471,6.513,278438,0,0.719,0.062,0.545,...,0.997,0.999,1.002,True,0.107,6.550,6.763,6.443,2,ibm
616,1964-06-12,6.513,6.555,6.513,6.546,323350,0,0.722,0.089,0.513,...,0.997,1.001,1.005,False,0.132,6.575,6.839,6.443,2,ibm
1106,1966-05-23,7.624,7.732,7.624,7.649,1034713,0,0.670,0.146,0.591,...,0.973,0.976,0.980,False,0.349,7.752,8.450,7.403,2,ibm
1282,1967-02-02,8.208,8.292,8.208,8.242,464664,0,0.647,0.139,0.578,...,0.990,0.993,0.996,True,0.215,8.312,8.742,8.096,2,ibm
1870,1969-07-16,13.529,13.652,13.529,13.652,747292,0,0.512,0.350,0.507,...,0.992,1.000,1.008,False,0.526,13.682,14.734,13.156,3,ibm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29,2017-11-01,16.020,18.880,15.390,18.430,180478,0,0.991,4.023,0.471,...,0.932,1.007,1.082,True,5.940,18.910,30.790,12.970,3,petz
314,2017-11-02,0.700,0.725,0.700,0.720,1233100,0,0.800,0.043,0.591,...,0.973,0.984,0.994,True,0.065,0.735,0.865,0.670,1,eaglw
518,2017-11-03,25.490,26.060,25.490,25.880,11830,0,0.940,0.584,0.486,...,0.983,0.991,0.999,False,1.180,26.090,28.450,24.910,3,cncr
729,2017-11-03,22.450,22.950,22.350,22.900,128037,0,0.670,0.445,0.709,...,0.992,1.000,1.008,False,0.860,22.980,24.700,22.120,3,wk


## Split data

In [15]:
date = "2017-01-31"
column_data = "Date"
train = data[pd.to_datetime(data[column_data]) <= pd.to_datetime(date)]
test = data[pd.to_datetime(data[column_data]) > pd.to_datetime(date)] # 2017 - 2018

In [16]:
train.shape, test.shape, len(test)/len(data)

((53441, 33), (5391, 33), 0.09163380473211857)

In [17]:
train.WinningTrade.mean(), test.WinningTrade.mean()

(0.38079377257162106, 0.40827304767204603)

In [18]:
train.groupby('cohorts').WinningTrade.agg({'size', 'mean'})

Unnamed: 0_level_0,size,mean
cohorts,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3267,0.442
2,7010,0.408
3,34768,0.375
4,6339,0.366
5,1851,0.335
6,128,0.391
7,78,0.359


In [178]:
test.groupby('cohorts').WinningTrade.agg({'size', 'mean'})

Unnamed: 0_level_0,size,mean
cohorts,Unnamed: 1_level_1,Unnamed: 2_level_1
1,158,0.367
2,335,0.424
3,3312,0.396
4,1079,0.427
5,490,0.449
6,12,0.417
7,5,0.4


## Evalute Function

In [24]:
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [25]:
def evaluate(clf, train, test, features = TRADES_ATTRIBUTES, thresh=0.5):
    preds_train, preds_test = clf.predict_proba(train[features])[:, 1] > thresh, clf.predict_proba(test[features])[:, 1] > thresh
    recall_train = recall_score(train.WinningTrade, 
                                             preds_train, 
                                             zero_division=0)
    
    recall_test = recall_score(test.WinningTrade,  preds_test, zero_division=0)
                                                                                                            
    precision_train = precision_score(train.WinningTrade, 
                                      preds_train, 
                                      zero_division=0)
    precision_test = precision_score(test.WinningTrade, preds_test)

    print('recall train test: , ', round(recall_train, 2), round(recall_test, 2))
    print('precision train test: , ', round(precision_train, 2), round(precision_test, 2))
    
    return preds_train, preds_test

## Training

In [123]:
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [164]:
clf = LGBMClassifier(random_state=2021, class_weight="balanced", n_estimators=200, max_depth=10, importance_type="gain")

In [20]:
features = ['Open', 'High', 'Low', 'Close', 'SlPrice', 'TpPrice', 'EntryPrice', 'OneR', 'ATR', 'Volume', 'RSI', 'cohorts']

In [172]:
clf.fit(train[features], train.WinningTrade)

LGBMClassifier(class_weight='balanced', importance_type='gain', max_depth=10,
               n_estimators=200, random_state=2021)

In [173]:
preds_train, preds_test = evaluate(clf, train, test, features=features, thresh=0.5)

recall train test: ,  0.51 0.44
precision train test: ,  0.43 0.41


In [None]:
import autosklearn.classification
cls = autosklearn.classification.AutoSklearnClassifier()
cls.fit(train[features], train.WinningTrade)
predictions = cls.predict(test[features])



Process ForkProcess-1:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/abdelelyds/.local/lib/python3.8/site-packages/autosklearn/util/logging_.py", line 320, in start_log_server
    receiver.serve_until_stopped()
  File "/home/abdelelyds/.local/lib/python3.8/site-packages/autosklearn/util/logging_.py", line 350, in serve_until_stopped
    rd, wr, ex = select.select([self.socket.fileno()],
KeyboardInterrupt


In [151]:
corr_matrix = train[TRADES_ATTRIBUTES].corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

new_features = list(set(TRADES_ATTRIBUTES) - set(to_drop))

In [154]:
clf = LGBMClassifier(random_state=2021, class_weight="balanced", n_estimators=600, max_depth=10, importance_type="gain")
clf = DecisionTreeClassifier(random_state=0, max_depth=10)
clf = RandomForestClassifier(random_state=0, n_estimators=100, max_depth=10)
clf = LogisticRegression()
clf.fit(train[new_features], train.WinningTrade)

LogisticRegression()

In [155]:
preds_train, preds_test = evaluate(clf, train, test, features=new_features, thresh=0.6)

recall train test: ,  0.0 0.0
precision train test: ,  0.0 0.0


In [415]:
test['preds'] = preds_test

In [416]:
trades_to_take_with_ml = test[test['preds'] == True]

In [417]:
trades_to_take_with_ml.WinningTrade.mean()

0.625

In [418]:
trades_to_take_with_ml.shape[0] / len(test)

0.0014839547393804488

In [419]:
len(trades_to_take_with_ml)

8

## Test On 2018-2021

In [420]:
yahoo_path = "logs/yahoo_extreme_rsi.csv"
res = pd.read_csv(yahoo_path)

In [421]:
ana = Analyzer(res)

In [422]:
res_ana = ana.backtest_results

In [423]:
res_ana.shape

(37134, 36)

In [424]:
ana.win_rate

0.39

In [425]:
year = "2019-01-01"
trade_2018_2021 = res_ana[pd.to_datetime(res_ana.EntryTime) >= pd.to_datetime(year)]
trade_2018_2021['cohorts'] = trade_2018_2021.EntryPrice.apply(compute_cohots)

In [426]:
trade_2018_2021.shape

(22963, 37)

In [427]:
preds, preds = evaluate(clf, trade_2018_2021, trade_2018_2021, features=features, thresh=0.75)

recall train test: ,  0.0 0.0
precision train test: ,  0.34 0.34


In [369]:
trade_2018_2021['preds'] = preds

In [370]:
trades_to_take_with_ml = trade_2018_2021[trade_2018_2021['preds'] == True]

In [371]:
len(trades_to_take_with_ml)

117

In [372]:
trades_to_take_with_ml.WinningTrade.mean()

0.4188034188034188