# Artik 47

In [28]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNet, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import neighbors

from tqdm import tqdm
from datetime import *
# import catboost as ctb
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17785751376919860730
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2255906407
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17428994458375981652
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [101]:
PATH_RESULTS = f'~\Documents\Artik_47\Artik-47\results\\'

## Data preparation

In [41]:
gbpusd = pd.read_csv(f'~\Documents\Artik_47\Artik-47\data\GBPUSD_M30.csv', sep='\t', parse_dates=['Time'])

#Just for close values as objetive
gbpusd = gbpusd[['Time', 'Close', 'Volume']]
gbpusd

Unnamed: 0,Time,Close,Volume
0,2007-01-01 06:00:00,1.95817,1728
1,2007-01-01 06:30:00,1.95815,1651
2,2007-01-01 07:00:00,1.95824,1352
3,2007-01-01 07:30:00,1.95822,1722
4,2007-01-01 08:00:00,1.95852,1729
...,...,...,...
193895,2022-07-13 03:30:00,1.18863,596
193896,2022-07-13 04:00:00,1.18813,1522
193897,2022-07-13 04:30:00,1.18794,1852
193898,2022-07-13 05:00:00,1.18797,1404


In [42]:
gbpusd.describe()

Unnamed: 0,Close,Volume
count,193900.0,193900.0
mean,1.51325,5233.197968
std,0.21537,11369.920694
min,1.14293,1.0
25%,1.322348,1562.0
50%,1.517765,2935.0
75%,1.61315,5508.0
max,2.11425,556867.0


In [62]:
def shift_col(df, col_name, n):
    """
    Shift column in df, n times
    :param df: dataframe
    :param col_name: str
    :param n: int
    :return: dateframe, list columns with shift
    """
    list_shift = []
    for i in range(1,n):
        new_name_col = col_name + '_' + str(i)
        df[new_name_col] = df[col_name].shift(i)
        list_shift.append(new_name_col)

    return df, list_shift

In [104]:
df = gbpusd.copy()
df = df.rename(columns={'Time':'ds'})

#Shift last x values
last_x = 6

df, list_shift = shift_col(df, 'Close', last_x)

#Variance of shift values
df['var_last_val'] = df[list_shift].var(axis=1)

#Shift volume because is impossible to have on time
df, _ = shift_col(df, 'Volume', last_x)
df = df.drop(columns=['Volume'])

df = df.dropna()
df#.iloc[:,2:].values

Unnamed: 0,ds,Close,Close_1,Close_2,Close_3,Close_4,Close_5,var_last_val,Volume_1,Volume_2,Volume_3,Volume_4,Volume_5
5,2007-01-01 08:30:00,1.95877,1.95852,1.95822,1.95824,1.95815,1.95817,2.245000e-08,1729.0,1722.0,1352.0,1651.0,1728.0
6,2007-01-01 09:00:00,1.95882,1.95877,1.95852,1.95822,1.95824,1.95815,6.745000e-08,1524.0,1729.0,1722.0,1352.0,1651.0
7,2007-01-01 09:30:00,1.95883,1.95882,1.95877,1.95852,1.95822,1.95824,8.018000e-08,1565.0,1524.0,1729.0,1722.0,1352.0
8,2007-01-01 10:00:00,1.95840,1.95883,1.95882,1.95877,1.95852,1.95822,6.897000e-08,1704.0,1565.0,1524.0,1729.0,1722.0
9,2007-01-01 10:30:00,1.95823,1.95840,1.95883,1.95882,1.95877,1.95852,3.837000e-08,1643.0,1704.0,1565.0,1524.0,1729.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
193895,2022-07-13 03:30:00,1.18863,1.18885,1.18834,1.18883,1.18885,1.18866,4.823000e-08,448.0,796.0,979.0,2881.0,2673.0
193896,2022-07-13 04:00:00,1.18813,1.18863,1.18885,1.18834,1.18883,1.18885,4.910000e-08,596.0,448.0,796.0,979.0,2881.0
193897,2022-07-13 04:30:00,1.18794,1.18813,1.18863,1.18885,1.18834,1.18883,9.878000e-08,1522.0,596.0,448.0,796.0,979.0
193898,2022-07-13 05:00:00,1.18797,1.18794,1.18813,1.18863,1.18885,1.18834,1.352700e-07,1852.0,1522.0,596.0,448.0,796.0


In [100]:
#Find the support and resistance

#Top most frequent values with only 3 decimals
df['Close'].astype(str).str[:4].value_counts()[:10]

1.30    7108
1.56    6741
1.29    6642
1.31    6331
1.61    5869
1.60    5844
1.55    5692
1.59    5085
1.54    4996
1.32    4687
Name: Close, dtype: int64

In [77]:
#Spit
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,2:].values, df.iloc[:,1].values, test_size=0.33, shuffle=False)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train:{y_train.shape}, y_test:{y_test.shape}")

X_train: (129909, 11), X_test: (63986, 11), y_train:(129909,), y_test:(63986,)


## ML  version

In [112]:
def backtest_regression(model_name, fecha_inicio, fecha_fin, data, cutoff, retrain_days, t_scaler, n_pca, params=None):
    print("from: ", fecha_inicio, " to: ", fecha_fin)

    #Primer entreno
    mask = (data.ds < (fecha_inicio - timedelta(days=cutoff)))
    train = data.loc[mask]

    train = train.drop(columns=['ds'])

    val_reales = np.array([])
    val_pred = np.array([])
    val_fechas = np.array([])

    # Divide en variables de entrada y salida
    X, y = train.values[:, 1:], train.values[:, 0].astype('float64')

    #Scalers
    if(t_scaler=='MinMaxScaler'):
        scaler = MinMaxScaler(feature_range=(0, 1))
        X = scaler.fit_transform(X)
    elif(t_scaler=='StandardScaler'):
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    elif(t_scaler=='RobustScaler'):
        scaler = RobustScaler(quantile_range=(0, 95.0))
        X = scaler.fit_transform(X)

    #PCA
    if (type(n_pca) == int):
        pca = PCA(n_components=n_pca)
        X = pca.fit_transform(X)

    #Crea el modelo
    model = eval(model_name + "()")

    #Agrega parametros
    if(params!=None):
        model.set_params(**params)

    model.fit(X, y)

    pbar = tqdm(total=(fecha_fin-fecha_inicio).days+1)
    fecha_index = fecha_inicio
    while(fecha_index <= fecha_fin):
        print(fecha_index)
        # if(fecha_index.weekday() in retrain_days):
        #
        #     #Re entreno
        #     mask = (data.ds < (fecha_index - timedelta(days=cutoff)))
        #     train = data.loc[mask]
        #     #print("Train: \n", train.iloc[:,0:6], "\n")
        #     train = train.drop(columns=['ds'])
        #
        #     # Divide en variables de entrada y salida
        #     X, y = train.values[:, 1:], train.values[:, 0].astype('float64')
        #
        #     #Scalers
        #     if(t_scaler=='MinMaxScaler'):
        #         scaler = MinMaxScaler(feature_range=(0, 1))
        #         X = scaler.fit_transform(X)
        #     elif(t_scaler=='StandardScaler'):
        #         scaler = StandardScaler()
        #         X = scaler.fit_transform(X)
        #     elif(t_scaler=='RobustScaler'):
        #         scaler = RobustScaler(quantile_range=(0, 95.0))
        #         X = scaler.fit_transform(X)
        #
        #     #PCA
        #     if (type(n_pca) == int):
        #         pca = PCA(n_components=n_pca)
        #         X = pca.fit_transform(X)
        #
        #     model = eval(model_name + "()")
        #
        #     if(params!=None):
        #         model.set_params(**params)
        #
        #     model.fit(X, y)

        mask = (data.ds >= fecha_index) & (data.ds <= fecha_index + timedelta(hours=23))
        to_pred = data.loc[mask]
        if not(to_pred.empty):

            val_fechas = np.append(val_fechas, to_pred.iloc[:,0].tolist())
            #print("to_pred: ", to_pred.values)
            to_pred = to_pred.drop(columns=['ds'])
            # Divide en variables de entrada y salida
            X, y = to_pred.values[:, 1:], to_pred.values[:, 0].astype('float64')

            #Scalers
            if((t_scaler=='MinMaxScaler') or (t_scaler=='StandardScaler') or (t_scaler=='RobustScaler')):
                X = scaler.transform(X)

            #PCA
            if (type(n_pca) == int):
                X = pca.transform(X)

            y_hat = model.predict(X)

            val_reales = np.append(val_reales, y)
            val_pred = np.append(val_pred, y_hat)

        fecha_index = fecha_index + timedelta(minutes=30)
        pbar.update(1)

    resultados = pd.DataFrame({'fechas':val_fechas, 'val_reales':val_reales, 'val_pred':val_pred})
    pbar.close()
    return resultados

In [113]:
#Fecha inicial
inicial_date = '01/01/2008 00:00:00'
inicial_date = datetime.strptime(inicial_date, '%d/%m/%Y %H:%M:%S')
#Fecha final
end_date = '02/01/2008 23:00:00'
end_date = datetime.strptime(end_date, '%d/%m/%Y %H:%M:%S')

#Nombre experimiento
NOM_EXP = 'artik47_ML'

#Variables generales de los modelos
retrain_days = [3, 6]    #Monday is 0 and Sunday is 6
cutoff = 0         #Dias atras del index que se toman para entrenar el modelo

In [114]:
models_dict = {
    # 'XGBoost0':{'xgb.XGBRegressor':{'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    # 'XGBoost1':{'xgb.XGBRegressor':{'Objective':'reg:squaredlogerror',
    #                                'n_estimator':100,
    #                                # 'eval_metric':'mape',
    #                                'booster':'gbtree',
    #                                'verbosity':0,
    #                                # 'disable_default_eval_metric':'false',
    #                                'learning_rate':0.2,
    #                                'max_depth':7,
    #                                'min_child_weight':2,
    #                                'sampling_method':'gradient_based',
    #                                'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    #'ElasticNet':{'ElasticNet':{'alpha':1.0, 'l1_ratio':0.5}},
    #'ElasticNet_2':{'ElasticNet':None},
    #'Lasso':{'Lasso':None},
    'RandomForest':{'RandomForestRegressor':{'n_jobs': 4}, 'Scaler':False, 'PCA':False},
    # 'RandomForest_2':{'RandomForestRegressor':{'n_estimators': 300, 'min_samples_split': 10,
    #                                         'min_samples_leaf': 1, 'max_features': 'sqrt','max_depth': 40,
    #                                       'bootstrap': False, 'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    #                'AdaBoost':{'AdaBoostRegressor':None},
    # 'SVR0':{'SVR':None, 'Scaler':'StandardScaler', 'PCA':5},
    # 'SVR1':{'SVR':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'SVR2':{'SVR':None, 'Scaler':'StandardScaler', 'PCA':10},
    # 'HuberRegressor':{'HuberRegressor':{'epsilon':1.0, 'max_iter':200*20, 'alpha':0.0001}, 'Scaler':False, 'PCA':False},
    # 'HuberRegressor1':{'HuberRegressor':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'HuberRegressor2':{'HuberRegressor':None, 'Scaler':False, 'PCA':9},
    # 'HuberRegressor2':{'HuberRegressor':{'epsilon':1.0, 'max_iter':200*20, 'alpha':1e-10}, 'Scaler':False, 'PCA':9},
    # 'HuberRegressor3':{'HuberRegressor':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'HuberRegresso4':{'HuberRegressor':{'epsilon':1.0, 'max_iter':200*20, 'alpha':1e-10}, 'Scaler':'StandardScaler', 'PCA':9},
    # 'HuberRegressor5':{'HuberRegressor':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'SVR2':{'SVR':None, 'Scaler':'RobustScaler', 'PCA':6},
    #'KNR':{'neighbors.KNeighborsRegressor':None},
    #'BaggingRegressor':{'BaggingRegressor':None},
    #'GradientBoosting':{'GradientBoostingRegressor':None},
}

df_resultados = pd.DataFrame()

for modelo in models_dict.items():
    #print(modelo, type(modelo))
    try:
        output_name_model = modelo[0]
        name_model = list(modelo[1])[0]
        params = modelo[1].get(name_model)
        scaler = modelo[1]['Scaler']
        pca = modelo[1]['PCA']

        start_time = datetime.now()
        print("\n\n--->", output_name_model, '<--- hora inicio:', start_time)
        print("Parametros: ", params, " scaler: ", scaler, " pca: ", pca)
        nombre = output_name_model + '_' + NOM_EXP + '_' + start_time.strftime('%Y_%m_%d-%H_%M') + '.csv'
        print(nombre)
        resultados = backtest_regression(name_model, inicial_date, end_date, df, cutoff, retrain_days,
                                         scaler, pca, params)
        # print("mean absolute porcentage error:", mean_absolute_porcentage_error(resultados))
        # print("mean absolute error:", mean_absolute_error(resultados))
        # print("root mean square error:", root_mean_square_error(resultados))
        # print("tiempo de ejecucion: ", datetime.now() - start_time)
        # resultados.to_csv(PATH + nombre, index = False, header=True)

    except Exception as ex:
        print(":::::::::::  Error en " + modelo[0], " ::: ", ex)



---> RandomForest <--- hora inicio: 2022-10-01 22:16:45.791675
Parametros:  {'n_jobs': 4}  scaler:  False  pca:  False
RandomForest_artik47_ML_2022_10_01-22_16.csv
from:  2008-01-01 00:00:00  to:  2008-01-02 23:00:00




  0%|          | 0/2 [00:00<?, ?it/s][A[A

3it [00:00, 27.21it/s]               [A[A

2008-01-01 00:00:00
2008-01-01 00:30:00
2008-01-01 01:00:00
2008-01-01 01:30:00
2008-01-01 02:00:00
2008-01-01 02:30:00




6it [00:00, 27.20it/s][A[A

10it [00:00, 27.80it/s][A[A

2008-01-01 03:00:00
2008-01-01 03:30:00
2008-01-01 04:00:00
2008-01-01 04:30:00
2008-01-01 05:00:00
2008-01-01 05:30:00




14it [00:00, 27.97it/s][A[A

17it [00:00, 27.76it/s][A[A

2008-01-01 06:00:00
2008-01-01 06:30:00
2008-01-01 07:00:00
2008-01-01 07:30:00
2008-01-01 08:00:00
2008-01-01 08:30:00




20it [00:00, 27.63it/s][A[A



2008-01-01 09:00:00
2008-01-01 09:30:00
2008-01-01 10:00:00
2008-01-01 10:30:00
2008-01-01 11:00:00


23it [00:00, 27.53it/s][A[A

26it [00:00, 26.32it/s][A[A

2008-01-01 11:30:00
2008-01-01 12:00:00
2008-01-01 12:30:00
2008-01-01 13:00:00
2008-01-01 13:30:00
2008-01-01 14:00:00




29it [00:01, 26.58it/s][A[A

32it [00:01, 26.77it/s][A[A

2008-01-01 14:30:00
2008-01-01 15:00:00
2008-01-01 15:30:00
2008-01-01 16:00:00
2008-01-01 16:30:00




35it [00:01, 25.81it/s][A[A

38it [00:01, 24.25it/s][A[A

2008-01-01 17:00:00
2008-01-01 17:30:00
2008-01-01 18:00:00
2008-01-01 18:30:00
2008-01-01 19:00:00




41it [00:01, 25.10it/s][A[A

2008-01-01 19:30:00
2008-01-01 20:00:00
2008-01-01 20:30:00
2008-01-01 21:00:00
2008-01-01 21:30:00




44it [00:01, 24.73it/s][A[A

2008-01-01 22:00:00


98it [04:15,  2.61s/it]


47it [00:02, 14.80it/s][A[A



2008-01-01 22:30:00
2008-01-01 23:00:00
2008-01-01 23:30:00
2008-01-02 00:00:00
2008-01-02 00:30:00


50it [00:02, 17.15it/s][A[A

53it [00:02, 19.33it/s][A[A

2008-01-02 01:00:00
2008-01-02 01:30:00
2008-01-02 02:00:00
2008-01-02 02:30:00
2008-01-02 03:00:00
2008-01-02 03:30:00




56it [00:02, 21.17it/s][A[A

59it [00:02, 22.70it/s][A[A

2008-01-02 04:00:00
2008-01-02 04:30:00
2008-01-02 05:00:00
2008-01-02 05:30:00
2008-01-02 06:00:00
2008-01-02 06:30:00




62it [00:02, 23.91it/s][A[A

65it [00:02, 24.84it/s][A[A

2008-01-02 07:00:00
2008-01-02 07:30:00
2008-01-02 08:00:00
2008-01-02 08:30:00
2008-01-02 09:00:00
2008-01-02 09:30:00




68it [00:02, 25.54it/s][A[A

71it [00:02, 26.05it/s][A[A

2008-01-02 10:00:00
2008-01-02 10:30:00
2008-01-02 11:00:00
2008-01-02 11:30:00
2008-01-02 12:00:00
2008-01-02 12:30:00




75it [00:03, 27.90it/s][A[A

78it [00:03, 27.70it/s][A[A

2008-01-02 13:00:00
2008-01-02 13:30:00
2008-01-02 14:00:00
2008-01-02 14:30:00
2008-01-02 15:00:00
2008-01-02 15:30:00




81it [00:03, 27.30it/s][A[A

84it [00:03, 26.47it/s][A[A

2008-01-02 16:00:00
2008-01-02 16:30:00
2008-01-02 17:00:00
2008-01-02 17:30:00
2008-01-02 18:00:00




87it [00:03, 25.66it/s][A[A



2008-01-02 18:30:00
2008-01-02 19:00:00
2008-01-02 19:30:00
2008-01-02 20:00:00
2008-01-02 20:30:00


90it [00:03, 26.13it/s][A[A

95it [00:03, 24.69it/s][A[A

2008-01-02 21:00:00
2008-01-02 21:30:00
2008-01-02 22:00:00
2008-01-02 22:30:00
2008-01-02 23:00:00



