# Artik 47

In [28]:
import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNet, Lasso, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn import neighbors

from tqdm import tqdm
from datetime import *
# import catboost as ctb
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 17785751376919860730
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 2255906407
locality {
  bus_id: 1
  links {
  }
}
incarnation: 17428994458375981652
physical_device_desc: "device: 0, name: NVIDIA GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


## Data preparation

In [41]:
gbpusd = pd.read_csv(f'~\Documents\Artik_47\Artik-47\data\GBPUSD_M30.csv', sep='\t', parse_dates=['Time'])

#Just for close values as objetive
gbpusd = gbpusd[['Time', 'Close', 'Volume']]
gbpusd

Unnamed: 0,Time,Close,Volume
0,2007-01-01 06:00:00,1.95817,1728
1,2007-01-01 06:30:00,1.95815,1651
2,2007-01-01 07:00:00,1.95824,1352
3,2007-01-01 07:30:00,1.95822,1722
4,2007-01-01 08:00:00,1.95852,1729
...,...,...,...
193895,2022-07-13 03:30:00,1.18863,596
193896,2022-07-13 04:00:00,1.18813,1522
193897,2022-07-13 04:30:00,1.18794,1852
193898,2022-07-13 05:00:00,1.18797,1404


In [42]:
gbpusd.describe()

Unnamed: 0,Close,Volume
count,193900.0,193900.0
mean,1.51325,5233.197968
std,0.21537,11369.920694
min,1.14293,1.0
25%,1.322348,1562.0
50%,1.517765,2935.0
75%,1.61315,5508.0
max,2.11425,556867.0


In [58]:
df = gbpusd.copy()

#Shift last x values
last_x = 6

list_shift = []
for i in range(1,last_x):
    name_col = 'Close_' + str(i)
    df[name_col] = df['Close'].shift(i)
    list_shift.append(name_col)

df = df.dropna()

#Variance of shift values
df['var_last_val'] = df[list_shift].var(axis=1)


#Spit
# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
df

Unnamed: 0,Time,Close,Volume,Close_1,Close_2,Close_3,Close_4,Close_5,var_last_val
5,2007-01-01 08:30:00,1.95877,1524,1.95852,1.95822,1.95824,1.95815,1.95817,2.245000e-08
6,2007-01-01 09:00:00,1.95882,1565,1.95877,1.95852,1.95822,1.95824,1.95815,6.745000e-08
7,2007-01-01 09:30:00,1.95883,1704,1.95882,1.95877,1.95852,1.95822,1.95824,8.018000e-08
8,2007-01-01 10:00:00,1.95840,1643,1.95883,1.95882,1.95877,1.95852,1.95822,6.897000e-08
9,2007-01-01 10:30:00,1.95823,1358,1.95840,1.95883,1.95882,1.95877,1.95852,3.837000e-08
...,...,...,...,...,...,...,...,...,...
193895,2022-07-13 03:30:00,1.18863,596,1.18885,1.18834,1.18883,1.18885,1.18866,4.823000e-08
193896,2022-07-13 04:00:00,1.18813,1522,1.18863,1.18885,1.18834,1.18883,1.18885,4.910000e-08
193897,2022-07-13 04:30:00,1.18794,1852,1.18813,1.18863,1.18885,1.18834,1.18883,9.878000e-08
193898,2022-07-13 05:00:00,1.18797,1404,1.18794,1.18813,1.18863,1.18885,1.18834,1.352700e-07


## ML  version

In [None]:
def backtest_regression(model_name, fecha_inicio, fecha_fin, data, cutoff, retrain_days, t_scaler, n_pca, params=None):
    #print("from: ", fecha_inicio, " to: ", fecha_fin)

    #Primer entreno
    mask = (data.ds < (fecha_inicio - timedelta(days=cutoff)))
    train = data.loc[mask]
    #print("Train: \n", train.iloc[:,0:6], "\n")
    train = train.drop(columns=['ds'])

    val_reales = np.array([])
    val_pred = np.array([])
    val_fechas = np.array([])

    # Divide en variables de entrada y salida
    X, y = train.values[:, 1:], train.values[:, 0].astype('float64')

    #Scalers
    if(t_scaler=='MinMaxScaler'):
        scaler = MinMaxScaler(feature_range=(0, 1))
        X = scaler.fit_transform(X)
    elif(t_scaler=='StandardScaler'):
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
    elif(t_scaler=='RobustScaler'):
        scaler = RobustScaler(quantile_range=(0, 95.0))
        X = scaler.fit_transform(X)

    #PCA
    if (type(n_pca) == int):
        pca = PCA(n_components=n_pca)
        X = pca.fit_transform(X)

    #Crea el modelo
    model = eval(model_name + "()")

    #Agrega parametros
    if(params!=None):
        model.set_params(**params)

    model.fit(X, y)

    pbar = tqdm(total=(fecha_fin-fecha_inicio).days+1)
    fecha_index = fecha_inicio
    while(fecha_index <= fecha_fin):
        #print(fecha_index)
        if(fecha_index.weekday() in retrain_days):

            #Re entreno
            mask = (data.ds < (fecha_index - timedelta(days=cutoff)))
            train = data.loc[mask]
            #print("Train: \n", train.iloc[:,0:6], "\n")
            train = train.drop(columns=['ds'])

            # Divide en variables de entrada y salida
            X, y = train.values[:, 1:], train.values[:, 0].astype('float64')

            #Scalers
            if(t_scaler=='MinMaxScaler'):
                scaler = MinMaxScaler(feature_range=(0, 1))
                X = scaler.fit_transform(X)
            elif(t_scaler=='StandardScaler'):
                scaler = StandardScaler()
                X = scaler.fit_transform(X)
            elif(t_scaler=='RobustScaler'):
                scaler = RobustScaler(quantile_range=(0, 95.0))
                X = scaler.fit_transform(X)

            #PCA
            if (type(n_pca) == int):
                pca = PCA(n_components=n_pca)
                X = pca.fit_transform(X)

            model = eval(model_name + "()")

            if(params!=None):
                model.set_params(**params)

            model.fit(X, y)

        mask = (data.ds >= fecha_index) & (data.ds <= fecha_index + timedelta(hours=23))
        to_pred = data.loc[mask]
        if not(to_pred.empty):

            val_fechas = np.append(val_fechas, to_pred.iloc[:,0].tolist())
            #print("to_pred: ", to_pred.values)
            to_pred = to_pred.drop(columns=['ds'])
            # Divide en variables de entrada y salida
            X, y = to_pred.values[:, 1:], to_pred.values[:, 0].astype('float64')

            #Scalers
            if((t_scaler=='MinMaxScaler') or (t_scaler=='StandardScaler') or (t_scaler=='RobustScaler')):
                X = scaler.transform(X)

            #PCA
            if (type(n_pca) == int):
                X = pca.transform(X)

            y_hat = model.predict(X)

            val_reales = np.append(val_reales, y)
            val_pred = np.append(val_pred, y_hat)

        fecha_index = fecha_index + timedelta(days=1)
        pbar.update(1)

    resultados = pd.DataFrame({'fechas':val_fechas, 'val_reales':val_reales, 'val_pred':val_pred})
    pbar.close()
    return resultados

In [None]:
models_dict = {
    # 'XGBoost0':{'xgb.XGBRegressor':{'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    # 'XGBoost1':{'xgb.XGBRegressor':{'Objective':'reg:squaredlogerror',
    #                                'n_estimator':100,
    #                                # 'eval_metric':'mape',
    #                                'booster':'gbtree',
    #                                'verbosity':0,
    #                                # 'disable_default_eval_metric':'false',
    #                                'learning_rate':0.2,
    #                                'max_depth':7,
    #                                'min_child_weight':2,
    #                                'sampling_method':'gradient_based',
    #                                'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    #'ElasticNet':{'ElasticNet':{'alpha':1.0, 'l1_ratio':0.5}},
    #'ElasticNet_2':{'ElasticNet':None},
    #'Lasso':{'Lasso':None},
    # 'RandomForest':{'RandomForestRegressor':{'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    # 'RandomForest_2':{'RandomForestRegressor':{'n_estimators': 300, 'min_samples_split': 10,
    #                                         'min_samples_leaf': 1, 'max_features': 'sqrt','max_depth': 40,
    #                                       'bootstrap': False, 'n_jobs': 10}, 'Scaler':False, 'PCA':False},
    #                'AdaBoost':{'AdaBoostRegressor':None},
    # 'SVR0':{'SVR':None, 'Scaler':'StandardScaler', 'PCA':5},
    'SVR1':{'SVR':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'SVR2':{'SVR':None, 'Scaler':'StandardScaler', 'PCA':10},
    'HuberRegressor':{'HuberRegressor':{'epsilon':1.0, 'max_iter':200*20, 'alpha':0.0001}, 'Scaler':False, 'PCA':False},
    'HuberRegressor1':{'HuberRegressor':None, 'Scaler':'StandardScaler', 'PCA':9},
    'HuberRegressor2':{'HuberRegressor':None, 'Scaler':False, 'PCA':9},
    # 'HuberRegressor2':{'HuberRegressor':{'epsilon':1.0, 'max_iter':200*20, 'alpha':1e-10}, 'Scaler':False, 'PCA':9},
    # 'HuberRegressor3':{'HuberRegressor':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'HuberRegresso4':{'HuberRegressor':{'epsilon':1.0, 'max_iter':200*20, 'alpha':1e-10}, 'Scaler':'StandardScaler', 'PCA':9},
    # 'HuberRegressor5':{'HuberRegressor':None, 'Scaler':'StandardScaler', 'PCA':9},
    # 'SVR2':{'SVR':None, 'Scaler':'RobustScaler', 'PCA':6},
    #'KNR':{'neighbors.KNeighborsRegressor':None},
    #'BaggingRegressor':{'BaggingRegressor':None},
    #'GradientBoosting':{'GradientBoostingRegressor':None},
}

df_resultados = pd.DataFrame()

for modelo in models_dict.items():
    #print(modelo, type(modelo))
    try:
        output_name_model = modelo[0]
        name_model = list(modelo[1])[0]
        params = modelo[1].get(name_model)
        scaler = modelo[1]['Scaler']
        pca = modelo[1]['PCA']

        start_time = datetime.now()
        print("\n\n--->", output_name_model, '<--- hora inicio:', start_time)
        print("Parametros: ", params, " scaler: ", scaler, " pca: ", pca)
        nombre = output_name_model + '_' + NOM_EXP + '_' + start_time.strftime('%Y_%m_%d-%H_%M') + '.csv'
        print(nombre)
        resultados = backtest_regression(name_model, inicial_date, end_date, data_c, cutoff, retrain_days,
                                         scaler, pca, params)
        print("mean absolute porcentage error:", mean_absolute_porcentage_error(resultados))
        print("mean absolute error:", mean_absolute_error(resultados))
        print("root mean square error:", root_mean_square_error(resultados))
        print("tiempo de ejecucion: ", datetime.now() - start_time)
        resultados.to_csv(PATH + nombre, index = False, header=True)

    except Exception as ex:
        print(":::::::::::  Error en " + modelo[0], " ::: ", ex)