## Cargar librerías

In [None]:
import requests
import talib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import read_csv, set_option
from pandas.plotting import scatter_matrix
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC 
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

## Descargar los datos

In [None]:
def fetch_bitstamp_data(symbol, start, end, timeframe, limit=1000):
    url = f"https://www.bitstamp.net/api/v2/ohlc/{symbol}/"
    data_frames = []
    
    while start < end:
        # Ajustar end para la solicitud actual para no exceder el límite de 1000 registros
        current_end = min(start + (timeframe * limit), end)
        # Debug
        #print(f"{pd.to_datetime(start, unit='s')} - {pd.to_datetime(current_end, unit='s')}")
        params = {
            'start': int(start),
            'end': int(current_end),
            'step': timeframe,
            'limit': limit,
            'exclude_current_candle': False
        }
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                df = pd.DataFrame(data['data']['ohlc'])
                if not df.empty:
                    data_frames.append(df)
            else:
                raise Exception(f"Failed to fetch data: {response.status_code}, {response.text}")
        except Exception as e:
            print(e)
            break

        start = current_end
    # Combinar todos los DataFrames
    if data_frames:
        df = pd.concat(data_frames, ignore_index=True)
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        df = df.set_index('timestamp')
        df = df.sort_index()
        df.index.name = 'date'
        df = df.astype({
            'open': float,
            'high': float,
            'low': float,
            'close': float,
            'volume': float
        })
        return df
    else:
        return pd.DataFrame()

# Uso de ejemplo
start_date = pd.Timestamp('2011-01-01').timestamp()
end_date = pd.Timestamp.now(tz='UTC').timestamp()

df = fetch_bitstamp_data('btcusd', start=start_date, end=end_date, timeframe=3600)

## Análisis exploratorio de datos

In [None]:
df.shape

In [None]:
df.describe()

## Ingeniería de características

In [None]:
# Crear indicadores técnicos
df['short_mavg'] = talib.SMA(real=df['close'], timeperiod=10)
df['long_mavg'] = talib.SMA(real=df['close'], timeperiod=60)
df['ema10'] = talib.EMA(real=df['close'], timeperiod=10)
df['ema30'] = talib.EMA(real=df['close'], timeperiod=30)
df['ema200'] = talib.EMA(real=df['close'], timeperiod=200)
df['roc10'] = talib.ROC(real=df['close'], timeperiod=10)
df['roc30'] = talib.ROC(real=df['close'], timeperiod=30)
df['mom10'] = talib.MOM(real=df['close'], timeperiod=10)
df['mom30'] = talib.MOM(real=df['close'], timeperiod=30)
df['rsi10'] = talib.RSI(real=df['close'], timeperiod=10)
df['rsi30'] = talib.RSI(real=df['close'], timeperiod=30)
df['rsi200'] = talib.RSI(real=df['close'], timeperiod=200)
df['k10'], df['d10'] = talib.STOCH(high=df['high'], low=df['low'], close=df['close'], fastk_period=10, slowk_period=10, slowd_period=10)
df['k30'], df['d30'] = talib.STOCH(high=df['high'], low=df['low'], close=df['close'], fastk_period=30, slowk_period=30, slowd_period=30)
df['k200'], df['d200'] = talib.STOCH(high=df['high'], low=df['low'], close=df['close'], fastk_period=200, slowk_period=200, slowd_period=200)

In [None]:
# Comprobar valores nulos
if df.isnull().values.any():
    df = df.dropna()
print(f'Null values = {df.isnull().values.any()}')

In [None]:
# Generar la señal
df['signal'] = np.where(df['short_mavg'] > df['long_mavg'], 1.0, 0.0)

In [None]:
# Excluir características innecesarias para la predicción
df = df.drop(columns=['high', 'low', 'open', 'short_mavg', 'long_mavg'], axis=1)

In [None]:
df

## Visualización de datos

In [None]:
df[['close']].plot(grid=True)
plt.show()

In [None]:
df.hist(sharex=False, sharey= False, xlabelsize=1, ylabelsize=1, figsize=(12, 12))
plt.show()

In [None]:
df.groupby(['signal']).size().plot(kind='barh')
plt.show()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(), vmax=1, square=True, annot=True, cmap='cubehelix')
plt.show()

## Evaluar algoritmos y modelos

In [None]:
# Train and test split
Y = df['signal']
X = df.loc[:, df.columns != 'signal']
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=0.2)

In [None]:
# Modelos
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('AB', AdaBoostClassifier()))
models.append(('GBM', GradientBoostingClassifier()))
models.append(('RF', RandomForestClassifier()))
#models.append(('SVM', SVC()))
models.append(('XGB', XGBClassifier()))
models.append(('LGBM', LGBMClassifier()))
models.append(('CAT', CatBoostClassifier()))
models.append(('SGD', SGDClassifier()))
models.append(('BAG', BaggingClassifier()))
models.append(('ET', ExtraTreesClassifier()))

In [None]:
# Opciones
num_folds = 10
scoring = 'accuracy'

In [None]:
results = []
names = []
for name, model in models:
    kfold = KFold(n_splits=num_folds)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print(f'{name}: {cv_results.mean()} ({cv_results.std()})')


In [None]:
# comparar algoritmos
fig = plt.figure()
fig.suptitle('Comparación de algoritmos')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
fig.set_size_inches(15, 8)
plt.show()

## Tuning del modelo

In [None]:
# Normalización
scaler = StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
# Parámetros a optimizar con grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}
model = models[-1][1]
kfold = KFold(n_splits=num_folds)
grid = GridSearchCV(estimator=model, param_grid= param_grid, scoring=scoring, cv=kfold)
grid_result =grid.fit(X_scaled, Y_train)

In [None]:
# Print results
print(f'Best: {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
ranks = grid_result.cv_results_['rank_test_score']
for mean, stdev, param, rank in zip(means, stds, params, ranks):
    print(f'#{rank} {mean} ({stdev}) with: {param}')

## Resultados del modelo tuneado

In [None]:
model = ExtraTreesClassifier(
    n_estimators=params['n_estimators'],
    max_features=params['max_features'],
    max_depth=params['max_depth'],
    min_samples_split=params['min_samples_split'],
    min_samples_leaf=params['min_samples_leaf'],
    criterion=params['criterion']
    )
model.fit(X_train, Y_train)

In [None]:
# resultados en validation set
y_pred = model.predict(X_validation)
print(accuracy_score(Y_validation, y_pred))
print(confusion_matrix(Y_validation, y_pred))
print(classification_report(Y_validation, y_pred))

In [None]:
df_cm = pd.DataFrame(confusion_matrix(Y_validation, y_pred), columns=np.unique(Y_validation), index=np.unique(Y_validation))
df_cm.index.name = 'Actual'
df_cm.columns.name = 'Predicted'
sns.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={'size': 16})
plt.show()

## Intuición de variables / Importancia de características

In [None]:
features_importance = pd.DataFrame({'Importance': model.feature_importances_*100}, index=X.columns)
features_importance.sort_values('Importance', axis=0, ascending=True).plot(kind='barh')
plt.show()

## Backtesting results

In [None]:
# Crear una columna con los retornos de la estrategia multiplicando
# la señal al cierre de la vela anterior por el retorno de la vela siguiente
backtestdata = pd.DataFrame(index=X_validation.index)
backtestdata['signal_pred'] = y_pred
backtestdata['signal_actual'] = Y_validation
backtestdata['Market Returns'] = X_validation['close'].pct_change()
backtestdata['Actual Returns'] = backtestdata['Market Returns'] * backtestdata['signal_actual'].shift(1)
backtestdata['Strategy Returns'] = backtestdata['Market Returns'] * backtestdata['signal_pred'].shift(1)
backtestdata

In [None]:
# Visualizar distribución
backtestdata[['Strategy Returns', 'Actual Returns']].cumsum().plot(kind='hist')
backtestdata[['Strategy Returns', 'Actual Returns']].cumsum().plot()
plt.show()