In [9]:
import pandas as pd
import numpy as np
import sqlalchemy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


from data import db
from ml.processing import var_to_str
from ml.labels import apply_triple_barrier
from ml.analytics import exponential_decay, return_attribution

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:

SYMBOL = "ETH"
start = "2020"
end = "2024"

DB = db.connect_db("database", interval="1h")
data = DB.get_data(SYMBOL)
data = data.loc[start:end]


def var_to_str(data):
    data.columns = [str(col) if isinstance(col, sqlalchemy.sql.elements.quoted_name) else col for col in data.columns]

var_to_str(data)

### Triple Barrier

In [11]:
# Calcul de la volatilité quotidienne avec une fenêtre glissante de 24 heures
daily_vol = data['close'].pct_change().rolling(window=24).std()

events = pd.DataFrame(index=data.index)
events['t1'] = events.index + pd.Timedelta(days=24)  # Expiration après 24 heure
events['trgt'] = daily_vol
events



Unnamed: 0_level_0,t1,trgt
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,2020-01-25 00:00:00,
2020-01-01 01:00:00,2020-01-25 01:00:00,
2020-01-01 02:00:00,2020-01-25 02:00:00,
2020-01-01 03:00:00,2020-01-25 03:00:00,
2020-01-01 04:00:00,2020-01-25 04:00:00,
...,...,...
2023-12-31 19:00:00,2024-01-24 19:00:00,0.002875
2023-12-31 20:00:00,2024-01-24 20:00:00,0.003074
2023-12-31 21:00:00,2024-01-24 21:00:00,0.003059
2023-12-31 22:00:00,2024-01-24 22:00:00,0.002911


In [12]:
labels = apply_triple_barrier(data['close'], events, profit_mult=2, loss_mult=2, time_barrier=10)
labels['label'].value_counts()

  out.at[ix, 'touch_time'] = first_touch


label
 1    18105
-1    16883
 0       44
Name: count, dtype: int64

## ML features

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


data['Y'] = labels['label']


class ReturnFeature(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['return'] = X['close'].pct_change()
        X.columns = X.columns.astype(str)
        return X

class SMACalculator(BaseEstimator, TransformerMixin):
    def __init__(self, window):
        self.window = window

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X[f'SMA_{self.window}'] = X['close'].rolling(window=self.window).mean()
        X.columns = X.columns.astype(str)
        return X

class RSICalculator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        delta = X['close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        X['RSI'] = 100 - (100 / (1 + rs))
        X.columns = X.columns.astype(str)
        return X

class MACDCalculator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        exp1 = X['close'].ewm(span=12, adjust=False).mean()
        exp2 = X['close'].ewm(span=26, adjust=False).mean()
        X['MACD'] = exp1 - exp2
        X['Signal_line'] = X['MACD'].ewm(span=9, adjust=False).mean()
        X.columns = X.columns.astype(str)
        return X

class HighLowRangeCalculator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['High_Low_Range'] = X['high'] - X['low']
        X.columns = X.columns.astype(str)
        return X

class OBVCalculator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        obv = (np.sign(X['close'].diff()) * X['volume']).fillna(0).cumsum()
        X['OBV'] = obv
        X.columns = X.columns.astype(str)
        return X

class DropNaTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X.dropna(inplace=True)
        X.columns = X.columns.astype(str)
        return X


In [22]:

# Définition du pipeline de caractéristiques sans le modèle
feature_pipeline = Pipeline([
    ('return_feature', ReturnFeature()),
    ('sma_10', SMACalculator(window=10)),
    ('sma_20', SMACalculator(window=20)),
    ('rsi', RSICalculator()),
    ('macd', MACDCalculator()),
    ('high_low_range', HighLowRangeCalculator()),
    ('obv', OBVCalculator()),
    ('dropna', DropNaTransformer())  # Pour supprimer les valeurs manquantes
])

feature_pipeline


In [23]:
feature_pipeline.fit_transform(data)

Unnamed: 0_level_0,open,high,low,close,volume,Y,return,SMA_10,SMA_20,RSI,MACD,Signal_line,High_Low_Range,OBV
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-02 14:00:00,129.34,129.86,129.21,129.37,5538.92471,-1,0.000155,129.643,130.1240,37.403101,-0.520919,-0.523181,0.65,-5.774460e+04
2020-01-02 15:00:00,129.37,129.80,129.36,129.59,5352.62034,-1,0.001701,129.639,130.0105,44.238683,-0.497638,-0.518072,0.44,-5.239198e+04
2020-01-02 16:00:00,129.58,129.78,126.94,127.60,32996.89479,-1,-0.015356,129.457,129.7815,33.489097,-0.632473,-0.540952,2.84,-8.538888e+04
2020-01-02 17:00:00,127.60,127.68,126.38,127.40,40429.91193,-1,-0.001567,129.271,129.5625,35.833333,-0.746859,-0.582134,1.30,-1.258188e+05
2020-01-02 18:00:00,127.42,127.75,127.13,127.49,7360.65888,-1,0.000706,128.999,129.4235,31.737589,-0.820788,-0.629864,0.62,-1.184581e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-31 19:00:00,2303.20,2304.34,2291.14,2294.83,6990.57320,-1,-0.003630,2302.427,2296.6340,60.815576,0.199039,-0.238609,13.20,2.245593e+06
2023-12-31 20:00:00,2294.83,2294.83,2280.28,2282.40,12876.96600,0,-0.005417,2299.670,2295.7025,48.523035,-1.187807,-0.428449,14.55,2.232716e+06
2023-12-31 21:00:00,2282.40,2292.75,2280.11,2283.20,7392.86670,0,0.000351,2297.130,2295.3725,40.981824,-2.197014,-0.782162,12.64,2.240109e+06
2023-12-31 22:00:00,2283.21,2293.99,2258.88,2274.77,18374.14730,0,-0.003692,2294.327,2294.8295,36.174893,-3.635144,-1.352758,35.11,2.221735e+06


In [15]:
feature_pipeline

## Fonction de Traitement en Temps Reel

In [25]:
def train_model(data, pipeline):
    X = data.drop(columns=['Y'])
    y = data['Y']
    
    X_transformed = pipeline.fit_transform(X)
    y = y.loc[X_transformed.index]
    
    model = RandomForestClassifier()
    model.fit(X_transformed, y)
    
    return model, pipeline


model, pipeline = train_model(data, feature_pipeline)

In [None]:
import joblib

# Exporter le pipeline et le modèle
joblib.dump(pipeline, 'feature_pipeline.pkl')
joblib.dump(model, 'random_forest_model.pkl')

## Model

In [None]:
data.dropna(inplace=True)

# Supposons que vous avez déjà préparé un DataFrame `labels` avec vos étiquettes
features = data[['return', 'SMA_10', 'SMA_20', 'RSI', 'MACD', 'Signal_line', 'High_Low_Range', 'OBV']]
labels = data['Y']

# Séparation en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Normalisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Evaluation

In [None]:

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

# Prédiction et évaluation
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
# Doit etre interpreter : IMPORTANT
return_attribution(data, model, list(features.columns))

# Backtest

In [None]:
X_test

In [None]:
backtest = X_test.copy()
backtest['pred'] = y_test

backtest['cum_ret'] = (1 + backtest['return']).cumprod()
backtest['strategy'] = backtest['return'] * backtest['pred']
backtest['strategy_cum_ret'] = (1 + backtest['strategy']).cumprod()


In [None]:
backtest['pred'].plot()

In [None]:
backtest[['cum_ret', 'strategy_cum_ret']].plot(figsize = (15, 10))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def simulate_bet_sizes(num_bets, true_prob):
    np.random.seed(42)
    outcomes = np.random.rand(num_bets) < true_prob
    bet_sizes = np.random.rand(num_bets)
    returns = bet_sizes * outcomes - bet_sizes * (1 - outcomes)
    return returns

# Paramètres de la simulation
num_bets = 1000
true_prob = 0.55  # Probabilité que le pari soit favorable

# Exécuter la simulation
returns = simulate_bet_sizes(num_bets, true_prob)

# Calculer le rendement total et afficher les résultats
total_return = np.sum(returns)
print(f"Total Return: {total_return}")

# Afficher l'histogramme des retours
plt.hist(returns, bins=30, alpha=0.75)
plt.title('Histogram of Returns from Bets')
plt.xlabel('Return')
plt.ylabel('Frequency')
plt.show()


In [None]:
import numpy as np
from sklearn.mixture import GaussianMixture
import matplotlib.pyplot as plt

# Exemple de données
data = np.random.randn(1000)

# Ajustement du mélange de deux Gaussiennes
gmm = GaussianMixture(n_components=2)
gmm.fit(data.reshape(-1, 1))

# Estimation des responsabilités
responsibilities = gmm.predict_proba(data.reshape(-1, 1))

# Calcul de la taille du pari
bet_size = responsibilities[:, 1]  # Utiliser la deuxième composante comme exemple

# Tracer les données et la taille des paris
plt.hist(data, bins=30, density=True, alpha=0.5, color='gray')
x = np.linspace(min(data), max(data), 1000)
logprob = gmm.score_samples(x.reshape(-1, 1))
pdf = np.exp(logprob)
plt.plot(x, pdf, '-r')
plt.show()
