In [1]:
import pandas as pd
import numpy as np
import sqlalchemy

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from data import db


SYMBOL = "ETH"
start = "2020"
end = "2024"

DB = db.connect_db("database", interval="1h")
data = DB.get_data(SYMBOL)
data = data.loc[start:end]

# Convert all column names to plain Python strings
data.columns = [str(col) if isinstance(col, sqlalchemy.sql.elements.quoted_name) else col for col in data.columns]
print("Column names and types after conversion:")
for col in data.columns:
    print(f"Column: {col}, Type: {type(col)}")

Column names and types after conversion:
Column: open, Type: <class 'str'>
Column: high, Type: <class 'str'>
Column: low, Type: <class 'str'>
Column: close, Type: <class 'str'>
Column: volume, Type: <class 'str'>


In [2]:
def apply_triple_barrier(prices, events, profit_mult, loss_mult, time_barrier):
    """
    prices: Series des prix
    events: DataFrame avec les colonnes 't1' pour les horizons temporels et 'trgt' pour les seuils
    profit_mult, loss_mult: Multiplicateurs pour définir les seuils de profit et de perte
    time_barrier: Nombre de jours pour la barrière verticale
    """
    # Stocker les temps de toucher pour chaque barrière
    out = events[['t1']].copy(deep=True)
    out['touch_time'] = np.nan
    out['label'] = 0  # Par défaut, la barrière de temps est atteinte sans toucher les barrières de profit/perte
    
    for ix, event in events.iterrows():
        start_price = prices.loc[ix]
        end_date = event['t1']
        target = event['trgt']
        
        if pd.isna(start_price) or pd.isna(target):
            continue
        
        # Définir les seuils
        upper_barrier = start_price * (1 + profit_mult * target)
        lower_barrier = start_price * (1 - loss_mult * target)
        
        # Filtrer les prix dans la fenêtre de temps
        price_sub = prices.loc[ix:end_date]
        
        # Vérifier le toucher des barrières supérieure et inférieure
        upper_touch = price_sub[price_sub >= upper_barrier].index.min()
        lower_touch = price_sub[price_sub <= lower_barrier].index.min()
        
        # Déterminer le premier toucher
        first_touch = min(filter(pd.notna, [upper_touch, lower_touch, end_date]))
        if pd.notna(first_touch):
            out.at[ix, 'touch_time'] = first_touch
            if first_touch == upper_touch:
                out.at[ix, 'label'] = 1
            elif first_touch == lower_touch:
                out.at[ix, 'label'] = -1

    return out


In [3]:
# Calcul de la volatilité quotidienne avec une fenêtre glissante de 10 jours
daily_vol = data['close'].pct_change().rolling(window=24).std()


events = pd.DataFrame(index=data.index)
events['t1'] = events.index + pd.Timedelta(days=10)  # Expiration après 10 jours
events['trgt'] = daily_vol
events

Unnamed: 0_level_0,t1,trgt
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,2020-01-11 00:00:00,
2020-01-01 01:00:00,2020-01-11 01:00:00,
2020-01-01 02:00:00,2020-01-11 02:00:00,
2020-01-01 03:00:00,2020-01-11 03:00:00,
2020-01-01 04:00:00,2020-01-11 04:00:00,
...,...,...
2023-12-31 19:00:00,2024-01-10 19:00:00,0.002875
2023-12-31 20:00:00,2024-01-10 20:00:00,0.003074
2023-12-31 21:00:00,2024-01-10 21:00:00,0.003059
2023-12-31 22:00:00,2024-01-10 22:00:00,0.002911


In [4]:
labels = apply_triple_barrier(data['close'], events, profit_mult=2, loss_mult=2, time_barrier=10)
labels['label'].value_counts()

  out.at[ix, 'touch_time'] = first_touch


label
 1    18105
-1    16883
 0       44
Name: count, dtype: int64

In [5]:
data['Y'] = labels['label']

# Calcul des Moyennes Mobiles
data['SMA_10'] = data['close'].rolling(window=10).mean()
data['SMA_20'] = data['close'].rolling(window=20).mean()

# Calcul du RSI
delta = data['close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
data['RSI'] = 100 - (100 / (1 + rs))

# Calcul du MACD
exp1 = data['close'].ewm(span=12, adjust=False).mean()
exp2 = data['close'].ewm(span=26, adjust=False).mean()
data['MACD'] = exp1 - exp2
data['Signal_line'] = data['MACD'].ewm(span=9, adjust=False).mean()

# Calcul du High-Low Range
data['High_Low_Range'] = data['high'] - data['low']

# Calcul du Volume On-Balance (OBV)
obv = (np.sign(data['close'].diff()) * data['volume']).fillna(0).cumsum()
data['OBV'] = obv

In [6]:
import plotly.graph_objects as go

# Créer un graphique de lignes pour les prix de clôture
fig = go.Figure()
fig.add_trace(go.Scatter(x=data.index, y=data['close'], mode='lines', name='Prix de Clôture'))

# Ajouter des points pour les touches de barrière de profit et de perte
fig.add_trace(go.Scatter(
    x=labels[labels['label'] == 1].index,
    y=data['close'][labels[labels['label'] == 1].index],
    mode='markers', marker=dict(color='green', size=4),
    name='Profit (Label 1)'
))

fig.add_trace(go.Scatter(
    x=labels[labels['label'] == -1].index,
    y=data['close'][labels[labels['label'] == -1].index],
    mode='markers', marker=dict(color='red', size=4),
    name='Perte (Label -1)'
))

# Ajouter des options de mise en forme
fig.update_layout(
    title='Application de la Méthode à Trois Barrières sur ETH',
    xaxis_title='Date',
    yaxis_title='Prix de Clôture',
    legend_title='Légende'
)

# Afficher le graphique
fig.show()


In [7]:
data.dropna(inplace=True)

# Supposons que vous avez déjà préparé un DataFrame `labels` avec vos étiquettes
features = data[['SMA_10', 'SMA_20', 'RSI', 'MACD', 'Signal_line', 'High_Low_Range', 'OBV']]
labels = data['Y']
# Séparation en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Normalisation des données
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
data.shape

(35013, 13)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

# Prédiction et évaluation
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.8400685420534056
Classification Report:
              precision    recall  f1-score   support

          -1       0.84      0.84      0.84      3437
           0       0.00      0.00      0.00         5
           1       0.84      0.85      0.84      3561

    accuracy                           0.84      7003
   macro avg       0.56      0.56      0.56      7003
weighted avg       0.84      0.84      0.84      7003



In [11]:
y_pred_proba

array([[0.2 , 0.  , 0.8 ],
       [0.22, 0.  , 0.78],
       [0.08, 0.  , 0.92],
       ...,
       [0.82, 0.  , 0.18],
       [0.1 , 0.  , 0.9 ],
       [0.18, 0.  , 0.82]])