In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

start_date = '2010-01-01'
end_date = '2019-12-31'

## Chargement des Données


In [2]:
# Charger les données
ebay_df = pd.read_csv('../data/merged_data/EBAY.csv')
ebay_df = ebay_df[(ebay_df['Date'] >= start_date) & (ebay_df['Date'] <= end_date)]

# Convertir la colonne 'Date' en datetime et la définir comme index
ebay_df['Date'] = pd.to_datetime(ebay_df['Date'])
ebay_df.set_index('Date', inplace=True)

print(ebay_df.shape)

(2516, 58)


In [3]:
# Afficher les premières lignes
ebay_df.head()

Unnamed: 0_level_0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,sp500_return_pct,gold_return_pct,vix_close,bond_yields_close,stock_reddit_neg,stock_reddit_neu,stock_reddit_pos,sector_reddit_neg,sector_reddit_neu,sector_reddit_pos
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,0.983682,0.972329,0.969038,0.973833,0.973351,0.843496,0.985551,0.983504,0.981922,0.978125,...,1.604342,2.054419,20.040001,0.055,,,,,,
2010-01-05,0.998224,0.985285,0.979239,0.983856,0.984194,0.854856,0.996702,0.994839,0.993329,0.989319,...,0.311568,0.03579,19.35,0.06,,,,0.019667,0.938121,0.042211
2010-01-06,1.007277,0.993958,0.986298,0.990085,0.991047,0.862534,1.002507,1.001006,0.999713,0.995957,...,0.054552,1.591991,19.16,0.045,,,,0.016661,0.867867,0.115472
2010-01-07,1.020319,1.007777,0.999161,1.001507,1.00325,0.874843,1.011585,1.010696,1.009822,1.006975,...,0.40012,-0.246505,19.059999,0.045,,,,0.024607,0.879734,0.095659
2010-01-08,1.00621,0.997533,0.988941,0.989885,0.992437,0.866723,0.999621,0.998865,0.998089,0.995354,...,0.288173,0.450091,18.129999,0.04,,,,0.059265,0.906462,0.034273


In [4]:
INDICATORS = [
    'stock_SMA_10', 'stock_SMA_15', 'stock_SMA_20', 'stock_SMA_50',
    'stock_SMA_100', 'stock_SMA_200', 'stock_EMA_10', 'stock_EMA_12',
    'stock_EMA_14', 'stock_EMA_26', 'stock_EMA_30', 'stock_EMA_50',
    'stock_EMA_100', 'stock_ADX_14', 'stock_ADX_14_neg', 'stock_ADX_14_pos',
    'stock_ADX_20', 'stock_ADX_20_neg', 'stock_ADX_20_pos', 'stock_ADX_25',
    'stock_ADX_25_neg', 'stock_ADX_25_pos', 'stock_ADX_30',
    'stock_ADX_30_neg', 'stock_ADX_30_pos', 'stock_ATR_14', 'stock_ATR_20',
    'stock_ATR_28', 'stock_RSI_7', 'stock_RSI_14', 'stock_RSI_21',
    'stock_Stoch_14', 'stock_Stoch_14_signal', 'stock_Stoch_21',
    'stock_Stoch_21_signal', 'stock_Stoch_28', 'stock_Stoch_28_signal',
    'stock_CMF_14', 'stock_CMF_20', 'stock_CMF_28', 'stock_VROC_7',
    'stock_VROC_14', 'stock_VROC_21', 'stock_VROC_28'
]

print(len(INDICATORS))

44


In [5]:
# Afficher les statistiques descriptives des indicateurs
ebay_df[INDICATORS].describe()

Unnamed: 0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,stock_Stoch_21_signal,stock_Stoch_28,stock_Stoch_28_signal,stock_CMF_14,stock_CMF_20,stock_CMF_28,stock_VROC_7,stock_VROC_14,stock_VROC_21,stock_VROC_28
count,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,...,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0
mean,0.99833,0.997355,0.996371,0.990446,0.979683,0.954701,0.998311,0.997924,0.997535,0.995164,...,54.455065,55.069771,55.063123,0.016114,0.015653,0.015484,0.169889,0.176499,0.170427,0.162093
std,0.028107,0.034721,0.03993,0.060691,0.079075,0.101644,0.023539,0.026043,0.028279,0.038447,...,27.815368,29.149687,27.936042,0.168749,0.142715,0.122835,0.759882,0.810442,0.82362,0.803365
min,0.875543,0.848016,0.825517,0.789064,0.758487,0.735855,0.886369,0.877253,0.869803,0.841344,...,1.604939,0.0,1.432497,-0.414617,-0.358564,-0.301133,-0.884136,-0.892026,-0.905211,-0.929906
25%,0.982204,0.976379,0.971752,0.950291,0.926987,0.881352,0.984685,0.982559,0.980652,0.970637,...,28.601885,29.580704,29.953366,-0.109803,-0.086347,-0.071768,-0.28693,-0.293252,-0.3014,-0.31059
50%,0.997701,0.995857,0.994971,0.986995,0.974474,0.941972,0.997422,0.996957,0.996345,0.99461,...,56.681239,56.527895,57.350407,0.013233,0.016852,0.014964,-0.006633,-0.022291,-0.012426,-0.031853
75%,1.013807,1.016704,1.019375,1.027825,1.028656,1.017283,1.011033,1.011946,1.012946,1.017731,...,80.062991,81.814072,80.990279,0.128431,0.111244,0.095635,0.397623,0.400925,0.402171,0.388747
max,1.176957,1.199406,1.200742,1.220164,1.284075,1.397539,1.146124,1.156096,1.163084,1.176782,...,99.008787,100.0,99.095908,0.531284,0.512178,0.390804,7.488302,9.550685,14.012035,9.284693


In [6]:
# Afficher les statistiques descriptives des autres colonnes
ebay_df.drop(INDICATORS, axis=1).describe()

Unnamed: 0,stock_target,news_neg,news_neu,news_pos,sp500_return_pct,gold_return_pct,vix_close,bond_yields_close,stock_reddit_neg,stock_reddit_neu,stock_reddit_pos,sector_reddit_neg,sector_reddit_neu,sector_reddit_pos
count,2516.0,582.0,582.0,582.0,2512.0,2512.0,2512.0,2512.0,2215.0,2215.0,2215.0,2240.0,2240.0,2240.0
mean,0.066899,0.078552,0.795839,0.125608,0.046836,0.017741,16.861692,0.554658,0.09372,0.850306,0.055884,0.048496,0.879064,0.071646
std,1.785427,0.138744,0.202417,0.1658,0.931054,0.995375,5.634105,0.776166,0.05916,0.059049,0.024726,0.0457,0.060771,0.04548
min,-12.452691,0.010012,0.022354,0.008707,-6.663446,-9.353766,9.14,0.003,0.009169,0.07693,0.01877,0.008277,0.034308,0.015812
25%,-0.817519,0.021461,0.759642,0.044543,-0.326374,-0.456111,13.04,0.035,0.053709,0.822874,0.042122,0.027601,0.864915,0.049709
50%,0.054721,0.031933,0.88339,0.063467,0.060024,0.018708,15.475,0.1175,0.079832,0.862577,0.049753,0.034882,0.896217,0.060088
75%,0.972312,0.059712,0.914864,0.108502,0.50572,0.535977,18.9,0.985,0.120906,0.890271,0.061095,0.052194,0.911803,0.0772
max,13.919872,0.963946,0.94585,0.905597,4.959374,4.710198,48.0,2.408,0.89948,0.939969,0.428514,0.849241,0.948691,0.946339


## Gestion des Valeurs Manquantes


In [7]:
# print le nombre de valeurs manquantes
missing_values = ebay_df.isnull().sum()
print("Valeurs manquantes avant imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes avant imputation:
 news_neg             1934
news_neu             1934
news_pos             1934
sp500_return_pct        4
gold_return_pct         4
vix_close               4
bond_yields_close       4
stock_reddit_neg      301
stock_reddit_neu      301
stock_reddit_pos      301
sector_reddit_neg     276
sector_reddit_neu     276
sector_reddit_pos     276
dtype: int64


In [8]:
# Imputation avec la médiane
ebay_df_imputed_median = ebay_df.fillna(ebay_df.median())

missing_values = ebay_df_imputed_median.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [9]:
# Imputation avec la moyenne
ebay_df_imputed_mean = ebay_df.fillna(ebay_df.mean())

missing_values = ebay_df_imputed_mean.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [10]:
# Imputation avec une interpolation linéaire
ebay_df_imputed_interpolate = ebay_df.interpolate(method='linear')

missing_values = ebay_df_imputed_interpolate.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 news_neg             11
news_neu             11
news_pos             11
stock_reddit_neg      8
stock_reddit_neu      8
stock_reddit_pos      8
sector_reddit_neg     1
sector_reddit_neu     1
sector_reddit_pos     1
dtype: int64


In [11]:
# Imputation avec forward fill
ebay_df_imputed_ffill = ebay_df.ffill()

missing_values = ebay_df_imputed_ffill.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 news_neg             11
news_neu             11
news_pos             11
stock_reddit_neg      8
stock_reddit_neu      8
stock_reddit_pos      8
sector_reddit_neg     1
sector_reddit_neu     1
sector_reddit_pos     1
dtype: int64


In [12]:
def impute_missing_values(df, method='ffill'):
    if method == 'median':
        return df.fillna(df.median())
    elif method == 'mean':
        return df.fillna(df.mean())
    elif method == 'interpolate':
        return df.interpolate(method='linear')
    elif method == 'ffill':
        return df.ffill()
    
    return df

## Analyse de la Variable Cible (stock_target)


In [13]:
target = ebay_df['stock_target']
target = target.apply(lambda x: 1 if x > 0 else 0)

target_counts = target.value_counts()
print(target_counts)


stock_target
1    1296
0    1220
Name: count, dtype: int64


## Préparation des Données pour le Modèle


In [14]:
# créer des variables lags pour les indicateurs
def create_lag_variables(data, features, lags=[1, 2, 3, 4, 5, 6, 7]):
    df = data.copy()
    lagged_columns = {}

    for feature in features:
        for lag in lags:
            lagged_columns[f'{feature}_lag_{lag}'] = df[feature].shift(lag)
    
    lagged_df = pd.DataFrame(lagged_columns, index=df.index)
    df = pd.concat([df, lagged_df], axis=1)
    
    return df

ebay_df_lagged = create_lag_variables(ebay_df, ebay_df.columns)
ebay_df_lagged = impute_missing_values(ebay_df_lagged, method='ffill')

In [15]:
# Calculer la corrélation entre les features et la cible
correlations = ebay_df_lagged.corr()['stock_target'].sort_values(ascending=False)
print(correlations)

stock_target              1.000000
gold_return_pct_lag_4     0.057844
stock_EMA_10              0.054415
stock_EMA_12              0.054230
stock_EMA_14              0.054046
                            ...   
stock_Stoch_21           -0.048092
stock_Stoch_28           -0.049257
stock_VROC_28_lag_6      -0.051308
stock_Stoch_14           -0.051409
stock_reddit_neu_lag_2   -0.060317
Name: stock_target, Length: 464, dtype: float64


In [16]:
ebay_df_lagged.shape

(2516, 464)

## Feature Selection avec Lasso


In [17]:
data = ebay_df_lagged.copy()
data = data.dropna()

In [18]:
X = data.drop(['stock_target'], axis=1)
y = data['stock_target']

In [19]:
# Standardiser les features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

In [20]:
X_scaled.head()

Unnamed: 0_level_0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,sector_reddit_neu_lag_5,sector_reddit_neu_lag_6,sector_reddit_neu_lag_7,sector_reddit_pos_lag_1,sector_reddit_pos_lag_2,sector_reddit_pos_lag_3,sector_reddit_pos_lag_4,sector_reddit_pos_lag_5,sector_reddit_pos_lag_6,sector_reddit_pos_lag_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-29,0.607947,0.499494,0.513539,0.378527,0.54461,-0.400524,0.84909,0.770851,0.712428,0.548583,...,0.830969,0.830465,-0.170947,0.763595,-1.017985,-1.017656,-1.017081,-1.016576,-1.015856,0.811924
2010-02-01,0.481704,0.282506,0.329635,0.26393,0.468413,-0.440121,0.478958,0.450758,0.427517,0.3592,...,0.830969,0.830465,0.830837,0.763595,0.763715,-1.017656,-1.017081,-1.016576,-1.015856,-1.016302
2010-02-02,0.594203,0.364692,0.383613,0.326843,0.52059,-0.386739,0.541656,0.521448,0.502622,0.436457,...,0.830969,0.830465,0.830837,-0.921081,0.763715,0.763787,-1.017081,-1.016576,-1.015856,-1.016302
2010-02-03,0.514272,0.180494,0.197801,0.220234,0.435953,-0.425454,0.229643,0.241646,0.247385,0.256463,...,0.830969,0.830465,0.830837,-0.921081,-0.920659,0.763787,0.763998,-1.016576,-1.015856,-1.016302
2010-02-04,1.498746,1.084795,1.011186,0.788233,0.874009,-0.096273,1.411625,1.34867,1.293843,1.085759,...,-0.052461,0.830465,0.830837,-0.921081,-0.920659,-0.920344,0.763998,0.764156,-1.015856,-1.016302


In [21]:
# Appliquer LASO pour la sélection des features
from sklearn.linear_model import Lasso

In [22]:
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X, y)

# nombre de features sélectionnées
selected_features = X.columns[lasso.coef_ != 0]
print(f'Nombre de features sélectionnées: {len(selected_features)}')

Nombre de features sélectionnées: 96


In [23]:
indic_coef = dict(zip(X.columns, lasso.coef_))
non_zero_coef = {k: v for k, v in indic_coef.items() if v != 0}

In [24]:
# non_zero_coef

In [25]:
# Sélectionner les features sélectionnées
ebay_df_selected = data[selected_features.append(pd.Index(['stock_target']))]

In [26]:
ebay_df_selected.head()

Unnamed: 0_level_0,stock_ADX_14,stock_ADX_14_neg,stock_ADX_30_pos,stock_RSI_7,stock_Stoch_14,stock_Stoch_21,stock_Stoch_28,stock_VROC_14,sp500_return_pct,gold_return_pct,...,sp500_return_pct_lag_3,sp500_return_pct_lag_4,gold_return_pct_lag_2,gold_return_pct_lag_4,gold_return_pct_lag_7,vix_close_lag_4,vix_close_lag_5,vix_close_lag_6,vix_close_lag_7,stock_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-29,11.290092,19.879898,20.405439,42.944722,36.885239,36.885239,36.885239,0.759836,-0.982917,-0.055369,...,-0.42032,0.45981,-1.22962,0.550863,-2.404133,25.41,27.309999,22.27,18.68,0.651617
2010-02-01,10.697937,18.986014,19.931036,45.737934,43.032872,43.032872,43.032872,0.436222,1.426611,1.966764,...,0.488015,-0.42032,-0.073778,0.246537,-0.863085,24.549999,25.41,27.309999,22.27,-0.388435
2010-02-02,10.14808,18.231918,19.52983,44.222473,39.344329,39.344329,39.344329,0.104128,1.297295,1.18627,...,-1.181774,0.488015,-0.055369,-1.22962,-1.224268,23.139999,24.549999,25.41,27.309999,0.649904
2010-02-03,9.718219,17.848539,19.112422,47.598448,45.491776,45.491776,45.491776,0.388296,-0.547431,-0.536961,...,-0.982917,-1.181774,1.966764,-0.073778,0.550863,23.73,23.139999,24.549999,25.41,-3.357722
2010-02-04,10.215153,21.000173,18.276067,34.814693,13.524625,13.524625,13.524625,0.107708,-3.114068,-4.408854,...,1.426611,-0.982917,1.18627,-0.055369,0.246537,24.620001,23.73,23.139999,24.549999,1.158128


## Model 1: XGBOOST


In [27]:
data = ebay_df_selected.copy()

In [28]:
# !pip install xgboost
# !pip uninstall -y scikit-learn
!pip install "scikit-learn==1.5.2"



In [29]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Grid Search pour trouver les meilleurs hyperparamètres qui maximisent la métrique F1
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier

# Pour les métriques financières
import scipy.stats as stats

In [30]:
def get_rolling_train_test_data(data= data, start_year = '2010', train_window=5, test_window=1):
    df = data.copy()
    df.reset_index(inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
    
    # split data into train and test
    train = df[(df['Date'].dt.year >= int(start_year)) & (df['Date'].dt.year < int(start_year) + train_window)]
    test = df[(df['Date'].dt.year >= int(start_year) + train_window) & (df['Date'].dt.year < int(start_year) + train_window + test_window)]

    X_train = train.drop(columns=['Date', 'stock_target']).values
    y_train_return = train['stock_target'].values
    # y_train = np.where(y_train > 0, 1, 0) # 0 if stock_target <= 0, 1 otherwise

    X_test = test.drop(columns=['Date', 'stock_target']).values
    y_test_return = test['stock_target'].values
    # y_test = np.where(y_test > 0, 1, 0) # 0 if stock_target <= 0, 1 otherwise

    print(f'X_train from {train["Date"].dt.date.values[0]} to {train["Date"].dt.date.values[-1]}')
    print(f'X_test from {test["Date"].dt.date.values[0]} to {test["Date"].dt.date.values[-1]}')
    
    return X_train, y_train_return, X_test, y_test_return

In [31]:
def lasso_feature_selection(df, alpha=0.01):
    data = df.copy()
    X = data.drop('stock_target', axis=1)
    y = data['stock_target']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X, y)
    selected_features = X.columns[lasso.coef_ != 0]
    data_selected = data[selected_features.append(pd.Index(['stock_target']))]
    print(f'Nombre de features sélectionnées: {len(selected_features)}')
    return data_selected

def xgboost_grid_search(X_train, y_train, params, num_boost_round=300):
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    f1_scorer = make_scorer(f1_score, average='binary')
    recall_scorer = make_scorer(recall_score)
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=params,
        scoring=f1_scorer,
        cv=3,
        verbose=1,
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_, grid_search.best_score_


def grid_search_best_params(df, params, model_grid_search):
    data = df.copy()
    grid_search_params = params.copy()
    grid_search_params.pop('nan_strategy')
    grid_search_params.pop('lasso_alpha')

    best_params = {}
    score = 0
    for nan_strategy in params['nan_strategy']:
        current_params = {'nan_strategy': nan_strategy}
        data_lagged = create_lag_variables(data, data.columns)
        data_lagged = impute_missing_values(data_lagged, method=nan_strategy)
        data_lagged = data_lagged.dropna()
        for lasso_alpha in params['lasso_alpha']:
            current_params['lasso_alpha'] = lasso_alpha

            # Lasso feature selection
            data_selected = lasso_feature_selection(data_lagged, alpha=lasso_alpha)

            X = data_selected.drop('stock_target', axis=1)
            y = data_selected['stock_target']

            # get rolling train test data
            X_train, y_train_return, _, _ = get_rolling_train_test_data(
                data_selected,
                start_year='2010',
                train_window=5,
                test_window=1)
            y_train = np.where(y_train_return > 0, 1, 0)

            # grid search
            best_params_, best_score_ = model_grid_search(X_train, y_train, grid_search_params)
            if best_score_ > score:
                best_params = current_params
                best_params.update(best_params_)
                score = best_score_
            
    return best_params, score

In [32]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'max_depth': [3, 4],           # Profondeur maximale de l'arbre
    'learning_rate': [0.02, 0.01],  # Taux d'apprentissage (eta)
    'n_estimators': [100],  # Nombre d'arbres (boost rounds)
    'subsample': [0.5, 0.6],     # Fraction des données pour chaque arbre
    'colsample_bytree': [0.5]  # Fraction des colonnes pour chaque arbre
}

best_params, best_score = grid_search_best_params(data, param_grid, xgboost_grid_search)

  model = cd_fast.enet_coordinate_descent(
Parameters: { "use_label_encoder" } are not used.



Nombre de features sélectionnées: 273
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 37
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 264
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 33
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 383
X_train from 2010-02-09 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 88
X_train from 2010-02-09 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 383
X_train from 2010-02-09 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 88
X_train from 2010-02-09 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [33]:
best_params, best_score

({'nan_strategy': 'median',
  'lasso_alpha': 0.1,
  'colsample_bytree': 0.5,
  'learning_rate': 0.01,
  'max_depth': 3,
  'n_estimators': 100,
  'subsample': 0.5},
 0.5958752526926344)

In [34]:
# Entraîner le modèle avec les meilleurs hyperparamètres
data = ebay_df.copy()

# Créer des variables lags
data_lagged = create_lag_variables(data, data.columns)
data_lagged = impute_missing_values(data_lagged, method=best_params['nan_strategy'])
data_lagged = data_lagged.dropna()

# Feature selection avec Lasso
data_selected = lasso_feature_selection(data_lagged, alpha=best_params['lasso_alpha'])

# Séparer les features et la cible
X = data_selected.drop('stock_target', axis=1)
y = data_selected['stock_target']

Nombre de features sélectionnées: 24


In [39]:
# 2010
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

xgb_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = xgb_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.48


Parameters: { "use_label_encoder" } are not used.



## Model 2: Random Forest


In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score

def random_forest_grid_search(X_train, y_train, param_grid):
    # Définition du modèle
    rf_model = RandomForestClassifier(random_state=42)

    # Définition de la métrique de scoring
    f1_scorer = make_scorer(f1_score, average='binary')
    # Vous pouvez également définir d'autres métriques, par exemple :
    # recall_scorer = make_scorer(recall_score, average='binary')

    # Configuration de la recherche en grille
    grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=param_grid,
        scoring=f1_scorer,
        cv=3,         # Vous pouvez augmenter le nombre de folds (k-fold cross validation)
        verbose=1
    )

    # Entraînement de GridSearch sur les données
    grid_search.fit(X_train, y_train)

    # Récupération des meilleurs paramètres et du meilleur score
    return grid_search.best_params_, grid_search.best_score_

In [41]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'n_estimators': [100, 200], # Nombre d'arbres
    'max_depth': [None, 5, 10], # Profondeur maximale de l'arbre
    'min_samples_split': [2, 5], # Nombre minimum d'échantillons pour diviser un nœud
    'min_samples_leaf': [1, 2] # Nombre minimum d'échantillons requis à chaque feuille
}

best_params, best_score = grid_search_best_params(data, param_grid, random_forest_grid_search)
best_params, best_score

Nombre de features sélectionnées: 103
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 23
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 105
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 96
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 21
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for e

({'nan_strategy': 'median',
  'lasso_alpha': 0.1,
  'max_depth': None,
  'min_samples_leaf': 1,
  'min_samples_split': 5,
  'n_estimators': 100},
 0.591045234457281)

In [46]:
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
rf_model = RandomForestClassifier(random_state=42, n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], min_samples_leaf=best_params['min_samples_leaf'])

rf_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = rf_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.536


## Model 3: Regression Logistique


In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score

def logistic_regression_grid_search(X_train, y_train, param_grid):
    # Définition du modèle
    # Note : pour utiliser la pénalisation L1, vous devez choisir un solver qui la supporte
    # comme 'liblinear' ou 'saga'.
    log_reg_model = LogisticRegression(random_state=42, max_iter=10000)

    # Définition de la métrique de scoring
    f1_scorer = make_scorer(f1_score, average='binary')
    # Exemples d'autres métriques possibles :
    # recall_scorer = make_scorer(recall_score, average='binary')
    # accuracy_scorer = 'accuracy'
    
    # Configuration de la recherche en grille
    grid_search = GridSearchCV(
        estimator=log_reg_model,
        param_grid=param_grid,
        scoring=f1_scorer,  # Vous pouvez mettre 'accuracy', recall_scorer, etc.
        cv=3,               # Nombre de folds pour la cross-validation
        verbose=1
    )

    # Entraînement de GridSearch sur les données
    grid_search.fit(X_train, y_train)

    # Récupération des meilleurs paramètres et du meilleur score
    return grid_search.best_params_, grid_search.best_score_


In [48]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'] 
}

best_params, best_score = grid_search_best_params(data, param_grid, logistic_regression_grid_search)
best_params, best_score

Nombre de features sélectionnées: 103
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 23
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 105
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 96
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 21
X_train from 2010-01-29 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for e

({'nan_strategy': 'median',
  'lasso_alpha': 0.1,
  'C': 0.1,
  'penalty': 'l2',
  'solver': 'saga'},
 0.5814077941779171)

In [53]:
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
log_reg_model = LogisticRegression(random_state=42, penalty=best_params['penalty'], C=best_params['C'], solver=best_params['solver'], max_iter=10000)

log_reg_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = log_reg_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.483


## Model 4: DNN


In [54]:
# !pip install --upgrade pip
# !pip install tensorflow-macos
# !pip install tensorflow-metal
# !pip install scikit-learn

In [55]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [110]:
nan_strategy = 'ffill'
lasso_alpha = 0.01

data_lagged = create_lag_variables(data, data.columns)
data_lagged = impute_missing_values(data_lagged, method=nan_strategy)
data_lagged = data_lagged.dropna()

data_selected = lasso_feature_selection(data_lagged, alpha=lasso_alpha)

X = data_selected.drop('stock_target', axis=1)
y = data_selected['stock_target']

X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year='2014', train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

Nombre de features sélectionnées: 96
X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31


In [111]:
# Créer le modèle
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),  
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')  # pour classification binaire
])

# Compiler le modèle
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    # metrics accuracy, recall, precision, f1-score
    metrics=[keras.metrics.Recall(name='accuracy'), keras.metrics.Recall(name='recall'), keras.metrics.BinaryAccuracy(name='f1_score')]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [112]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=10,
    verbose=1
)

Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.5346 - f1_score: 0.5060 - loss: 29.6663 - recall: 0.5346 - val_accuracy: 0.8321 - val_f1_score: 0.5397 - val_loss: 3.7695 - val_recall: 0.8321
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5684 - f1_score: 0.5375 - loss: 42.9519 - recall: 0.5684 - val_accuracy: 0.1679 - val_f1_score: 0.5040 - val_loss: 8.5853 - val_recall: 0.1679
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5239 - f1_score: 0.4924 - loss: 45.4549 - recall: 0.5239 - val_accuracy: 0.0876 - val_f1_score: 0.4841 - val_loss: 9.3548 - val_recall: 0.0876
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4938 - f1_score: 0.5054 - loss: 35.7810 - recall: 0.4938 - val_accuracy: 0.1241 - val_f1_score: 0.5000 - val_loss: 4.7122 - val_recall: 0.1241
Epoch 5/10
[1m40/40[0m [32m━━

In [113]:
# Prédire les valeurs sur l'ensemble de test
y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
F1 Score: 0.366
