In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

start_date = '2010-01-01'
end_date = '2019-12-31'

## Chargement des Données


In [2]:
# Charger les données
azn_df = pd.read_csv('../data/merged_data/AZN.csv')
azn_df = azn_df[(azn_df['Date'] >= start_date) & (azn_df['Date'] <= end_date)]

# Convertir la colonne 'Date' en datetime et la définir comme index
azn_df['Date'] = pd.to_datetime(azn_df['Date'])
azn_df.set_index('Date', inplace=True)

print(azn_df.shape)

(2516, 55)


In [3]:
# Afficher les premières lignes
azn_df.head()

Unnamed: 0_level_0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,news_neg,news_neu,news_pos,sp500_return_pct,gold_return_pct,vix_close,bond_yields_close,sector_reddit_neg,sector_reddit_neu,sector_reddit_pos
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,0.978907,0.97504,0.972611,0.96386,0.959357,0.905915,0.982968,0.980982,0.979311,0.972745,...,,,,1.604342,2.054419,20.040001,0.055,0.239686,0.706169,0.054145
2010-01-05,1.001098,0.996297,0.992874,0.983974,0.979251,0.926098,1.002687,1.001063,0.999611,0.993379,...,,,,0.311568,0.03579,19.35,0.06,0.142433,0.80461,0.052957
2010-01-06,1.01059,1.004731,1.000792,0.992266,0.986986,0.934902,1.008786,1.007701,1.006619,1.001255,...,,,,0.054552,1.591991,19.16,0.045,0.199394,0.730939,0.069668
2010-01-07,1.000837,0.994633,0.990994,0.982151,0.97652,0.926291,0.998328,0.997363,0.996372,0.99121,...,0.026727,0.919697,0.053576,0.40012,-0.246505,19.059999,0.045,0.123578,0.822368,0.054053
2010-01-08,0.99829,0.991278,0.988286,0.978204,0.972674,0.923738,0.99514,0.994161,0.993164,0.987938,...,,,,0.288173,0.450091,18.129999,0.04,0.112057,0.804381,0.083562


In [4]:
INDICATORS = [
    'stock_SMA_10', 'stock_SMA_15', 'stock_SMA_20', 'stock_SMA_50',
    'stock_SMA_100', 'stock_SMA_200', 'stock_EMA_10', 'stock_EMA_12',
    'stock_EMA_14', 'stock_EMA_26', 'stock_EMA_30', 'stock_EMA_50',
    'stock_EMA_100', 'stock_ADX_14', 'stock_ADX_14_neg', 'stock_ADX_14_pos',
    'stock_ADX_20', 'stock_ADX_20_neg', 'stock_ADX_20_pos', 'stock_ADX_25',
    'stock_ADX_25_neg', 'stock_ADX_25_pos', 'stock_ADX_30',
    'stock_ADX_30_neg', 'stock_ADX_30_pos', 'stock_ATR_14', 'stock_ATR_20',
    'stock_ATR_28', 'stock_RSI_7', 'stock_RSI_14', 'stock_RSI_21',
    'stock_Stoch_14', 'stock_Stoch_14_signal', 'stock_Stoch_21',
    'stock_Stoch_21_signal', 'stock_Stoch_28', 'stock_Stoch_28_signal',
    'stock_CMF_14', 'stock_CMF_20', 'stock_CMF_28', 'stock_VROC_7',
    'stock_VROC_14', 'stock_VROC_21', 'stock_VROC_28'
]

print(len(INDICATORS))

44


In [5]:
# Afficher les statistiques descriptives des indicateurs
azn_df[INDICATORS].describe()

Unnamed: 0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,stock_Stoch_21_signal,stock_Stoch_28,stock_Stoch_28_signal,stock_CMF_14,stock_CMF_20,stock_CMF_28,stock_VROC_7,stock_VROC_14,stock_VROC_21,stock_VROC_28
count,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,...,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0
mean,0.999055,0.998515,0.997974,0.994596,0.988776,0.977424,0.99905,0.998834,0.998616,0.997285,...,54.382151,54.715641,54.722912,0.067789,0.068105,0.06771,0.215397,0.239952,0.270762,0.281004
std,0.023031,0.028567,0.032912,0.049146,0.060806,0.07251,0.019234,0.021305,0.023145,0.031321,...,29.439334,30.725798,29.53971,0.202878,0.17606,0.151061,0.934657,0.972575,1.251976,1.134685
min,0.870549,0.850002,0.83572,0.824018,0.790804,0.714095,0.889232,0.882422,0.877292,0.851702,...,1.563984,0.0,1.33542,-0.599236,-0.46259,-0.360398,-0.882247,-0.928983,-0.924494,-0.960206
25%,0.985978,0.981741,0.977984,0.962577,0.946722,0.926772,0.987935,0.986333,0.985285,0.977854,...,27.285021,27.550227,27.450265,-0.073103,-0.050336,-0.035473,-0.321027,-0.340903,-0.326953,-0.344423
50%,0.998442,0.997662,0.997573,0.99225,0.985899,0.978686,0.998412,0.998294,0.998036,0.996223,...,56.928034,57.190275,57.440262,0.070946,0.070132,0.070644,-0.016184,-0.008046,-0.003828,-0.00686
75%,1.011809,1.015567,1.018049,1.025153,1.028889,1.027705,1.009969,1.011352,1.01237,1.016586,...,82.334888,83.071351,82.566648,0.206306,0.185469,0.165856,0.469359,0.508191,0.488911,0.527818
max,1.152181,1.151154,1.159795,1.198589,1.212743,1.189087,1.138325,1.143009,1.146705,1.158279,...,99.098891,100.0,99.25885,0.79161,0.703882,0.590226,12.975607,11.16566,26.790792,13.013314


In [6]:
# Afficher les statistiques descriptives des autres colonnes
azn_df.drop(INDICATORS, axis=1).describe()

Unnamed: 0,stock_target,news_neg,news_neu,news_pos,sp500_return_pct,gold_return_pct,vix_close,bond_yields_close,sector_reddit_neg,sector_reddit_neu,sector_reddit_pos
count,2516.0,739.0,739.0,739.0,2512.0,2512.0,2512.0,2512.0,2516.0,2516.0,2516.0
mean,0.040025,0.069771,0.868806,0.061423,0.046836,0.017741,16.861692,0.554658,0.131712,0.782792,0.085258
std,1.416213,0.107181,0.124693,0.076547,0.931054,0.995375,5.634105,0.776166,0.063496,0.068973,0.038096
min,-14.908661,0.010037,0.029431,0.012495,-6.663446,-9.353766,9.14,0.003,0.025058,0.462875,0.03092
25%,-0.660797,0.02785,0.882078,0.034401,-0.326374,-0.456111,13.04,0.035,0.083071,0.739104,0.057807
50%,0.040877,0.042738,0.908122,0.042641,0.060024,0.018708,15.475,0.1175,0.122378,0.791866,0.073022
75%,0.752142,0.065433,0.919767,0.056999,0.50572,0.535977,18.9,0.985,0.170053,0.833663,0.103731
max,12.161372,0.950888,0.945476,0.882347,4.959374,4.710198,48.0,2.408,0.403484,0.921713,0.27529


## Gestion des Valeurs Manquantes


In [7]:
# print le nombre de valeurs manquantes
missing_values = azn_df.isnull().sum()
print("Valeurs manquantes avant imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes avant imputation:
 news_neg             1777
news_neu             1777
news_pos             1777
sp500_return_pct        4
gold_return_pct         4
vix_close               4
bond_yields_close       4
dtype: int64


In [8]:
# Imputation avec la médiane
azn_df_imputed_median = azn_df.fillna(azn_df.median())

missing_values = azn_df_imputed_median.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [9]:
# Imputation avec la moyenne
azn_df_imputed_mean = azn_df.fillna(azn_df.mean())

missing_values = azn_df_imputed_mean.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [10]:
# Imputation avec une interpolation linéaire
azn_df_imputed_interpolate = azn_df.interpolate(method='linear')

missing_values = azn_df_imputed_interpolate.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 news_neg    3
news_neu    3
news_pos    3
dtype: int64


In [11]:
# Imputation avec forward fill
azn_df_imputed_ffill = azn_df.ffill()

missing_values = azn_df_imputed_ffill.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 news_neg    3
news_neu    3
news_pos    3
dtype: int64


In [12]:
def impute_missing_values(df, method='ffill'):
    if method == 'median':
        return df.fillna(df.median())
    elif method == 'mean':
        return df.fillna(df.mean())
    elif method == 'interpolate':
        return df.interpolate(method='linear')
    elif method == 'ffill':
        return df.ffill()
    
    return df

## Analyse de la Variable Cible (stock_target)


In [13]:
target = azn_df['stock_target']
target = target.apply(lambda x: 1 if x > 0 else 0)

target_counts = target.value_counts()
print(target_counts)


stock_target
1    1294
0    1222
Name: count, dtype: int64


## Préparation des Données pour le Modèle


In [14]:
# créer des variables lags pour les indicateurs
def create_lag_variables(data, features, lags=[1, 2, 3, 4, 5, 6, 7]):
    df = data.copy()
    lagged_columns = {}

    for feature in features:
        for lag in lags:
            lagged_columns[f'{feature}_lag_{lag}'] = df[feature].shift(lag)
    
    lagged_df = pd.DataFrame(lagged_columns, index=df.index)
    df = pd.concat([df, lagged_df], axis=1)
    
    return df

azn_df_lagged = create_lag_variables(azn_df, azn_df.columns)
azn_df_lagged = impute_missing_values(azn_df_lagged, method='ffill')

In [15]:
# Calculer la corrélation entre les features et la cible
correlations = azn_df_lagged.corr()['stock_target'].sort_values(ascending=False)
print(correlations)

stock_target              1.000000
stock_EMA_100             0.056944
stock_SMA_100             0.056583
stock_SMA_200             0.056017
stock_EMA_100_lag_6       0.053926
                            ...   
stock_ADX_20_pos_lag_7   -0.051905
stock_ADX_30_pos_lag_6   -0.052081
stock_ADX_25_pos_lag_7   -0.052661
stock_ADX_30_pos_lag_7   -0.053018
stock_VROC_21_lag_5      -0.056531
Name: stock_target, Length: 440, dtype: float64


In [16]:
azn_df_lagged.shape

(2516, 440)

## Feature Selection avec Lasso


In [17]:
data = azn_df_lagged.copy()
data = data.dropna()

In [18]:
X = data.drop(['stock_target'], axis=1)
y = data['stock_target']

In [19]:
# Standardiser les features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

In [20]:
X_scaled.head()

Unnamed: 0_level_0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,sector_reddit_neu_lag_5,sector_reddit_neu_lag_6,sector_reddit_neu_lag_7,sector_reddit_pos_lag_1,sector_reddit_pos_lag_2,sector_reddit_pos_lag_3,sector_reddit_pos_lag_4,sector_reddit_pos_lag_5,sector_reddit_pos_lag_6,sector_reddit_pos_lag_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-19,-2.195365,-1.950403,-1.92487,-1.580449,-1.36263,-1.542106,-2.145687,-2.126951,-2.103316,-1.956262,...,0.511895,0.311903,0.572709,3.241149,0.165537,1.424094,0.092658,-0.299126,-0.044918,-0.819254
2010-01-20,-1.699615,-1.652951,-1.6654,-1.459824,-1.287527,-1.471189,-1.576556,-1.633247,-1.666429,-1.689386,...,0.996398,0.511418,0.31201,-0.920531,3.241023,0.164723,1.426032,0.092479,-0.298729,-0.044763
2010-01-21,-0.841983,-1.072335,-1.157356,-1.172429,-1.079119,-1.29106,-0.732269,-0.863773,-0.957632,-1.186117,...,-1.236832,0.996069,0.511537,0.640417,-0.920826,3.239096,0.16563,1.425988,0.09298,-0.298549
2010-01-22,-0.04029,-0.505973,-0.659335,-0.879126,-0.856308,-1.098208,0.001338,-0.172758,-0.305817,-0.691147,...,-1.225332,-1.237843,0.996217,1.829658,0.640185,-0.921247,3.242521,0.165458,1.426839,0.093122
2010-01-25,-0.348232,-0.818773,-0.923893,-1.092437,-1.039818,-1.232606,-0.52311,-0.633565,-0.722777,-0.995175,...,-1.917005,-1.226339,-1.237829,2.73459,1.829474,0.639199,-0.921229,3.24266,0.165978,1.426854


In [21]:
# Appliquer LASO pour la sélection des features
from sklearn.linear_model import Lasso

In [22]:
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X, y)

# nombre de features sélectionnées
selected_features = X.columns[lasso.coef_ != 0]
print(f'Nombre de features sélectionnées: {len(selected_features)}')

Nombre de features sélectionnées: 98


In [23]:
indic_coef = dict(zip(X.columns, lasso.coef_))
non_zero_coef = {k: v for k, v in indic_coef.items() if v != 0}

In [24]:
# non_zero_coef

In [25]:
# Sélectionner les features sélectionnées
azn_df_selected = data[selected_features.append(pd.Index(['stock_target']))]

In [26]:
azn_df_selected.head()

Unnamed: 0_level_0,stock_RSI_7,stock_RSI_14,stock_Stoch_14,stock_Stoch_21,stock_Stoch_28,stock_VROC_14,stock_VROC_28,vix_close,stock_ADX_14_lag_1,stock_ADX_14_lag_3,...,gold_return_pct_lag_5,gold_return_pct_lag_6,gold_return_pct_lag_7,vix_close_lag_2,vix_close_lag_3,vix_close_lag_4,vix_close_lag_5,vix_close_lag_6,bond_yields_close_lag_7,stock_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-19,78.908876,72.616719,97.773344,97.920664,97.966787,3.446149,1.673185,17.58,15.237649,12.329139,...,1.098225,0.450091,-0.246505,17.629999,17.85,18.25,17.549999,18.129999,0.045,-0.416672
2010-01-20,75.159694,70.562408,93.522272,93.950853,94.085033,3.284679,0.266107,18.68,17.052429,14.247458,...,-1.894493,1.098225,0.450091,17.91,17.629999,17.85,18.25,17.549999,0.04,-1.315003
2010-01-21,64.008668,64.396527,77.192949,78.00748,78.649597,2.555955,0.601946,22.27,18.696738,15.237649,...,0.664364,-1.894493,1.098225,17.58,17.91,17.629999,17.85,18.25,0.025,-1.393092
2010-01-22,54.200569,58.628592,63.742686,63.742686,66.05838,1.148925,-0.283228,27.309999,20.487379,17.052429,...,0.545578,0.664364,-1.894493,18.68,17.58,17.91,17.629999,17.85,0.04,1.269449
2010-01-25,60.539275,61.928384,75.828432,75.828432,77.37223,0.030773,0.121576,25.41,21.17679,18.696738,...,-1.093996,0.545578,0.664364,22.27,18.68,17.58,17.91,17.629999,0.05,1.233321


## Model 1: XGBOOST


In [27]:
data = azn_df_selected.copy()

In [28]:
# !pip install xgboost
# !pip uninstall -y scikit-learn
!pip install "scikit-learn==1.5.2"



In [29]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Grid Search pour trouver les meilleurs hyperparamètres qui maximisent la métrique F1
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier

# Pour les métriques financières
import scipy.stats as stats

In [30]:
def get_rolling_train_test_data(data= data, start_year = '2010', train_window=5, test_window=1):
    df = data.copy()
    df.reset_index(inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
    
    # split data into train and test
    train = df[(df['Date'].dt.year >= int(start_year)) & (df['Date'].dt.year < int(start_year) + train_window)]
    test = df[(df['Date'].dt.year >= int(start_year) + train_window) & (df['Date'].dt.year < int(start_year) + train_window + test_window)]

    X_train = train.drop(columns=['Date', 'stock_target']).values
    y_train_return = train['stock_target'].values
    # y_train = np.where(y_train > 0, 1, 0) # 0 if stock_target <= 0, 1 otherwise

    X_test = test.drop(columns=['Date', 'stock_target']).values
    y_test_return = test['stock_target'].values
    # y_test = np.where(y_test > 0, 1, 0) # 0 if stock_target <= 0, 1 otherwise

    print(f'X_train from {train["Date"].dt.date.values[0]} to {train["Date"].dt.date.values[-1]}')
    print(f'X_test from {test["Date"].dt.date.values[0]} to {test["Date"].dt.date.values[-1]}')
    
    return X_train, y_train_return, X_test, y_test_return

In [31]:
def lasso_feature_selection(df, alpha=0.01):
    data = df.copy()
    X = data.drop('stock_target', axis=1)
    y = data['stock_target']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X, y)
    selected_features = X.columns[lasso.coef_ != 0]
    data_selected = data[selected_features.append(pd.Index(['stock_target']))]
    print(f'Nombre de features sélectionnées: {len(selected_features)}')
    return data_selected

def xgboost_grid_search(X_train, y_train, params, num_boost_round=300):
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    f1_scorer = make_scorer(f1_score, average='binary')
    recall_scorer = make_scorer(recall_score)
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=params,
        scoring=f1_scorer,
        cv=3,
        verbose=1,
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_, grid_search.best_score_


def grid_search_best_params(df, params, model_grid_search):
    data = df.copy()
    grid_search_params = params.copy()
    grid_search_params.pop('nan_strategy')
    grid_search_params.pop('lasso_alpha')

    best_params = {}
    score = 0
    for nan_strategy in params['nan_strategy']:
        current_params = {'nan_strategy': nan_strategy}
        data_lagged = create_lag_variables(data, data.columns)
        data_lagged = impute_missing_values(data_lagged, method=nan_strategy)
        data_lagged = data_lagged.dropna()
        for lasso_alpha in params['lasso_alpha']:
            current_params['lasso_alpha'] = lasso_alpha

            # Lasso feature selection
            data_selected = lasso_feature_selection(data_lagged, alpha=lasso_alpha)

            X = data_selected.drop('stock_target', axis=1)
            y = data_selected['stock_target']

            # get rolling train test data
            X_train, y_train_return, _, _ = get_rolling_train_test_data(
                data_selected,
                start_year='2010',
                train_window=5,
                test_window=1)
            y_train = np.where(y_train_return > 0, 1, 0)

            # grid search
            best_params_, best_score_ = model_grid_search(X_train, y_train, grid_search_params)
            if best_score_ > score:
                best_params = current_params
                best_params.update(best_params_)
                score = best_score_
            
    return best_params, score

In [32]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'max_depth': [3, 4],           # Profondeur maximale de l'arbre
    'learning_rate': [0.02, 0.01],  # Taux d'apprentissage (eta)
    'n_estimators': [100],  # Nombre d'arbres (boost rounds)
    'subsample': [0.5, 0.6],     # Fraction des données pour chaque arbre
    'colsample_bytree': [0.5]  # Fraction des colonnes pour chaque arbre
}

best_params, best_score = grid_search_best_params(data, param_grid, xgboost_grid_search)

  model = cd_fast.enet_coordinate_descent(
Parameters: { "use_label_encoder" } are not used.



Nombre de features sélectionnées: 221
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 45
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 222
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 45
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 473
X_train from 2010-01-28 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 100
X_train from 2010-01-28 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 473
X_train from 2010-01-28 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 100
X_train from 2010-01-28 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [33]:
best_params, best_score

({'nan_strategy': 'interpolate',
  'lasso_alpha': 0.1,
  'colsample_bytree': 0.5,
  'learning_rate': 0.01,
  'max_depth': 3,
  'n_estimators': 100,
  'subsample': 0.5},
 0.6060182658055525)

In [34]:
# Entraîner le modèle avec les meilleurs hyperparamètres
data = azn_df.copy()

# Créer des variables lags
data_lagged = create_lag_variables(data, data.columns)
data_lagged = impute_missing_values(data_lagged, method=best_params['nan_strategy'])
data_lagged = data_lagged.dropna()

# Feature selection avec Lasso
data_selected = lasso_feature_selection(data_lagged, alpha=best_params['lasso_alpha'])

# Séparer les features et la cible
X = data_selected.drop('stock_target', axis=1)
y = data_selected['stock_target']

Nombre de features sélectionnées: 21


In [40]:
# 2010
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

xgb_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = xgb_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.548


Parameters: { "use_label_encoder" } are not used.



## Model 2: Random Forest


In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score

def random_forest_grid_search(X_train, y_train, param_grid):
    # Définition du modèle
    rf_model = RandomForestClassifier(random_state=42)

    # Définition de la métrique de scoring
    f1_scorer = make_scorer(f1_score, average='binary')
    # Vous pouvez également définir d'autres métriques, par exemple :
    # recall_scorer = make_scorer(recall_score, average='binary')

    # Configuration de la recherche en grille
    grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=param_grid,
        scoring=f1_scorer,
        cv=3,         # Vous pouvez augmenter le nombre de folds (k-fold cross validation)
        verbose=1
    )

    # Entraînement de GridSearch sur les données
    grid_search.fit(X_train, y_train)

    # Récupération des meilleurs paramètres et du meilleur score
    return grid_search.best_params_, grid_search.best_score_

In [42]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'n_estimators': [100, 200], # Nombre d'arbres
    'max_depth': [None, 5, 10], # Profondeur maximale de l'arbre
    'min_samples_split': [2, 5], # Nombre minimum d'échantillons pour diviser un nœud
    'min_samples_leaf': [1, 2] # Nombre minimum d'échantillons requis à chaque feuille
}

best_params, best_score = grid_search_best_params(data, param_grid, random_forest_grid_search)
best_params, best_score

  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 103
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 22
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits


  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 100
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 22
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 98
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 21
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 98
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 21
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for ea

({'nan_strategy': 'mean',
  'lasso_alpha': 0.1,
  'max_depth': 5,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 200},
 0.5817023273352168)

In [47]:
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
rf_model = RandomForestClassifier(random_state=42, n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], min_samples_leaf=best_params['min_samples_leaf'])

rf_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = rf_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.5


## Model 3: Regression Logistique


In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score

def logistic_regression_grid_search(X_train, y_train, param_grid):
    # Définition du modèle
    # Note : pour utiliser la pénalisation L1, vous devez choisir un solver qui la supporte
    # comme 'liblinear' ou 'saga'.
    log_reg_model = LogisticRegression(random_state=42, max_iter=10000)

    # Définition de la métrique de scoring
    f1_scorer = make_scorer(f1_score, average='binary')
    # Exemples d'autres métriques possibles :
    # recall_scorer = make_scorer(recall_score, average='binary')
    # accuracy_scorer = 'accuracy'
    
    # Configuration de la recherche en grille
    grid_search = GridSearchCV(
        estimator=log_reg_model,
        param_grid=param_grid,
        scoring=f1_scorer,  # Vous pouvez mettre 'accuracy', recall_scorer, etc.
        cv=3,               # Nombre de folds pour la cross-validation
        verbose=1
    )

    # Entraînement de GridSearch sur les données
    grid_search.fit(X_train, y_train)

    # Récupération des meilleurs paramètres et du meilleur score
    return grid_search.best_params_, grid_search.best_score_


In [49]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'] 
}

best_params, best_score = grid_search_best_params(data, param_grid, logistic_regression_grid_search)
best_params, best_score

  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 103
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 22
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits


  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 100
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 22
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 98
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 21
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 98
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 21
X_train from 2010-01-19 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for ea

({'nan_strategy': 'mean',
  'lasso_alpha': 0.1,
  'C': 0.01,
  'penalty': 'l1',
  'solver': 'liblinear'},
 0.5977179187503338)

In [54]:
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
log_reg_model = LogisticRegression(random_state=42, penalty=best_params['penalty'], C=best_params['C'], solver=best_params['solver'], max_iter=10000)

log_reg_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = log_reg_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.534


## Model 4: DNN


In [55]:
# !pip install --upgrade pip
# !pip install tensorflow-macos
# !pip install tensorflow-metal
# !pip install scikit-learn

In [56]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [138]:
nan_strategy = 'ffill'
lasso_alpha = 0.01

data_lagged = create_lag_variables(data, data.columns)
data_lagged = impute_missing_values(data_lagged, method=nan_strategy)
data_lagged = data_lagged.dropna()

data_selected = lasso_feature_selection(data_lagged, alpha=lasso_alpha)

X = data_selected.drop('stock_target', axis=1)
y = data_selected['stock_target']

X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year='2014', train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

Nombre de features sélectionnées: 98
X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31


In [139]:
# Créer le modèle
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),  
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')  # pour classification binaire
])

# Compiler le modèle
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    # metrics accuracy, recall, precision, f1-score
    metrics=[keras.metrics.Recall(name='accuracy'), keras.metrics.BinaryAccuracy(name='f1_score')]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [140]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=10,
    verbose=1
)

Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.5599 - f1_score: 0.4938 - loss: 35.7928 - val_accuracy: 1.0000 - val_f1_score: 0.5476 - val_loss: 21.4896
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6200 - f1_score: 0.5240 - loss: 50.5306 - val_accuracy: 0.0000e+00 - val_f1_score: 0.4524 - val_loss: 12.5517
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4991 - f1_score: 0.4952 - loss: 53.4954 - val_accuracy: 0.8406 - val_f1_score: 0.5516 - val_loss: 3.1648
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5165 - f1_score: 0.4795 - loss: 54.5681 - val_accuracy: 0.0000e+00 - val_f1_score: 0.4524 - val_loss: 9.0730
Epoch 5/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4840 - f1_score: 0.4896 - loss: 53.3517 - val_accuracy: 0.6377 - val_f1_sco

In [142]:
# Prédire les valeurs sur l'ensemble de test
y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 
F1 Score: 0.384
