In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

start_date = '2010-01-01'
end_date = '2019-12-31'

## Chargement des Données


In [3]:
# Charger les données
aapl_df = pd.read_csv('../data/merged_data/AAPL.csv')
aapl_df = aapl_df[(aapl_df['Date'] >= start_date) & (aapl_df['Date'] <= end_date)]

# Convertir la colonne 'Date' en datetime et la définir comme index
aapl_df['Date'] = pd.to_datetime(aapl_df['Date'])
aapl_df.set_index('Date', inplace=True)

print(aapl_df.shape)

(2516, 58)


In [4]:
# Afficher les premières lignes
aapl_df.head()

Unnamed: 0_level_0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,sp500_return_pct,gold_return_pct,vix_close,bond_yields_close,stock_reddit_neg,stock_reddit_neu,stock_reddit_pos,sector_reddit_neg,sector_reddit_neu,sector_reddit_pos
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-04,0.963623,0.945426,0.934846,0.933008,0.883653,0.755846,0.968436,0.96354,0.959512,0.945193,...,1.604342,2.054419,20.040001,0.055,0.037126,0.81974,0.143134,0.155312,0.796095,0.048593
2010-01-05,0.970799,0.949924,0.938145,0.932254,0.884416,0.757172,0.972808,0.967742,0.963475,0.947742,...,0.311568,0.03579,19.35,0.06,0.031367,0.923151,0.045481,0.033803,0.913047,0.05315
2010-01-06,0.99253,0.969699,0.958527,0.947989,0.900728,0.771859,0.990617,0.98594,0.981842,0.965797,...,0.054552,1.591991,19.16,0.045,0.035153,0.92126,0.043587,0.082909,0.865224,0.051867
2010-01-07,0.999221,0.97669,0.96522,0.950514,0.904477,0.77576,0.993824,0.989648,0.985839,0.969987,...,0.40012,-0.246505,19.059999,0.045,0.05241,0.899681,0.047909,0.046438,0.90869,0.044872
2010-01-08,0.997283,0.97557,0.96219,0.945615,0.900975,0.773125,0.989577,0.98571,0.982084,0.966279,...,0.288173,0.450091,18.129999,0.04,0.019935,0.904986,0.075079,0.084882,0.858233,0.056885


In [5]:
INDICATORS = [
    'stock_SMA_10', 'stock_SMA_15', 'stock_SMA_20', 'stock_SMA_50',
    'stock_SMA_100', 'stock_SMA_200', 'stock_EMA_10', 'stock_EMA_12',
    'stock_EMA_14', 'stock_EMA_26', 'stock_EMA_30', 'stock_EMA_50',
    'stock_EMA_100', 'stock_ADX_14', 'stock_ADX_14_neg', 'stock_ADX_14_pos',
    'stock_ADX_20', 'stock_ADX_20_neg', 'stock_ADX_20_pos', 'stock_ADX_25',
    'stock_ADX_25_neg', 'stock_ADX_25_pos', 'stock_ADX_30',
    'stock_ADX_30_neg', 'stock_ADX_30_pos', 'stock_ATR_14', 'stock_ATR_20',
    'stock_ATR_28', 'stock_RSI_7', 'stock_RSI_14', 'stock_RSI_21',
    'stock_Stoch_14', 'stock_Stoch_14_signal', 'stock_Stoch_21',
    'stock_Stoch_21_signal', 'stock_Stoch_28', 'stock_Stoch_28_signal',
    'stock_CMF_14', 'stock_CMF_20', 'stock_CMF_28', 'stock_VROC_7',
    'stock_VROC_14', 'stock_VROC_21', 'stock_VROC_28'
]

print(len(INDICATORS))

44


In [6]:
# Afficher les statistiques descriptives des indicateurs
aapl_df[INDICATORS].describe()

Unnamed: 0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,stock_Stoch_21_signal,stock_Stoch_28,stock_Stoch_28_signal,stock_CMF_14,stock_CMF_20,stock_CMF_28,stock_VROC_7,stock_VROC_14,stock_VROC_21,stock_VROC_28
count,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,...,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0
mean,0.996534,0.994587,0.992659,0.981622,0.964131,0.929688,0.996534,0.995768,0.995007,0.990525,...,61.552329,62.347958,62.344457,0.039995,0.038655,0.036773,0.111289,0.126378,0.127097,0.120952
std,0.027302,0.034098,0.039913,0.066427,0.094056,0.130278,0.023057,0.025725,0.028165,0.04002,...,29.081772,30.037666,29.044301,0.177126,0.149679,0.130196,0.576071,0.650056,0.673652,0.650932
min,0.909796,0.900574,0.898222,0.81426,0.731557,0.67105,0.926691,0.92159,0.917837,0.886927,...,1.253199,0.0,1.733466,-0.477755,-0.382779,-0.364944,-0.848471,-0.813931,-0.847502,-0.863293
25%,0.978533,0.971241,0.965414,0.936917,0.900727,0.838616,0.981375,0.978808,0.976258,0.963544,...,35.504822,38.262328,38.964877,-0.09142,-0.070655,-0.057001,-0.25574,-0.284082,-0.288909,-0.281894
50%,0.994572,0.991445,0.988382,0.974174,0.947454,0.895636,0.994204,0.993042,0.992013,0.985699,...,68.995207,69.290424,69.085304,0.034526,0.027419,0.030459,-0.012342,-0.015428,-0.021469,-0.032437
75%,1.011811,1.013812,1.014854,1.016012,1.004877,0.975676,1.009123,1.010031,1.010614,1.012547,...,88.26556,89.573893,89.411457,0.161169,0.148712,0.126036,0.337975,0.351918,0.341566,0.349253
max,1.119828,1.143148,1.158347,1.303117,1.420194,1.414387,1.109108,1.118108,1.12569,1.174955,...,99.165374,100.0,99.338644,0.55172,0.466915,0.372831,5.261044,9.692106,8.827986,6.595671


In [7]:
# Afficher les statistiques descriptives des autres colonnes
aapl_df.drop(INDICATORS, axis=1).describe()

Unnamed: 0,stock_target,news_neg,news_neu,news_pos,sp500_return_pct,gold_return_pct,vix_close,bond_yields_close,stock_reddit_neg,stock_reddit_neu,stock_reddit_pos,sector_reddit_neg,sector_reddit_neu,sector_reddit_pos
count,2516.0,2072.0,2072.0,2072.0,2512.0,2512.0,2512.0,2512.0,2516.0,2516.0,2516.0,2516.0,2516.0,2516.0
mean,0.104052,0.133296,0.754822,0.111882,0.046836,0.017741,16.861692,0.554658,0.067742,0.845922,0.085542,0.133893,0.788055,0.078052
std,1.623597,0.169353,0.187062,0.115457,0.931054,0.995375,5.634105,0.776166,0.041442,0.051882,0.035737,0.073635,0.075993,0.033939
min,-12.355785,0.009295,0.019036,0.007598,-6.663446,-9.353766,9.14,0.003,0.018676,0.591457,0.033084,0.018808,0.442899,0.026085
25%,-0.691267,0.029513,0.674559,0.048351,-0.326374,-0.456111,13.04,0.035,0.036994,0.816677,0.060604,0.077159,0.741247,0.053604
50%,0.089511,0.057951,0.824043,0.071023,0.060024,0.018708,15.475,0.1175,0.052721,0.856306,0.07566,0.12372,0.799101,0.068546
75%,0.979748,0.170293,0.892132,0.125862,0.50572,0.535977,18.9,0.985,0.090252,0.886902,0.100623,0.178913,0.844929,0.092884
max,8.874128,0.967384,0.946458,0.932956,4.959374,4.710198,48.0,2.408,0.277392,0.926306,0.33805,0.425087,0.924679,0.280869


## Gestion des Valeurs Manquantes


In [8]:
# print le nombre de valeurs manquantes
missing_values = aapl_df.isnull().sum()
print("Valeurs manquantes avant imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes avant imputation:
 news_neg             444
news_neu             444
news_pos             444
sp500_return_pct       4
gold_return_pct        4
vix_close              4
bond_yields_close      4
dtype: int64


In [9]:
# Imputation avec la médiane
aapl_df_imputed_median = aapl_df.fillna(aapl_df.median())

missing_values = aapl_df_imputed_median.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [10]:
# Imputation avec la moyenne
aapl_df_imputed_mean = aapl_df.fillna(aapl_df.mean())

missing_values = aapl_df_imputed_mean.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [11]:
# Imputation avec une interpolation linéaire
aapl_df_imputed_interpolate = aapl_df.interpolate(method='linear')

missing_values = aapl_df_imputed_interpolate.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [12]:
# Imputation avec forward fill
aapl_df_imputed_ffill = aapl_df.ffill()

missing_values = aapl_df_imputed_ffill.isnull().sum()
print("Valeurs manquantes après imputation:\n", missing_values[missing_values > 0])

Valeurs manquantes après imputation:
 Series([], dtype: int64)


In [13]:
def impute_missing_values(df, method='ffill'):
    if method == 'median':
        return df.fillna(df.median())
    elif method == 'mean':
        return df.fillna(df.mean())
    elif method == 'interpolate':
        return df.interpolate(method='linear')
    elif method == 'ffill':
        return df.ffill()
    
    return df

## Analyse de la Variable Cible (stock_target)


In [14]:
target = aapl_df['stock_target']
target = target.apply(lambda x: 1 if x > 0 else 0)

target_counts = target.value_counts()
print(target_counts)


stock_target
1    1330
0    1186
Name: count, dtype: int64


## Préparation des Données pour le Modèle


In [15]:
# créer des variables lags pour les indicateurs
def create_lag_variables(data, features, lags=[1, 2, 3, 4, 5, 6, 7]):
    df = data.copy()
    lagged_columns = {}

    for feature in features:
        for lag in lags:
            lagged_columns[f'{feature}_lag_{lag}'] = df[feature].shift(lag)
    
    lagged_df = pd.DataFrame(lagged_columns, index=df.index)
    df = pd.concat([df, lagged_df], axis=1)
    
    return df

aapl_df_lagged = create_lag_variables(aapl_df, aapl_df.columns)
aapl_df_lagged = impute_missing_values(aapl_df_lagged, method='ffill')

In [16]:
# Calculer la corrélation entre les features et la cible
correlations = aapl_df_lagged.corr()['stock_target'].sort_values(ascending=False)
print(correlations)

stock_target              1.000000
stock_VROC_28_lag_2       0.058583
stock_VROC_28_lag_1       0.054718
stock_VROC_28_lag_3       0.052203
stock_reddit_neu_lag_3    0.050971
                            ...   
sp500_return_pct_lag_4   -0.042623
stock_VROC_14_lag_6      -0.047903
stock_VROC_28_lag_6      -0.051079
stock_reddit_neg_lag_3   -0.052926
stock_VROC_7_lag_6       -0.063071
Name: stock_target, Length: 464, dtype: float64


In [17]:
aapl_df_lagged.shape

(2516, 464)

## Feature Selection avec Lasso


In [18]:
data = aapl_df_lagged.copy()
data = data.dropna()

In [19]:
X = data.drop(['stock_target'], axis=1)
y = data['stock_target']

In [20]:
# Standardiser les features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

In [21]:
X_scaled.head()

Unnamed: 0_level_0,stock_SMA_10,stock_SMA_15,stock_SMA_20,stock_SMA_50,stock_SMA_100,stock_SMA_200,stock_EMA_10,stock_EMA_12,stock_EMA_14,stock_EMA_26,...,sector_reddit_neu_lag_5,sector_reddit_neu_lag_6,sector_reddit_neu_lag_7,sector_reddit_pos_lag_1,sector_reddit_pos_lag_2,sector_reddit_pos_lag_3,sector_reddit_pos_lag_4,sector_reddit_pos_lag_5,sector_reddit_pos_lag_6,sector_reddit_pos_lag_7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-13,0.235393,0.019917,-0.373206,-0.380081,-0.54515,-1.112614,-0.053019,-0.107942,-0.160954,-0.340597,...,1.017705,1.6483,0.102527,-0.81236,-0.087765,-0.624527,-0.978201,-0.771669,-0.733323,-0.867078
2010-01-14,0.410636,0.274583,-0.13921,-0.26737,-0.468233,-1.058338,0.189396,0.123944,0.061412,-0.166359,...,1.592126,1.016357,1.648146,-1.184986,-0.812375,-0.086874,-0.624292,-0.977988,-0.771141,-0.732711
2010-01-15,0.952289,0.811291,0.347472,0.004169,-0.28301,-0.93674,0.785424,0.688164,0.597713,0.250091,...,0.92532,1.590729,1.016126,0.65243,-1.185001,-0.811856,-0.086706,-0.623687,-0.977447,-0.770525
2010-01-19,-0.615915,-0.404076,-0.58593,-0.587711,-0.682567,-1.181495,-0.854706,-0.804122,-0.776979,-0.731318,...,0.981294,0.923981,1.590568,-0.47949,0.652418,-1.184674,-0.811599,-0.085505,-0.623168,-0.976809
2010-01-20,-0.101084,0.046648,-0.109811,-0.340892,-0.510805,-1.070889,-0.13058,-0.154164,-0.182038,-0.312212,...,1.311688,0.97995,0.923738,0.055792,-0.479503,0.653689,-1.184371,-0.811201,-0.08502,-0.622568


In [22]:
# Appliquer LASO pour la sélection des features
from sklearn.linear_model import Lasso

In [23]:
lasso = Lasso(alpha=0.01, max_iter=10000)
lasso.fit(X, y)

# nombre de features sélectionnées
selected_features = X.columns[lasso.coef_ != 0]
print(f'Nombre de features sélectionnées: {len(selected_features)}')

Nombre de features sélectionnées: 95


In [24]:
indic_coef = dict(zip(X.columns, lasso.coef_))
non_zero_coef = {k: v for k, v in indic_coef.items() if v != 0}

In [25]:
# non_zero_coef

In [26]:
# Sélectionner les features sélectionnées
aapl_df_selected = data[selected_features.append(pd.Index(['stock_target']))]

In [27]:
aapl_df_selected.head()

Unnamed: 0_level_0,stock_ADX_14,stock_ADX_14_pos,stock_ADX_20_neg,stock_ADX_30,stock_RSI_7,stock_RSI_21,stock_Stoch_14,stock_Stoch_21,stock_Stoch_28,stock_VROC_14,...,gold_return_pct_lag_2,gold_return_pct_lag_4,gold_return_pct_lag_6,gold_return_pct_lag_7,vix_close_lag_1,vix_close_lag_2,vix_close_lag_4,vix_close_lag_6,vix_close_lag_7,stock_target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-13,20.262939,22.967592,21.49697,15.912205,56.479593,58.118319,66.576332,79.910455,81.64245,0.733519,...,1.098225,-0.246505,0.03579,2.054419,18.25,17.549999,19.059999,19.35,20.040001,-0.579154
2010-01-14,19.00467,22.295619,21.066462,15.669494,51.75666,56.665323,49.673195,74.949138,77.108867,0.252857,...,-1.894493,0.450091,1.591991,0.03579,17.85,18.25,18.129999,19.16,19.35,-1.671203
2010-01-15,18.359467,20.072353,23.405833,15.238556,40.438524,52.696729,15.926798,60.715711,64.102556,0.186026,...,0.664364,1.098225,-0.246505,1.591991,17.629999,17.85,17.549999,19.059999,19.16,4.423831
2010-01-19,17.365634,22.837199,20.947355,15.016756,64.207076,60.29631,95.213118,97.763267,97.954583,0.132557,...,0.545578,-1.894493,0.450091,-0.246505,17.91,17.629999,18.25,18.129999,19.059999,-1.53924
2010-01-20,16.538072,21.245719,19.536115,14.821223,54.917381,56.814126,66.405603,83.210127,84.731036,0.374989,...,-1.093996,0.664364,1.098225,0.450091,17.58,17.91,17.85,17.549999,18.129999,-1.728631


## Model 1: XGBOOST


In [28]:
data = aapl_df_selected.copy()

In [29]:
# !pip install xgboost
# !pip uninstall -y scikit-learn
# !pip install "scikit-learn==1.5.2"

In [30]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Grid Search pour trouver les meilleurs hyperparamètres qui maximisent la métrique F1
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, roc_auc_score
from xgboost import XGBClassifier

# Pour les métriques financières
import scipy.stats as stats

In [31]:
def get_rolling_train_test_data(data= data, start_year = '2010', train_window=5, test_window=1):
    df = data.copy()
    df.reset_index(inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
    
    # split data into train and test
    train = df[(df['Date'].dt.year >= int(start_year)) & (df['Date'].dt.year < int(start_year) + train_window)]
    test = df[(df['Date'].dt.year >= int(start_year) + train_window) & (df['Date'].dt.year < int(start_year) + train_window + test_window)]

    X_train = train.drop(columns=['Date', 'stock_target']).values
    y_train_return = train['stock_target'].values
    # y_train = np.where(y_train > 0, 1, 0) # 0 if stock_target <= 0, 1 otherwise

    X_test = test.drop(columns=['Date', 'stock_target']).values
    y_test_return = test['stock_target'].values
    # y_test = np.where(y_test > 0, 1, 0) # 0 if stock_target <= 0, 1 otherwise

    print(f'X_train from {train["Date"].dt.date.values[0]} to {train["Date"].dt.date.values[-1]}')
    print(f'X_test from {test["Date"].dt.date.values[0]} to {test["Date"].dt.date.values[-1]}')
    
    return X_train, y_train_return, X_test, y_test_return

In [32]:
def lasso_feature_selection(df, alpha=0.01):
    data = df.copy()
    X = data.drop('stock_target', axis=1)
    y = data['stock_target']
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X, y)
    selected_features = X.columns[lasso.coef_ != 0]
    data_selected = data[selected_features.append(pd.Index(['stock_target']))]
    print(f'Nombre de features sélectionnées: {len(selected_features)}')
    return data_selected

def xgboost_grid_search(X_train, y_train, params, num_boost_round=300):
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    f1_scorer = make_scorer(f1_score, average='binary')
    recall_scorer = make_scorer(recall_score)
    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=params,
        scoring=f1_scorer,
        cv=3,
        verbose=1,
    )
    grid_search.fit(X_train, y_train)
    return grid_search.best_params_, grid_search.best_score_


def grid_search_best_params(df, params, model_grid_search):
    data = df.copy()
    grid_search_params = params.copy()
    grid_search_params.pop('nan_strategy')
    grid_search_params.pop('lasso_alpha')

    best_params = {}
    score = 0
    for nan_strategy in params['nan_strategy']:
        current_params = {'nan_strategy': nan_strategy}
        data_lagged = create_lag_variables(data, data.columns)
        data_lagged = impute_missing_values(data_lagged, method=nan_strategy)
        data_lagged = data_lagged.dropna()
        for lasso_alpha in params['lasso_alpha']:
            current_params['lasso_alpha'] = lasso_alpha

            # Lasso feature selection
            data_selected = lasso_feature_selection(data_lagged, alpha=lasso_alpha)

            X = data_selected.drop('stock_target', axis=1)
            y = data_selected['stock_target']

            # get rolling train test data
            X_train, y_train_return, _, _ = get_rolling_train_test_data(
                data_selected,
                start_year='2010',
                train_window=5,
                test_window=1)
            y_train = np.where(y_train_return > 0, 1, 0)

            # grid search
            best_params_, best_score_ = model_grid_search(X_train, y_train, grid_search_params)
            if best_score_ > score:
                best_params = current_params
                best_params.update(best_params_)
                score = best_score_
            
    return best_params, score

In [33]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'max_depth': [3, 4],           # Profondeur maximale de l'arbre
    'learning_rate': [0.02, 0.01],  # Taux d'apprentissage (eta)
    'n_estimators': [100],  # Nombre d'arbres (boost rounds)
    'subsample': [0.5, 0.6],     # Fraction des données pour chaque arbre
    'colsample_bytree': [0.5]  # Fraction des colonnes pour chaque arbre
}

best_params, best_score = grid_search_best_params(data, param_grid, xgboost_grid_search)

  model = cd_fast.enet_coordinate_descent(
Parameters: { "use_label_encoder" } are not used.



Nombre de features sélectionnées: 199
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 46
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 192
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 45
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 345
X_train from 2010-01-25 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 93
X_train from 2010-01-25 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 345
X_train from 2010-01-25 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Nombre de features sélectionnées: 93
X_train from 2010-01-25 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 8 candidates, totalling 24 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

In [34]:
best_params, best_score

({'nan_strategy': 'interpolate',
  'lasso_alpha': 0.1,
  'colsample_bytree': 0.5,
  'learning_rate': 0.01,
  'max_depth': 3,
  'n_estimators': 100,
  'subsample': 0.5},
 0.6306640582011039)

In [35]:
# Entraîner le modèle avec les meilleurs hyperparamètres
data = aapl_df.copy()

# Créer des variables lags
data_lagged = create_lag_variables(data, data.columns)
data_lagged = impute_missing_values(data_lagged, method=best_params['nan_strategy'])
data_lagged = data_lagged.dropna()

# Feature selection avec Lasso
data_selected = lasso_feature_selection(data_lagged, alpha=best_params['lasso_alpha'])

# Séparer les features et la cible
X = data_selected.drop('stock_target', axis=1)
y = data_selected['stock_target']

Nombre de features sélectionnées: 24


In [36]:
# 2010
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree']
)

xgb_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = xgb_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.6862170087976539


Parameters: { "use_label_encoder" } are not used.



## Model 2: Random Forest


In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score

def random_forest_grid_search(X_train, y_train, param_grid):
    # Définition du modèle
    rf_model = RandomForestClassifier(random_state=42)

    # Définition de la métrique de scoring
    f1_scorer = make_scorer(f1_score, average='binary')
    # Vous pouvez également définir d'autres métriques, par exemple :
    # recall_scorer = make_scorer(recall_score, average='binary')

    # Configuration de la recherche en grille
    grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid=param_grid,
        scoring=f1_scorer,
        cv=3,         # Vous pouvez augmenter le nombre de folds (k-fold cross validation)
        verbose=1
    )

    # Entraînement de GridSearch sur les données
    grid_search.fit(X_train, y_train)

    # Récupération des meilleurs paramètres et du meilleur score
    return grid_search.best_params_, grid_search.best_score_

In [147]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'n_estimators': [100, 200], # Nombre d'arbres
    'max_depth': [None, 5, 10], # Profondeur maximale de l'arbre
    'min_samples_split': [2, 5], # Nombre minimum d'échantillons pour diviser un nœud
    'min_samples_leaf': [1, 2] # Nombre minimum d'échantillons requis à chaque feuille
}

best_params, best_score = grid_search_best_params(data, param_grid, random_forest_grid_search)
best_params, best_score

  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 100
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits


  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 98
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 23
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 96
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 95
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 24 candidates, totalling 72 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for eac

({'nan_strategy': 'mean',
  'lasso_alpha': 0.1,
  'max_depth': 5,
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 200},
 0.6048333738348641)

In [153]:
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
rf_model = RandomForestClassifier(random_state=42, n_estimators=best_params['n_estimators'], max_depth=best_params['max_depth'], min_samples_split=best_params['min_samples_split'], min_samples_leaf=best_params['min_samples_leaf'])

rf_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = rf_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.667


## Model 3: Regression Logistique


In [154]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, recall_score

def logistic_regression_grid_search(X_train, y_train, param_grid):
    # Définition du modèle
    # Note : pour utiliser la pénalisation L1, vous devez choisir un solver qui la supporte
    # comme 'liblinear' ou 'saga'.
    log_reg_model = LogisticRegression(random_state=42, max_iter=10000)

    # Définition de la métrique de scoring
    f1_scorer = make_scorer(f1_score, average='binary')
    # Exemples d'autres métriques possibles :
    # recall_scorer = make_scorer(recall_score, average='binary')
    # accuracy_scorer = 'accuracy'
    
    # Configuration de la recherche en grille
    grid_search = GridSearchCV(
        estimator=log_reg_model,
        param_grid=param_grid,
        scoring=f1_scorer,  # Vous pouvez mettre 'accuracy', recall_scorer, etc.
        cv=3,               # Nombre de folds pour la cross-validation
        verbose=1
    )

    # Entraînement de GridSearch sur les données
    grid_search.fit(X_train, y_train)

    # Récupération des meilleurs paramètres et du meilleur score
    return grid_search.best_params_, grid_search.best_score_


In [155]:
param_grid = {
    'nan_strategy': ['mean', 'median', 'interpolate', 'ffill'],  # Stratégie d'imputation
    'lasso_alpha': [0.01, 0.1], # Alpha values to explore
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga'] 
}

best_params, best_score = grid_search_best_params(data, param_grid, logistic_regression_grid_search)
best_params, best_score

  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 100
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits


  model = cd_fast.enet_coordinate_descent(


Nombre de features sélectionnées: 98
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 23
X_train from 2010-01-04 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 96
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 95
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for each of 16 candidates, totalling 48 fits
Nombre de features sélectionnées: 24
X_train from 2010-01-13 to 2014-12-31
X_test from 2015-01-02 to 2015-12-31
Fitting 3 folds for eac

({'nan_strategy': 'ffill',
  'lasso_alpha': 0.1,
  'C': 0.01,
  'penalty': 'l1',
  'solver': 'saga'},
 0.6237775752843818)

In [162]:
start_year = '2014'
X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year=start_year, train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

# Entraîner le modèle
log_reg_model = LogisticRegression(random_state=42, penalty=best_params['penalty'], C=best_params['C'], solver=best_params['solver'], max_iter=10000)

log_reg_model.fit(X_train, y_train)

# Prédire les valeurs sur l'ensemble de test
y_pred = log_reg_model.predict(X_test)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31
F1 Score: 0.68


## Model 4: DNN


In [38]:
# !pip install --upgrade pip
# !pip install tensorflow-macos
# !pip install tensorflow-metal
# !pip install scikit-learn

In [46]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [102]:
nan_strategy = 'ffill'
lasso_alpha = 0.01

data_lagged = create_lag_variables(data, data.columns)
data_lagged = impute_missing_values(data_lagged, method=nan_strategy)
data_lagged = data_lagged.dropna()

data_selected = lasso_feature_selection(data_lagged, alpha=lasso_alpha)

X = data_selected.drop('stock_target', axis=1)
y = data_selected['stock_target']

X_train, y_train_return, X_test, y_test_return = get_rolling_train_test_data(data_selected, start_year='2014', train_window=5, test_window=1)
y_train = np.where(y_train_return > 0, 1, 0)
y_test = np.where(y_test_return > 0, 1, 0)

Nombre de features sélectionnées: 95
X_train from 2014-01-02 to 2018-12-31
X_test from 2019-01-02 to 2019-12-31


In [106]:
# Créer le modèle
model = keras.Sequential([
    layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)),  
    layers.Dropout(0.2),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1, activation='sigmoid')  # pour classification binaire
])

# Compiler le modèle
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    # metrics accuracy, recall, precision, f1-score
    metrics=[keras.metrics.Recall(name='recall'), keras.metrics.BinaryAccuracy(name='accuracy'), keras.metrics.Precision(name='precision')]
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [107]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    batch_size=32,
    epochs=10,
    verbose=1
)

Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.4859 - loss: 34.5302 - precision: 0.5010 - recall: 0.5356 - val_accuracy: 0.4444 - val_loss: 12.7325 - val_precision: 0.6667 - val_recall: 0.0822
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5251 - loss: 48.3800 - precision: 0.5655 - recall: 0.4505 - val_accuracy: 0.5238 - val_loss: 5.3796 - val_precision: 0.5915 - val_recall: 0.5753
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.4913 - loss: 57.2961 - precision: 0.5300 - recall: 0.5285 - val_accuracy: 0.4167 - val_loss: 14.4663 - val_precision: 0.4000 - val_recall: 0.0137
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5043 - loss: 64.9395 - precision: 0.5108 - recall: 0.5017 - val_accuracy: 0.5873 - val_loss: 6.2398 - val_precision: 0.5955 - val_recall: 0.8973
Epoch 5/10
[1m40/40[

In [109]:
# Prédire les valeurs sur l'ensemble de test
y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

# Print le f1 score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {round(f1, 3)}')

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
F1 Score: 0.676
