In [55]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Télécharger les données
data = yf.download("AAPL", start="2020-01-01", end="2024-01-01")



[*********************100%***********************]  1 of 1 completed


In [56]:
data.head()

Price,Close,High,Low,Open,Volume
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2020-01-02,72.400497,72.460761,71.156659,71.409763,135480400
2020-01-03,71.69664,72.455958,71.472462,71.629145,146322800
2020-01-06,72.267952,72.306521,70.568525,70.819223,118387200
2020-01-07,71.928062,72.533103,71.708703,72.277586,108872000
2020-01-08,73.085106,73.386423,71.631552,71.631552,132079200


In [57]:
#création des features 

#returns(variations du prix)
data["return_1d"] = data["Close"].pct_change()
data["return_5d"] = data["Close"].pct_change(5)
data["return_20d"] = data["Close"].pct_change(20)


In [58]:

data.head()

Price,Close,High,Low,Open,Volume,return_1d,return_5d,return_20d
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2020-01-02,72.400497,72.460761,71.156659,71.409763,135480400,,,
2020-01-03,71.69664,72.455958,71.472462,71.629145,146322800,-0.009722,,
2020-01-06,72.267952,72.306521,70.568525,70.819223,118387200,0.007968,,
2020-01-07,71.928062,72.533103,71.708703,72.277586,108872000,-0.004703,,
2020-01-08,73.085106,73.386423,71.631552,71.631552,132079200,0.016086,,


In [59]:
#moving averages(SMA/EMA)
data["ema_10"]= data["Close"].ewm(span=10).mean()
data["ema_20"]= data["Close"].ewm(span=20).mean()
data["ema_30"]= data["Close"].ewm(span=30).mean()

data["sma_10"] = data["Close"].rolling(window=10).mean()
data["sma_20"] = data["Close"].rolling(window=20).mean()
data["sma_30"] = data["Close"].rolling(window=30).mean()

#Momentum indicators
# variation des prix
delta = data['Close'].diff()

# gains (delta positif) et pertes (delta négatif)
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)

# moyenne glissante sur 14 jours
window = 14
avg_gain = gain.rolling(window).mean()
avg_loss = loss.rolling(window).mean()

# division par zéro
rs = avg_gain / avg_loss

# RSI
data["RSI"] = 100 - (100 / (1 + rs))
data["RSA"] = data["Close"].pct_change(12)

#volatility
data["volatility_10"] = data["return_1d"].rolling(10).std()

#ATR
data["H-L"] = data["High"] - data["Low"]
data["H-C"] = abs(data["High"] - data["Close"].shift(1))
data["L-C"] = abs(data["Low"] - data["Close"].shift(1))
data["tr"] = data[["H-L", "H-C", "L-C"]].max(axis=1)
data["atr"] = data["tr"].rolling(14).mean()


#Volume indicators
data["volume_change"] = data["Volume"].pct_change()
data["volume_ma_10"] = data["Volume"].rolling(10).mean()
#volume_change

#volume_ma_10

In [60]:
data

Price,Close,High,Low,Open,Volume,return_1d,return_5d,return_20d,ema_10,ema_20,...,RSI,RSA,volatility_10,H-L,H-C,L-C,tr,atr,volume_change,volume_ma_10
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-01-02,72.400497,72.460761,71.156659,71.409763,135480400,,,,72.400497,72.400497,...,,,,1.304102,,,1.304102,,,
2020-01-03,71.696640,72.455958,71.472462,71.629145,146322800,-0.009722,,,72.013376,72.030972,...,,,,0.983496,0.055461,0.928036,0.983496,,0.080029,
2020-01-06,72.267952,72.306521,70.568525,70.819223,118387200,0.007968,,,72.115714,72.117990,...,,,,1.737996,0.609881,1.128115,1.737996,,-0.190918,
2020-01-07,71.928062,72.533103,71.708703,72.277586,108872000,-0.004703,,,72.053891,72.063161,...,,,,0.824400,0.265151,0.559249,0.824400,,-0.080374,
2020-01-08,73.085106,73.386423,71.631552,71.631552,132079200,0.016086,,,72.349925,72.310361,...,,,,1.754871,1.458361,0.296511,1.754871,,0.213160,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,191.609497,193.400885,190.985970,193.173239,37149600,-0.005547,-0.020094,0.019108,192.952868,191.331203,...,59.246175,0.006656,0.009145,2.414915,0.722540,1.692375,2.414915,2.910481,-0.200783,61175570.0
2023-12-26,191.065109,191.896469,190.847370,191.619349,28919300,-0.002841,-0.014498,0.017177,192.609639,191.305861,...,49.031886,-0.006280,0.008196,1.049099,0.286972,0.762127,1.049099,2.634066,-0.221545,57973130.0
2023-12-27,191.164093,191.510500,189.125276,190.510890,48087700,0.000518,-0.019245,0.014443,192.346813,191.292359,...,52.291508,-0.013081,0.007718,2.385225,0.445391,1.939834,2.385225,2.617100,0.662824,57512210.0
2023-12-28,191.589661,192.658558,191.183873,192.143900,34049900,0.002226,-0.006416,0.022231,192.209149,191.320674,...,47.920448,0.002071,0.004933,1.474686,1.494465,0.019780,1.494465,2.534388,-0.291921,53876780.0


In [61]:
horizon = 60
future_return = data["Close"].shift(-horizon) / data["Close"] - 1

mask_up = future_return.ge(0.05).to_numpy()      # bool array
mask_down = future_return.le(-0.05).to_numpy()   # bool array

data["target"] = np.nan
data.loc[mask_up, "target"] = 1
data.loc[mask_down, "target"] = 0


IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
data


Price,Close,High,Low,Open,Volume,return_1d,return_5d,return_20d,ema_10,ema_20,...,volatility_10,H-L,H-C,L-C,tr,atr,volume_change,volume_ma_10,future_return,target
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-01-02,72.400497,72.460761,71.156659,71.409763,135480400,,,,72.400497,72.400497,...,,1.304102,,,1.304102,,,,0.042098,0
2020-01-03,71.696640,72.455958,71.472462,71.629145,146322800,-0.009722,,,72.013376,72.030972,...,,0.983496,0.055461,0.928036,0.983496,,0.080029,,0.039622,0
2020-01-06,72.267952,72.306521,70.568525,70.819223,118387200,0.007968,,,72.115714,72.117990,...,,1.737996,0.609881,1.128115,1.737996,,-0.190918,,0.037740,0
2020-01-07,71.928062,72.533103,71.708703,72.277586,108872000,-0.004703,,,72.053891,72.063161,...,,0.824400,0.265151,0.559249,0.824400,,-0.080374,,0.036480,0
2020-01-08,73.085106,73.386423,71.631552,71.631552,132079200,0.016086,,,72.349925,72.310361,...,,1.754871,1.458361,0.296511,1.754871,,0.213160,,0.044105,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,191.609497,193.400885,190.985970,193.173239,37149600,-0.005547,-0.020094,0.019108,192.952868,191.331203,...,0.009145,2.414915,0.722540,1.692375,2.414915,2.910481,-0.200783,61175570.0,,0
2023-12-26,191.065109,191.896469,190.847370,191.619349,28919300,-0.002841,-0.014498,0.017177,192.609639,191.305861,...,0.008196,1.049099,0.286972,0.762127,1.049099,2.634066,-0.221545,57973130.0,,0
2023-12-27,191.164093,191.510500,189.125276,190.510890,48087700,0.000518,-0.019245,0.014443,192.346813,191.292359,...,0.007718,2.385225,0.445391,1.939834,2.385225,2.617100,0.662824,57512210.0,,0
2023-12-28,191.589661,192.658558,191.183873,192.143900,34049900,0.002226,-0.006416,0.022231,192.209149,191.320674,...,0.004933,1.474686,1.494465,0.019780,1.494465,2.534388,-0.291921,53876780.0,,0


In [None]:
data = data.dropna()
data = data.drop(columns=["future_return"], errors="ignore")


data

Price,Close,High,Low,Open,Volume,return_1d,return_5d,return_20d,ema_10,ema_20,...,RSA,volatility_10,H-L,H-C,L-C,tr,atr,volume_change,volume_ma_10,target
Ticker,AAPL,AAPL,AAPL,AAPL,AAPL,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-02-13,78.497017,78.823213,78.129748,78.332713,94747600,-0.007121,0.001325,0.045934,77.605053,77.014208,...,0.025028,0.021394,0.693465,0.236792,0.930257,0.930257,2.035065,-0.166911,126407960.0,1
2020-02-14,78.516342,78.765217,78.008926,78.465595,80113600,0.000246,0.015374,0.033248,77.771072,77.163999,...,0.004259,0.014482,0.756290,0.268200,0.488091,0.756290,1.857846,-0.154452,114460480.0,1
2020-02-18,77.078667,77.259886,76.017925,76.199145,152531200,-0.018311,-0.007930,0.003222,77.644975,77.155528,...,-0.012699,0.016170,1.241961,1.256456,2.498417,2.498417,1.873594,0.903936,112334760.0,1
2020-02-19,78.194977,78.424524,77.320292,77.320292,93984000,0.014483,0.012547,0.024696,77.745109,77.258303,...,0.048070,0.013226,1.104232,1.345858,0.241625,1.345858,1.794790,-0.383838,108071520.0,1
2020-02-20,77.392776,78.443851,76.887779,77.955769,100566000,-0.010259,-0.021088,0.010577,77.680978,77.271551,...,0.040174,0.013522,1.556071,0.248874,1.307198,1.556071,1.809688,0.070033,106245440.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-16,174.524643,176.471809,174.455447,175.078154,46964900,-0.004959,-0.007753,-0.093755,178.444256,181.772433,...,-0.099982,0.015992,2.016362,1.077369,0.938993,2.016362,3.205344,0.076619,64395630.0,1
2023-08-17,171.984390,175.453725,171.470410,175.088016,66062900,-0.014555,-0.020987,-0.097836,177.269735,180.840238,...,-0.109274,0.016101,3.983315,0.929082,3.054233,3.983315,3.249440,0.406644,64878400.0,1
2023-08-18,172.468750,173.071684,169.968058,170.304116,61172200,0.002816,-0.018561,-0.089686,176.396829,180.042953,...,-0.092711,0.008840,3.103626,1.087294,2.016332,3.103626,3.384404,-0.074031,59399940.0,1
2023-08-21,173.803085,174.089734,171.727421,173.042016,46311900,0.007737,-0.020172,-0.086498,175.925239,179.448680,...,-0.078948,0.008196,2.362314,1.620984,0.741329,2.362314,3.450907,-0.242926,54273520.0,1


In [None]:
#chronologie respectée
data = data.sort_index()

#Initialisation des sets
n = len(data)

train = data.iloc[:int(0.6*n)]
valid = data.iloc[int(0.6*n):int(0.8*n)]
test  = data.iloc[int(0.8*n):]


X_train = train.drop(columns=["target"])
y_train = train["target"]

X_valid = valid.drop(columns=["target"])
y_valid = valid["target"]

X_test = test.drop(columns=["target"])
y_test = test["target"]


print(train)

Price            Close        High         Low        Open     Volume  \
Ticker            AAPL        AAPL        AAPL        AAPL       AAPL   
Date                                                                    
2020-02-13   78.497017   78.823213   78.129748   78.332713   94747600   
2020-02-14   78.516342   78.765217   78.008926   78.465595   80113600   
2020-02-18   77.078667   77.259886   76.017925   76.199145  152531200   
2020-02-19   78.194977   78.424524   77.320292   77.320292   93984000   
2020-02-20   77.392776   78.443851   76.887779   77.955769  100566000   
...                ...         ...         ...         ...        ...   
2022-03-17  157.373001  157.745324  154.443455  155.403639   75615400   
2022-03-18  160.665070  161.154962  156.530377  157.265216  123511700   
2022-03-21  162.036774  162.987166  159.714674  160.204566   95811400   
2022-03-22  165.407242  165.995104  161.576280  162.164142   81532000   
2022-03-23  166.769135  169.150004  164.260874  164

In [None]:
#normalisation des données
def scale_dataset(X, y, oversample=False, scaler=None):
    if scaler is None:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled = scaler.transform(X)

    if oversample:
        ros = RandomOverSampler()
        X_scaled, y = ros.fit_resample(X_scaled, y)

    return X_scaled, y, scaler

In [None]:
X_train_scaled, y_train, scaler = scale_dataset(X_train, y_train)

X_valid_scaled, y_valid, _ = scale_dataset(X_valid, y_valid, scaler=scaler)

X_test_scaled, y_test, _ = scale_dataset(X_test, y_test, scaler=scaler)


In [None]:
print(X_train_scaled.shape, y_train.shape)
print(X_valid_scaled.shape, y_valid.shape)
print(X_test_scaled.shape, y_test.shape)


(532, 24) (532,)
(177, 24) (177,)
(178, 24) (178,)


In [None]:
from sklearn.metrics import confusion_matrix
knn_model = KNeighborsClassifier(n_neighbors=7 , weights="distance")

knn_model.fit(X_train_scaled, y_train)

y_pred = knn_model.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13  0  0]
 [48  0  2]
 [15  3 97]]
              precision    recall  f1-score   support

          -1       0.17      1.00      0.29        13
           0       0.00      0.00      0.00        50
           1       0.98      0.84      0.91       115

    accuracy                           0.62       178
   macro avg       0.38      0.61      0.40       178
weighted avg       0.65      0.62      0.61       178



In [None]:
data["target"].value_counts()
data["target"].value_counts(normalize=True)


target
 1    0.611048
 0    0.234498
-1    0.154453
Name: proportion, dtype: float64

In [None]:
baseline = data["target"].value_counts(normalize=True).max()
print("Baseline (majority class):", baseline)


Baseline (majority class): 0.6110484780157835


In [None]:
#On laisse tomber knn car il n'est pas du tout accruate(49)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

lr = LogisticRegression(
    max_iter=3000,
    class_weight="balanced"
)

lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[13  0  0]
 [48  1  1]
 [57 54  4]]
              precision    recall  f1-score   support

          -1       0.11      1.00      0.20        13
           0       0.02      0.02      0.02        50
           1       0.80      0.03      0.07       115

    accuracy                           0.10       178
   macro avg       0.31      0.35      0.09       178
weighted avg       0.53      0.10      0.06       178



In [None]:
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(X_test.shape, y_test.shape)

print(np.isnan(X_train).sum().sum())  # doit être 0


(532, 24) (532,)
(177, 24) (177,)
(178, 24) (178,)
0


In [None]:
print("future_return dans X_train ?", "future_return" in X_train.columns)
print("target dans X_train ?", "target" in X_train.columns)


future_return dans X_train ? False
target dans X_train ? False


In [None]:
y_train_shuffled = y_train.sample(frac=1, random_state=42).values
lr.fit(X_train_scaled, y_train_shuffled)
print("Score avec y mélangé:", lr.score(X_test_scaled, y_test))

Score avec y mélangé: 0.43820224719101125


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

# Cellule 8 — Entraînement RandomForest
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=10,
    min_samples_leaf=5,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)
# Cellule 9 — Prédiction + métriques
y_pred = rf.predict(X_test)

print("Accuracy:", rf.score(X_test, y_test))
print("Balanced accuracy:", balanced_accuracy_score(y_test, y_pred))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred, digits=3))


Accuracy: 0.29775280898876405
Balanced accuracy: 0.4492753623188406

Confusion matrix:
 [[13  0  0]
 [50  0  0]
 [28 47 40]]

Report:
               precision    recall  f1-score   support

          -1      0.143     1.000     0.250        13
           0      0.000     0.000     0.000        50
           1      1.000     0.348     0.516       115

    accuracy                          0.298       178
   macro avg      0.381     0.449     0.255       178
weighted avg      0.657     0.298     0.352       178



In [None]:
data["target"].value_counts(normalize=True)


target
 1    0.611048
 0    0.234498
-1    0.154453
Name: proportion, dtype: float64

In [None]:
data["target"].value_counts(normalize=True)


target
 1    0.611048
 0    0.234498
-1    0.154453
Name: proportion, dtype: float64

In [None]:
print(len(train[train["target"]==1]))
print(len(train[train["target"]==0]))

380
77
