In [None]:
import keras
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier

In [2]:
# Lecture des données
orders_distance_stores_softmax = pd.read_csv("données/orders_distance_stores_softmax.csv", index_col=0)
orders_products_prior_specials = pd.read_csv("données/order_products__prior_specials.csv", index_col=0)

In [3]:
orders = pd.merge(orders_distance_stores_softmax, orders_products_prior_specials, on='order_id', how='inner')

In [4]:
#show columns with their types
orders.columns

Index(['user_id', 'store_id', 'distance', 'order_id', 'eval_set',
       'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'product_id', 'add_to_cart_order',
       'reordered', 'special'],
      dtype='object')

In [5]:
orders = orders.sample(frac=0.05, random_state=42)

In [6]:
#show the first 5 rows
orders.head()

Unnamed: 0,user_id,store_id,distance,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,special
86058,12166,3,2.772836,243435,prior,88,0,13,21.0,48290,10,1,0
1170067,205543,9,0.386416,1425899,prior,22,6,9,0.0,3481,6,1,0
852677,148902,0,0.349984,1455360,prior,36,4,18,4.0,41720,8,1,30
346398,59106,9,1.040265,2683498,prior,83,2,17,3.0,5876,5,1,0
1045727,182401,1,0.804848,2474304,prior,86,5,14,5.0,4562,15,1,15


In [7]:
#TO REDO
orders.dropna(inplace=True)
# drop the eval_set column
orders.drop(columns=['eval_set'], inplace=True)

In [8]:
# Charger vos données depuis votre DataFrame
# Assumons que votre DataFrame est nommé "data"

# Séparer les fonctionnalités (X) de la cible (y)
X = orders.drop(columns=['reordered'])
y = orders['reordered']

# Diviser l'ensemble de données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Diviser l'ensemble d'entraînement en ensembles d'entraînement et de validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Vérifier les formes des ensembles créés
print("Shape of X_train:", X_train.shape)
print("Shape of X_val:", X_val.shape)
print("Shape of X_test:", X_test.shape)

Shape of X_train: (37072, 11)
Shape of X_val: (9269, 11)
Shape of X_test: (11586, 11)


In [9]:
from keras.utils import to_categorical

# Assuming y_train and y_test are your target labels
# Convert them to one-hot encoded vectors
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)
y_val_one_hot = to_categorical(y_val)

y_train_one_hot

array([[1., 0.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]], dtype=float32)

# Prédiction de l'attribut re-ordered

In [None]:
loss_fn = keras.losses.CategoricalCrossentropy()
metrics_fn = keras.metrics.BinaryAccuracy()

In [None]:
# Define a function to create the model based on hyperparameters
def create_model(optimizer='adam', neurons_layer1=64, neurons_layer2=64):
    model = Sequential()    
    model.add(keras.Input(shape=(X_train.shape[1],)))
    model.add(Dense(neurons_layer1, activation='relu'))
    model.add(Dense(neurons_layer2, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer=optimizer, loss=loss_fn, metrics=[metrics_fn])
    return model

# Wrap the create_model function with KerasClassifier
keras_model = KerasClassifier(model=create_model, neurons_layer1=64, neurons_layer2=64, metrics=[metrics_fn])

# Define the grid of hyperparameters to search
param_grid = {
    'neurons_layer1': [32, 64, 128],
    'neurons_layer2': [32, 64, 128],
    'optimizer': ['adam', 'rmsprop']
}

# Define F1 score as the metric
f1_scorer = make_scorer(f1_score, average='micro')

# Initialize GridSearchCV with F1 score as the metric
grid = GridSearchCV(estimator=keras_model, param_grid=param_grid, cv=10, scoring=f1_scorer)
# Perform grid search
grid_result = grid.fit(X_train, y_train_one_hot, epochs=10, batch_size=64, validation_data=(X_val, y_val_one_hot), verbose=1)

In [24]:
# Display results
print("- Best parameters found: ", grid_result.best_params_)
print("\n")
print("- Best F1 score found: ", round(grid_result.best_score_,2))

- Best parameters found:  {'neurons_layer1': 32, 'neurons_layer2': 32, 'optimizer': 'rmsprop'}


- Best F1 score found:  0.78


## Verification sur les données de test

In [43]:
best_model = grid_result.best_estimator_

# Faire des prédictions sur l'ensemble de test avec le meilleur modèle
y_pred_best = best_model.predict(X_test)

f1_best = f1_score(y_test_one_hot, y_pred_best, average='micro')

# Afficher les résultats pour le meilleur modèle
print("Best model performance:")
print("F1 Score:", round(f1_best,3))

Best model performance:
F1 Score: 0.78
