# Groupe 11 - TP 2 - Rapport

Membres du groupe:
- Oussama Khaloui --- khao1201
- Caroline Wang --- wanc1101

#### Les imports

In [1]:
import pandas as pd

#### Prétraitements

In [2]:
# Lecture des données
orders_distance_stores_softmax = pd.read_csv("donnees\orders_distance_stores_softmax.csv")
orders_products_prior_specials = pd.read_csv("donnees\order_products__prior_specials.csv")

In [3]:
orders_distance_stores_softmax = orders_distance_stores_softmax.drop(columns=['Unnamed: 0'])
orders_distance_stores_softmax.head()

Unnamed: 0,user_id,store_id,distance,order_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,210,0,2.304404,1438665,prior,9,1,18,3.0
1,210,0,2.304404,2850206,prior,36,0,7,2.0
2,210,0,2.304404,2406913,prior,42,0,8,3.0
3,210,0,2.304404,1155933,prior,43,2,23,2.0
4,210,0,2.304404,271697,prior,56,5,17,3.0


In [4]:
orders_products_prior_specials = orders_products_prior_specials.drop(columns=['Unnamed: 0'])
orders_products_prior_specials.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,special
0,15,19660,1,1,15
1,15,21195,2,1,0
2,15,7461,3,1,50
3,15,2996,4,1,0
4,15,32463,5,1,0


In [5]:
orders = pd.merge(orders_distance_stores_softmax, orders_products_prior_specials, on='order_id', how='inner')
orders.dropna(inplace=True)

In [6]:
# Réduction des données à 5%
orders = orders.sample(frac=0.05, random_state=42)
list(orders.columns)

['user_id',
 'store_id',
 'distance',
 'order_id',
 'eval_set',
 'order_number',
 'order_dow',
 'order_hour_of_day',
 'days_since_prior_order',
 'product_id',
 'add_to_cart_order',
 'reordered',
 'special']

In [7]:
from sklearn.model_selection import train_test_split

# Division des données en ensemble d'entraînement et ensemble de test
X = orders.drop(columns=['reordered'])
X = pd.get_dummies(X, columns=['eval_set'])
y = orders['reordered']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Choix du modèle de prédiction

In [8]:
# Recherche d'hyperpatamètres en fonction du noyau
from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit

def hyper_parameters_search(model, kernel, X_train, y_train):
        if kernel == 'rbf':
            C_range = [1,10,100,1000]
            gamma_range = [1,0.1,0.001,0.0001]
            param_grid = dict(gamma=gamma_range, C=C_range)
            cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
            grid = GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='accuracy')
            grid.fit(X_train, y_train)
            print(grid.best_score_, " pour les hyperparamètres suivants, noyau RBF : ", grid.best_params_)
            return grid.best_estimator_
        elif kernel == 'sigmoid':
            coef0_range = [-5, -3, -1, -0.1, 0.0, 0.1, 1]
            param_grid = dict(coef0=coef0_range)
            cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
            grid = GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='accuracy')
            grid.fit(X_train, y_train)
            print(grid.best_score_, " pour les hyperparamètres suivants, noyau sigmoïde : ", grid.best_params_)
            return grid.best_estimator_
        elif kernel == 'poly':
            coef0_range = [-5, -3, -1, -0.1, 0.0, 0.1, 1]
            degree_range = [2, 3, 4, 5, 6, 7]
            param_grid = dict(coef0=coef0_range, degree = degree_range)
            cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
            grid = GridSearchCV(model, param_grid=param_grid, cv=cv, scoring='accuracy')
            grid.fit(X_train, y_train)
            print(grid.best_score_, " pour les hyperparamètres suivants, noyau polynomial : ", grid.best_params_)
            return grid.best_estimator_
        print("Erreur recherche hyper-paramètres")
        return model     

In [9]:
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, f1_score

kernel_values = ['linear', 'rbf', 'sigmoid', 'poly']
for kernel in kernel_values :
    print(f'Modèle de noyau {kernel}')
    
    svm_model = SVC(kernel=kernel)
    svm_model.fit(X_train, y_train)
    y_pred = svm_model.predict(X_test)
    
    # Évaluation du modèle    
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f'Matrice de confusion : {conf_matrix}')
    
    f1_svm = f1_score(y_test, y_pred)
    print(f'Score F1 : {f1_svm}')
    
    # Recherche des hyperparamètres
    best_svm_model = hyper_parameters_search(svm_model, kernel, X_train=X_train, y_train=y_train)
    best_svm_model.fit(X_train, y_train)
    best_y_pred = best_svm_model.predict(X_test)

    # Évaluation du meilleur
    best_conf_matrix = confusion_matrix(y_test, best_y_pred)
    print(f'Matrice de confusion : {best_conf_matrix}')
    
    best_f1_svm = f1_score(y_test, best_y_pred)
    print(f'Score F1 : {best_f1_svm}')

Modèle de noyau linear
