In [1]:
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.stattools import adfuller

from sklearn.metrics import r2_score

## le but de ce notebook est de réussir à construire un modèle VAR  pour fit sur nos données de vente
## On va implémenter ici les bonnes métriques pour les modèles 

## Les fonctions qui vont être utilisées

In [2]:
def r2_ajuste(r2,n,p):

    return  1 - (  ((1 - r2) * (n - 1)) / (n - p - 1)  )

def determine_petit_truc(series):
    min_positive_value = series[series > 0].min()
    val = 0.00001
    return min_positive_value * val

# Fonction pour tester la stationnarité avec le test ADF
def test_stationarity(series):
    result = adfuller(series.dropna())
    return result[1]  # Retourne la p-value

# Fonction pour appliquer la transformation logarithmique
def apply_log(series, p):
    return np.log(series + p)

## reconstruction pour un dataframe 
def deDiff(orig, diff,order,log_order):
    dfout = pd.DataFrame(index=orig.index, columns=orig.columns)
    cols = orig.columns

    for family in cols:
        #affectation des premières valeurs 
        dfout[family].iloc[:order] = orig[family].iloc[:order]
        # recontruction du reste
        for i in range(order, len(orig)):
            dfout[family].iloc[i] = dfout[family].iloc[i-order]  +  diff[family].iloc[i-order]
        dfout[family]
        if log_order == 1:
            dfout = np.exp(dfout)
    return dfout

## reconstruction pour une colonne de dataframe 

def deDiffFamily(orig, diff, order,log_order):
    
    """
    Fonction poure reconstruire les colonnes de dataframes à différenciation
    orig = la colonne originale
    diff = la colonne des valuers différenciées
    order = l'ordre de différentiation de la colone
    log_order = 1 si on a utilisé un logarithme et 0 sinon
    """
    
    dfout = pd.Series(index=orig.index, dtype=orig.dtype)
    dfout.iloc[:order] = orig.iloc[:order]
    for i in range(order, len(orig)):
        dfout.iloc[i] = dfout.iloc[i-order] + diff.iloc[i-order]
    if log_order == 1:
        dfout = np.exp(dfout)
    return dfout

## Chargement du Jeu de données de ventes, des informations de cluster et des étapes de différenciation

In [3]:
dfOrig = pd.read_csv("Product families over time.csv")
dfOrig = dfOrig.set_index('Product Family').T

df_clust = pd.read_csv("pf+clusters+dtw+complete.csv")
df_clust = df_clust.rename(columns={"clusters.norm": "clusters"}) 
df_clust = df_clust.drop(columns= "Unnamed: 0")

df_list_diffs = pd.read_csv("stationnarity_steps.csv")
dictTransformations = df_list_diffs.set_index('family')
#dictTransformations

## Calcul des différentes transformations (diff et log ) pour rendre les données stationnaires 

In [5]:
df2 = dfOrig.copy()

# Initialiser un DataFrame pour enregistrer les résultats
results_df = pd.DataFrame(columns=["family", "ordreP", "loga"])
results_list = list()
fams = df2.columns

stat_data = dict()
# Traiter chaque famille de séries temporelles
for family in fams:
    #print(f"\nTraitement de la famille: {family}")  
    series = df2[family].dropna()
    p_val = test_stationarity(series)
    ordreP = 0
    loga = 0
    petit_truc = determine_petit_truc(df2[family])
    # Tester la stationnarité de la série originale 
    if p_val < 0.05:
        ordreP = 0
        stat_data[family] = series
     
    if p_val > 0.05:
        # Appliquer des différentiations successives
        for p in range(1, 6):
            diff_series = series.diff(periods=p).dropna()
            p_val1 = test_stationarity(diff_series)
            if p_val1 <= 0.05:
                ordreP = p
                stat_data[family] = diff_series
                break  
            else:
                continue
        # Si toujours non stationnaire après différentiation
        if p_val1 > 0.05:
            # Appliquer une transformation logarithmique
            log_series = apply_log(series, p=petit_truc)
            loga = 1       
             
            # Tester la stationnarité de la série log-transformée
            p_val2 = test_stationarity(log_series)
            if p_val2 < 0.05:
                print("mauvaise fam")
                #ordreP = 0
            if p_val2 > 0.05:
                # Appliquer des différentiations successives sur la série log-transformée
                for j in range(1, 6):
                    diff_log_series = log_series.diff(periods=j).dropna()
                    p_val3 = test_stationarity(diff_log_series)
                    if p_val3 <= 0.05:
                        ordreP = j
                        stat_data[family] = diff_log_series
                        break
    results_list.append({"family": family, "ordreP": ordreP, "loga": loga})

stat_data = pd.DataFrame(stat_data)
results_df = pd.DataFrame(results_list)
df_diff = stat_data.copy()

## Division en jeu de données d'entrainement et de test 

In [6]:
#le point de coupure 
n = 3
# Diviser les données
df_train = df_diff.iloc[:-3]
df_test = df_diff.iloc[-3:]

# Vérifier les tailles des ensembles d'entraînement et de test
print('Training set:', len(df_train))
print('Test set:', len(df_test))

Training set: 50
Test set: 3


## Initiation du modèle, fit sur les données, calcul des prédictions des différences et calcul des vraies valeurs prédites 

In [None]:
ordervarclust = { 2: 2,  3: 1, 1:1 }
decalages = dict()
train_size = len(df_train)
test_pred_df2 = pd.DataFrame()
reelPred = pd.DataFrame()

for cluster in df_clust['clusters'].unique():

    families_in_cluster = df_clust[df_clust['clusters'] == cluster]['productfamilies']
    
    # Sélectionner les colonnes correspondantes dans le DataFrame des séries différentiées
    
    cluster_data_train = df_train[families_in_cluster].dropna()  
    cluster_data_test  = df_test[families_in_cluster]

    model = VAR(cluster_data_train)
    order = ordervarclust[cluster]
    results = model.fit(order)
    lag_order = results.k_ar

    train_pred = results.fittedvalues
    test_pred = results.forecast(cluster_data_train.values[-lag_order:], steps=len(cluster_data_test))

    test_pred_df = pd.DataFrame(test_pred, index=cluster_data_test.index, columns=cluster_data_test.columns)
    max_diff_order = 0

    for family in families_in_cluster:
        transfo_family = dictTransformations.loc[family]
        diff_order = transfo_family['ordreP']  
        max_diff_order = max(max_diff_order, diff_order)   

    for family in families_in_cluster:
 
        transfo_family = dictTransformations.loc[family]
        diff_order = transfo_family['ordreP']  
        log_order =  transfo_family['loga'] 
        if diff_order > 0:

            # Reconstruction de la partie train

            decalage = max_diff_order - diff_order
            reconsTrain = deDiffFamily(orig = dfOrig[family].iloc[(lag_order+decalage):train_size], diff = train_pred[family], order = diff_order,log_order = log_order )
            decalages[family] = (lag_order+decalage)

            # Reconstruction de la partie test

            last_values = dfOrig[family][train_size:train_size + diff_order]
            reconstructed_forecast = list()
            current_values = list(last_values)
    
            for diff in test_pred_df[family].values.flatten():
                next_value = current_values[-1] + diff
                reconstructed_forecast.append(next_value)
                current_values.append(next_value)

            reconsout = pd.Series(reconstructed_forecast,index=test_pred_df[family].index, dtype=test_pred_df[family].dtype)

            # Si il y a eu un log pour rendre la série stationnaire on le retire en passant un exponentiel

            if log_order == 1:
                reconstructed_forecast = np.exp(reconstructed_forecast)

        elif diff_order == 0 :

            decalages[family] = 0
            reconsTrain = train_pred[family]
            reconsout = pd.DataFrame(test_pred_df[family])

        # on remets tout dans deux dataframes
        reelPred[family] = reconsTrain
        test_pred_df2[family] = reconsout
reelPred = reelPred.dropna()    

## Calcul des métriques d'évaluation du modèle 

In [18]:
metrics_list_train = list()
metrics_list_test = list()
for cluster in df_clust['clusters'].unique():
    families_in_cluster = df_clust[df_clust['clusters'] == cluster]['productfamilies']
    metrics_df_train = pd.DataFrame(index=dfOrig[families_in_cluster].columns, columns=['MAPE_train' ,'SMAPE_train', 'SMAPE_ADJ_train', 'R2_train', 'R2_ajuste_train'])
    metrics_df_test  = pd.DataFrame(index=dfOrig[families_in_cluster].columns, columns=['MAPE_test', 'SMAPE_test', 'SMAPE_ADJ_test','R2_test', 'R2_ajuste_test'])
    for family in families_in_cluster:   
        x = decalages[family]
        y_train_true = dfOrig[x:train_size][family]
        y_train_pred = reelPred[family]
        y_train_true = y_train_true[(-y_train_pred.shape[0]):] # on mets le vecteur de valeurs vraies à la même taille que celui des prédictions
        y_test_true = dfOrig[train_size:][family]
        y_test_pred = test_pred_df2[family]

        n_train = len(y_train_true)
        n_test = len(y_test_true)
        p = dfOrig.shape[1]

        mape_train = np.mean(np.abs(y_train_pred - y_train_true)/np.abs(y_train_true))  
        mape_test = np.mean(np.abs(y_test_pred - y_test_true)/np.abs(y_test_true))  
        
        smape_train = 1/len(y_train_true) * (np.sum(2 * np.abs(y_train_pred-y_train_true) / (np.abs(y_train_true) + np.abs(y_train_pred))*100))
        smape_test  = 1/len(y_test_true) * (np.sum(2 * np.abs(y_test_pred-y_test_true) / (np.abs(y_test_true) + np.abs(y_test_pred))*100))
        
        smape_adj_train = 1/(y_train_true.size) * (np.sum( np.abs(y_train_pred-y_train_true) / (np.abs(y_train_true) + np.abs(y_train_pred))*100))
        smape_adj_test  = 1/(y_test_true.size) * (np.sum( np.abs(y_test_pred-y_test_true) / (np.abs(y_test_true) + np.abs(y_test_pred))*100))
        
        r2_train = r2_score(y_train_true, y_train_pred)
        r2_test = r2_score(y_test_true, y_test_pred)
        r2_ajuste_train =  r2_ajuste(r2_train,n_train,p)
        r2_ajuste_test = r2_ajuste(r2_test,n_test,p)
        
        metrics_df_train.loc[family] = [mape_train, smape_train,smape_adj_train,  r2_train,  r2_ajuste_train]
        metrics_df_test.loc[family] = [ mape_test,  smape_test, smape_adj_test,  r2_test, r2_ajuste_test]

    metrics_df_train['Cluster'] = cluster
    metrics_df_test['Cluster'] = cluster

    metrics_list_train.append(metrics_df_train) 
    metrics_list_test.append(metrics_df_test)

final_metrics_train = pd.concat(metrics_list_train)
final_metrics_test = pd.concat(metrics_list_test)

final_metrics_train.to_csv("metrics_train.csv")
final_metrics_test.to_csv("metric_test.csv")

## Tracé des courbes des valeurs vraies et des prédictions

In [None]:
ordervarclust = { 2: 2,  3: 1, 1:1 }
for cluster in df_clust['clusters'].unique():

    families_in_cluster = df_clust[df_clust['clusters'] == cluster]['productfamilies']
    cluster_data_train = df_train[families_in_cluster] 

    y_train_true = dfOrig[x:train_size][families_in_cluster]
    y_train_pred = reelPred[families_in_cluster]
    y_train_true = y_train_true[(-y_train_pred.shape[0]):]
    y_test_true = dfOrig[train_size:][families_in_cluster]
    y_test_pred = test_pred_df2[families_in_cluster]
    
    for column in families_in_cluster:
        plt.figure(figsize=(12, 6))
        plt.plot(y_train_true.index, y_train_true[column], label='Train Actual')
       
        plt.plot(y_train_pred.index, y_train_pred[column], label='Train Predicted', linestyle='--')
        plt.plot(y_test_true.index, y_test_true[column], label='Test Actual')
        plt.plot(y_test_pred.index, y_test_pred[column], label='Test Predicted', linestyle='--')
      
        plt.title(f'Cluster {cluster} - {column}')
        plt.xlabel('Date')
        plt.ylabel('Values')
        plt.legend()
        plt.show()
