In [11]:
# data_preprocessing.py

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime

# Modèles
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Prétraitement et pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Modèles de validation
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error


In [12]:
# Charger les données
def load_data():
    df_data = pd.read_csv('train.csv', parse_dates=['Date'])
    df_stores = pd.read_csv('store.csv')
    df = pd.merge(df_data, df_stores, on='Store', how='left')
    return df

df = load_data()

# Prétraitement des données
def preprocess_data(df):
    # Filtrer les magasins ouverts et les ventes supérieures à zéro
    df = df[(df['Open'] == 1) & (df['Sales'] > 0)]
    
    # Remplacer les valeurs manquantes
    df['CompetitionDistance'].fillna(df['CompetitionDistance'].median(), inplace=True)
    df.fillna(0, inplace=True)
    
    # Convertir les colonnes catégoriques
    categorical_cols = ['StoreType', 'Assortment', 'StateHoliday']
    df[categorical_cols] = df[categorical_cols].astype(str)
    
    # Ajouter des features temporelles
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    df['IsWeekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)
    
    # Créer des features de décalage (lags) pour les modèles ML
    df.sort_values(['Store', 'Date'], inplace=True)
    for lag in range(1, 8):
        df[f'Sales_Lag_{lag}'] = df.groupby('Store')['Sales'].shift(lag)
    
    # Supprimer les lignes avec des valeurs manquantes après le décalage
    df.dropna(inplace=True)
    
    return df

df = preprocess_data(df)


In [13]:
# Séparation des données
def train_test_split(df, test_size=0.2):
    # Nous allons prendre les dernières dates comme ensemble de test
    train_index = int(len(df) * (1 - test_size))
    df_train = df.iloc[:train_index]
    df_test = df.iloc[train_index:]
    return df_train, df_test

df_train, df_test = train_test_split(df)


In [14]:
# Sélection des features et de la cible
features = [
    'Store', 'DayOfWeek', 'Promo', 'Year', 'Month', 'Day', 'WeekOfYear', 'CompetitionDistance', 
    'Promo2', 'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_5', 'Sales_Lag_6', 'Sales_Lag_7',
    'StoreType', 'Assortment', 'StateHoliday', 'IsWeekend'
]
target = 'Sales'

X_train = df_train[features]
y_train = df_train[target]
X_test = df_test[features]
y_test = df_test[target]

# Préparation du préprocesseur
numerical_cols = ['Store', 'DayOfWeek', 'Promo', 'Year', 'Month', 'Day', 'WeekOfYear', 'CompetitionDistance', 
                  'Promo2', 'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_5', 'Sales_Lag_6', 'Sales_Lag_7', 'IsWeekend']
categorical_cols = ['StoreType', 'Assortment', 'StateHoliday']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [15]:
# Pipeline de Régression Linéaire
from sklearn.linear_model import LinearRegression

lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Entraîner le modèle
lr_pipeline.fit(X_train, y_train)

# Sauvegarder le modèle
joblib.dump(lr_pipeline, 'models/linear_regression_pipeline.joblib')


['models/linear_regression_pipeline.joblib']

In [16]:
# Pipeline de Random Forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1))
])

# Entraîner le modèle
rf_pipeline.fit(X_train, y_train)

# Sauvegarder le modèle
joblib.dump(rf_pipeline, 'models/random_forest_pipeline.joblib')


['models/random_forest_pipeline.joblib']

In [17]:
# Pipeline de XGBoost
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(n_estimators=100, random_state=42, n_jobs=-1))
])

# Entraîner le modèle
xgb_pipeline.fit(X_train, y_train)

# Sauvegarder le modèle
joblib.dump(xgb_pipeline, 'models/xgboost_pipeline.joblib')


['models/xgboost_pipeline.joblib']

In [18]:
# Entraîner un modèle SARIMA pour un magasin spécifique
import warnings
warnings.filterwarnings("ignore")

from statsmodels.tsa.statespace.sarimax import SARIMAX

def train_sarima(store_id):
    store_data = df[df['Store'] == store_id].sort_values('Date')
    sales_series = store_data.set_index('Date')['Sales']
    
    # Déterminer les paramètres p, d, q, P, D, Q, s (ici, nous utilisons des valeurs par défaut)
    p, d, q = 1, 1, 1
    P, D, Q, s = 1, 1, 1, 7  # s=7 pour la saisonnalité hebdomadaire
    
    model = SARIMAX(sales_series, order=(p, d, q), seasonal_order=(P, D, Q, s), enforce_stationarity=False, enforce_invertibility=False)
    sarima_model = model.fit(disp=False)
    
    # Sauvegarder le modèle
    joblib.dump(sarima_model, f'models/sarima_model_store_{store_id}.joblib')
    return sarima_model

# Entraîner pour un magasin (exemple: Store 1)
sarima_model = train_sarima(1)


In [19]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

def train_holt_winters(store_id):
    store_data = df[df['Store'] == store_id].sort_values('Date')
    sales_series = store_data.set_index('Date')['Sales']
    
    model = ExponentialSmoothing(sales_series, trend='add', seasonal='add', seasonal_periods=7)
    hw_model = model.fit()
    
    # Sauvegarder le modèle
    joblib.dump(hw_model, f'models/holt_winters_model_store_{store_id}.joblib')
    return hw_model

# Entraîner pour un magasin (exemple: Store 1)
hw_model = train_holt_winters(1)


In [20]:
def train_prophet(store_id):
    store_data = df[df['Store'] == store_id][['Date', 'Sales']].rename(columns={'Date': 'ds', 'Sales': 'y'})
    
    model = Prophet()
    model.fit(store_data)
    
    # Sauvegarder le modèle
    joblib.dump(model, f'models/prophet_model_store_{store_id}.joblib')
    return model

# Entraîner pour un magasin (exemple: Store 1)
prophet_model = train_prophet(1)


16:51:02 - cmdstanpy - INFO - Chain [1] start processing
16:51:05 - cmdstanpy - INFO - Chain [1] done processing


In [21]:
# Créer un dossier pour les artefacts
import os
os.makedirs('artefacts', exist_ok=True)

def evaluate_and_log_model(model_name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    
    # Enregistrer les métriques
    with open('artefacts/model_performance.txt', 'a') as f:
        f.write(f"{model_name} - MAE: {mae:.2f}, RMSE: {rmse:.2f}, MAPE: {mape:.2%}\n")
    
    # Enregistrer le graphe des prédictions vs vraies valeurs
    plt.figure(figsize=(10, 6))
    plt.plot(y_true.values, label='Vraies Valeurs')
    plt.plot(y_pred, label='Prédictions')
    plt.title(f'Prédictions vs Vraies Valeurs - {model_name}')
    plt.legend()
    plt.savefig(f'artefacts/{model_name}_predictions.png')
    plt.close()
    
    return {'Model': model_name, 'MAE': mae, 'RMSE': rmse, 'MAPE': mape}


In [22]:
# Prédictions
y_pred_lr = lr_pipeline.predict(X_test)

# Évaluation
lr_results = evaluate_and_log_model('Linear Regression', y_test, y_pred_lr)


In [23]:
# Prédictions
y_pred_rf = rf_pipeline.predict(X_test)

# Évaluation
rf_results = evaluate_and_log_model('Random Forest', y_test, y_pred_rf)


In [24]:
# Prédictions
y_pred_xgb = xgb_pipeline.predict(X_test)

# Évaluation
xgb_results = evaluate_and_log_model('XGBoost', y_test, y_pred_xgb)


In [30]:
# Prédictions pour le magasin 1
store_data = df[df['Store'] == 1].sort_values('Date')
sales_series = store_data.set_index('Date')['Sales']
start = sales_series.index[-1]
end = df_test[df_test['Store'] == 1]['Date'].iloc[-1]
sarima_forecast = sarima_model.predict(start=start, end=end)
y_true_sarima = df_test[df_test['Store'] == 1]['Sales']
sarima_results = evaluate_and_log_model('SARIMA (Store 1)', y_true_sarima, sarima_forecast)


IndexError: single positional indexer is out-of-bounds

In [None]:
# Prédictions pour le magasin 1
hw_forecast = hw_model.forecast(steps=len(df_test[df_test['Store'] == 1]))
y_true_hw = df_test[df_test['Store'] == 1]['Sales']
hw_results = evaluate_and_log_model('Holt-Winters (Store 1)', y_true_hw, hw_forecast)


In [None]:
# Préparation des données de test pour Prophet
future_dates = df_test[df_test['Store'] == 1]['Date'].reset_index(drop=True)
future = pd.DataFrame({'ds': future_dates})

# Prédictions
forecast = prophet_model.predict(future)
y_pred_prophet = forecast['yhat']
y_true_prophet = df_test[df_test['Store'] == 1]['Sales'].reset_index(drop=True)

# Évaluation
prophet_results = evaluate_and_log_model('Prophet (Store 1)', y_true_prophet, y_pred_prophet)


No test data available for Store 1


In [32]:
# Créer un tableau des résultats
results_df = pd.DataFrame([
    lr_results,
    rf_results,
    xgb_results,
    #sarima_results,
    #hw_results,
    #prophet_results
])

# Afficher le tableau
print(results_df)

# Enregistrer le tableau
results_df.to_csv('artefacts/model_comparison.csv', index=False)


               Model          MAE         RMSE      MAPE
0  Linear Regression  1125.635677  1578.452839  0.179172
1      Random Forest   675.440375   997.560018  0.105766
2            XGBoost   637.729809   940.666691  0.098542
