In [None]:
import pandas as pd
import numpy as np

# Chargement des fichiers CSV
waiting_times_train = pd.read_csv('waiting_times_train.csv')
weather_data = pd.read_csv('weather_data.csv')
waiting_times_X_test_val = pd.read_csv('waiting_times_X_test_final.csv')

In [None]:
# Définition de la fonction pour ajouter les caractéristiques temporelles
def add_time_features(df):
    df['DATETIME'] = pd.to_datetime(df['DATETIME'])
    df['DAY_OF_WEEK'] = df['DATETIME'].dt.dayofweek
    df['MONTH'] = df['DATETIME'].dt.month
    df['HOUR'] = df['DATETIME'].dt.hour
    df['IS_WEEKEND'] = df['DAY_OF_WEEK'].apply(lambda x: 1 if x >= 5 else 0)

    # Ajouter le numéro de semaine
    df['WEEK_OF_YEAR'] = df['DATETIME'].dt.isocalendar().week

    # Définir les saisons
    def get_season(month):
        if month in [3, 4, 5]:
            return 0
        elif month in [6, 7, 8]:
            return 1
        elif month in [9, 10, 11]:
            return 2
        else:
            return 3

    df['SEASON'] = df['MONTH'].apply(get_season)

    df['DAY_OF_YEAR'] = df['DATETIME'].dt.dayofyear
    df['QUARTER_HOUR'] = df['DATETIME'].dt.hour * 4 + df['DATETIME'].dt.minute // 15

    def get_period_of_day(hour):
        if 8 <= hour < 11:
            return 0
        elif 11 <= hour < 14:
            return 1
        elif 14 <= hour < 18:
            return 2
        else:
            return 3

    df['PERIOD_OF_DAY'] = df['DATETIME'].dt.hour.apply(get_period_of_day)

    df['HOUR_SIN'] = np.sin(2 * np.pi * df['DATETIME'].dt.hour / 24)

    df['WEEKEND_NUMBER'] = 0  # Initialiser avec 0 pour tous les jours
    weekend_mask = df['IS_WEEKEND'] == 1  # Masque pour identifier les week-ends
    df.loc[weekend_mask, 'WEEKEND_NUMBER'] = df.loc[weekend_mask, 'DATETIME'].dt.isocalendar().week


    return df

# Assurez-vous d'appliquer cette fonction seulement aux DataFrames nécessitant ces colonnes
waiting_times_train = add_time_features(waiting_times_train)
waiting_times_X_test_val = add_time_features(waiting_times_X_test_val)
weather_data = add_time_features(weather_data)


train_data = pd.merge(waiting_times_train, weather_data, on='DATETIME', how='left', suffixes=('', '_drop'))
test_data = pd.merge(waiting_times_X_test_val, weather_data, on='DATETIME', how='left', suffixes=('', '_drop'))

# Supprimer les colonnes avec le suffixe _drop
train_data = train_data[[c for c in train_data.columns if not c.endswith('_drop')]]
test_data = test_data[[c for c in test_data.columns if not c.endswith('_drop')]]


X_train = train_data[['ADJUST_CAPACITY', 'HOUR_SIN', 'PERIOD_OF_DAY', 'QUARTER_HOUR', 'DAY_OF_YEAR', 'WEEK_OF_YEAR', 'WEEKEND_NUMBER', 'SEASON', 'DOWNTIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2', 'CURRENT_WAIT_TIME', 'TIME_TO_NIGHT_SHOW', 'temp', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'snow_1h', 'clouds_all', 'DAY_OF_WEEK', 'MONTH', 'IS_WEEKEND', 'HOUR']].fillna(0)
y_train = train_data['WAIT_TIME_IN_2H']

X_test = test_data[['ADJUST_CAPACITY', 'HOUR_SIN', 'PERIOD_OF_DAY', 'QUARTER_HOUR', 'DAY_OF_YEAR', 'WEEK_OF_YEAR', 'WEEKEND_NUMBER', 'SEASON', 'DOWNTIME', 'TIME_TO_PARADE_1', 'TIME_TO_PARADE_2', 'CURRENT_WAIT_TIME', 'TIME_TO_NIGHT_SHOW', 'temp', 'dew_point', 'feels_like', 'pressure', 'humidity', 'wind_speed', 'rain_1h', 'snow_1h', 'clouds_all', 'DAY_OF_WEEK', 'MONTH', 'IS_WEEKEND', 'HOUR']].fillna(0)

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Define the pipeline steps
steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')), # optimal
    ('scaler', MinMaxScaler()), # optimal
    ('model', XGBRegressor(n_estimators=1000, # optimal
                           random_state=30, # optimal
                           learning_rate=0.025,   # optimal
                           max_depth=8,         # optimal
                           subsample=0.95,       # optimal
                           colsample_bytree=0.56, # optimal
                           gamma=1,           # optimal
                           reg_alpha=200,       # optimal
                           reg_lambda=10,       # optimal
                           min_child_weight=150,  # optimal
                           ))
]

# Create the pipeline
pipeline = Pipeline(steps)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Création d'un DataFrame pour les prédictions
predictions_df = pd.DataFrame({
    'DATETIME': test_data['DATETIME'],
    'ENTITY_DESCRIPTION_SHORT': test_data['ENTITY_DESCRIPTION_SHORT'],
    'y_pred': y_pred,
    'KEY': 'c57d53a31f68e864e929524b80c3dfe31190a5e431187fa12f'  # Utilisez 'Validation' pour le test de validation, changez selon le besoin pour le test final
})

predictions_df = predictions_df[['DATETIME', 'ENTITY_DESCRIPTION_SHORT', 'y_pred', 'KEY']]
predictions_df.to_csv('predictions_final.csv', index=False)