<a href="https://www.kaggle.com/code/averma111/timeseries-split?scriptVersionId=136908900" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [81]:
%reload_ext autoreload

from time import time

import numpy as np
import pandas as pd
pd.options.display.float_format = '{:,.5f}'.format
from IPython.display import display

# Sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Plotting
import matplotlib.pyplot as plt


import warnings
warnings.filterwarnings('ignore')

# Visualizations
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.io as pio

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s3e19/sample_submission.csv
/kaggle/input/playground-series-s3e19/train.csv
/kaggle/input/playground-series-s3e19/test.csv


In [82]:
class DataAcquisition:
    
    def __init__(self):
        self.X_train=None
        self.X_val=None
        self.y_train=None
        self.y_val=None
    
    def prepare_data(self):
        path = '/kaggle/input/playground-series-s3e19/train.csv'
        df = pd.read_csv(
            path, 
            sep=',', 
            parse_dates=['date'], 
            infer_datetime_format=True, 
            low_memory=False
        )
        
        X = df.copy()
        return X
    
    def prepare_data_test(self):
        path = '/kaggle/input/playground-series-s3e19/test.csv'
        df = pd.read_csv(
            path, 
            sep=',', 
            parse_dates=['date'], 
            infer_datetime_format=True, 
            low_memory=False
        )
        
        X = df.copy()
        return X
    
    
    def new_features(self,df):
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        df['year'] = df['date'].dt.year
        df['dayofweek'] = df['date'].dt.dayofweek
        df['quarter'] = df['date'].dt.quarter
        df['dayofmonth'] = df['date'].dt.day
        df['weekofyear'] = df['date'].dt.weekofyear
        df['month_sin'] = np.sin(2*np.pi*df.month/12)
        df['month_cos'] = np.cos(2*np.pi*df.month/12)
        df['day_sin'] = np.sin(2*np.pi*df.day/24)
        df['day_cos'] = np.cos(2*np.pi*df.day/24)
        return df
    
    
    
    def preprocessing(self,df):
        X = self.new_features(df)
        X = X.loc[:, X.columns!='id']
        y = X['num_sold']
        X.drop(columns=['date','num_sold'],axis=1,inplace=True)
        return X,y
        
    def preprocessing_test(self,df):
        X = self.new_features(df)
        X = X.loc[:, X.columns!='id']
        X.drop(columns=['date'],axis=1,inplace=True)
        return  X
    
        
acq = DataAcquisition()

train_df = acq.prepare_data()
test_df = acq.prepare_data_test()

In [83]:
train_df.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49


In [84]:
train_df['product'].value_counts()

Using LLMs to Improve Your Coding                 27390
Using LLMs to Train More LLMs                     27390
Using LLMs to Win Friends and Influence People    27390
Using LLMs to Win More Kaggle Competitions        27390
Using LLMs to Write Better                        27390
Name: product, dtype: int64

In [85]:
train_df.head()
X,y=acq.preprocessing(train_df)

In [86]:
from sklearn.model_selection import TimeSeriesSplit

ts_cv = TimeSeriesSplit(
    n_splits=5,
    gap=48,
    max_train_size=10000,
    test_size=1000,
)

In [87]:
all_splits = list(ts_cv.split(X, y))
train_0, test_0 = all_splits[0]

In [102]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor,RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.tree import DecisionTreeRegressor

categorical_columns = [
    "country",
    "store",
    "product",
]
categories = [
    ["Argentina", "Canada", "Estonia","Japan","Spain"],
    ["Kaggle Learn", "Kaggle Store", "Kagglazon"],
    ["Using LLMs to Improve Your Coding", "Using LLMs to Train More LLMs",
    "Using LLMs to Win Friends and Influence People","Using LLMs to Win More Kaggle Competitions",
    "Using LLMs to Write Better"]
]
ordinal_encoder = OrdinalEncoder(categories=categories)


gbrt_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
    HistGradientBoostingRegressor(
        categorical_features=categorical_columns,
        random_state=42,
    ),
).set_output(transform="pandas")

In [89]:
decisiontreee_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
    RandomForestRegressor(
        random_state=42,
    ),
).set_output(transform="pandas")

In [90]:
random_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
    DecisionTreeRegressor(
        random_state=42,
    ),
).set_output(transform="pandas")

In [100]:
gb_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
    GradientBoostingRegressor(
        random_state=42,
    ),
).set_output(transform="pandas")

In [105]:
ada_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
        ],
        remainder="passthrough",
        verbose_feature_names_out=False,
    ),
    AdaBoostRegressor(
         n_estimators=100, learning_rate=0.001,
        random_state=42,
    ),
).set_output(transform="pandas")

In [106]:
def evaluate(model, X, y, cv):
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )

In [107]:
evaluate(gbrt_pipeline, X, y, cv=ts_cv)

Mean Absolute Error:     15.627 +/- 10.269
Root Mean Squared Error: 31.774 +/- 24.508


In [108]:
evaluate(random_pipeline, X, y, cv=ts_cv)

Mean Absolute Error:     19.015 +/- 9.734
Root Mean Squared Error: 38.445 +/- 22.360


In [109]:
evaluate(decisiontreee_pipeline, X, y, cv=ts_cv)

Mean Absolute Error:     16.537 +/- 10.265
Root Mean Squared Error: 33.601 +/- 24.099


In [110]:
evaluate(gb_pipeline, X, y, cv=ts_cv)

Mean Absolute Error:     29.734 +/- 8.984
Root Mean Squared Error: 54.630 +/- 20.502


In [111]:
evaluate(ada_pipeline, X, y, cv=ts_cv)

Mean Absolute Error:     70.867 +/- 9.986
Root Mean Squared Error: 113.850 +/- 19.580


In [None]:
gb_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
gb_pipeline_prediction = gbrt_pipeline.predict(X.iloc[test_0])


decisiontreee_pipeline.fit(X.iloc[train_0], y.iloc[train_0])
decisiontreee_pipeline_predictions = decisiontreee_pipeline.predict(X.iloc[test_0])

In [None]:
last_hours = slice(-96, None)
fig, ax = plt.subplots(figsize=(12, 4))
fig.suptitle("Predictions by linear models")
ax.plot(
    y.iloc[test_0].values[last_hours],
    "x-",
    alpha=0.2,
    label="Actual demand",
    color="black",
)
ax.plot(
    gbrt_pipeline_prediction[last_hours], 
    "x-", 
    label="HistGradientBoostingRegressor"
)


_ = ax.legend()

In [None]:
X_test=acq.preprocessing_test(test_df)

In [None]:
gbrt_pipeline_predictions = gbrt_pipeline.predict(X_test)

In [None]:
class Submit:
    
    def submit_predictions(self):        
        df_submit = pd.DataFrame(data={'id': test_df['id'],'num_sold':gbrt_pipeline_predictions})
        df_submit.to_csv('submission.csv',index=False)
        print('Submission Completed!!')
        return df_submit
        
        
submit = Submit()
df_submit=submit.submit_predictions()

In [None]:
df_submit

In [None]:
test_df['num_sold']=gbrt_pipeline_predictions

In [None]:
plt = px.line(test_df, x="date", y="num_sold",height=500,title='Test Data Prediction')
plt.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1

))
plt.show()