In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import missingno as msno

import seaborn as sns

from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer

from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor



from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# sns.set(style='whitegrid')

# Apply the default theme
sns.set_theme()

# Importing training data

In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s3e19/train.csv')
df = df.drop('id', axis=1)
df

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.describe(include='all', datetime_is_numeric=True)

In [None]:
df.info()

# Visualizing missing values

In [None]:
msno.matrix(df)

In [None]:
(df.isnull().sum() / df.shape[0]) * 100 # converting missing values into percentage

# Feature engineering

In [None]:
df['saleYear'] = df['date'].apply(lambda x: x.year)
df['saleMonth'] = df['date'].apply(lambda x: x.month)
df['saleDay'] = df['date'].apply(lambda x: x.day)

# #Drop orignal date column
# df = df.drop('date', axis=1)

# Reorder the columns
df = df[['date', 'saleYear', 'saleMonth', 'saleDay', 'country', 'store', 'product', 'num_sold']]

df

# Exploratory Data Analysis

In [None]:

fig, ax = plt.subplots(2, 1, figsize=(15, 15))
fig.set_figwidth(12)  # Adjust the figure width to a reasonable value

ax[0].title.set_text('Total Sales by Country and Store')
# Group by country and store to calculate total sales for each combination
sales_by_country_store = df.groupby(['country', 'store'])['num_sold'].sum().reset_index()
# Pivot the data to have store as columns
sales_pivot = sales_by_country_store.pivot_table(index='country', columns='store', values='num_sold', fill_value=0)
sales_plot = sales_pivot.plot(kind='bar', stacked=True, ax=ax[0])
ax[0].ticklabel_format(style='plain', axis='y')
ax[0].legend(title='Stores', loc='upper left', bbox_to_anchor=(1, 1))



ax[1].title.set_text('Total Sales by Country and Products')
# Group by country and store to calculate total sales for each combination
sales_by_country_store = df.groupby(['country', 'product'])['num_sold'].sum().reset_index()
# Pivot the data to have store as columns
sales_pivot = sales_by_country_store.pivot_table(index='country', columns='product', values='num_sold', fill_value=0)
sales_plot = sales_pivot.plot(kind='bar', stacked=True, ax=ax[1])
ax[1].ticklabel_format(style='plain', axis='y')
ax[1].legend(title='Products', loc='upper left', bbox_to_anchor=(1, 1))# Add a legend


plt.subplots_adjust(hspace=0.5)
plt.show()

<span style='color: red'><b>Question 1: Most successful store?</b></span>

In [None]:

fig, ax = plt.subplots(2, 1, figsize=(15, 12))
fig.set_figwidth(12)  # Adjust the figure width to a reasonable value
# fig.suptitle("Number of each product sold in 2020", fontsize=16, y=1.02)  # Adding the figure title

ax[0].title.set_text('Total sales of each stores')
sns.barplot(
    data= df[['store', 'num_sold']].groupby('store').sum().reset_index(), 
    x="store", 
    y="num_sold",
    ax=ax[0]
)
ax[0].bar_label(ax[0].containers[0], fmt='%.1f')
ax[0].set_ylabel("Number of products sold")
ax[0].set_xlabel("Products")
ax[0].tick_params(axis='x', rotation=20)
ax[0].ticklabel_format(style='plain', axis='y')


ax[1].title.set_text('Number of items sold by each store')
sns.lineplot(x="date", y="num_sold", hue="store", data=df, ax=ax[1], marker='o', linewidth = 1, errorbar=None)
# Add a legend
ax[1].legend(title='Products', loc='upper left', bbox_to_anchor=(1, 1))
ax[1].set_ylabel("Number of products sold")
ax[1].set_xlabel("Date of sale")


plt.subplots_adjust(hspace=0.5)
plt.xticks(rotation=90)
plt.show()

<span style='color: red'><b>Question 2: Most successful product?</b></span>

In [None]:

fig, ax = plt.subplots(2, 1, figsize=(15, 18))
fig.set_figwidth(12)  # Adjust the figure width to a reasonable value
# fig.suptitle("Number of each product sold in 2020", fontsize=16, y=1.02)  # Adding the figure title

ax[0].title.set_text('Total sales of each product')
sns.barplot(
    data= df[['product', 'num_sold']].groupby('product').sum().reset_index(), 
    x="product", 
    y="num_sold",
    ax=ax[0]
)
ax[0].bar_label(ax[0].containers[0], fmt='%.1f')
ax[0].set_ylabel("Number of products sold")
ax[0].set_xlabel("Products")
ax[0].tick_params(axis='x', rotation=90)
ax[0].ticklabel_format(style='plain', axis='y')


ax[1].title.set_text('Total product sales by date')
sns.lineplot(x="date", y="num_sold", hue="product", data=df, ax=ax[1], linewidth = 1, errorbar=None)
# Add a legend
ax[1].legend(title='Products', loc='upper left', bbox_to_anchor=(1, 1))
ax[1].set_ylabel("Number of products sold")
ax[1].set_xlabel("Date of sale")


plt.subplots_adjust(hspace=1.3)
plt.xticks(rotation=90)
plt.show()

<span style='color: red'><b>Question 3: Which product was most sold in 2020?</b></span>

In [None]:

fig, ax = plt.subplots(2, 1, figsize=(15, 18))
fig.set_figwidth(12)  # Adjust the figure width to a reasonable value
# fig.suptitle("Number of each product sold in 2020", fontsize=16, y=1.02)  # Adding the figure title

ax[0].title.set_text('Number of each product sold in 2020')
sns.barplot(
    data= df[df['date'].astype(str).str.contains('2020')][['product', 'num_sold']].groupby('product').sum().reset_index(), 
    x="product", 
    y="num_sold",
    ax=ax[0]
)
ax[0].set_ylabel("Number of products sold")
ax[0].set_xlabel("Products")
ax[0].tick_params(axis='x', rotation=90)
ax[0].ticklabel_format(style='plain', axis='y')


ax[1].title.set_text('Number of each product sold in 2020 by date')
sns.lineplot(
    data=df[df['date'].astype(str).str.contains('2020')][['date','product', 'num_sold']],
    x="date", y="num_sold", hue="product", ax=ax[1], linewidth = 1, errorbar=None
)
# Add a legend
ax[1].legend(title='Products', loc='upper left', bbox_to_anchor=(1, 1))
ax[0].set_ylabel("Number of products sold")
ax[0].set_xlabel("Date of sale")


plt.subplots_adjust(hspace=1.3)
plt.xticks(rotation=90)
plt.show()

<span style='color: red'><b>Question 4: Total proucts sold by each stores in 2017?</b></span>

In [None]:

fig, ax = plt.subplots(2, 1, figsize=(15, 13))
fig.set_figwidth(12)  # Adjust the figure width to a reasonable value
# fig.suptitle("Number of each product sold in 2020", fontsize=16, y=1.02)  # Adding the figure title

ax[0].title.set_text('Number of each store sales in 2017')
sns.barplot(
    data= df[df['date'].astype(str).str.contains('2017')][['store', 'num_sold']].groupby('store').sum().reset_index(), 
    x="store", 
    y="num_sold",
    ax=ax[0]
)
ax[0].set_ylabel("Number of products sold")
ax[0].set_xlabel("Stores")
ax[0].tick_params(axis='x', rotation=90)
ax[0].ticklabel_format(style='plain', axis='y')


ax[1].title.set_text('Number of product sold by each store in 2017 by date')
sns.lineplot(
    data=df[df['date'].astype(str).str.contains('2017')][['date','store', 'num_sold']],
    x="date", y="num_sold", hue="store", ax=ax[1], linewidth = 1, errorbar=None
)
# Add a legend
ax[1].legend(title='Stores', loc='upper left', bbox_to_anchor=(1, 1))
ax[0].set_ylabel("Number of products sold")
ax[0].set_xlabel("Date of sale")


plt.subplots_adjust(hspace=0.5)

plt.xticks(rotation=90)
plt.show()

<span style='color: red'><b>Further Analysis</b></span>

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(15, 10))

for i, column in enumerate(['saleYear', 'saleMonth', 'saleDay']):
    ax[0, i].title.set_text(f'Distribution of {column}')
    sns.histplot(data=df, x="saleMonth", ax=ax[0, i])

for i, column in enumerate(['saleYear', 'saleMonth', 'saleDay']):
    ax[1, i].title.set_text(f'Scatterplot - {column} vs num_sold')
    sns.scatterplot(data=df, x=column, y="num_sold", ax=ax[1, i])
    
plt.tight_layout()
plt.show()

# Label encoding

In [None]:
#Drop orignal date column
df = df.drop('date', axis=1)

In [None]:
df

In [None]:
# Create dummy variables for the specified columns
df_dummies = pd.get_dummies(df, columns=['country', 'store', 'product'])

# Replace spaces in the column names with underscores
df_dummies.columns = df_dummies.columns.str.replace(' ', '_')

In [None]:
y = df_dummies['num_sold']
y

In [None]:
X = df_dummies.drop('num_sold', axis=1)
X

<span style='color: red'><b>'Y'</b> is dependent/target feature, and <b>'X'</b> is independent features.</span>

In [None]:
print('Independent features in order:\n')

for col in X.columns:
    print(col)

# Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

# Creating base models & selecting 

**The competition will evaluate submissions based on SMAPE. So, I'll use the same for evaluating my models.**

In [None]:
def calculate_smape(actual, predicted) -> float:
  
    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 2
    )

In [None]:
'''Linear Regression'''

LinearRegression_pipeline = Pipeline([
    ('quantile_transformer', QuantileTransformer()),
    ('linear_regression', LinearRegression())
])

# training the pipeline on the training set
LinearRegression_pipeline.fit(X_train, np.array(y_train))

# predicting on the testing set
y_pred = LinearRegression_pipeline.predict(X_test)

# evaluating the performance
smape = calculate_smape(np.array(y_test), y_pred)

print("SMPAE:",smape, '%')

In [None]:
# '''XGBRegressor'''

# # specifing the parameters for XGBoost
# params = {
#     'objective': 'reg:squarederror',  # specify the objective function
#     'eval_metric': 'mae',  # specify the evaluation metric
#     'tree_method': 'gpu_hist',  # use GPU to build trees
#     'gpu_id': 0  # specify the GPU device to use
# }

# # creating an XGBoost regressor
# xgb_model = xgb.XGBRegressor(**params)

# # fitting the model on the training data
# xgb_model.fit(X_train, np.array(y_train))

# # predicting on the test data
# y_pred = xgb_model.predict(X_test)

# # evaluating the model
# smape = calculate_smape(np.array(y_test), y_pred)

# print("SMPAE:",smape, '%')

In [None]:
'''Decision Tree'''

# Creating a decision tree regressor
tree = DecisionTreeRegressor(random_state=42)

# Fitting the model on the training data
tree.fit(X_train, np.array(y_train))

# Making predictions on the test data
y_pred = tree.predict(X_test)

# Evaluating the model
smape = calculate_smape(np.array(y_test), y_pred)

print("SMPAE:",smape, '%')

In [None]:
'''Random Forest'''

RandomForestRegressor_pipeline = make_pipeline(
    QuantileTransformer(),
    RandomForestRegressor()
)

# Fit the pipeline on the training data
RandomForestRegressor_pipeline.fit(X_train, np.array(y_train))

# Predict on the testing data
y_pred = RandomForestRegressor_pipeline.predict(X_test)

# Evaluate the model
smape = calculate_smape(np.array(y_test), y_pred)

print("SMPAE:",smape, '%')

In [None]:
# '''Support Vector Regression (SVR)'''

# SVR_pipeline = Pipeline([
#     ('quantile_transformer', QuantileTransformer()),
#     ('regressor', SVR())
# ])

# # Fit the pipeline on the training data
# SVR_pipeline.fit(X_train, np.array(y_train))

# # Predict on the testing data
# y_pred = SVR_pipeline.predict(X_test)

# # Evaluate the model
# smape = calculate_smape(np.array(y_test), y_pred)

# print("SMPAE:",smape, '%')

In [None]:
'''Neural network'''

NN_pipeline = Pipeline([
    ('transformer', QuantileTransformer()),
    ('estimator', MLPRegressor(learning_rate_init=0.06))
])

# Fitting the pipeline on the training data
NN_pipeline.fit(X_train, np.array(y_train))

# Predict on the testing data
y_pred = NN_pipeline.predict(X_test)

# Evaluate the model
smape = calculate_smape(np.array(y_test), y_pred)

print("SMPAE:",smape, '%')

# RandomForestRegressor - Hyperparameter Tuning

## Grid Search

In [None]:
# pipeline = Pipeline([
#     ('quantile_transformer', QuantileTransformer()),
#     ('random_forest', RandomForestRegressor())
# ])

# param_grid = {
#     'quantile_transformer__n_quantiles': [1000, 2000],
#     'quantile_transformer__output_distribution': ['uniform', 'normal'],
#     'random_forest__n_estimators': [25, 50, 100, 150],
#     'random_forest__max_features': ['sqrt', 'log2', None],
#     'random_forest__max_depth': [3, 6, 9],
#     'random_forest__max_leaf_nodes': [3, 6, 9],
# }

# grid_search = GridSearchCV(pipeline, param_grid=param_grid, n_iter=100, cv=3, verbose=1)
# grid_search.fit(X_train, np.array(y_train))

In [None]:
# print(grid_search.best_estimator_)

## Random Search

In [None]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 80, stop = 150, num = 10)]
# # Number of features to consider at every split
# max_features = [1.0, 'sqrt']
# # Maximum number of levels in tree
# max_depth = [None]
# max_depth = max_depth + [int(x) for x in np.linspace(5, 50, num = 10)]


# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [1, 2, 5, 7]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 3, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]


# pipeline = Pipeline([
#     ('quantile_transformer', QuantileTransformer()),
#     ('random_forest', RandomForestRegressor())
# ])

# param_distributions = {
# #     'quantile_transformer__n_quantiles': [500, 1000, 1500, 2000],
# #     'quantile_transformer__output_distribution': ['uniform', 'normal'],
#     'random_forest__n_estimators': n_estimators,  # Set n_estimators for RandomForestRegressor
#     'random_forest__max_features': max_features,
#     'random_forest__max_depth': max_depth,
#     'random_forest__max_leaf_nodes': min_samples_leaf,
#     'random_forest__min_samples_split': min_samples_split,
#     'random_forest__bootstrap': bootstrap,
# }


# rf_random = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distributions, n_iter=100, cv=10, verbose=0, random_state=42, n_jobs=-1, scoring=make_scorer(calculate_smape, greater_is_better=False))
# rf_random.fit(X_train, np.array(y_train))

In [None]:
# rf_random.best_params_

In [None]:
# # Evaluate the model
# smape = calculate_smape(
#     np.array(y_test), 
#     rf_random.predict(X_test)
# )

# print("SMPAE:",smape, '%')

In [None]:
# # Evaluate the model
# print("SMPAE:",
#       calculate_smape(
#             np.array(y_train), 
#             rf_random.predict(X_train)
#         ), 
#       '%')

# Neural network - Hyperparameter tuning

In [None]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 80, stop = 150, num = 10)]
# # Number of features to consider at every split
# max_features = [1.0, 'sqrt']
# # Maximum number of levels in tree
# max_depth = [None]
# max_depth = max_depth + [int(x) for x in np.linspace(5, 50, num = 10)]

# # Minimum number of samples required to split a node
# min_samples_split = [1, 2, 5, 7]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 3, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]


# pipeline = Pipeline([
#     ('quantile_transformer', QuantileTransformer()),
#     ('min_max_scaler', MinMaxScaler()),
#     ('random_forest', RandomForestRegressor())
# ])

# param_distributions = {
#     'quantile_transformer__n_quantiles': [500, 1000, 1500, 2000],
#     'quantile_transformer__output_distribution': ['uniform', 'normal'],
#     'quantile_transformer__subsample': [5000, 10000, 20000],
#     'random_forest__n_estimators': n_estimators,  # Set n_estimators for RandomForestRegressor
#     'random_forest__max_features': max_features,
#     'random_forest__max_depth': max_depth,
#     'random_forest__max_leaf_nodes': min_samples_leaf,
#     'random_forest__min_samples_split': min_samples_split,
#     'random_forest__bootstrap': bootstrap,
# }


# rf_random = RandomizedSearchCV(estimator=pipeline, param_distributions=param_distributions, n_iter=100, cv=10, verbose=0, random_state=42, n_jobs=-1, scoring=make_scorer(calculate_smape, greater_is_better=False))
# rf_random.fit(X_train, np.array(y_train))

In [None]:
# import tensorflow as tf
# from tensorflow import keras
# import keras_tuner as kt
# import numpy as np

# import tensorflow.keras.backend as K

# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.optimizers import Adam


# def smape(y_true, y_pred):
#     return 200 * K.mean(K.abs(y_pred - y_true) / (K.abs(y_pred) + K.abs(y_true) + K.epsilon()))


# def model_builder(hp):
#     model = keras.Sequential()

#     model.add(Dense(hp.Int('units_1', min_value=16, max_value=1024, step=16), 
#                     activation='relu', 
#                     kernel_regularizer=keras.regularizers.l2(hp.Choice('l2_reg_1', values=[0.01, 0.1, 0.2])),
#                     input_dim=16
#     ))
    
#     model.add(Dense(hp.Int('units_2', min_value=32, max_value=1024, step=32), 
#                     activation='relu'
#     ))
    
#     model.add(Dropout(hp.Float('dropout_1', min_value=0.0, max_value=0.5, step=0.1)))
    
#     model.add(Dense(hp.Int('units_3', min_value=8, max_value=512, step=8), 
#                     activation='relu'
#     ))
    
#     model.add(Dropout(hp.Float('dropout_2', min_value=0.0, max_value=0.5, step=0.1)))
    
#     model.add(Dense(hp.Int('units_4', min_value=8, max_value=1024, step=8), 
#                     activation='relu'
#     ))
    
#     model.add(Dense(hp.Int('units_5', min_value=16, max_value=512, step=16), 
#                     activation='relu', 
#                     kernel_regularizer=keras.regularizers.l2(hp.Choice('l2_reg_1', values=[0.01, 0.1, 0.2])),
#     ))
    
#     model.add(Dense(hp.Int('units_6', min_value=16, max_value=1024, step=16), 
#                     activation='relu', 
#                     kernel_regularizer=keras.regularizers.l2(hp.Choice('l2_reg_1', values=[0.01, 0.1, 0.2])),
#     ))
    
#     model.add(keras.layers.Dense(1, activation='linear'))

#     # Tune the learning rate for the optimizer
#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
#         loss=tf.keras.losses.MeanAbsolutePercentageError(),
#         metrics=[tf.keras.metrics.MeanAbsoluteError()]
#     )

#     return model


# # Apply QuantileTransformer to preprocess the data
# quantile_transformer = QuantileTransformer()
# X_train_preprocessed = quantile_transformer.fit_transform(X_train)
# X_test_preprocessed = quantile_transformer.transform(X_test)

# # Apply MinMaxScaler to preprocess the data
# min_max_scaler = MinMaxScaler()
# X_train_preprocessed = min_max_scaler.fit_transform(X_train_preprocessed)
# X_test_preprocessed = min_max_scaler.transform(X_test_preprocessed)

# tuner = kt.Hyperband(
#     model_builder,
#     objective=kt.Objective("val_loss", direction="min"),
#     max_epochs=30,
#     directory='keras_tuner_dir',
#     project_name='dnn'
# )

# tuner.search(X_train_preprocessed, y_train, epochs=40, batch_size=1024, validation_data=(X_test_preprocessed, y_test))

In [None]:
# # Get the optimal hyperparameters
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

# best_hps.values

In [None]:
'''previous best'''

# {'units_1': 864,
#  'l2_reg_1': 0.01,
#  'units_2': 864,
#  'dropout_1': 0.1,
#  'units_3': 392,
#  'dropout_2': 0.1,
#  'units_4': 520,
#  'units_5': 368,
#  'units_6': 928,
#  'learning_rate': 0.001,
#  'tuner/epochs': 30,
#  'tuner/initial_epoch': 10,
#  'tuner/bracket': 3,
#  'tuner/round': 3,
#  'tuner/trial_id': '0049'}

**I'll rebuild the model myself and try different learning rates**

In [None]:
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt
import numpy as np

import tensorflow.keras.backend as K

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam




# Apply QuantileTransformer to preprocess the data
quantile_transformer = QuantileTransformer()
X_train_preprocessed = quantile_transformer.fit_transform(X_train)
X_test_preprocessed = quantile_transformer.transform(X_test)

# Apply MinMaxScaler to preprocess the data
min_max_scaler = MinMaxScaler()
X_train_preprocessed = min_max_scaler.fit_transform(X_train_preprocessed)
X_test_preprocessed = min_max_scaler.transform(X_test_preprocessed)







model = keras.Sequential()

model.add(Dense(864, 
                activation='relu', 
                kernel_regularizer=keras.regularizers.l2(0.01),
                input_dim=16
))

model.add(Dense(864, activation='relu'))

model.add(Dropout(0.1))

model.add(Dense(392, activation='relu'))

model.add(Dropout(0.1))

model.add(Dense(520, activation='relu'))

model.add(Dense(368, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01),))

model.add(Dense(928, activation='relu', kernel_regularizer=keras.regularizers.l2(0.01),))

model.add(keras.layers.Dense(1, activation='linear'))

# Tune the learning rate for the optimizer
hp_learning_rate = 0.0001

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate),
    loss=tf.keras.losses.MeanAbsolutePercentageError(),
    metrics=[tf.keras.metrics.MeanAbsoluteError()]
)

In [None]:
# Build the model with the optimal hyperparameters and train it on the data
# model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train_preprocessed, y_train, epochs=200, batch_size=4096, validation_data=(X_test_preprocessed, y_test))

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Assuming pred is a numpy array containing the predictions
pred = model.predict(X_test_preprocessed)

# Convert y_test to a NumPy array and reshape for plotting
y_test = y_test.values.reshape(-1)

# Reshape the predictions to match the shape of y_test
pred = pred.reshape(-1)

# Create a scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(y_test, pred, c='blue', alpha=0.5)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('Scatter Plot of True vs. Predicted Values')
plt.grid(True)
plt.show()

# Submission

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s3e19/test.csv')
test_df

In [None]:
def df_transformer(df):
    df = df.drop('id', axis=1)
    
    df['date'] = pd.to_datetime(df['date'])

    df['saleYear'] = df['date'].apply(lambda x: x.year)
    df['saleMonth'] = df['date'].apply(lambda x: x.month)
    df['saleDay'] = df['date'].apply(lambda x: x.day)

    # Reorder the columns
    df = df[['date', 'saleYear', 'saleMonth', 'saleDay', 'country', 'store', 'product']]
    
    #Drop orignal date column
    df = df.drop('date', axis=1)
    
    # Create dummy variables for the specified columns
    df_dummies = pd.get_dummies(df, columns=['country', 'store', 'product'])

    # Replace spaces in the column names with underscores
    df_dummies.columns = df_dummies.columns.str.replace(' ', '_')
    
    return df_dummies


In [None]:
Submission_transformed_df = df_transformer(test_df)
Submission_transformed_df

In [None]:
# Define the pipeline
pipeline = Pipeline([
    ('quantile_transform', quantile_transformer),
    ('min_max_scaler', min_max_scaler),
    ('model', model)
])

# Make predictions using the pipeline
Submission_y = pipeline.predict(Submission_transformed_df)

In [None]:
submission = pd.read_csv('/kaggle/input/playground-series-s3e19/sample_submission.csv')
submission['num_sold'] = Submission_y
submission

In [None]:
submission.to_csv('submission.csv', index=False)