In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [2]:
# Load the dataset files
store_data = pd.read_csv('store.csv')
train_data=pd.read_csv('train.csv', low_memory=False)
test_data = pd.read_csv('test.csv')

# Display the first few rows of each dataset
display(store_data.head())
display(train_data.head())
display(test_data.head())

Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


In [3]:
# Merge the store data with the train and test datasets
train = pd.merge(train_data, store_data, on='Store', how='left')
test = pd.merge(test_data, store_data, on='Store', how='left')

# Display merged train dataset structure
train.head()

Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,2015-07-31,5263,555,1,1,0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,5,2015-07-31,6064,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,5,2015-07-31,8314,821,1,1,0,1,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,5,2015-07-31,13995,1498,1,1,0,1,c,c,620.0,9.0,2009.0,0,,,
4,5,5,2015-07-31,4822,559,1,1,0,1,a,a,29910.0,4.0,2015.0,0,,,


In [4]:
# Check for missing values and data types in the merged train dataset
train.info()

# Fill missing values if any, for example in the 'train' dataset
# (You may need to adjust based on specific columns and the extent of missing data)
train['Promo2SinceWeek'].fillna(0, inplace=True)
train['Promo2SinceYear'].fillna(0, inplace=True)
train['PromoInterval'].fillna('None', inplace=True)

# Convert Date column to datetime format
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])

# Filter rows where the store was open, since closed stores have no sales
train = train[train['Open'] != 0]
train = train.drop(columns=['Open'])  # Drop 'Open' as it's not needed for prediction

# Drop rows with zero sales as they are not useful for the model
train = train[train['Sales'] > 0]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 18 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   Store                      1017209 non-null  int64  
 1   DayOfWeek                  1017209 non-null  int64  
 2   Date                       1017209 non-null  object 
 3   Sales                      1017209 non-null  int64  
 4   Customers                  1017209 non-null  int64  
 5   Open                       1017209 non-null  int64  
 6   Promo                      1017209 non-null  int64  
 7   StateHoliday               1017209 non-null  object 
 8   SchoolHoliday              1017209 non-null  int64  
 9   StoreType                  1017209 non-null  object 
 10  Assortment                 1017209 non-null  object 
 11  CompetitionDistance        1014567 non-null  float64
 12  CompetitionOpenSinceMonth  693861 non-null   float64
 13  CompetitionO

In [5]:
from sklearn.preprocessing import LabelEncoder

# Check if 'Date' column is in the dataset
if 'Date' in train.columns:
    # Convert 'Date' column to datetime format if it isn't already
    train['Date'] = pd.to_datetime(train['Date'], format='%d %m %Y', errors='coerce')

    # Extract date-related features from the Date column
    train['Year'] = train['Date'].dt.year
    train['Month'] = train['Date'].dt.month
    train['Day'] = train['Date'].dt.day
    train['WeekOfYear'] = train['Date'].dt.isocalendar().week
    
    # Drop the 'Date' column after extracting features
    train = train.drop(columns=['Date'])
else:
    print("The 'Date' column is not found in the dataset.")

# Handle categorical columns
categorical_cols = ['StateHoliday', 'PromoInterval']  # Add any other categorical columns if needed

# Label encode categorical columns
label_encoders = {}
for col in categorical_cols:
    if col in train.columns:
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col].astype(str))
        label_encoders[col] = le  # Save label encoder if needed for inverse transform
    else:
        print(f"Column '{col}' not found in the dataset.")

In [11]:
# Define features (X) and target (y)
X = train.drop(columns=['Sales'])
y = train['Sales']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# One-hot encode both training and validation sets consistently
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)

# Align the columns of X_val with X_train
X_val = X_val.reindex(columns=X_train.columns, fill_value=0)

In [12]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

In [13]:
# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate model performance
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 581595.2036872958
Mean Absolute Error (MAE): 561.8853285511218
R-squared (R2): 0.9397283576686897


In [19]:
# Define a parameter grid for hyperparameter tuning
param_grid = {
    'learning_rate': [0.01, 0.05],
    'max_depth': [3, 5],           
    'max_iter': [100],             
    'min_samples_leaf': [1],     
}

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(
    estimator=HistGradientBoostingRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,  # Reduced from 5 or 10
    n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

Best Parameters: {'learning_rate': 0.05, 'max_depth': 5, 'max_iter': 100, 'min_samples_leaf': 1}


In [20]:
# Evaluate the model with the best found parameters
y_pred_best = best_model.predict(X_val)

mse_best = mean_squared_error(y_val, y_pred_best)
mae_best = mean_absolute_error(y_val, y_pred_best)
r2_best = r2_score(y_val, y_pred_best)

print(f"Best Model MSE: {mse_best}")
print(f"Best Model MAE: {mae_best}")
print(f"Best Model R2: {r2_best}")

Best Model MSE: 946864.7399293866
Best Model MAE: 723.1259125825436
Best Model R2: 0.9018748906811184


In [21]:
# Save the trained model for later use
joblib.dump(best_model, 'sales_prediction_model.pkl')
print("Model saved as 'sales_prediction_model.pkl'")

Model saved as 'sales_prediction_model.pkl'
