**Downloading Kaggle data sets directly into Colab**

Install the kaggle python library

In [None]:
! pip install kaggle



Mount the Google drive so you can store your kaggle API credentials for future use

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Make a directory for kaggle at the temporary instance location on Colab drive.

Download your kaggle API key (.json file). You can do this by going to your kaggle account page and clicking 'Create new API token' under the API section.

In [2]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


Upload the json file to Google Drive and then copy to the temporary location.

In [3]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

Change the file permissions to read/write to the owner only

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

**Competitions and Datasets are the two types of Kaggle data**

**1. Download competition data**

If you get 403 Forbidden error, you need to click 'Late Submission' on the Kaggle page for that competition.

In [5]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)


Unzip, in case the downloaded file is zipped. Refresh the files on the left hand side to update the view.

In [7]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [8]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder # For Type encoding if not using category dtype directly
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc # For garbage collection
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [9]:
stores = pd.read_csv('stores.csv')
train = pd.read_csv("train.csv.zip")
features = pd.read_csv('features.csv.zip')
sample = pd.read_csv('sampleSubmission.csv.zip')
test = pd.read_csv('test.csv.zip')

In [10]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date', 'IsHoliday'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment Type    Size
0      1     1 2010-02-05      24924.50      False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106    A  151315
1      1     1 2010-02-12      46039.49       True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106    A  151315
2      1     1 2010-02-19      41595.55      False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106    A  151315
3      1     1 2010-02-26      19403.54      False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106    A  151315
4      1     1 2010-03-05      21827.90      False        46.50       2.625        NaN        NaN        NaN        Na

0

## **DATA CLEANING**


In [11]:
class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to handle missing values for specific columns.
    - MarkDown columns: fill with 0.
    - Other specified numerical columns: fill with ffill then bfill, fallback to mean.
    """
    def __init__(self, markdown_cols=None, numerical_cols_to_impute=None):
        self.markdown_cols = markdown_cols if markdown_cols is not None else [f'MarkDown{i}' for i in range(1, 6)]
        self.numerical_cols_to_impute = numerical_cols_to_impute if numerical_cols_to_impute is not None else ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        self.means = {} # To store means for fallback imputation during transform

    def fit(self, X, y=None):
        # Calculate means for fallback imputation from the training data
        for col in self.numerical_cols_to_impute:
            if col in X.columns:
                self.means[col] = X[col].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()


        for col in self.markdown_cols:
          if col in X_copy.columns:
            X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
            X_copy[col] = X_copy[col].fillna(0)


        # Impute other numerical columns with ffill then bfill, fallback to mean
        for col in self.numerical_cols_to_impute:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
                # Fallback to mean if NaNs still exist (e.g., if all values were NaN in a column)
                if X_copy[col].isnull().any() and col in self.means:
                    X_copy[col] = X_copy[col].fillna(self.means[col])
        return X_copy

In [12]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to extract temporal features from the 'Date' column.
    """
    def __init__(self, date_column='Date'):
        self.date_column = date_column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        if self.date_column not in X_copy.columns:
            raise ValueError(f"Date column '{self.date_column}' not found in DataFrame.")

        X_copy[self.date_column] = pd.to_datetime(X_copy[self.date_column])

        X_copy['Year'] = X_copy[self.date_column].dt.year
        X_copy['Month'] = X_copy[self.date_column].dt.month
        X_copy['Month_sin'] = np.sin(2 * np.pi * X_copy['Month'] / 12)
        X_copy['Month_cos'] = np.cos(2 * np.pi * X_copy['Month'] / 12)

        # Using .dt.isocalendar().week for consistent week numbering across years
        X_copy['Week'] = X_copy[self.date_column].dt.isocalendar().week.astype(int)
        X_copy['Day'] = X_copy[self.date_column].dt.day
        X_copy['DayOfWeek'] = X_copy[self.date_column].dt.dayofweek

        X_copy['Week_sin'] = np.sin(2 * np.pi * X_copy['Week'] / 52)
        X_copy['Week_cos'] = np.cos(2 * np.pi * X_copy['Week'] / 52)

        # Markdown aggregation
        X_copy['Total_MarkDown'] = X_copy[[f'MarkDown{i}' for i in range(1, 6)]].sum(axis=1)
        X_copy['MarkDown_Intensity'] = X_copy['Total_MarkDown'] / (X_copy['Total_MarkDown'].mean() + 1)

        # Economic indicators
        X_copy['Fuel_CPI_Ratio'] = X_copy['Fuel_Price'] / X_copy['CPI']
        X_copy['Economic_Index'] = (X_copy['CPI'] * 0.4 + (100 - X_copy['Unemployment']) * 0.6) / 100


        # Convert IsHoliday to integer if it exists and is boolean
        if 'IsHoliday' in X_copy.columns and X_copy['IsHoliday'].dtype == bool:
            X_copy['IsHoliday'] = X_copy['IsHoliday'].astype(int)

        # Keep the 'Date' column for ARIMA
        return X_copy # Removed .drop(columns=[self.date_column, "Month", "Week"])

In [13]:
class CategoricalFeatureConverter(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to convert specified columns to 'category' dtype
    for LightGBM to handle them efficiently.
    """
    def __init__(self, categorical_cols=None):
        self.categorical_cols = categorical_cols if categorical_cols is not None else ['Store', 'Dept', 'Type']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        for col in self.categorical_cols:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].astype('category')
        return X_copy

In [14]:
y_train = train_df['Weekly_Sales']
X_train = train_df.drop(columns=['Weekly_Sales', 'Id'], errors='ignore')

temp_train_df = X_train.copy()
temp_train_df['Date'] = pd.to_datetime(train_df['Date']) # Get original dates back for sorting
temp_train_df['Weekly_Sales'] = y_train

temp_train_df = temp_train_df.sort_values(by='Date').reset_index(drop=True)

# Define a cutoff date for validation
validation_cutoff_date = pd.to_datetime('2012-09-01')

X_train_split = temp_train_df[temp_train_df['Date'] < validation_cutoff_date]
y_train_split = temp_train_df[temp_train_df['Date'] < validation_cutoff_date]['Weekly_Sales']

X_val_split = temp_train_df[temp_train_df['Date'] >= validation_cutoff_date]
y_val_split = temp_train_df[temp_train_df['Date'] >= validation_cutoff_date]['Weekly_Sales']

def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

val_weights = np.where(X_val_split['IsHoliday'] == 1, 5, 1)


In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from statsmodels.tsa.arima.model import ARIMA
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
from statsmodels.tools.sm_exceptions import ConvergenceWarning, ValueWarning
from joblib import Parallel, delayed


class ARIMAModelWrapper(BaseEstimator, TransformerMixin):
    def __init__(self, order=(1,0,1), seasonal_order=(0,0,0,0), verbose=False):
        self.order = order
        self.seasonal_order = seasonal_order
        self.verbose = verbose
        self.models = {}
        self.last_values = {}

    def fit(self, X, y=None):
        # Suppress all statsmodels warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=ConvergenceWarning)
            warnings.simplefilter("ignore", category=UserWarning)
            warnings.simplefilter("ignore", category=ValueWarning)

            grouped = X.groupby(['Store', 'Dept'])

            # Create progress bar if verbose
            if self.verbose:
                groups = tqdm(grouped, desc="Training ARIMA models", unit="store-dept")
            else:
                groups = grouped

            for (store, dept), group in groups:
                ts_data = group.set_index('Date')['Weekly_Sales']

                # Force weekly frequency to prevent warnings
                ts_data = ts_data.asfreq('W-FRI')

                if len(ts_data) > 0:
                    try:
                        with warnings.catch_warnings():
                            warnings.simplefilter("ignore")
                            model = ARIMA(ts_data,
                                         order=self.order,
                                         seasonal_order=self.seasonal_order)
                            fitted_model = model.fit()
                            self.models[(store, dept)] = fitted_model
                            self.last_values[(store, dept)] = ts_data.iloc[-1]
                    except:
                        if self.verbose:
                            print(f"Failed on Store {store}, Dept {dept}")
                        continue
        return self

    def transform(self, X):
        # This will return ARIMA predictions for the existing dates
        # For production, you might want a separate predict method
        return X

    def predict(self, X):
        # Make predictions for each store-dept combination in X
        predictions = []
        for _, row in X.iterrows():
            store = row['Store']
            dept = row['Dept']
            date = row['Date']

            if (store, dept) in self.models and self.models[(store, dept)] is not None:
                try:
                     # Get the forecast for this specific date
                    model = self.models[(store, dept)]
                    # Calculate the number of steps from the last training data point to the prediction date
                    # Assuming weekly data frequency
                    # Find the last date the model was trained on
                    last_train_date = model.model.data.dates[-1]
                    steps = (date - last_train_date).days // 7


                    if steps >= 0: # Predict from the last training date onwards
                        forecast = model.forecast(steps=steps + 1) # Forecast up to the prediction date
                        pred = forecast.iloc[-1]
                    else: # If the date is before the last training date, use the observed value
                         # This case should ideally not happen in a standard forecast scenario,
                         # but included for robustness if predict is used on past dates.
                         # We would need to find the closest date in the training data
                        print(f"Warning: Predicting for a date before the last training date for Store {store}, Dept {dept}, Date {date}")
                        pred = self.last_values[(store, dept)] # Use last value as a fallback

                except Exception as e:
                    print(f"Prediction failed for Store {store}, Dept {dept}, Date {date}: {str(e)}")
                    pred = self.last_values[(store, dept)] if (store, dept) in self.last_values else np.nan # Fallback to last value or NaN
            else:
                # Fallback - use last known value or NaN if no model was fitted
                pred = self.last_values[(store, dept)] if (store, dept) in self.last_values else np.nan


            predictions.append(pred)

        return np.array(predictions)

In [None]:
# Define the full pipeline
arima_order = (1,1,1)  # Simplified order

arima_seasonal_order=(0,0,0,0)
# Preprocessing steps
preprocessing = Pipeline([
    ('missing_value_imputer', MissingValueImputer()),
    ('date_feature_extractor', DateFeatureExtractor()),
    ('categorical_converter', CategoricalFeatureConverter()),
])



# Full pipeline with ARIMA
full_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('arima_model', ARIMAModelWrapper(order=arima_order, seasonal_order=arima_seasonal_order))
])


full_pipeline.fit(X_train_split, y_train_split)
predictions = full_pipeline.predict(X_val_split)










  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
  grouped = X.groupby(['Store', 'Dept'])
  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')


In [None]:
def weighted_mean_absolute_error(y_true, y_pred, weights):
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

val_weights = np.where(X_val_split['IsHoliday'] == 1, 5, 1)
print (weighted_mean_absolute_error(y_val_split, predictions, val_weights))


2153.6490189275673


In [21]:
%pip install -q dagshub


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
!pip install mlflow==2.7.1


Collecting mlflow==2.7.1
  Downloading mlflow-2.7.1-py3-none-any.whl.metadata (12 kB)
Collecting cloudpickle<3 (from mlflow==2.7.1)
  Downloading cloudpickle-2.2.1-py3-none-any.whl.metadata (6.9 kB)
Collecting databricks-cli<1,>=0.8.7 (from mlflow==2.7.1)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl.metadata (4.0 kB)
Collecting protobuf<5,>=3.12.0 (from mlflow==2.7.1)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytz<2024 (from mlflow==2.7.1)
  Downloading pytz-2023.4-py2.py3-none-any.whl.metadata (22 kB)
Collecting packaging<24 (from mlflow==2.7.1)
  Downloading packaging-23.2-py3-none-any.whl.metadata (3.2 kB)
Collecting importlib-metadata!=4.7.0,<7,>=3.7.0 (from mlflow==2.7.1)
  Downloading importlib_metadata-6.11.0-py3-none-any.whl.metadata (4.9 kB)
Collecting alembic!=1.10.0,<2 (from mlflow==2.7.1)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<7,>=4.0.0 (from mlflow==2.7.1)
  Downlo

In [16]:

import dagshub, mlflow
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)
mlflow.set_experiment("ARIMA_Training")


* 'schema_extra' has been renamed to 'json_schema_extra'


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=823af23c-bba2-4fdb-a914-02bcdc65ff03&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=058227cc7554af37798b0463c388a763bf9dd9bcbe077a999e75933c83f9ef46




<Experiment: artifact_location='mlflow-artifacts:/6e01db02c8e240aebfa89d3184cdf829', creation_time=1751573162919, experiment_id='2', last_update_time=1751573162919, lifecycle_stage='active', name='ARIMA_Training', tags={}>

In [None]:
with mlflow.start_run(run_name="ARIMA_Data_Cleaning"):
    # Log data cleaning parameters
    mlflow.log_param("missing_value_strategy", "MarkDowns->0, others->ffill/bfill/mean")
    mlflow.log_param("date_features_extracted", True)


    # Log metrics about data quality
    mlflow.log_metric("cleaned_missing_values", train_df.isna().sum().sum())


In [None]:
with mlflow.start_run(run_name="ARIMA_Feature_Engineering"):
    # Log feature engineering parameters
    mlflow.log_params({
        "temporal_features": ["Year", "Month", "Week", "DayOfWeek"],
        "cyclical_features": ["Month_sin", "Month_cos", "Week_sin", "Week_cos"],
        "economic_features": ["Fuel_CPI_Ratio", "Economic_Index"],
        "markdown_features": ["Total_MarkDown", "MarkDown_Intensity"]
    })

    # Your feature engineering
    feature_pipeline = Pipeline([
        ('date_extractor', DateFeatureExtractor())
    ])

    X_featured = feature_pipeline.fit_transform(X_train_split)

    # Log results
    mlflow.log_metric("total_features", len(X_featured.columns))
    mlflow.log_metric("time_span_days", (X_featured['Date'].max() - X_featured['Date'].min()).days)

In [19]:


with mlflow.start_run(run_name="ARIMA_Model_Training"):
    # Log model parameters
    arima_params = {
        'order': (1,0,1),
        'seasonal_order': (0,0,0,0),
        'trend': 'c'
    }
    mlflow.log_params(arima_params)

    preprocessing = Pipeline([
        ('missing_value_imputer', MissingValueImputer()),
        ('date_feature_extractor', DateFeatureExtractor()),
        ('categorical_converter', CategoricalFeatureConverter()),
    ])

    # Full pipeline with ARIMA
    full_pipeline = Pipeline([
        ('preprocessing', preprocessing),
        ('arima_model', ARIMAModelWrapper(order=arima_params['order'], seasonal_order=arima_params['seasonal_order']))
    ])


    full_pipeline.fit(X_train_split, y_train_split)
    val_preds = full_pipeline.predict(X_val_split)


    val_wmae = weighted_mean_absolute_error(y_val_split, val_preds, val_weights)
        # Log metrics
    mlflow.log_metrics({
        "train_samples": len(X_train_split),
        "val_samples": len(X_val_split),
        "val_wmae": val_wmae,
    })

    # Log model (as artifact since statsmodels doesn't have native MLflow support)
    import joblib
    joblib.dump(full_pipeline, "arima_pipeline.joblib")
    mlflow.log_artifact("arima_pipeline.joblib")


  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
  grouped = X.groupby(['Store', 'Dept'])
  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')


In [None]:
# param_grid = {
#     'order': [(1,1,1), (2,1,2), (1,0,1)],
# }

# best_score = float('inf')
# best_params = None

# for params in ParameterGrid(param_grid):
#     with mlflow.start_run(run_name=f"ARIMA_Tuning_{params['order']}_{params['seasonal_order']}", nested=True):
#         mlflow.log_params(params)

#         model = ARIMAModelWrapper(
#             order=params['order'],
#             seasonal_order=params['seasonal_order']
#         )
#         model.fit(X_train_split, y_train_split)

#         val_preds = model.predict(X_val_split)
#         val_wmae = weighted_mean_absolute_error(y_val_split, val_preds, val_weights)

#         mlflow.log_metric("val_wmae", val_wmae)

#         if val_wmae < best_score:
#             best_score = val_wmae
#             best_params = params
#             best_model = model