In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

with open('/content/drive/MyDrive/MLFinal/git_token.env', 'r') as f:
    token = f.read().strip()

username = "badrilosaberidze"

%cd /content/drive/MyDrive/MLFinal/walmart-sales-forecasting
!git remote set-url origin https://{username}:{token}@github.com/{username}/Walmart-Recruiting---Store-Sales-Forecasting.git
!git pull

In [None]:
!pip install dagshub mlflow pmdarima

In [None]:
import numpy as np
from IPython.display import display, HTML
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

#Load And Merge Data

In [None]:
train = pd.read_csv("data/train.csv", parse_dates=["Date"])
features = pd.read_csv("data/features.csv", parse_dates=["Date"])
stores = pd.read_csv("data/stores.csv")

# Merge all data into one DataFrame
df = train.merge(features, on=["Store", "Date", "IsHoliday"], how="left")
df = df.merge(stores, on="Store", how="left")

In [None]:
# Sort and fill missing
cols_to_fill = ["MarkDown1", "MarkDown2", "MarkDown3", "MarkDown4", "MarkDown5"]
df[cols_to_fill] = df[cols_to_fill].fillna(0)
df = df.sort_values(["Store", "Dept", "Date"])

# Encode categorical store type
df["Type"] = LabelEncoder().fit_transform(df["Type"])

# Add time-based features
df["Week"] = df["Date"].dt.isocalendar().week

df.head()

In [None]:
def weighted_mae(y_true, y_pred, is_holiday):
    weights = np.where(is_holiday, 5, 1)
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

#Train

In [None]:
import mlflow
import dagshub
from tqdm.notebook import tqdm

results = []
series_results = {}
best_wmae = float('inf')
best_model_path = None

dagshub.init(repo_owner="losaberidzebadri", repo_name="Walmart-Recruiting---Store-Sales-Forecasting", mlflow=True)
mlflow.set_tracking_uri("https://dagshub.com/losaberidzebadri/Walmart-Recruiting---Store-Sales-Forecasting.mlflow")

experiment_name = "ARIMA_Department_models"
try:
    experiment_id = mlflow.create_experiment(experiment_name)
    print(f"✓ Created new experiment: {experiment_name}")
except mlflow.exceptions.MlflowException:
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id
    print(f"✓ Using existing experiment: {experiment_name}")

mlflow.set_experiment(experiment_name)

unique_depts = df["Dept"].unique()

for dept_id in tqdm(unique_depts):
    ts = df[df["Dept"] == dept_id].groupby("Date")["Weekly_Sales"].sum().reset_index()
    holiday_ts = df[df["Dept"] == dept_id].groupby("Date")["IsHoliday"].max().reset_index()

    ts = ts.merge(holiday_ts, on="Date")
    ts = ts.sort_values("Date")
    ts.set_index("Date", inplace=True)

    if len(ts) < 80:
        continue

    y = ts["Weekly_Sales"].fillna(0)
    is_holiday = ts["IsHoliday"]

    split_idx = int(len(y) * 0.8)
    y_train, y_val = y.iloc[:split_idx], y.iloc[split_idx:]
    h_val = is_holiday.iloc[split_idx:]

    with mlflow.start_run(run_name=f"Dept_{dept_id}"):
        try:
            model = SARIMAX(y_train, order=(1,1,1), seasonal_order=(1,1,1,52),
                            enforce_stationarity=False, enforce_invertibility=False)
            fitted = model.fit(disp=False)
            preds = fitted.predict(start=y_val.index[0], end=y_val.index[-1])

            wmae = weighted_mae(y_val.values, preds.values, h_val.values)
            results.append({"Dept": dept_id, "WMAE": wmae})

            series_results[dept_id] = {
                "y_true": y_val,
                "y_pred": preds,
                "wmae": wmae
            }

            mlflow.log_metric("WMAE", wmae)
            mlflow.log_param("order", (1,1,1))
            mlflow.log_param("seasonal_order", (1,1,1,52))
            mlflow.log_param("series_length", len(y))

            # ✅ Save model to disk and log
            os.makedirs("models", exist_ok=True)
            model_path = f"models/arima_dept_{dept_id}.pkl"
            joblib.dump(fitted, model_path)
            mlflow.log_artifact(model_path)

            if wmae < best_wmae:
                best_wmae = wmae
                best_model_path = model_path

        except Exception as e:
            mlflow.log_param("status", "failed")
            mlflow.log_param("error", str(e))
            continue

#Visualisations Of Results

In [None]:
result_df = pd.DataFrame(results).sort_values("WMAE")
display(result_df.head(10))

In [None]:
dept_ex = result_df.iloc[0]["Dept"]
y_true = series_results[dept_ex]["y_true"]
y_pred = series_results[dept_ex]["y_pred"]

plt.figure(figsize=(14,6))
plt.plot(y_true.index, y_true, label="Actual", linewidth=2)
plt.plot(y_pred.index, y_pred, label="Forecast", linestyle="--")
plt.title(f"Dept {dept_ex} Forecast (All Stores Summed)")
plt.xlabel("Date")
plt.ylabel("Weekly Sales")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,5))
plt.hist(result_df["WMAE"], bins=30, color="skyblue", edgecolor="black")
plt.title("Distribution of WMAE across Departments")
plt.xlabel("WMAE")
plt.ylabel("Number of Departments")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
result_df.to_csv("dept_arima_wmae_results.csv", index=False)
mlflow.log_artifact("dept_arima_wmae_results.csv")

##Save Best Arima Model

In [None]:
from mlflow.models.signature import infer_signature
from mlflow import pyfunc

if best_model_path:
    print("bla")
    mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/{best_model_path}", "Best_ARIMA_Dept_Model")
else:
  print("ae")