In [1]:
! pip install kaggle
! pip install onnx
! pip install dagshub
! pip install mlflow

from google.colab import drive
drive.mount('/content/drive')
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
# ! kaggle competitions download -c walmart-recruiting-store-sales-forecasting
# ! unzip walmart-recruiting-store-sales-forecasting

Mounted at /content/drive


In [2]:
# ! unzip test.csv.zip
# ! unzip train.csv.zip
# ! unzip sampleSubmission.csv.zip
# ! unzip features.csv.zip

In [3]:
# ! rm -rf test.csv.zip
# ! rm -rf train.csv.zip
# ! rm -rf sampleSubmission.csv.zip
# ! rm -rf features.csv.zip

In [4]:
# 1. Clone your GitHub repo
!git clone https://github.com/arazm21/ML-final_project

# 2. Add the cloned repo (or specific folder) to sys.path
import sys
sys.path.insert(0, '/content/ML-final_project')


Cloning into 'ML-final_project'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (24/24), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 24 (delta 0), reused 21 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (24/24), 3.72 MiB | 10.99 MiB/s, done.


In [7]:
import torch # Main PyTorch Library
from torch import nn # Used for creating the layers and loss function
from torch.optim import Adam # Adam Optimizer
import torchvision.transforms as transforms # Transform function used to modify and preprocess all the images
from torch.utils.data import Dataset, DataLoader # Dataset class and DataLoader for creating the objects
from sklearn.preprocessing import LabelEncoder # Label Encoder to encode the classes from strings to numbers
import matplotlib.pyplot as plt # Used for visualizing the images and plotting the training progress
from PIL import Image # Used to read the images from the directory
import pandas as pd # Used to read/create dataframes (csv) and process tabular data
import numpy as np # preprocessing and numerical/mathematical operations
import os # Used to read the images path from the directory

device = "cuda" if torch.cuda.is_available() else "cpu" # detect the GPU if any, if not use CPU, change cuda to mps if you have a mac
print("Device available: ", device)

Device available:  cpu


In [8]:
features_df = pd.read_csv('ML-final_project/data/features.csv/features.csv')
sample_submission_df = pd.read_csv('ML-final_project/data/sampleSubmission.csv/sampleSubmission.csv')
test_df = pd.read_csv('ML-final_project/data/test.csv/test.csv')
train_df = pd.read_csv('ML-final_project/data/train.csv/train.csv')
stores_df = pd.read_csv('ML-final_project/data/stores.csv')

In [9]:
display(train_df.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


In [14]:
train_df = train_df.drop(columns=['IsHoliday'], errors='ignore')
test_df = test_df.drop(columns=['IsHoliday'], errors='ignore')
# Merge on 'Store' and 'Date'
train_merged = pd.merge(train_df, features_df, on=['Store', 'Date'], how='left')
test_merged = pd.merge(test_df, features_df, on=['Store', 'Date'], how='left')
train_merged = pd.merge(train_merged, stores_df, on='Store', how='left')
test_merged = pd.merge(test_merged, stores_df, on='Store', how='left')

In [15]:
display(train_merged.head())

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday,Type,Size
0,1,1,2010-02-05,24924.5,42.31,2.572,,,,,,211.096358,8.106,False,A,151315
1,1,1,2010-02-12,46039.49,38.51,2.548,,,,,,211.24217,8.106,True,A,151315
2,1,1,2010-02-19,41595.55,39.93,2.514,,,,,,211.289143,8.106,False,A,151315
3,1,1,2010-02-26,19403.54,46.63,2.561,,,,,,211.319643,8.106,False,A,151315
4,1,1,2010-03-05,21827.9,46.5,2.625,,,,,,211.350143,8.106,False,A,151315


## helper functions

### 1.feature engineering functions

In [None]:
def add_holiday_lookahead(df):
    """
    Adds:
    - IsHolidayNextWeek: True if this week OR next week is a holiday.
    - IsHolidayIn2Weeks: True if this week OR next 2 weeks are holidays.
    """
    df = df.sort_values(by=['Store', 'Date']).copy()

    # current and future IsHoliday flags
    curr = df['IsHoliday'].astype(bool)
    next_1 = df.groupby('Store')['IsHoliday'].shift(-1).fillna(False).astype(bool)
    next_2 = df.groupby('Store')['IsHoliday'].shift(-2).fillna(False).astype(bool)

    df['IsHolidayNextWeek'] = (curr | next_1).astype(int)
    df['IsHolidayIn2Weeks'] = (curr | next_1 | next_2).astype(int)

    return df

def add_date_features(df):
    df['Date'] = pd.to_datetime(df['Date'])
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['WeekOfYear'] = df['Date'].dt.isocalendar().week
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.weekday  # Monday=0, Sunday=6
    return df

# use this method for model which are not sequencial, i.e.xgboost
def drop_unneeded_columns(df, drop_date=True):
    """
    Drop or convert fields not usable by XGBoost directly.
    """
    df = df.copy()
    if drop_date and 'Date' in df.columns:
        df = df.drop(columns=['Date'])
    return df


## connecting wandb and dagshub

In [17]:
project_name = 'ML-final_project'
run_name = "test_run_1"
repo_owner = "arazm21"

In [18]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33marazm21[0m ([33marazm21-free-university-of-tbilisi-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [19]:
import dagshub
import mlflow
import mlflow.sklearn
dagshub.init(repo_owner=repo_owner, repo_name=project_name, mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=cd25537e-680f-44a0-9c17-2795377d273b&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=6081d3ccba1aaba46d9c80c9162f53fae7ff11afa8bf92adafb899c73f4b1b7a




Output()

In [None]:
# with wandb.init(project=project_name,
#                 config=hyperparameters,
#                 name = run_name):
