In [1]:
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
import pandas as pd
from mlflow.models import infer_signature
from sklearn.model_selection import train_test_split

## Load the datasets

In [2]:
# Input data files are available in the "../input/" directory.
# First let us load the datasets into different Dataframes
def load_data(datapath):
    data = pd.read_csv(datapath)
   # Dimensions
    print('Shape:', data.shape)
    # Set of features we have are: date, store, and item
    display(data.sample(10))
    return data
    
    
train_df = load_data('../data/train.csv')
test_df = load_data('../data/test.csv')

Shape: (913000, 4)


Unnamed: 0,date,store,item,sales
527007,2016-01-25,9,29,32
49879,2014-08-01,8,3,57
649331,2016-01-07,6,36,43
399036,2015-08-27,9,22,81
706126,2016-07-14,7,39,45
92141,2015-04-22,1,6,60
693563,2017-02-18,10,38,88
237089,2017-03-16,10,13,110
453320,2014-04-18,9,25,94
657647,2013-10-15,1,37,14


Shape: (45000, 4)


Unnamed: 0,id,date,store,item
43360,43360,2018-03-12,2,49
34063,34063,2018-02-13,9,38
34354,34354,2018-03-06,2,39
4000,4000,2018-02-10,5,5
2195,2195,2018-02-05,5,3
14921,14921,2018-03-13,6,17
4342,4342,2018-01-23,9,5
40093,40093,2018-02-13,6,45
19025,19025,2018-02-05,2,22
41755,41755,2018-03-27,4,47


# LIGHTGBM

In [3]:
def split_data(train_data,test_data):
    train_data['date'] = pd.to_datetime(train_data['date'])
    test_data['date'] = pd.to_datetime(test_data['date'])

    train_data['month'] = train_data['date'].dt.month
    train_data['day'] = train_data['date'].dt.dayofweek
    train_data['year'] = train_data['date'].dt.year

    test_data['month'] = test_data['date'].dt.month
    test_data['day'] = test_data['date'].dt.dayofweek
    test_data['year'] = test_data['date'].dt.year

    col = [i for i in test_data.columns if i not in ['date','id']]
    y = 'sales'
    train_x, test_x, train_y, test_y = train_test_split(train_data[col],train_data[y], test_size=0.2, random_state=2018)
    return (train_x, test_x, train_y, test_y,col)

train_x, test_x, train_y, test_y,col = split_data(train_df,test_df)

In [4]:
train_x.shape,train_y.shape,test_x.shape

((730400, 5), (730400,), (182600, 5))

In [5]:
def model(train_x,train_y,test_x,test_y,col):
    params = {
        'nthread': 10,
         'max_depth': 5,
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'metric': 'mape', # this is abs(a-e)/max(1,a)
        'num_leaves': 64,
        'learning_rate': 0.2,
       'feature_fraction': 0.9,
       'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'lambda_l1': 3.097758978478437,
        'lambda_l2': 2.9482537987198496,
        'verbose': 1,
        'min_child_weight': 6.996211413900573,
        'min_split_gain': 0.037310344962162616,
        }
    
    mlflow.log_params(params)


    lgb_train = lgb.Dataset(train_x,train_y)
    lgb_valid = lgb.Dataset(test_x,test_y)
    model = lgb.train(params, lgb_train, 3000, valid_sets=[lgb_train, lgb_valid])
    y_test = model.predict(test_df[col])

    # Infer the model signature
    signature = infer_signature(train_df[col], model.predict(train_df[col]))
    
    model_info = mlflow.lightgbm.log_model(model, artifact_path='model', signature=signature, input_example=train_df[col], registered_model_name='lgbm_model')
    
    return model_info

In [6]:
mlflow.lightgbm.autolog()
mlflow.set_experiment("Store Item Demand Forecasting")

<Experiment: artifact_location='file:///home/overlord/Documents/PythonProjects/store-item-demand-forecasting/notebooks/mlruns/425162585312985292', creation_time=1743257861506, experiment_id='425162585312985292', last_update_time=1743257861506, lifecycle_stage='active', name='Store Item Demand Forecasting', tags={}>

In [7]:
%%time


with mlflow.start_run() as run:
    # Train the model and get predictions
    model_info = model(train_x,train_y,test_x,test_y,col)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 88
[LightGBM] [Info] Number of data points in the train set: 730400, number of used features: 5
[LightGBM] [Info] Start training from score 47.000000




CPU times: user 9min 37s, sys: 1.48 s, total: 9min 38s
Wall time: 54.4 s


Registered model 'lgbm_model' already exists. Creating a new version of this model...
Created version '2' of model 'lgbm_model'.


In [8]:
# Load the model back for predictions as a generic Python Function model
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

predictions = loaded_model.predict(test_x)

# Convert predictions to a DataFrame for easier handling
predictions_df = pd.DataFrame(predictions, columns=['Predicted Sales'])
predictions_df.head()


Unnamed: 0,Predicted Sales
0,31.697849
1,22.309596
2,68.606352
3,33.558955
4,91.928857


In [9]:
print("Model logged with the following details:")
print("Experiment ID: ", run.info.experiment_id)
print("Run Name: ", run.info.run_name)
print("Run ID: ", run.info.run_id)
print("Model URI: ", model_info.model_uri)


Model logged with the following details:
Experiment ID:  425162585312985292
Run Name:  intrigued-asp-464
Run ID:  f9b9715adcfa4145bd1cec58ddc70fc8
Model URI:  runs:/f9b9715adcfa4145bd1cec58ddc70fc8/model
