In [1]:
import numpy as np  
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv("../dataset/kalimati.csv")

In [3]:
df.head()

Unnamed: 0,SN,Commodity,Date,Unit,Minimum,Maximum,Average
0,0,Tomato Big(Nepali),6/16/2013,Kg,35,40,37.5
1,1,Tomato Small(Local),6/16/2013,Kg,26,32,29.0
2,2,Potato Red,6/16/2013,Kg,20,21,20.5
3,3,Potato White,6/16/2013,Kg,15,16,15.5
4,4,Onion Dry (Indian),6/16/2013,Kg,28,30,29.0


In [4]:
df = df.drop(columns=["Minimum", "Maximum", "Unit", "SN"])

In [5]:
df = df.rename(columns={"Average": "Price"})

In [6]:
df.head()

Unnamed: 0,Commodity,Date,Price
0,Tomato Big(Nepali),6/16/2013,37.5
1,Tomato Small(Local),6/16/2013,29.0
2,Potato Red,6/16/2013,20.5
3,Potato White,6/16/2013,15.5
4,Onion Dry (Indian),6/16/2013,29.0


In [7]:
df["Date"] = pd.to_datetime(df["Date"])

In [8]:
df.head()

Unnamed: 0,Commodity,Date,Price
0,Tomato Big(Nepali),2013-06-16,37.5
1,Tomato Small(Local),2013-06-16,29.0
2,Potato Red,2013-06-16,20.5
3,Potato White,2013-06-16,15.5
4,Onion Dry (Indian),2013-06-16,29.0


In [9]:
df = df[["Date", "Commodity", "Price"]]

In [10]:
df.head()

Unnamed: 0,Date,Commodity,Price
0,2013-06-16,Tomato Big(Nepali),37.5
1,2013-06-16,Tomato Small(Local),29.0
2,2013-06-16,Potato Red,20.5
3,2013-06-16,Potato White,15.5
4,2013-06-16,Onion Dry (Indian),29.0


In [11]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df["Item"] = label_encoder.fit_transform(df["Commodity"])


In [12]:
df.head()

Unnamed: 0,Date,Commodity,Price,Item
0,2013-06-16,Tomato Big(Nepali),37.5,122
1,2013-06-16,Tomato Small(Local),29.0,124
2,2013-06-16,Potato Red,20.5,96
3,2013-06-16,Potato White,15.5,99
4,2013-06-16,Onion Dry (Indian),29.0,83


In [13]:
# df = df.drop(columns=["Commodity"])

In [14]:
train = df[(df["Date"] >= "2013-06-16") & (df["Date"] <= "2019-12-31")]
test = df[(df["Date"] >= "2020-01-01") & (df["Date"] <= "2021-05-13")]


In [15]:
train.to_csv("../dataset/processed/train.csv", index=False)
test.to_csv("../dataset/processed/test.csv", index=False)


In [16]:
print(train.shape, test.shape, df.shape)



(157439, 4) (39722, 4) (197161, 4)


In [17]:
train.head()

Unnamed: 0,Date,Commodity,Price,Item
0,2013-06-16,Tomato Big(Nepali),37.5,122
1,2013-06-16,Tomato Small(Local),29.0,124
2,2013-06-16,Potato Red,20.5,96
3,2013-06-16,Potato White,15.5,99
4,2013-06-16,Onion Dry (Indian),29.0,83


In [18]:
test.head()

Unnamed: 0,Date,Commodity,Price,Item
157439,2020-01-01,Tomato Big(Nepali),62.5,122
157440,2020-01-01,Tomato Big(Indian),62.5,121
157441,2020-01-01,Tomato Small(Local),40.0,124
157442,2020-01-01,Tomato Small(Tunnel),40.0,126
157443,2020-01-01,Tomato Small(Indian),42.5,123


In [19]:
df.head()

Unnamed: 0,Date,Commodity,Price,Item
0,2013-06-16,Tomato Big(Nepali),37.5,122
1,2013-06-16,Tomato Small(Local),29.0,124
2,2013-06-16,Potato Red,20.5,96
3,2013-06-16,Potato White,15.5,99
4,2013-06-16,Onion Dry (Indian),29.0,83


In [20]:
train.Item.nunique(), test.Item.nunique()

(128, 131)

In [21]:
train["Date"].min(), train["Date"].max(), test["Date"].min(), test["Date"].max()

(Timestamp('2013-06-16 00:00:00'),
 Timestamp('2019-12-31 00:00:00'),
 Timestamp('2020-01-01 00:00:00'),
 Timestamp('2021-05-13 00:00:00'))

In [22]:
df.groupby(["Item"]).agg(
    {"Price": ["count", "sum", "mean", "median", "std", "min", "max"]}
)

Unnamed: 0_level_0,Price,Price,Price,Price,Price,Price,Price
Unnamed: 0_level_1,count,sum,mean,median,std,min,max
Item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,395,110610.0,280.025316,290.00,39.922188,145.0,345.0
1,2683,328087.5,122.283824,105.00,36.720520,55.0,290.0
2,2020,86056.5,42.602228,42.50,8.996921,11.0,67.5
3,1055,339337.5,321.646919,305.00,148.461336,65.0,1900.0
4,1131,75852.0,67.066313,65.00,23.557621,19.0,125.0
...,...,...,...,...,...,...,...
127,420,24357.0,57.992857,57.50,16.226395,27.5,85.0
128,2538,141773.5,55.860323,55.00,16.650512,22.5,145.0
129,42,1382.0,32.904762,28.75,7.563744,26.5,52.5
130,2679,123674.5,46.164427,47.50,14.457301,13.5,175.0


Feature Engineering

In [23]:
import warnings

warnings.filterwarnings("ignore")


In [24]:
# Time-related feature == creating time related features


def create_date_features(df):
    df["month"] = df.Date.dt.month
    df["day_of_month"] = df.Date.dt.day
    df["day_of_year"] = df.Date.dt.dayofyear
    # df["week_of_year"] = df.Date.dt.weekofyear
    df["day_of_week"] = df.Date.dt.dayofweek + 1
    df["year"] = df.Date.dt.year
    df["is_wknd"] = (
        df.Date.dt.weekday // 4
    )  ### df.date.dt.weekday => Starts from '0' means '0' = 'Monday'.
    ##So, '// 4' will give '1' when day number equals
    ## so does for 5 and 6 gives '1' for //4
    df["is_month_start"] = df.Date.dt.is_month_start.astype(int)
    df["is_month_end"] = df.Date.dt.is_month_end.astype(int)
    df["is_year_start"] = df.Date.dt.is_year_start.astype(int)
    df["is_year_end"] = df.Date.dt.is_year_end.astype(int)
    return df


df = create_date_features(train)


In [25]:
df.head()

Unnamed: 0,Date,Commodity,Price,Item,month,day_of_month,day_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,is_year_start,is_year_end
0,2013-06-16,Tomato Big(Nepali),37.5,122,6,16,167,7,2013,1,0,0,0,0
1,2013-06-16,Tomato Small(Local),29.0,124,6,16,167,7,2013,1,0,0,0,0
2,2013-06-16,Potato Red,20.5,96,6,16,167,7,2013,1,0,0,0,0
3,2013-06-16,Potato White,15.5,99,6,16,167,7,2013,1,0,0,0,0
4,2013-06-16,Onion Dry (Indian),29.0,83,6,16,167,7,2013,1,0,0,0,0


In [26]:
df.sort_values(by=["Item", "Date"], axis=0, inplace=True)


# Define the function to add random noise
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))


# Define the function to create lag features
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe["price_lag_" + str(lag)] = dataframe.groupby("Item")[
            "Price"
        ].transform(lambda x: x.shift(lag)) + random_noise(
            dataframe
        )  
    return dataframe

lags_list = [91, 98, 105, 112, 119, 126, 182, 364, 546, 728]

df = lag_features(df, lags_list)



In [27]:
df.tail()

Unnamed: 0,Date,Commodity,Price,Item,month,day_of_month,day_of_year,day_of_week,year,is_wknd,...,price_lag_91,price_lag_98,price_lag_105,price_lag_112,price_lag_119,price_lag_126,price_lag_182,price_lag_364,price_lag_546,price_lag_728
157057,2019-12-27,Yam,55.0,131,12,27,361,5,2019,1,...,46.494588,38.859868,48.277765,47.580477,47.717697,45.084427,41.132164,54.403045,31.528148,27.619826
157143,2019-12-28,Yam,55.0,131,12,28,362,6,2019,1,...,48.966025,42.19638,45.586769,46.091931,44.760053,48.43381,37.915759,55.537659,25.192925,29.259594
157228,2019-12-29,Yam,55.0,131,12,29,363,7,2019,1,...,47.578774,41.18258,43.617128,48.407534,47.902035,41.003363,38.09986,53.169355,22.597669,31.369924
157313,2019-12-30,Yam,55.0,131,12,30,364,1,2019,0,...,48.076528,44.620034,44.044264,44.609514,47.215021,37.069896,36.694648,55.985702,21.084736,33.406285
157398,2019-12-31,Yam,55.0,131,12,31,365,2,2019,0,...,51.159513,51.335273,46.543351,47.898791,48.346783,38.385326,37.985693,52.438533,24.339702,28.763372


In [28]:
def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe["price_roll_mean_" + str(window)] = dataframe.groupby("Item")[
            "Price"
        ].transform(
            lambda x: x.shift(1)
            .rolling(window=window, min_periods=10, win_type="triang")
            .mean()
        ) + random_noise(dataframe)
    return dataframe


# List of windows to create rolling mean features
roll_mean_list = [365, 546]

# Create rolling mean features
df = roll_mean_features(df, roll_mean_list)


In [29]:
df.tail()


Unnamed: 0,Date,Commodity,Price,Item,month,day_of_month,day_of_year,day_of_week,year,is_wknd,...,price_lag_105,price_lag_112,price_lag_119,price_lag_126,price_lag_182,price_lag_364,price_lag_546,price_lag_728,price_roll_mean_365,price_roll_mean_546
157057,2019-12-27,Yam,55.0,131,12,27,361,5,2019,1,...,48.277765,47.580477,47.717697,45.084427,41.132164,54.403045,31.528148,27.619826,43.078937,44.041177
157143,2019-12-28,Yam,55.0,131,12,28,362,6,2019,1,...,45.586769,46.091931,44.760053,48.43381,37.915759,55.537659,25.192925,29.259594,42.678216,45.806177
157228,2019-12-29,Yam,55.0,131,12,29,363,7,2019,1,...,43.617128,48.407534,47.902035,41.003363,38.09986,53.169355,22.597669,31.369924,43.822826,43.403821
157313,2019-12-30,Yam,55.0,131,12,30,364,1,2019,0,...,44.044264,44.609514,47.215021,37.069896,36.694648,55.985702,21.084736,33.406285,46.103901,45.02946
157398,2019-12-31,Yam,55.0,131,12,31,365,2,2019,0,...,46.543351,47.898791,48.346783,38.385326,37.985693,52.438533,24.339702,28.763372,44.135316,44.888362


In [30]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe[
                "price_ewm_alpha_" + str(alpha).replace(".", "") + "_lag_" + str(lag)
            ] = dataframe.groupby("Item")["Price"].transform(
                lambda x: x.shift(lag).ewm(alpha=alpha).mean()
            )
    return dataframe


# Alphas and lags to create exponentially weighted mean features
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]

# Create exponentially weighted mean features
df = ewm_features(df, alphas, lags)


In [31]:
df.tail()

Unnamed: 0,Date,Commodity,Price,Item,month,day_of_month,day_of_year,day_of_week,year,is_wknd,...,price_ewm_alpha_07_lag_728,price_ewm_alpha_05_lag_91,price_ewm_alpha_05_lag_98,price_ewm_alpha_05_lag_105,price_ewm_alpha_05_lag_112,price_ewm_alpha_05_lag_180,price_ewm_alpha_05_lag_270,price_ewm_alpha_05_lag_365,price_ewm_alpha_05_lag_546,price_ewm_alpha_05_lag_728
157057,2019-12-27,Yam,55.0,131,12,27,361,5,2019,1,...,29.426847,47.362213,44.863277,47.499475,47.43286,38.129958,38.122664,54.902954,32.796447,29.556925
157143,2019-12-28,Yam,55.0,131,12,28,362,6,2019,1,...,29.128054,47.431107,43.681639,46.249738,47.46643,37.814979,37.811332,54.951477,30.148224,29.278462
157228,2019-12-29,Yam,55.0,131,12,29,363,7,2019,1,...,29.038416,47.465553,43.090819,45.624869,47.483215,37.657489,37.655666,54.975739,26.324112,29.139231
157313,2019-12-30,Yam,55.0,131,12,30,364,1,2019,0,...,29.711525,47.482777,45.29541,45.312434,47.491607,37.578745,37.577833,54.987869,24.412056,29.569616
157398,2019-12-31,Yam,55.0,131,12,31,365,2,2019,0,...,29.213457,47.491388,46.397705,46.406217,47.495804,37.539372,37.538916,54.993935,23.456028,29.284808


In [32]:
df.shape

(157439, 71)

In [33]:
df.isnull().sum()

Date                              0
Commodity                         0
Price                             0
Item                              0
month                             0
                              ...  
price_ewm_alpha_05_lag_180    20138
price_ewm_alpha_05_lag_270    28414
price_ewm_alpha_05_lag_365    36410
price_ewm_alpha_05_lag_546    51290
price_ewm_alpha_05_lag_728    65850
Length: 71, dtype: int64

In [34]:
df = pd.get_dummies(df, columns=["Item"])

In [35]:
df

Unnamed: 0,Date,Commodity,Price,month,day_of_month,day_of_year,day_of_week,year,is_wknd,is_month_start,...,Item_122,Item_123,Item_124,Item_125,Item_126,Item_127,Item_128,Item_129,Item_130,Item_131
141053,2019-05-31,Apple(Fuji),255.0,5,31,151,5,2019,1,0,...,False,False,False,False,False,False,False,False,False,False
141130,2019-06-01,Apple(Fuji),255.0,6,1,152,6,2019,1,1,...,False,False,False,False,False,False,False,False,False,False
141203,2019-06-02,Apple(Fuji),255.0,6,2,153,7,2019,1,0,...,False,False,False,False,False,False,False,False,False,False
141278,2019-06-03,Apple(Fuji),255.0,6,3,154,1,2019,0,0,...,False,False,False,False,False,False,False,False,False,False
141358,2019-06-04,Apple(Fuji),345.0,6,4,155,2,2019,0,0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157057,2019-12-27,Yam,55.0,12,27,361,5,2019,1,0,...,False,False,False,False,False,False,False,False,False,True
157143,2019-12-28,Yam,55.0,12,28,362,6,2019,1,0,...,False,False,False,False,False,False,False,False,False,True
157228,2019-12-29,Yam,55.0,12,29,363,7,2019,1,0,...,False,False,False,False,False,False,False,False,False,True
157313,2019-12-30,Yam,55.0,12,30,364,1,2019,0,0,...,False,False,False,False,False,False,False,False,False,True


In [36]:
df.fillna(method="ffill", inplace=True)

# Backward fill any remaining missing values
df.fillna(method="bfill", inplace=True)

# Check for any remaining missing values
print(df.isnull().sum())


Date            0
Commodity       0
Price           0
month           0
day_of_month    0
               ..
Item_127        0
Item_128        0
Item_129        0
Item_130        0
Item_131        0
Length: 198, dtype: int64


In [37]:
df["Price"] = np.log1p(df["Price"].values)

In [38]:
df.drop(columns=["Commodity"], inplace=True, errors="ignore")

In [39]:
#  train and validation set.
train = df.loc[(df["Date"] < "2019-01-01"), :]  # Until beginning of 2017
val = df.loc[
    (df["Date"] >= "2019-01-01") & (df["Date"] < "2019-04-01"), :
]  # First 3 months of 2019

# dropping useless column
cols = [col for col in train.columns if col not in ["Date", "id", "Price", "year"]]

Y_train = train["Price"]
X_train = train[cols]

Y_val = val["Price"]
X_val = val[cols]

Y_train.shape, X_train.shape, Y_val.shape, X_val.shape


((131242,), (131242, 194), (5868,), (5868, 194))

In [41]:
# train = train.drop(columns=["Commodity"])

In [42]:
train.head()

Unnamed: 0,Date,Price,month,day_of_month,day_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,...,Item_122,Item_123,Item_124,Item_125,Item_126,Item_127,Item_128,Item_129,Item_130,Item_131
50,2013-06-16,4.70953,6,16,167,7,2013,1,0,0,...,False,False,False,False,False,False,False,False,False,False
123,2013-06-17,4.795791,6,17,168,1,2013,0,0,0,...,False,False,False,False,False,False,False,False,False,False
196,2013-06-18,4.75359,6,18,169,2,2013,0,0,0,...,False,False,False,False,False,False,False,False,False,False
269,2013-06-19,4.75359,6,19,170,3,2013,0,0,0,...,False,False,False,False,False,False,False,False,False,False
343,2013-06-20,4.75359,6,20,171,4,2013,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [44]:
# val = val.drop(columns=["Commodity"])

Lgbm Model

In [45]:
# light bgm model
import lightgbm as lgb
import shap
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer
from lightgbm import LGBMRegressor


In [46]:
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    mean_absolute_percentage_error,
)

In [47]:
##SMAPE score


def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val


# Calculating SMAPE for LightGBM output:
def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return "SMAPE", smape_val, False


In [48]:
# lgbm in without any parameters
first_model = lgb.LGBMRegressor().fit(X_train, Y_train)

print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))
pred = first_model.predict(X_train)
print("MAE:", mean_absolute_error(Y_train, pred))
print("RMSE:", mean_squared_error(Y_train, pred))
print("R2 Score:", r2_score(Y_train, pred))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.059035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15026
[LightGBM] [Info] Number of data points in the train set: 131242, number of used features: 159
[LightGBM] [Info] Start training from score 4.168860
TRAIN SMAPE: 4.700816118559255
VALID SMAPE: 6.394375636594548
MAE: 0.1883774577614332
RMSE: 0.0643785349166808
R2 Score: 0.8651558395284946


Catboost Regressor

In [51]:
# from catboost import CatBoostRegressor
# first_model = CatBoostRegressor()
# first_model.fit(X_train, Y_train)

# print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
# print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))
# pred = first_model.predict(X_train)
# print("MAE:",mean_absolute_error(Y_train, pred))
# print("RMSE:",mean_squared_error(Y_train, pred))
# print("R2 Score:",r2_score(Y_train, pred))

Linear Regression

In [None]:
# from sklearn.linear_model import LinearRegression

# lr_model = LinearRegression()
# lr_model.fit(X_train, Y_train)


In [None]:
# train_preds = lr_model.predict(X_train)
# val_preds = lr_model.predict(X_val)


In [None]:
# train_smape = smape(train_preds, Y_train)
# val_smape = smape(val_preds, Y_val)


In [None]:
# # Calculate MAE
# train_mae = mean_absolute_error(Y_train, train_preds)
# val_mae = mean_absolute_error(Y_val, val_preds)

# # Calculate RMSE
# train_rmse = mean_squared_error(
#     Y_train, train_preds, squared=False
# )  # squared=False gives RMSE
# val_rmse = mean_squared_error(Y_val, val_preds, squared=False)

# # Calculate R2 Score
# train_r2 = r2_score(Y_train, train_preds)
# val_r2 = r2_score(Y_val, val_preds)


In [None]:
# print("TRAIN SMAPE:", train_smape)
# print("VALID SMAPE:", val_smape)
# print("MAE:", train_mae)
# print("RMSE:", train_rmse)
# print("R2 Score:", train_r2)


TRAIN SMAPE: 6.23038382226299
VALID SMAPE: 7.725451523421956
MAE: 0.250133925734053
RMSE: 0.3320287764054428
R2 Score: 0.7690901258628


In [None]:
# first_model = LinearRegression()
# first_model.fit(X_train, Y_train)
# print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
# print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))
# pred = first_model.predict(X_train)
# print("MAE:", mean_absolute_error(Y_train, pred))
# print("RMSE:", mean_squared_error(Y_train, pred))
# print("R2 Score:", r2_score(Y_train, pred))


TRAIN SMAPE: 6.23038382226299
VALID SMAPE: 7.725451523421956
MAE: 0.250133925734053
RMSE: 0.11024310836129553
R2 Score: 0.7690901258628


In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from sklearn.model_selection import ShuffleSplit
# import seaborn as sns
