Importing Libraies

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
# ---- Disable Warnings ----#
pd.options.mode.chained_assignment = None

Reading the Dataset

In [3]:
df = pd.read_csv('dataset/dataset_final.csv')
df.head()

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,589,Jhabua,Jhabua,Cotton,Desi,FAQ,4800.0,4900.0,4850.0,2020-01-01
1,590,Jhabua,Jhabua,Cotton,DCH-32 (Ginned),FAQ,5800.0,5900.0,5850.0,2020-01-01
2,992,Dhar,Manawar,Cotton,Other,FAQ,4800.0,5300.0,5150.0,2020-01-01
3,1268,Badwani,Sendhwa,Cotton,H4,FAQ,4400.0,5401.0,4911.0,2020-01-01
4,2222,Harda,Harda,Lentil (Masur)(Whole),Kala Masoor New,FAQ,3400.0,3400.0,3400.0,2020-01-01


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266382 entries, 0 to 266381
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sl no.                     266382 non-null  int64  
 1   District Name              266382 non-null  object 
 2   Market Name                266382 non-null  object 
 3   Commodity                  266382 non-null  object 
 4   Variety                    266382 non-null  object 
 5   Grade                      266382 non-null  object 
 6   Min Price (Rs./Quintal)    266382 non-null  float64
 7   Max Price (Rs./Quintal)    266382 non-null  float64
 8   Modal Price (Rs./Quintal)  266382 non-null  float64
 9   Price Date                 266382 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 20.3+ MB


In [5]:
df.isnull().sum()

Sl no.                       0
District Name                0
Market Name                  0
Commodity                    0
Variety                      0
Grade                        0
Min Price (Rs./Quintal)      0
Max Price (Rs./Quintal)      0
Modal Price (Rs./Quintal)    0
Price Date                   0
dtype: int64

In [6]:
df.columns

Index(['Sl no.', 'District Name', 'Market Name', 'Commodity', 'Variety',
       'Grade', 'Min Price (Rs./Quintal)', 'Max Price (Rs./Quintal)',
       'Modal Price (Rs./Quintal)', 'Price Date'],
      dtype='object')

Pre-processing of Dataset

In [7]:
required_data = df[['Market Name', 'Commodity', 'Variety', 'Modal Price (Rs./Quintal)', 'Price Date']]
required_data.head()

Unnamed: 0,Market Name,Commodity,Variety,Modal Price (Rs./Quintal),Price Date
0,Jhabua,Cotton,Desi,4850.0,2020-01-01
1,Jhabua,Cotton,DCH-32 (Ginned),5850.0,2020-01-01
2,Manawar,Cotton,Other,5150.0,2020-01-01
3,Sendhwa,Cotton,H4,4911.0,2020-01-01
4,Harda,Lentil (Masur)(Whole),Kala Masoor New,3400.0,2020-01-01


Normalizing the 'Modal Price' Column

In [8]:
scaler = MinMaxScaler(feature_range=(0,1))
required_data['Modal Price (Rs./Quintal)'] = scaler.fit_transform(required_data['Modal Price (Rs./Quintal)'].values.reshape(-1,1))

Encoding Market Name, Variety, and Commodity values

In [9]:
encode_market = LabelEncoder()
encode_variety = LabelEncoder()
encode_commodity = LabelEncoder()
required_data['Market Name_Encoded'] = encode_market.fit_transform(required_data['Market Name'].values)
required_data['Variety_Encoded'] = encode_variety.fit_transform(required_data['Variety'].values) 
required_data['Commodity_Encoded'] = encode_commodity.fit_transform(required_data['Commodity'].values)

In [10]:
required_data

Unnamed: 0,Market Name,Commodity,Variety,Modal Price (Rs./Quintal),Price Date,Market Name_Encoded,Variety_Encoded,Commodity_Encoded
0,Jhabua,Cotton,Desi,0.341407,2020-01-01,106,13,1
1,Jhabua,Cotton,DCH-32 (Ginned),0.413283,2020-01-01,106,7,1
2,Manawar,Cotton,Other,0.362970,2020-01-01,152,41,1
3,Sendhwa,Cotton,H4,0.345792,2020-01-01,216,15,1
4,Harda,Lentil (Masur)(Whole),Kala Masoor New,0.237188,2020-01-01,88,19,3
...,...,...,...,...,...,...,...,...
266377,Ujjain,Wheat,Lokwan,0.176094,2022-12-31,253,27,8
266378,Ujjain,Wheat,Other,0.183282,2022-12-31,253,41,8
266379,Unhel,Wheat,Lok -1 (Nilami Rate),0.193991,2022-12-31,255,25,8
266380,Vidisha,Wheat,Sharbati,0.179329,2022-12-31,257,48,8


In [11]:
print(required_data['Commodity_Encoded'].unique())

[1 3 4 5 6 7 8 0 2]


In [12]:
encoded_var = required_data['Market Name_Encoded'].unique()

# Get the length of the unique encoded values
num_classes = len(encoded_var)

# Map the encoded values to a range from 1 to num_classes - 1
mapped_values = {encoded_var[i]: i+1 for i in range(num_classes)}

# Replace the encoded values in the DataFrame with the mapped values
required_data['Market Name_Encoded'] = required_data['Market Name_Encoded'].map(mapped_values)

# Print the DataFrame with the transformed encoded values
print(required_data['Market Name_Encoded'].unique())

[  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18
  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36
  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  54
  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71  72
  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144
 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216
 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 243 244 245 246 24

In [13]:
required_data.to_csv('datasetEncoded.csv')

Segregating the dependent and independent variables

In [14]:
x = required_data[['Market Name_Encoded', 'Commodity_Encoded', 'Variety_Encoded', 'Price Date']]
y = required_data[['Modal Price (Rs./Quintal)']]

In [32]:
y

array([0.34140732, 0.41328254, 0.36296988, ..., 0.19399123, 0.17932869,
       0.1682599 ])

In [15]:
y = required_data['Modal Price (Rs./Quintal)'].values
X_market = required_data['Market Name_Encoded'].values
X_variety = required_data['Variety_Encoded'].values
X_commodity = required_data['Commodity_Encoded'].values
X_date = pd.to_datetime(required_data['Price Date']).astype('int64') // 10**9

In [16]:
X = np.column_stack((X_market, X_commodity, X_variety, X_date))
X

array([[         1,          1,         13, 1577836800],
       [         1,          1,          7, 1577836800],
       [         2,          1,         41, 1577836800],
       ...,
       [       194,          8,         25, 1672444800],
       [        12,          8,         48, 1672444800],
       [        12,          8,         41, 1672444800]], dtype=int64)

In [17]:
y

array([0.34140732, 0.41328254, 0.36296988, ..., 0.19399123, 0.17932869,
       0.1682599 ])

Train-Test Split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [19]:
X_train_pd = pd.DataFrame(X_train)
X_train_pd

Unnamed: 0,0,1,2,3
0,128,5,53,1605830400
1,143,8,34,1616716800
2,149,3,35,1615248000
3,53,8,27,1643241600
4,65,3,36,1589500800
...,...,...,...,...
213100,70,3,41,1613433600
213101,23,1,41,1640995200
213102,236,8,37,1616371200
213103,229,5,41,1639353600


LightGBM (Light Gradient-Boosting Machine) Model

In [20]:
model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=2000, max_depth=-1)

In [21]:
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 552
[LightGBM] [Info] Number of data points in the train set: 213105, number of used features: 4
[LightGBM] [Info] Start training from score 0.226382


In [22]:
y_pred = model.predict(X_test)

In [23]:
# pd_y_pred = pd.DataFrame(y_pred)
# unscaled_pred = scaler.inverse_transform(pd_y_pred)
# unscaled_pred

In [24]:
# pd_y_test = pd.DataFrame(y_test)
# unscaled_test = scaler.inverse_transform(pd_y_test)
# unscaled_test

In [25]:
mse = mean_squared_error(y_test, y_pred, squared=False)
mse

0.018932117986222293

In [26]:
r2_score(y_test,y_pred)

0.9814388357922806

In [27]:
import pickle
pickle.dump(model, open('model.pkl', 'wb'))

In [28]:
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [29]:
pickle.dump(encode_market, open('market.pkl', 'wb'))

In [30]:
import os
import pickle

# Create the folder if it doesn't exist
folder_path = 'encoders'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# Assuming you have your encoders already initialized and named marketName_pkl, commodity_pkl, variety_pkl

# Save encoders inside the "encoders" folder
# with open(os.path.join(folder_path, 'marketName_encoder.pkl'), 'wb') as f:
#     pickle.dump(marketName_pkl, f)

# with open('encode_market.pkl', 'wb') as f:
#     pickle.dump(encode_market, f)

# Save encode_variety
# with open('encode_variety.pkl', 'wb') as f:
#     pickle.dump(encode_variety, f)

# # Save encode_commodity
# with open('encode_commodity.pkl', 'wb') as f:
#     pickle.dump(encode_commodity, f)

In [31]:
print(pickle.format_version)

4.0
