Importing Libraies

In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [17]:
# ---- Disable Warnings ----#
pd.options.mode.chained_assignment = None

Reading the Dataset

In [18]:
df = pd.read_csv('dataset/dataset_final.csv')
df.head()

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,589,Jhabua,Jhabua,Cotton,Desi,FAQ,4800.0,4900.0,4850.0,2020-01-01
1,590,Jhabua,Jhabua,Cotton,DCH-32 (Ginned),FAQ,5800.0,5900.0,5850.0,2020-01-01
2,992,Dhar,Manawar,Cotton,Other,FAQ,4800.0,5300.0,5150.0,2020-01-01
3,1268,Badwani,Sendhwa,Cotton,H4,FAQ,4400.0,5401.0,4911.0,2020-01-01
4,2222,Harda,Harda,Lentil (Masur)(Whole),Kala Masoor New,FAQ,3400.0,3400.0,3400.0,2020-01-01


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266382 entries, 0 to 266381
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sl no.                     266382 non-null  int64  
 1   District Name              266382 non-null  object 
 2   Market Name                266382 non-null  object 
 3   Commodity                  266382 non-null  object 
 4   Variety                    266382 non-null  object 
 5   Grade                      266382 non-null  object 
 6   Min Price (Rs./Quintal)    266382 non-null  float64
 7   Max Price (Rs./Quintal)    266382 non-null  float64
 8   Modal Price (Rs./Quintal)  266382 non-null  float64
 9   Price Date                 266382 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 20.3+ MB


In [20]:
df.isnull().sum()

Sl no.                       0
District Name                0
Market Name                  0
Commodity                    0
Variety                      0
Grade                        0
Min Price (Rs./Quintal)      0
Max Price (Rs./Quintal)      0
Modal Price (Rs./Quintal)    0
Price Date                   0
dtype: int64

In [21]:
df.columns

Index(['Sl no.', 'District Name', 'Market Name', 'Commodity', 'Variety',
       'Grade', 'Min Price (Rs./Quintal)', 'Max Price (Rs./Quintal)',
       'Modal Price (Rs./Quintal)', 'Price Date'],
      dtype='object')

Pre-processing of Dataset

In [22]:
required_data = df[['Market Name', 'Commodity', 'Variety', 'Modal Price (Rs./Quintal)', 'Price Date']]
required_data.head()

Unnamed: 0,Market Name,Commodity,Variety,Modal Price (Rs./Quintal),Price Date
0,Jhabua,Cotton,Desi,4850.0,2020-01-01
1,Jhabua,Cotton,DCH-32 (Ginned),5850.0,2020-01-01
2,Manawar,Cotton,Other,5150.0,2020-01-01
3,Sendhwa,Cotton,H4,4911.0,2020-01-01
4,Harda,Lentil (Masur)(Whole),Kala Masoor New,3400.0,2020-01-01


Normalizing the 'Modal Price' Column

In [23]:
scaler = MinMaxScaler(feature_range=(0,1))
required_data['Modal Price (Rs./Quintal)'] = scaler.fit_transform(required_data['Modal Price (Rs./Quintal)'].values.reshape(-1,1))

Encoding Market Name, Variety, and Commodity values

In [24]:
encode_market = LabelEncoder()
encode_variety = LabelEncoder()
encode_commodity = LabelEncoder()
required_data['Market Name_Encoded'] = encode_market.fit_transform(required_data['Market Name'].values)
required_data['Variety_Encoded'] = encode_variety.fit_transform(required_data['Variety'].values) 
required_data['Commodity_Encoded'] = encode_commodity.fit_transform(required_data['Commodity'].values)

In [25]:
required_data

Unnamed: 0,Market Name,Commodity,Variety,Modal Price (Rs./Quintal),Price Date,Market Name_Encoded,Variety_Encoded,Commodity_Encoded
0,Jhabua,Cotton,Desi,0.341407,2020-01-01,106,13,1
1,Jhabua,Cotton,DCH-32 (Ginned),0.413283,2020-01-01,106,7,1
2,Manawar,Cotton,Other,0.362970,2020-01-01,152,41,1
3,Sendhwa,Cotton,H4,0.345792,2020-01-01,216,15,1
4,Harda,Lentil (Masur)(Whole),Kala Masoor New,0.237188,2020-01-01,88,19,3
...,...,...,...,...,...,...,...,...
266377,Ujjain,Wheat,Lokwan,0.176094,2022-12-31,253,27,8
266378,Ujjain,Wheat,Other,0.183282,2022-12-31,253,41,8
266379,Unhel,Wheat,Lok -1 (Nilami Rate),0.193991,2022-12-31,255,25,8
266380,Vidisha,Wheat,Sharbati,0.179329,2022-12-31,257,48,8


In [35]:
required_data.to_csv('datasetEncoded.csv')

Segregating the dependent and independent variables

In [26]:
x = required_data[['Market Name_Encoded', 'Commodity_Encoded', 'Variety_Encoded', 'Price Date']]
y = required_data[['Modal Price (Rs./Quintal)']]

In [28]:
y = required_data['Modal Price (Rs./Quintal)'].values
X_market = required_data['Market Name_Encoded'].values
X_variety = required_data['Variety_Encoded'].values
X_commodity = required_data['Commodity_Encoded'].values
X_date = pd.to_datetime(required_data['Price Date']).astype(int) // 10**9

In [29]:
X = np.column_stack((X_market, X_commodity, X_variety, X_date))
X

array([[       106,          1,         13, 1577836800],
       [       106,          1,          7, 1577836800],
       [       152,          1,         41, 1577836800],
       ...,
       [       255,          8,         25, 1672444800],
       [       257,          8,         48, 1672444800],
       [       257,          8,         41, 1672444800]])

In [30]:
y

array([0.34140732, 0.41328254, 0.36296988, ..., 0.19399123, 0.17932869,
       0.1682599 ])

Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

LightGBM (Light Gradient-Boosting Machine) Model

In [15]:
model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=2000, max_depth=-1)

In [16]:
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)

In [18]:
# pd_y_pred = pd.DataFrame(y_pred)
# unscaled_pred = scaler.inverse_transform(pd_y_pred)
# unscaled_pred

In [19]:
# pd_y_test = pd.DataFrame(y_test)
# unscaled_test = scaler.inverse_transform(pd_y_test)
# unscaled_test

In [20]:
mse = mean_squared_error(y_test, y_pred, squared=False)
mse

0.01813275927129915

In [21]:
r2_score(y_test,y_pred)

0.982694327971859

In [33]:
import pickle
# pickle.dump(model, open('model.pkl', 'wb'))

In [34]:
pickle.dump(scaler, open('scaler.pkl', 'wb'))

In [38]:
pickle.dump(encode_market, open('market.pkl', 'wb'))