Importing Libraies

In [107]:
import numpy as np
import seaborn as sns
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [108]:
df = pd.read_csv('wheat.csv')
df.head()

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,1,Ratlam,A lot,Wheat,Lokwan,FAQ,1725.0,1800.0,1785.0,2020-06-30
1,2,Ratlam,A lot,Wheat,Lokwan,FAQ,1689.0,1810.0,1790.0,2020-06-29
2,3,Ratlam,A lot,Wheat,Lokwan,FAQ,1601.0,1761.0,1730.0,2020-06-25
3,4,Ratlam,A lot,Wheat,Lokwan,FAQ,1620.0,1731.0,1707.0,2020-06-23
4,5,Ratlam,A lot,Wheat,Lokwan,FAQ,1624.0,2010.0,1725.0,2020-06-22


In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123369 entries, 0 to 123368
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sl no.                     123369 non-null  int64  
 1   District Name              123369 non-null  object 
 2   Market Name                123369 non-null  object 
 3   Commodity                  123369 non-null  object 
 4   Variety                    123369 non-null  object 
 5   Grade                      123369 non-null  object 
 6   Min Price (Rs./Quintal)    123369 non-null  float64
 7   Max Price (Rs./Quintal)    123369 non-null  float64
 8   Modal Price (Rs./Quintal)  123369 non-null  float64
 9   Price Date                 123369 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 9.4+ MB


In [110]:
df.isnull().sum()

Sl no.                       0
District Name                0
Market Name                  0
Commodity                    0
Variety                      0
Grade                        0
Min Price (Rs./Quintal)      0
Max Price (Rs./Quintal)      0
Modal Price (Rs./Quintal)    0
Price Date                   0
dtype: int64

In [111]:
df.columns

Index(['Sl no.', 'District Name', 'Market Name', 'Commodity', 'Variety',
       'Grade', 'Min Price (Rs./Quintal)', 'Max Price (Rs./Quintal)',
       'Modal Price (Rs./Quintal)', 'Price Date'],
      dtype='object')

In [112]:
required_data = df[['Market Name', 'Variety', 'Modal Price (Rs./Quintal)', 'Price Date']]
required_data.head()

Unnamed: 0,Market Name,Variety,Modal Price (Rs./Quintal),Price Date
0,A lot,Lokwan,1785.0,2020-06-30
1,A lot,Lokwan,1790.0,2020-06-29
2,A lot,Lokwan,1730.0,2020-06-25
3,A lot,Lokwan,1707.0,2020-06-23
4,A lot,Lokwan,1725.0,2020-06-22


In [113]:
scaler = MinMaxScaler(feature_range=(0,1))
required_data['Modal Price (Rs./Quintal)'] = scaler.fit_transform(required_data['Modal Price (Rs./Quintal)'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_data['Modal Price (Rs./Quintal)'] = scaler.fit_transform(required_data['Modal Price (Rs./Quintal)'].values.reshape(-1,1))


In [114]:
encode_market = LabelEncoder()
encode_variety = LabelEncoder()
required_data['Market Name'] = encode_market.fit_transform(required_data['Market Name'].values)
required_data['Variety'] = encode_variety.fit_transform(required_data['Variety'].values) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_data['Market Name'] = encode_market.fit_transform(required_data['Market Name'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_data['Variety'] = encode_variety.fit_transform(required_data['Variety'].values)


In [115]:
required_data.head()

Unnamed: 0,Market Name,Variety,Modal Price (Rs./Quintal),Price Date
0,0,8,0.179255,2020-06-30
1,0,8,0.179787,2020-06-29
2,0,8,0.173404,2020-06-25
3,0,8,0.170957,2020-06-23
4,0,8,0.172872,2020-06-22


In [116]:
x = required_data[['Market Name', 'Variety', 'Price Date']]
y = required_data[['Modal Price (Rs./Quintal)']]

In [117]:
y = required_data['Modal Price (Rs./Quintal)'].values
X_market = required_data['Market Name'].values
X_variety = required_data['Variety'].values
X_date = pd.to_datetime(required_data['Price Date']).astype(int) // 10**9

In [118]:
X = np.column_stack((X_market, X_variety, X_date))
X

array([[         0,          8, 1593475200],
       [         0,          8, 1593388800],
       [         0,          8, 1593043200],
       ...,
       [       255,         18, 1657238400],
       [       255,         18, 1657152000],
       [       255,         18, 1657065600]])

In [119]:
y = y.reshape(-1, 1)
y

array([[0.17925532],
       [0.17978723],
       [0.17340426],
       ...,
       [0.19468085],
       [0.19574468],
       [0.19468085]])

In [120]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [121]:
model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=500, max_depth=-1)

In [122]:
model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [123]:
y_pred = model.predict(X_test)

In [124]:
y_pred.reshape(-1,1)

array([[0.18691205],
       [0.21807138],
       [0.17295171],
       ...,
       [0.20364935],
       [0.16148719],
       [0.21040576]])

In [125]:
y_pred.reshape(-1,1)

array([[0.18691205],
       [0.21807138],
       [0.17295171],
       ...,
       [0.20364935],
       [0.16148719],
       [0.21040576]])

In [126]:
pd_y_pred = pd.DataFrame(y_pred)
unscaled_pred = scaler.inverse_transform(pd_y_pred)
unscaled_pred

array([[1856.97331328],
       [2149.87095743],
       [1725.74611966],
       ...,
       [2014.30388526],
       [1617.97957631],
       [2077.81417361]])

In [127]:
pd_y_test = pd.DataFrame(y_test)
unscaled_test = scaler.inverse_transform(pd_y_test)
unscaled_test

array([[1980.],
       [2140.],
       [1610.],
       ...,
       [2100.],
       [1560.],
       [2151.]])

In [128]:
mse = mean_squared_error(y_test, y_pred, squared=False)
mse

0.013375563885914847

In [129]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8174592330735737