Importing Libraries

In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [2]:
df = pd.read_csv('wheat.csv')
df

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,1,Ratlam,A lot,Wheat,Lokwan,FAQ,1725.0,1800.0,1785.0,2020-06-30
1,2,Ratlam,A lot,Wheat,Lokwan,FAQ,1689.0,1810.0,1790.0,2020-06-29
2,3,Ratlam,A lot,Wheat,Lokwan,FAQ,1601.0,1761.0,1730.0,2020-06-25
3,4,Ratlam,A lot,Wheat,Lokwan,FAQ,1620.0,1731.0,1707.0,2020-06-23
4,5,Ratlam,A lot,Wheat,Lokwan,FAQ,1624.0,2010.0,1725.0,2020-06-22
...,...,...,...,...,...,...,...,...,...,...
123364,26997,Sheopur,Vijaypur,Wheat,Other,FAQ,1900.0,1950.0,1930.0,2022-07-11
123365,26998,Sheopur,Vijaypur,Wheat,Other,FAQ,1900.0,1950.0,1930.0,2022-07-09
123366,26999,Sheopur,Vijaypur,Wheat,Other,FAQ,1900.0,1960.0,1930.0,2022-07-08
123367,27000,Sheopur,Vijaypur,Wheat,Other,FAQ,1900.0,1950.0,1940.0,2022-07-07


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123369 entries, 0 to 123368
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Sl no.                     123369 non-null  int64  
 1   District Name              123369 non-null  object 
 2   Market Name                123369 non-null  object 
 3   Commodity                  123369 non-null  object 
 4   Variety                    123369 non-null  object 
 5   Grade                      123369 non-null  object 
 6   Min Price (Rs./Quintal)    123369 non-null  float64
 7   Max Price (Rs./Quintal)    123369 non-null  float64
 8   Modal Price (Rs./Quintal)  123369 non-null  float64
 9   Price Date                 123369 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 9.4+ MB


In [4]:
df.isnull().sum()

Sl no.                       0
District Name                0
Market Name                  0
Commodity                    0
Variety                      0
Grade                        0
Min Price (Rs./Quintal)      0
Max Price (Rs./Quintal)      0
Modal Price (Rs./Quintal)    0
Price Date                   0
dtype: int64

In [5]:
df.columns

Index(['Sl no.', 'District Name', 'Market Name', 'Commodity', 'Variety',
       'Grade', 'Min Price (Rs./Quintal)', 'Max Price (Rs./Quintal)',
       'Modal Price (Rs./Quintal)', 'Price Date'],
      dtype='object')

In [6]:
required_data = df[['Market Name', 'Variety', 'Modal Price (Rs./Quintal)', 'Price Date']]
required_data

Unnamed: 0,Market Name,Variety,Modal Price (Rs./Quintal),Price Date
0,A lot,Lokwan,1785.0,2020-06-30
1,A lot,Lokwan,1790.0,2020-06-29
2,A lot,Lokwan,1730.0,2020-06-25
3,A lot,Lokwan,1707.0,2020-06-23
4,A lot,Lokwan,1725.0,2020-06-22
...,...,...,...,...
123364,Vijaypur,Other,1930.0,2022-07-11
123365,Vijaypur,Other,1930.0,2022-07-09
123366,Vijaypur,Other,1930.0,2022-07-08
123367,Vijaypur,Other,1940.0,2022-07-07


In [7]:
scaler = MinMaxScaler(feature_range=(0,1))
required_data['Modal Price (Rs./Quintal)'] = scaler.fit_transform(required_data['Modal Price (Rs./Quintal)'].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_data['Modal Price (Rs./Quintal)'] = scaler.fit_transform(required_data['Modal Price (Rs./Quintal)'].values.reshape(-1,1))


In [8]:
encode_market = LabelEncoder()
encode_variety = LabelEncoder()
required_data['Market Name'] = encode_market.fit_transform(required_data['Market Name'].values)
required_data['Variety'] = encode_variety.fit_transform(required_data['Variety'].values) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_data['Market Name'] = encode_market.fit_transform(required_data['Market Name'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  required_data['Variety'] = encode_variety.fit_transform(required_data['Variety'].values)


In [9]:
x = required_data[['Market Name', 'Variety', 'Price Date']]
y = required_data[['Modal Price (Rs./Quintal)']]

In [10]:
y = required_data['Modal Price (Rs./Quintal)'].values
X_market = required_data['Market Name'].values
X_variety = required_data['Variety'].values
X_date = pd.to_datetime(required_data['Price Date']).astype(int) // 10**9

In [11]:
X = np.column_stack((X_market, X_variety, X_date))
X

array([[         0,          8, 1593475200],
       [         0,          8, 1593388800],
       [         0,          8, 1593043200],
       ...,
       [       255,         18, 1657238400],
       [       255,         18, 1657152000],
       [       255,         18, 1657065600]])

In [12]:
y = y.reshape(-1, 1)
y

array([[0.17925532],
       [0.17978723],
       [0.17340426],
       ...,
       [0.19468085],
       [0.19574468],
       [0.19468085]])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
model = RandomForestRegressor(max_depth=500, random_state=2)
model.fit(X_train,y_train)

  model.fit(X_train,y_train)


In [21]:
y_pred = model.predict(X_test)

In [22]:
y_pred.reshape(-1,1)

array([[0.21985319],
       [0.17162128],
       [0.19787553],
       ...,
       [0.20868404],
       [0.19326596],
       [0.21594255]])

In [23]:
y_pred.reshape(-1,1)

array([[0.21985319],
       [0.17162128],
       [0.19787553],
       ...,
       [0.20868404],
       [0.19326596],
       [0.21594255]])

In [30]:
mse = mean_squared_error(y_test, y_pred, squared=False)
mse

0.015894587058593843

In [26]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.7608383571911255