In [1]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor

from common.preprocessing import CropDataProcessor

import pickle

# Load data

In [3]:
train_data = pd.read_csv("../data/Crop_Data__train.csv")

In [4]:
train_data.shape

(164552, 25)

# Preprocessing

In [5]:
data_preprocessor = CropDataProcessor(train_data, excluded_features=['Dew_Frost_Point', 'Year', 'State', 'District'])

In [6]:
data_preprocessor.get_training_data()[0].shape



(164552, 19)

In [7]:
data_preprocessor.process_to_train()

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.


In [8]:
X, y = data_preprocessor.get_training_data()
X.shape, y.shape

((164552, 28), (164552,))

# Model Training

From hyper-parameters search we found following setting gives the best and stable train-test scores.

{'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 110}

In [9]:
reg = RandomForestRegressor(n_estimators=110,
                            max_depth=20,
                            min_samples_leaf=2,
                            min_samples_split=4,
                            n_jobs=-1,
                            random_state=101)

In [10]:
reg.fit(X, y)

RandomForestRegressor(max_depth=20, min_samples_leaf=2, min_samples_split=4,
                      n_estimators=110, n_jobs=-1, random_state=101)

## Save model

In [11]:
with open('../model/model.pkl', 'wb') as f:
    pickle.dump(reg, f)

## Load model

In [12]:
model = None
with open('../model/model.pkl', 'rb') as f:
    model = pickle.load(f)

## Training score

In [13]:
print("Training score: ", model.score(X, y))

Training score:  0.9562725656456335


# Model Testing

## Load test data

In [14]:
test_data = pd.read_csv("../data/Crop_Data__test.csv")

In [15]:
test_data.head()

Unnamed: 0,State,District,Lat,Long,GW_depth_Min,GW_depth_Max,Wells_depth_0_to_2,Wells_depth_2_to_5,Wells_depth_5_to_10,Wells_depth_10_to_20,...,Humidity,Temp_Max,Temp_Min,Dew_Frost_Point,Wind_Speed,Year,Season,Crop,Area,Production
0,GUJARAT,NAVSARI,20.85,72.9167,3.4,13.4,0,6,5,4.0,...,83.652,29.72,26.858,25.084,3.964,2010,Kharif,Rice,62600.0,161300.0
1,HIMACHAL PRADESH,MANDI,31.64519,76.99529,0.74,8.94,3,3,2,0.0,...,69.244,25.618,17.336,14.262,1.442,2010,Kharif,Moong(Green Gram),16.0,6.0
2,UTTAR PRADESH,BULANDSHAHR,28.39912,78.02749,8.12,12.4,0,0,2,4.0,...,73.872,32.682,24.36,22.78,1.576,2013,Kharif,Arhar/Tur,8911.0,6728.0
3,KARNATAKA,BELGAUM,15.85483,74.5042,1.6,18.0,2,12,31,20.0,...,83.258,27.862,20.812,20.664,3.034,2000,Kharif,Groundnut,80929.0,77319.0
4,KARNATAKA,HAVERI,14.54387,76.23089,2.0,13.8,1,1,8,5.0,...,42.308,33.314,17.434,10.032,2.698,2012,Rabi,Jowar,31611.0,18889.0


In [16]:
test_data.shape

(41139, 25)

In [17]:
X_test, y_test = data_preprocessor.process_to_test(test_data)

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.


In [18]:
X_test.shape, y_test.shape

((41139, 28), (41139, 1))

## Test score

In [19]:
print(f"Test score: {model.score(X_test, y_test)}")

Test score: 0.8968860397220535


In [20]:
dict(test_data.iloc[1])

{'State': 'HIMACHAL PRADESH',
 'District': 'MANDI',
 'Lat': 31.645190000000003,
 'Long': 76.99529,
 'GW_depth_Min': 0.74,
 'GW_depth_Max': 8.94,
 'Wells_depth_0_to_2': 3,
 'Wells_depth_2_to_5': 3,
 'Wells_depth_5_to_10': 2,
 'Wells_depth_10_to_20': 0.0,
 'Wells_depth_20_to_40': 0.0,
 'Wells_depth_40_and_above': 0.0,
 'Precipitation': 235.352,
 'Solar_Radiation': 18.558,
 'Surface_Pressure': 84.21799999999998,
 'Humidity': 69.244,
 'Temp_Max': 25.618000000000002,
 'Temp_Min': 17.336,
 'Dew_Frost_Point': 14.262,
 'Wind_Speed': 1.442,
 'Year': 2010,
 'Season': 'Kharif',
 'Crop': 'Moong(Green Gram)',
 'Area': 16.0,
 'Production': 6.0}

In [21]:
test_feature = {
 'Lat': 14.543870000000002,
 'Long': 76.23089,
 'GW_depth_Min': 2.0,
 'GW_depth_Max': 13.8,
 'Wells_depth_0_to_2': 1,
 'Wells_depth_2_to_5': 1,
 'Wells_depth_5_to_10': 8,
 'Wells_depth_10_to_20': 5.0,
 'Wells_depth_20_to_40': 0.0,
 'Wells_depth_40_and_above': 0.0,
 'Precipitation': 37.56416666666667,
 'Solar_Radiation': 19.804166666666667,
 'Surface_Pressure': 94.11333333333334,
 'Humidity': 56.123999999999995,
 'Temp_Max': 32.937,
 'Temp_Min': 20.16,
 'Wind_Speed': 3.511,
 'Season': 'Whole Year',
 'Crop': 'Arcanut (Processed)'}

In [22]:
prediction = model.predict(data_preprocessor.process_to_predict(pd.DataFrame(test_feature, index=[1])))

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.


In [23]:
data_preprocessor.target_transformer.inverse_transform([prediction]) - 1

array([[1.36202686]])