In [1]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [2]:
import pandas as pd

from model import CropYielEstimator

In [3]:
estimator = CropYielEstimator()

# Load data

In [4]:
train_data = pd.read_csv("../data/Crop_Data__train.csv")

In [5]:
train_data.shape

(164550, 25)

# Model Training

From hyper-parameters search we found following setting gives the best and stable train-test scores.

{'max_depth': 20,
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 110}

In [6]:
estimator.train(train_data)

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.
Model trained!
Training score:  0.9564366821732151


# Model Testing

## Load test data

In [7]:
test_data = pd.read_csv("../data/Crop_Data__test.csv")

In [8]:
test_data.head()

Unnamed: 0,State,District,Lat,Long,GW_depth_Min,GW_depth_Max,Wells_depth_0_to_2,Wells_depth_2_to_5,Wells_depth_5_to_10,Wells_depth_10_to_20,...,Humidity,Temp_Max,Temp_Min,Dew_Frost_Point,Wind_Speed,Year,Season,Crop,Area,Production
0,TAMIL NADU,VIRUDHUNAGAR,8.381,77.6132,3.25,17.25,0,2,7,4.0,...,87.086,27.972,25.18,24.018,4.942,1998,Kharif,Horse-gram,62.0,31.0
1,KARNATAKA,KOPPAL,14.54387,76.23089,2.8,20.15,0,6,7,6.0,...,62.672,31.426,20.175,16.552,3.351,2009,Whole Year,Banana,1183.0,50410.0
2,KARNATAKA,BANGALORE RURAL,13.17776,77.34056,2.95,17.8,0,10,17,8.0,...,75.542,28.968,19.71,19.064,3.588,2014,Kharif,Small millets,17.0,11.0
3,TAMIL NADU,NAGAPATTINAM,10.7639,79.8445,2.26,4.66,0,7,0,0.0,...,74.752,30.696,25.539,22.816,3.758,2008,Whole Year,Moong(Green Gram),34588.0,2933.0
4,CHHATTISGARH,SURGUJA,23.369,83.32876,3.16,16.46,0,5,55,10.0,...,42.092,28.338,13.756,5.292,1.502,2010,Rabi,Khesari,382.0,237.0


In [9]:
test_data.shape

(41138, 25)

## Test score

In [10]:
estimator.test(test_data)

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.
Test score: 0.898934720649277


## Predict

In [11]:
test_feature = {
 'Lat': 14.543870000000002,
 'Long': 76.23089,
 'GW_depth_Min': 2.8,
 'GW_depth_Max': 20.15,
 'Wells_depth_0_to_2': 0,
 'Wells_depth_2_to_5': 6,
 'Wells_depth_5_to_10': 7,
 'Wells_depth_10_to_20': 6.0,
 'Wells_depth_20_to_40': 1.0,
 'Wells_depth_40_and_above': 0.0,
 'Precipitation': 70.70416666666667,
 'Solar_Radiation': 20.460833333333333,
 'Surface_Pressure': 94.08083333333332,
 'Humidity': 62.672,
 'Temp_Max': 31.426,
 'Temp_Min': 20.175,
 'Dew_Frost_Point': 16.552,
 'Wind_Speed': 3.3510000000000004,
 'Year': 2009,
 'Season': 'Whole Year',
 'Crop': 'Banana',}

In [12]:
estimator.predict([test_feature])

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.


array([[0.69980791]])

In [13]:
test_data.iloc[1]['Production'] / test_data.iloc[1]['Area']

42.61200338123415

In [17]:
dict(test_data.iloc[1])

{'State': 'KARNATAKA',
 'District': 'KOPPAL',
 'Lat': 14.543870000000002,
 'Long': 76.23089,
 'GW_depth_Min': 2.8,
 'GW_depth_Max': 20.15,
 'Wells_depth_0_to_2': 0,
 'Wells_depth_2_to_5': 6,
 'Wells_depth_5_to_10': 7,
 'Wells_depth_10_to_20': 6.0,
 'Wells_depth_20_to_40': 1.0,
 'Wells_depth_40_and_above': 0.0,
 'Precipitation': 70.70416666666667,
 'Solar_Radiation': 20.460833333333333,
 'Surface_Pressure': 94.08083333333332,
 'Humidity': 62.672,
 'Temp_Max': 31.426,
 'Temp_Min': 20.175,
 'Dew_Frost_Point': 16.552,
 'Wind_Speed': 3.3510000000000004,
 'Year': 2009,
 'Season': 'Whole Year',
 'Crop': 'Banana',
 'Area': 1183.0,
 'Production': 50410.0}