In [1]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [2]:
import pandas as pd
import numpy as np

from dask.distributed import Client
import joblib

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split

from common.preprocessing import CropDataProcessor

import time

In [3]:
client = Client(processes=False)

  next(self.gen)


In [3]:
train_data = pd.read_csv("../data/Crop_Data__train.csv")

In [4]:
train_data.shape

(164986, 25)

In [5]:
data_preprocessor = CropDataProcessor(train_data, excluded_features=['Dew_Frost_Point', 'Year'])



In [6]:
data_preprocessor.get_training_data()[0].shape



(164986, 21)

In [7]:
data_preprocessor.process_to_train()

Clustered Lat-Long to Geo Region.
Encoded Crop using WoE.
Transforming numerical features.
Transforming categorical features.


In [8]:
X, y = data_preprocessor.get_training_data()
X.shape, y.shape

((164986, 28), (164986,))

In [9]:
# print(data_preprocessor.process_to_predict(train_data.loc[100:105].drop(['Area', 'Production'], axis=1)).shape)
# data_preprocessor.process_to_predict(train_data.loc[100:105].drop(['Area', 'Production'], axis=1))[:10]

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=101)

In [11]:
X_train.shape, X_val.shape

((131988, 28), (32998, 28))

In [13]:
start_time = time.time()

reg = RandomForestRegressor(n_estimators=25)

with joblib.parallel_backend('dask'):
    reg.fit(X_train, y_train)
    
print(f"Finished in {time.time() - start_time} secs")

Finished in 8.314243078231812 secs


In [14]:
reg.score(X_train, y_train)

0.9836441415509415

In [15]:
reg.score(X_val, y_val)

0.8960234659535898

## RandomizedSearchCV

### Iter 1

In [29]:
param_space = {
    "n_estimators": [5, 15, 30, 50, 75, 100, 200],
    "max_depth": [10, 50, 100, 150, 200],
    "min_samples_split": [2, 5, 10, 15],
    "min_samples_leaf": [1, 2, 3, 5, 8],
    "max_features": ['auto', 'sqrt']
}

In [30]:
random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(), 
    param_distributions=param_space,
    cv=5, 
    n_iter=100, 
    verbose=5)

In [31]:
with joblib.parallel_backend('dask'):
    random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 5/5; 3/100] END max_depth=150, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=  54.2s
[CV 4/5; 1/100] END max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=30; total time=  55.9s
[CV 3/5; 1/100] END max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=30; total time=  56.7s
[CV 1/5; 1/100] END max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=30; total time=  57.2s
[CV 2/5; 1/100] END max_depth=100, max_features=auto, min_samples_leaf=2, min_samples_split=15, n_estimators=30; total time=  58.5s
[CV 2/5; 3/100] END max_depth=150, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=  28.2s
[CV 3/5; 3/100] END max_depth=150, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=30; total time=  28.5s
[CV 1/5; 3/100] 

Exception ignored in: <function _WeakKeyDictionary.__setitem__.<locals>.on_destroy at 0x7fbb1addbb70>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/joblib/_dask.py", line 74, in on_destroy
    del self._data[key]
KeyError: (140442495798688,)


In [32]:
random_search.best_estimator_

RandomForestRegressor(max_depth=50, min_samples_leaf=2)

In [33]:
random_search.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 50}

In [34]:
random_search.best_score_

0.8904260112124378

In [36]:
pd.DataFrame(random_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,87.176944,2.987074,0.352893,0.052287,30,10,2,auto,50,"{'n_estimators': 30, 'min_samples_split': 10, ...",0.883741,0.884130,0.882628,0.888429,0.889513,0.885688,0.002747,10
1,501.962215,3.427178,2.392382,0.697858,200,10,2,auto,150,"{'n_estimators': 200, 'min_samples_split': 10,...",0.886434,0.887549,0.885019,0.891197,0.893073,0.888655,0.003012,3
2,15.671151,8.202118,0.195996,0.064425,15,15,2,sqrt,100,"{'n_estimators': 15, 'min_samples_split': 15, ...",0.824301,0.829181,0.829282,0.833480,0.836325,0.830514,0.004110,59
3,513.489431,3.756711,1.463211,0.196531,200,5,5,auto,200,"{'n_estimators': 200, 'min_samples_split': 5, ...",0.882595,0.882532,0.880453,0.886334,0.888332,0.884049,0.002860,13
4,12.421960,12.372773,0.177599,0.052431,15,2,1,sqrt,10,"{'n_estimators': 15, 'min_samples_split': 2, '...",0.736228,0.745794,0.721875,0.744873,0.739510,0.737656,0.008635,98
5,233.439733,3.035414,0.519358,0.123196,100,2,2,sqrt,10,"{'n_estimators': 100, 'min_samples_split': 2, ...",0.751049,0.743855,0.750091,0.742634,0.744230,0.746372,0.003481,91
6,441.816108,12.330156,2.143538,0.306637,200,10,1,sqrt,50,"{'n_estimators': 200, 'min_samples_split': 10,...",0.839473,0.842292,0.840789,0.844065,0.846411,0.842606,0.002442,37
7,228.458030,15.544140,0.584133,0.386582,100,5,1,auto,10,"{'n_estimators': 100, 'min_samples_split': 5, ...",0.812385,0.811115,0.811733,0.809323,0.815982,0.812107,0.002189,71
8,3.964958,0.565347,0.158269,0.061941,15,10,3,auto,10,"{'n_estimators': 15, 'min_samples_split': 10, ...",0.810613,0.808163,0.808619,0.806754,0.811787,0.809188,0.001794,82
9,70.005068,7.170433,0.327424,0.126534,30,15,8,auto,50,"{'n_estimators': 30, 'min_samples_split': 15, ...",0.872826,0.874186,0.871368,0.876528,0.878793,0.874740,0.002644,32


### Iter 2

In [13]:
param_space_2 = {
    "n_estimators": np.arange(75, 125, 5),
    "max_depth": [30, 40, 50, 60, 70],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 3]
}

random_search_2 = RandomizedSearchCV(
    estimator=RandomForestRegressor(), 
    param_distributions=param_space_2,
    cv=5, 
    n_iter=100, 
    verbose=2)

with joblib.parallel_backend('dask'):
    random_search_2.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=50, min_samples_leaf=2, min_samples_split=2, n_estimators=90; total time= 6.2min
[CV] END max_depth=50, min_samples_leaf=2, min_samples_split=2, n_estimators=90; total time= 6.4min
[CV] END max_depth=50, min_samples_leaf=2, min_samples_split=2, n_estimators=90; total time= 6.4min
[CV] END max_depth=50, min_samples_leaf=2, min_samples_split=2, n_estimators=90; total time= 6.4min
[CV] END max_depth=30, min_samples_leaf=3, min_samples_split=3, n_estimators=80; total time= 6.1min
[CV] END max_depth=30, min_samples_leaf=3, min_samples_split=3, n_estimators=80; total time= 6.1min
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=4, n_estimators=80; total time= 6.0min
[CV] END max_depth=50, min_samples_leaf=2, min_samples_split=2, n_estimators=90; total time= 6.1min
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=4, n_estimators=80; total time= 6.1min
[CV] END max_depth=60, min_samples_le

Exception ignored in: <function _WeakKeyDictionary.__setitem__.<locals>.on_destroy at 0x7f5d46e8bd90>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/joblib/_dask.py", line 74, in on_destroy
    del self._data[key]
KeyError: (140038550304608,)


In [14]:
random_search_2.best_params_

{'n_estimators': 120,
 'min_samples_split': 4,
 'min_samples_leaf': 1,
 'max_depth': 70}

In [15]:
random_search_2.best_score_

0.8918147785634218

In [16]:
pd.DataFrame(random_search_2.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,376.200562,7.141133,1.087619,0.286823,90,2,2,50,"{'n_estimators': 90, 'min_samples_split': 2, '...",0.888370,0.889369,0.886997,0.892671,0.894453,0.890372,0.002769,56
1,484.232859,12.543976,1.042364,0.054259,105,4,2,70,"{'n_estimators': 105, 'min_samples_split': 4, ...",0.888501,0.889988,0.886615,0.892973,0.894707,0.890557,0.002938,46
2,366.794357,22.149013,0.897202,0.312903,80,3,3,30,"{'n_estimators': 80, 'min_samples_split': 3, '...",0.886167,0.887290,0.884773,0.890998,0.892472,0.888340,0.002922,83
3,355.827008,5.826697,0.791352,0.077966,80,4,1,50,"{'n_estimators': 80, 'min_samples_split': 4, '...",0.889334,0.890576,0.888058,0.893436,0.895213,0.891323,0.002637,17
4,380.619831,4.020677,0.988713,0.159279,95,4,3,60,"{'n_estimators': 95, 'min_samples_split': 4, '...",0.886802,0.887737,0.884934,0.891210,0.892155,0.888567,0.002715,73
5,471.705379,4.527807,0.965433,0.032911,115,2,3,50,"{'n_estimators': 115, 'min_samples_split': 2, ...",0.886140,0.888012,0.884854,0.890711,0.892679,0.888479,0.002879,76
6,463.314799,3.077675,1.002584,0.057013,105,3,1,70,"{'n_estimators': 105, 'min_samples_split': 3, ...",0.889379,0.890264,0.888059,0.893971,0.895998,0.891534,0.002973,9
7,365.126457,5.959621,0.830760,0.048391,85,4,1,50,"{'n_estimators': 85, 'min_samples_split': 4, '...",0.888376,0.890319,0.887179,0.893933,0.895501,0.891062,0.003187,29
8,452.143302,2.531989,1.035036,0.051433,120,4,1,70,"{'n_estimators': 120, 'min_samples_split': 4, ...",0.889948,0.890836,0.888320,0.894516,0.895453,0.891815,0.002727,1
9,330.728668,8.958723,0.737815,0.051031,75,3,2,60,"{'n_estimators': 75, 'min_samples_split': 3, '...",0.887874,0.888967,0.886426,0.892684,0.894663,0.890123,0.003074,66


### Iter 3

In [16]:
param_space_3 = {
    "n_estimators": [50, 80, 120, 150],
    "max_depth": [50, 75, 100],
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 4]
}

random_search_3 = RandomizedSearchCV(
    estimator=RandomForestRegressor(), 
    param_distributions=param_space_3,
    cv=3, 
    n_iter=50, 
    verbose=2)

with joblib.parallel_backend('dask'):
    random_search_3.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time= 2.9min
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time= 3.2min
[CV] END max_depth=50, min_samples_leaf=1, min_samples_split=3, n_estimators=50; total time= 3.5min
[CV] END max_depth=75, min_samples_leaf=2, min_samples_split=3, n_estimators=80; total time= 4.5min
[CV] END max_depth=75, min_samples_leaf=1, min_samples_split=4, n_estimators=80; total time= 4.6min
[CV] END max_depth=75, min_samples_leaf=2, min_samples_split=3, n_estimators=80; total time= 4.7min
[CV] END max_depth=75, min_samples_leaf=2, min_samples_split=3, n_estimators=80; total time= 4.7min
[CV] END max_depth=75, min_samples_leaf=1, min_samples_split=4, n_estimators=80; total time= 4.7min
[CV] END max_depth=75, min_samples_leaf=1, min_samples_split=4, n_estimators=80; total time= 3.9min
[CV] END max_depth=100, min_samples_le

In [17]:
random_search_3.best_params_

{'n_estimators': 150,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_depth': 75}

In [18]:
random_search_3.best_score_

0.8861842435120607

## GridSearchCV

In [None]:
grid_param = {
    "n_estimators": [150, 160, 170, 180],
    "max_depth": [50, 60, 70],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(n_jobs=-1),
    param_grid=grid_param,
    cv=3,
    n_jobs=-1,
    return_train_score=True,
    verbose=2)

# with joblib.parallel_backend('dask'):
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


