## Wild Blueberry Yield Prediction (hyperparamater Optimization)

Given data about *wild blueberries*, let's try to predict the *yield* for a given record.

We will use a random forest regression model to make our predictions. 

Data source: https://www.kaggle.com/datasets/shashwatwork/wild-blueberry-yield-prediction-dataset

### Getting Started 

In [14]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestRegressor

In [15]:
data = pd.read_csv('WildBlueberryPollinationSimulationData.csv')
data

Unnamed: 0,Row#,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds,yield
0,0,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,16.00,0.26,0.410652,0.408159,31.678898,3813.165795
1,1,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,1.00,0.10,0.444254,0.425458,33.449385,4947.605663
2,2,37.5,0.750,0.250,0.250,0.250,94.6,57.2,79.0,68.2,33.0,55.9,16.00,0.26,0.383787,0.399172,30.546306,3866.798965
3,3,37.5,0.750,0.250,0.250,0.250,94.6,57.2,79.0,68.2,33.0,55.9,1.00,0.10,0.407564,0.408789,31.562586,4303.943030
4,4,37.5,0.750,0.250,0.250,0.250,86.0,52.0,71.9,62.0,30.0,50.8,24.00,0.39,0.354413,0.382703,28.873714,3436.493543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,772,10.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.486815,0.428012,33.447471,5333.873335
773,773,40.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.342841,0.377915,28.462005,3373.436842
774,774,20.0,0.537,0.117,0.409,0.058,86.0,52.0,71.9,62.0,30.0,50.8,24.00,0.39,0.404617,0.401670,30.748240,4203.027624
775,775,20.0,0.537,0.117,0.409,0.058,89.0,39.0,65.6,66.0,28.0,45.3,3.77,0.06,0.401538,0.399935,30.582161,4166.299735


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Row#                  777 non-null    int64  
 1   clonesize             777 non-null    float64
 2   honeybee              777 non-null    float64
 3   bumbles               777 non-null    float64
 4   andrena               777 non-null    float64
 5   osmia                 777 non-null    float64
 6   MaxOfUpperTRange      777 non-null    float64
 7   MinOfUpperTRange      777 non-null    float64
 8   AverageOfUpperTRange  777 non-null    float64
 9   MaxOfLowerTRange      777 non-null    float64
 10  MinOfLowerTRange      777 non-null    float64
 11  AverageOfLowerTRange  777 non-null    float64
 12  RainingDays           777 non-null    float64
 13  AverageRainingDays    777 non-null    float64
 14  fruitset              777 non-null    float64
 15  fruitmass             7

### Preprocessing

In [17]:
def preprocess_inputs(df):
    df = df.copy()

    # Drop row# column
    df = df.drop('Row#', axis=1)

    # Split df into X and y
    y = df['yield']
    X = df.drop('yield', axis=1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    return X_train, X_test, y_train, y_test

In [18]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)
X_train

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
214,12.5,0.25,0.250,0.50,0.50,94.6,57.2,79.0,68.2,33.0,55.9,1.00,0.10,0.582954,0.488176,40.559770
88,12.5,0.25,0.250,0.25,0.50,86.0,52.0,71.9,62.0,30.0,50.8,34.00,0.56,0.435969,0.419720,32.815794
479,25.0,0.50,0.250,0.38,0.63,94.6,57.2,79.0,68.2,33.0,55.9,24.00,0.39,0.364565,0.391617,29.908518
602,25.0,0.50,0.250,0.75,0.50,86.0,52.0,71.9,62.0,30.0,50.8,1.00,0.10,0.523846,0.460305,37.277297
147,12.5,0.25,0.250,0.38,0.38,86.0,52.0,71.9,62.0,30.0,50.8,16.00,0.26,0.553730,0.471250,38.534569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,25.0,0.50,0.380,0.50,0.63,94.6,57.2,79.0,68.2,33.0,55.9,16.00,0.26,0.527592,0.464639,37.782288
767,20.0,0.00,0.585,0.00,0.00,86.0,52.0,71.9,62.0,30.0,50.8,3.77,0.06,0.599984,0.529791,46.585105
72,12.5,0.25,0.250,0.25,0.38,86.0,52.0,71.9,62.0,30.0,50.8,34.00,0.56,0.416271,0.409438,31.577558
235,12.5,0.25,0.250,0.50,0.63,77.4,46.8,64.7,55.8,27.0,45.8,16.00,0.26,0.589306,0.488616,40.546480


In [19]:
X_train.describe()

Unnamed: 0,clonesize,honeybee,bumbles,andrena,osmia,MaxOfUpperTRange,MinOfUpperTRange,AverageOfUpperTRange,MaxOfLowerTRange,MinOfLowerTRange,AverageOfLowerTRange,RainingDays,AverageRainingDays,fruitset,fruitmass,seeds
count,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0,543.0
mean,18.775322,0.389118,0.283116,0.476162,0.559987,82.078637,49.56372,68.547514,59.167956,28.617495,48.487845,18.121418,0.317827,0.503151,0.446645,36.201995
std,6.922487,0.787501,0.063583,0.160474,0.162965,9.19762,5.606868,7.678822,6.652775,3.209863,5.418443,12.188437,0.171467,0.078537,0.039988,4.319438
min,12.5,0.0,0.0,0.0,0.0,69.7,39.0,58.2,50.2,24.3,41.2,1.0,0.06,0.192732,0.311921,22.079199
25%,12.5,0.25,0.25,0.38,0.5,77.4,46.8,64.7,55.8,27.0,45.8,1.0,0.1,0.458556,0.417076,33.225444
50%,12.5,0.25,0.25,0.5,0.63,86.0,52.0,71.9,62.0,30.0,50.8,16.0,0.26,0.508992,0.446576,36.250838
75%,25.0,0.5,0.38,0.63,0.63,86.0,52.0,71.9,62.0,30.0,50.8,24.0,0.39,0.562291,0.476229,39.285339
max,40.0,18.43,0.585,0.75,0.75,94.6,57.2,79.0,68.2,33.0,55.9,34.0,0.56,0.645641,0.532772,46.585105


In [21]:
y_train

214    7243.226111
88     4684.893205
479    3723.523376
602    6521.291119
147    6683.200614
          ...     
715    6327.477365
767    7575.801245
72     4350.424670
235    7560.205645
37     4356.945873
Name: yield, Length: 543, dtype: float64

### Training

In [22]:
model = RandomForestRegressor(random_state=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [23]:
y_pred

array([8347.89770723, 5617.43353994, 6040.91770472, 4173.93171129,
       8473.76616715, 7234.76702628, 6225.91778631, 6859.34549124,
       6637.02444783, 6580.87958142, 5669.79024112, 5777.39852827,
       4465.31845819, 7805.50049886, 6675.22715612, 5921.94715828,
       4530.86721559, 7435.78687431, 7823.00351099, 5233.51084744,
       6296.30291283, 3738.70103395, 7624.08015921, 5626.33850482,
       5209.72012965, 5977.9733837 , 3630.90959874, 4334.47925919,
       5753.88279328, 4614.79823169, 6710.7406748 , 3790.68113273,
       5144.07202304, 8407.99085595, 7423.64841492, 6837.64713186,
       6755.38550009, 5670.1757635 , 5379.44014731, 8732.92815879,
       8170.96906526, 6758.20597224, 2611.7916052 , 5807.17117923,
       7103.17276231, 5410.07050787, 4690.53921909, 5777.89016355,
       4335.91768002, 5830.45196381, 5043.65935806, 7566.67442886,
       7822.88966909, 7255.44162752, 3539.58488246, 6306.49081446,
       4780.87702349, 5272.44565973, 5129.63675496, 6462.19274

In [33]:
# Error
rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
print("     RMSE: {:.2f}".format(rmse))
print("R^2 Score: {:.5f}".format(r2))

     RMSE: 186.23
R^2 Score: 0.98244


#### Hyperparameter optimization

In [35]:
params = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [2, 4, 6, 8, 10]
}

model = GridSearchCV(RandomForestRegressor(random_state=1), params)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = np.sqrt(np.mean((y_test - y_pred)**2))
r2 = 1 - (np.sum((y_test - y_pred)**2) / np.sum((y_test - y_test.mean())**2))
print("     RMSE: {:.2f}".format(rmse))
print("R^2 Score: {:.5f}".format(r2))

     RMSE: 185.37
R^2 Score: 0.98260


In [36]:
model.best_params_

{'max_depth': 10, 'n_estimators': 200}