# Library

In [1]:
# Native library
import copy
import warnings
warnings.filterwarnings('ignore')

# Save object
import joblib

# Data management
import numpy as np
import pandas as pd

# Data visualization
import plotly.express as px

# Data prepocessing
# from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Hyperoptimization
from sklearn.model_selection import GridSearchCV

# Regressor models
from xgboost import XGBRegressor

# Model evaluation
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

In [2]:
# Target variable
TARGET = "Rice Yield (kg/ha)"

# Import Data

In [3]:
df_vi = pd.read_csv("../../data/processed/fixed_0-00146/train_vi.csv")

df_vi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13368 entries, 0 to 13367
Data columns (total 17 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   District                                        13368 non-null  object 
 1   Latitude                                        13368 non-null  float64
 2   Longitude                                       13368 non-null  float64
 3   Season(SA = Summer Autumn, WS = Winter Spring)  13368 non-null  object 
 4   Rice Crop Intensity(D=Double, T=Triple)         13368 non-null  object 
 5   Date of Harvest                                 13368 non-null  object 
 6   Field size (ha)                                 13368 non-null  float64
 7   Rice Yield (kg/ha)                              13368 non-null  int64  
 8   date                                            13368 non-null  object 
 9   ndvi                                   

In [4]:
df_sat = pd.read_csv("../../data/processed/fixed_0-00146/train.csv")

df_sat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13368 entries, 0 to 13367
Data columns (total 17 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   District                                        13368 non-null  object 
 1   Latitude                                        13368 non-null  float64
 2   Longitude                                       13368 non-null  float64
 3   Season(SA = Summer Autumn, WS = Winter Spring)  13368 non-null  object 
 4   Rice Crop Intensity(D=Double, T=Triple)         13368 non-null  object 
 5   Date of Harvest                                 13368 non-null  object 
 6   Field size (ha)                                 13368 non-null  float64
 7   Rice Yield (kg/ha)                              13368 non-null  int64  
 8   date                                            13368 non-null  object 
 9   red                                    

In [5]:
df = pd.merge(df_vi, df_sat, on=['District', 'Latitude', 'Longitude',
       'Season(SA = Summer Autumn, WS = Winter Spring)',
       'Rice Crop Intensity(D=Double, T=Triple)', 'Date of Harvest',
       'Field size (ha)', 'Rice Yield (kg/ha)', 'date'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13368 entries, 0 to 13367
Data columns (total 25 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   District                                        13368 non-null  object 
 1   Latitude                                        13368 non-null  float64
 2   Longitude                                       13368 non-null  float64
 3   Season(SA = Summer Autumn, WS = Winter Spring)  13368 non-null  object 
 4   Rice Crop Intensity(D=Double, T=Triple)         13368 non-null  object 
 5   Date of Harvest                                 13368 non-null  object 
 6   Field size (ha)                                 13368 non-null  float64
 7   Rice Yield (kg/ha)                              13368 non-null  int64  
 8   date                                            13368 non-null  object 
 9   ndvi                                   

In [6]:
df.head()

Unnamed: 0,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Rice Yield (kg/ha),date,ndvi,...,mtvi1,lswi,red,green,blue,rededge1,rededge2,rededge3,nir,swir
0,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.4,5500,14-07-2022,0.182807,...,3062.295791,0.276005,3707.568604,4139.405273,4338.941406,4399.774414,5063.293945,5313.82666,5366.339844,3044.820312
1,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.4,5500,09-07-2022,0.453044,...,5106.577866,0.266163,1954.647095,2239.111084,1972.908447,2674.83667,4337.316895,5363.189453,5192.712402,3009.568604
2,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.4,5500,04-07-2022,0.125385,...,2728.251445,0.210415,5195.346191,5569.202637,5768.026367,5971.241699,6526.993652,6799.486816,6684.954102,4360.764648
3,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.4,5500,29-06-2022,0.176642,...,3024.090264,0.345635,3937.790771,4316.660156,4571.54248,4673.444336,5374.594727,5831.339844,5627.411621,2736.535889
4,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.4,5500,24-06-2022,0.452227,...,4482.325225,0.234559,1723.215698,1970.081665,1815.310425,2227.43457,3854.986816,4699.708984,4568.503418,2832.526123


## Renaming & Typing

In [7]:
# Rename columns
# df.rename(
#     {
#         "District": "district", 
#         "Latitude": "lat", 
#         "Longitude": "lon",
#         "Season(SA = Summer Autumn, WS = Winter Spring)": "season",
#         "Rice Crop Intensity(D=Double, T=Triple)": "intensity",
#         "Date of Harvest": "harvest",
#         "Field size (ha)": "size",
#         "Rice Yield (kg/ha)": "yield",
#         # "ID No": "id_no"
#     },
#     axis="columns", inplace=True)

# Set the type of each columns
df = df.astype(
    dtype={
        # "id_no": "category",
        "Rice Crop Intensity(D=Double, T=Triple)": "category",
    },
   copy=False 
)

#Remove unsuful columns
# df.drop(columns=["district", "lat", "lon", "season"], inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13368 entries, 0 to 13367
Data columns (total 25 columns):
 #   Column                                          Non-Null Count  Dtype   
---  ------                                          --------------  -----   
 0   District                                        13368 non-null  object  
 1   Latitude                                        13368 non-null  float64 
 2   Longitude                                       13368 non-null  float64 
 3   Season(SA = Summer Autumn, WS = Winter Spring)  13368 non-null  object  
 4   Rice Crop Intensity(D=Double, T=Triple)         13368 non-null  category
 5   Date of Harvest                                 13368 non-null  object  
 6   Field size (ha)                                 13368 non-null  float64 
 7   Rice Yield (kg/ha)                              13368 non-null  int64   
 8   date                                            13368 non-null  object  
 9   ndvi                        

# Feature Engineering

## Numerical data

In [8]:
df = df.drop(columns="date").groupby(['District', 'Latitude', 'Longitude',
       'Season(SA = Summer Autumn, WS = Winter Spring)',
       'Rice Crop Intensity(D=Double, T=Triple)', 'Date of Harvest',
       'Field size (ha)', 'Rice Yield (kg/ha)']).describe()
df.columns = df.columns.map('_'.join).str.strip('_')

In [9]:
df.reset_index(level=["Field size (ha)", "Rice Yield (kg/ha)", "Rice Crop Intensity(D=Double, T=Triple)"], inplace=True)

# Data Preparation

In [10]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,"Rice Crop Intensity(D=Double, T=Triple)",Field size (ha),Rice Yield (kg/ha),ndvi_count,ndvi_mean,ndvi_std,ndvi_min,ndvi_25%,ndvi_50%,ndvi_75%,...,nir_75%,nir_max,swir_count,swir_mean,swir_std,swir_min,swir_25%,swir_50%,swir_75%,swir_max
District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)",Date of Harvest,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Chau_Phu,10.445962,105.123164,WS,07-04-2022,D,3.0,7200,24.0,0.313879,0.278279,-0.070971,0.064601,0.358543,0.476366,...,5589.562134,10878.248047,24.0,3227.041785,1665.606455,376.104584,2252.296509,2833.02124,3857.703369,7925.820312
Chau_Phu,10.44784,105.141425,SA,14-07-2022,D,2.0,6600,24.0,0.227551,0.197603,-0.070756,0.060892,0.15641,0.405029,...,5081.133179,11035.708984,24.0,3673.413935,1694.433618,1749.273315,2659.774231,2851.572632,4190.267456,8245.587891
Chau_Phu,10.44784,105.141425,WS,03-04-2022,D,2.0,6400,24.0,0.304275,0.289463,-0.070756,0.048735,0.275382,0.458366,...,6984.020752,10454.006836,24.0,3350.425494,1916.764685,15.114187,2226.315796,3184.835693,5003.049438,7359.14209
Chau_Phu,10.455882,105.126628,WS,10-04-2022,D,1.9,6800,24.0,0.297431,0.28366,-0.079871,0.068008,0.29703,0.435336,...,5786.396118,12227.654297,24.0,3294.997261,1854.282382,573.214539,2258.987915,2829.714478,4440.691284,8655.439453
Chau_Phu,10.458071,105.162504,WS,03-04-2022,D,2.0,6400,24.0,0.321277,0.266027,-0.077437,0.085025,0.250383,0.571228,...,6050.918335,11387.712891,24.0,3167.711067,1972.669878,233.767975,2155.306458,2790.387207,3963.142944,7883.153809


In [11]:
# Split Features & Target
X, y = df.drop(columns=TARGET), df[[TARGET]]

## Convert Categorical features

In [12]:
# Convert categorical variable into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Field size (ha),ndvi_count,ndvi_mean,ndvi_std,ndvi_min,ndvi_25%,ndvi_50%,ndvi_75%,ndvi_max,savi_count,...,nir_max,swir_count,swir_mean,swir_std,swir_min,swir_25%,swir_50%,swir_75%,swir_max,"Rice Crop Intensity(D=Double, T=Triple)_T"
District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)",Date of Harvest,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
Chau_Phu,10.445962,105.123164,WS,07-04-2022,3.0,24.0,0.313879,0.278279,-0.070971,0.064601,0.358543,0.476366,0.828294,24.0,...,10878.248047,24.0,3227.041785,1665.606455,376.104584,2252.296509,2833.02124,3857.703369,7925.820312,0
Chau_Phu,10.44784,105.141425,SA,14-07-2022,2.0,24.0,0.227551,0.197603,-0.070756,0.060892,0.15641,0.405029,0.519165,24.0,...,11035.708984,24.0,3673.413935,1694.433618,1749.273315,2659.774231,2851.572632,4190.267456,8245.587891,0
Chau_Phu,10.44784,105.141425,WS,03-04-2022,2.0,24.0,0.304275,0.289463,-0.070756,0.048735,0.275382,0.458366,0.87439,24.0,...,10454.006836,24.0,3350.425494,1916.764685,15.114187,2226.315796,3184.835693,5003.049438,7359.14209,0
Chau_Phu,10.455882,105.126628,WS,10-04-2022,1.9,24.0,0.297431,0.28366,-0.079871,0.068008,0.29703,0.435336,0.871526,24.0,...,12227.654297,24.0,3294.997261,1854.282382,573.214539,2258.987915,2829.714478,4440.691284,8655.439453,0
Chau_Phu,10.458071,105.162504,WS,03-04-2022,2.0,24.0,0.321277,0.266027,-0.077437,0.085025,0.250383,0.571228,0.823035,24.0,...,11387.712891,24.0,3167.711067,1972.669878,233.767975,2155.306458,2790.387207,3963.142944,7883.153809,0


## Split Data

In [13]:
# Split dataset into Train/Test subdataset equitably distributed according to TARGET
# Repeat the operation to create Train/Val/Test subdataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## Scale Data

Because XGBoost is a decision tree based algorithm it not required normalization of their inputs.

In [14]:
# # Depend of the model used a data scaling is requered
# scaler = StandardScaler()
# X_train_scale = scaler.fit_transform(X_train)

# # X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
# # X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
# X_test_scale = scaler.transform(X_test)

# # Save scaler for a future use
# scaler_filename = "scaler.save"
# joblib.dump(scaler, scaler_filename) 
# # to load it
# # scaler = joblib.load(scaler_filename) 

# X_train.head()

# Machine Learning Regression

## XGBoost

### Train

In [15]:
xgbr = XGBRegressor(random_state=0)

param_grid = { 
    'max_depth' : np.linspace(2, 10, 5, dtype=int),
    'n_estimators': np.linspace(100, 1000, 5, dtype=int),
    'learning_rate': np.linspace(0.01, 0.3, 5, dtype=float),
    'colsample_bytree': np.linspace(0.5, 1, 5, dtype=float),
    # 'subsample': np.linspace(0.6, 1, 5, dtype=float),
}

# Perform a grid search
cv_xgbr = GridSearchCV(xgbr, param_grid, n_jobs=-1, verbose=1)
cv_xgbr.fit(X_train, y_train)

Fitting 5 folds for each of 625 candidates, totalling 3125 fits


In [16]:
cv_xgbr.best_params_

{'colsample_bytree': 0.625,
 'learning_rate': 0.01,
 'max_depth': 2,
 'n_estimators': 550}

### Evaluate

In [17]:
y_pred = cv_xgbr.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mape = mean_absolute_percentage_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'RMSE: {rmse:.0f} | MAPE: {100*mape:.1f}% | R2 score: {r2:.3f}')

RMSE: 483 | MAPE: 6.0% | R2 score: 0.690


In [18]:
# Plot features importance
df_fi = pd.DataFrame(columns=['Feature', 'Importance'])
df_fi['Feature'] = X.columns
df_fi['Importance'] = cv_xgbr.best_estimator_.feature_importances_
df_fi.sort_values('Importance', inplace=True, ascending=False)

fig = px.bar(df_fi.head(10), x='Feature', y='Importance', title="Feature importance")
fig.show()

# Save the model

In [19]:
model_filename = "../model/fixed_0-00146/cv_xgboost.save"
joblib.dump(cv_xgbr, model_filename)

['../model/fixed_0-00146/cv_xgboost.save']

# Train Final Model

In [21]:
param = copy.deepcopy(cv_xgbr.best_params_)
param["random_state"] = 0

xgbr = XGBRegressor(**param)

xgbr.fit(X, y)

In [22]:
model_filename = "../model/fixed_0-00146/xgboost.save"
joblib.dump(xgbr, model_filename)

['../model/fixed_0-00146/xgboost.save']

# Test file

In [23]:
df_vi = pd.read_csv("../../data/processed/fixed_0-00146/test_vi.csv")

df_vi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 18 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   ID No                                           2400 non-null   int64  
 1   District                                        2400 non-null   object 
 2   Latitude                                        2400 non-null   float64
 3   Longitude                                       2400 non-null   float64
 4   Season(SA = Summer Autumn, WS = Winter Spring)  2400 non-null   object 
 5   Rice Crop Intensity(D=Double, T=Triple)         2400 non-null   object 
 6   Date of Harvest                                 2400 non-null   object 
 7   Field size (ha)                                 2400 non-null   float64
 8   Predicted Rice Yield (kg/ha)                    0 non-null      float64
 9   date                                     

In [24]:
df_sat = pd.read_csv("../../data/processed/fixed_0-00146/test.csv")

df_sat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 18 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   ID No                                           2400 non-null   int64  
 1   District                                        2400 non-null   object 
 2   Latitude                                        2400 non-null   float64
 3   Longitude                                       2400 non-null   float64
 4   Season(SA = Summer Autumn, WS = Winter Spring)  2400 non-null   object 
 5   Rice Crop Intensity(D=Double, T=Triple)         2400 non-null   object 
 6   Date of Harvest                                 2400 non-null   object 
 7   Field size (ha)                                 2400 non-null   float64
 8   Predicted Rice Yield (kg/ha)                    0 non-null      float64
 9   date                                     

In [25]:
df = pd.merge(df_vi, df_sat, on=['District', 'Latitude', 'Longitude',
       'Season(SA = Summer Autumn, WS = Winter Spring)',
       'Rice Crop Intensity(D=Double, T=Triple)', 'Date of Harvest',
       'Field size (ha)', 'Predicted Rice Yield (kg/ha)', 'date', 'ID No'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2400 entries, 0 to 2399
Data columns (total 26 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   ID No                                           2400 non-null   int64  
 1   District                                        2400 non-null   object 
 2   Latitude                                        2400 non-null   float64
 3   Longitude                                       2400 non-null   float64
 4   Season(SA = Summer Autumn, WS = Winter Spring)  2400 non-null   object 
 5   Rice Crop Intensity(D=Double, T=Triple)         2400 non-null   object 
 6   Date of Harvest                                 2400 non-null   object 
 7   Field size (ha)                                 2400 non-null   float64
 8   Predicted Rice Yield (kg/ha)                    0 non-null      float64
 9   date                                     

In [26]:
df.head()

Unnamed: 0,ID No,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Predicted Rice Yield (kg/ha),date,...,mtvi1,lswi,red,green,blue,rededge1,rededge2,rededge3,nir,swir
0,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,10-04-2022,...,648.663369,-0.060031,3381.496826,3313.45752,3360.209229,3762.447754,3857.467285,4083.084961,3905.666748,4404.532715
1,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,05-04-2022,...,2375.34939,0.063495,2023.441162,1984.607788,1656.869263,2676.281006,3438.506592,3771.326904,3715.058838,3271.450928
2,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,31-03-2022,...,3644.730498,0.148839,2205.650391,2361.464111,2057.800537,2986.781006,4082.418213,4653.062012,4567.915039,3384.313721
3,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,26-03-2022,...,3737.763252,0.168246,1706.290894,1841.61438,1480.372559,2472.673096,3676.718994,4131.21875,4155.359375,2958.483643
4,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,21-03-2022,...,3085.443384,0.198814,1734.813721,1941.666626,1757.101318,2280.630615,3256.418213,3770.830078,3653.39209,2441.62085


## Renaming & Typing

In [27]:
# Rename columns
# df.rename(
#     {
#         "District": "district", 
#         "Latitude": "lat", 
#         "Longitude": "lon",
#         "Season(SA = Summer Autumn, WS = Winter Spring)": "season",
#         "Rice Crop Intensity(D=Double, T=Triple)": "intensity",
#         "Date of Harvest": "harvest",
#         "Field size (ha)": "size",
#         "Rice Yield (kg/ha)": "yield",
#         "ID No": "id_no"
#     },
#     axis="columns", inplace=True)

# Set the type of each columns
df = df.astype(
    dtype={
        # "id_no": "category",
        "Rice Crop Intensity(D=Double, T=Triple)": "category",
    },
   copy=False 
)

#Remove unusuful columns
# df.drop(columns=["id_no"], inplace=True)

df.head()

Unnamed: 0,ID No,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Predicted Rice Yield (kg/ha),date,...,mtvi1,lswi,red,green,blue,rededge1,rededge2,rededge3,nir,swir
0,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,10-04-2022,...,648.663369,-0.060031,3381.496826,3313.45752,3360.209229,3762.447754,3857.467285,4083.084961,3905.666748,4404.532715
1,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,05-04-2022,...,2375.34939,0.063495,2023.441162,1984.607788,1656.869263,2676.281006,3438.506592,3771.326904,3715.058838,3271.450928
2,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,31-03-2022,...,3644.730498,0.148839,2205.650391,2361.464111,2057.800537,2986.781006,4082.418213,4653.062012,4567.915039,3384.313721
3,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,26-03-2022,...,3737.763252,0.168246,1706.290894,1841.61438,1480.372559,2472.673096,3676.718994,4131.21875,4155.359375,2958.483643
4,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,,21-03-2022,...,3085.443384,0.198814,1734.813721,1941.666626,1757.101318,2280.630615,3256.418213,3770.830078,3653.39209,2441.62085


# Feature Engineering

## Numerical data

In [28]:
df = df.drop(columns=["date", 'Predicted Rice Yield (kg/ha)']).groupby(['District', 'Latitude', 'Longitude',
       'Season(SA = Summer Autumn, WS = Winter Spring)',
       'Rice Crop Intensity(D=Double, T=Triple)', 'Date of Harvest',
       'Field size (ha)', 'ID No']).describe()

df.columns = df.columns.map('_'.join).str.strip('_')

In [29]:
df.reset_index(level=["Field size (ha)", "Rice Crop Intensity(D=Double, T=Triple)"], inplace=True)

In [30]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,"Rice Crop Intensity(D=Double, T=Triple)",Field size (ha),ndvi_count,ndvi_mean,ndvi_std,ndvi_min,ndvi_25%,ndvi_50%,ndvi_75%,ndvi_max,...,nir_75%,nir_max,swir_count,swir_mean,swir_std,swir_min,swir_25%,swir_50%,swir_75%,swir_max
District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)",Date of Harvest,ID No,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Chau_Phu,10.441423,105.115088,SA,20-07-2022,17,D,4.0,24.0,0.245517,0.230892,-0.084224,0.058425,0.138671,0.464724,0.605914,...,5097.635498,12277.045898,24.0,3405.934484,1429.477502,1497.418335,2482.147095,2934.687866,3645.849609,6408.545898
Chau_Phu,10.469839,105.211568,WS,01-04-2022,85,T,4.0,24.0,0.298268,0.24633,-0.064239,0.054491,0.357424,0.498373,0.741627,...,6540.147217,11266.352539,24.0,3165.494672,1950.585987,496.484436,2010.956787,2366.728394,4527.582397,8102.975586
Chau_Phu,10.473786,105.190479,SA,14-07-2022,13,T,1.7,24.0,0.250739,0.231802,-0.115478,0.057744,0.202655,0.454684,0.600343,...,5234.571045,11876.013672,24.0,3425.606079,1791.30292,1317.425659,2629.262878,3019.295898,3277.140198,8455.74707
Chau_Phu,10.473786,105.190479,WS,03-04-2022,35,T,1.7,24.0,0.334348,0.268628,-0.064958,0.048183,0.46044,0.573372,0.666078,...,5720.296021,11876.013672,24.0,3185.026774,2035.427175,858.84082,1853.473175,2603.242188,3338.344238,8455.74707
Chau_Phu,10.474439,105.216928,SA,15-07-2022,81,T,7.0,24.0,0.233725,0.203865,-0.070384,0.058992,0.229992,0.439704,0.526516,...,5592.332031,11409.107422,24.0,3073.574534,1403.103937,1547.906616,2435.92041,2790.167847,3080.892822,8002.72998


# Data Preparation

In [31]:
# Split Features & Target
X = df

## Convert Categorical features

In [32]:
# Convert categorical variable into dummy/indicator variables
X = pd.get_dummies(X, drop_first=True)
X.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Field size (ha),ndvi_count,ndvi_mean,ndvi_std,ndvi_min,ndvi_25%,ndvi_50%,ndvi_75%,ndvi_max,savi_count,...,nir_max,swir_count,swir_mean,swir_std,swir_min,swir_25%,swir_50%,swir_75%,swir_max,"Rice Crop Intensity(D=Double, T=Triple)_T"
District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)",Date of Harvest,ID No,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
Chau_Phu,10.441423,105.115088,SA,20-07-2022,17,4.0,24.0,0.245517,0.230892,-0.084224,0.058425,0.138671,0.464724,0.605914,24.0,...,12277.045898,24.0,3405.934484,1429.477502,1497.418335,2482.147095,2934.687866,3645.849609,6408.545898,0
Chau_Phu,10.469839,105.211568,WS,01-04-2022,85,4.0,24.0,0.298268,0.24633,-0.064239,0.054491,0.357424,0.498373,0.741627,24.0,...,11266.352539,24.0,3165.494672,1950.585987,496.484436,2010.956787,2366.728394,4527.582397,8102.975586,1
Chau_Phu,10.473786,105.190479,SA,14-07-2022,13,1.7,24.0,0.250739,0.231802,-0.115478,0.057744,0.202655,0.454684,0.600343,24.0,...,11876.013672,24.0,3425.606079,1791.30292,1317.425659,2629.262878,3019.295898,3277.140198,8455.74707,1
Chau_Phu,10.473786,105.190479,WS,03-04-2022,35,1.7,24.0,0.334348,0.268628,-0.064958,0.048183,0.46044,0.573372,0.666078,24.0,...,11876.013672,24.0,3185.026774,2035.427175,858.84082,1853.473175,2603.242188,3338.344238,8455.74707,1
Chau_Phu,10.474439,105.216928,SA,15-07-2022,81,7.0,24.0,0.233725,0.203865,-0.070384,0.058992,0.229992,0.439704,0.526516,24.0,...,11409.107422,24.0,3073.574534,1403.103937,1547.906616,2435.92041,2790.167847,3080.892822,8002.72998,1


# Load the model

In [33]:
model_filename = "../model/fixed_0-00146/xgboost.save"
model = joblib.load(model_filename)

In [34]:
y_pred = model.predict(X)

In [35]:
s_pred = pd.Series(y_pred, index=df.index, name='Predicted Rice Yield (kg/ha)')

# Load Submissions file

In [36]:
df_sub = pd.read_csv("../../data/raw/test.csv")
df_sub.drop(columns='Predicted Rice Yield (kg/ha)', inplace=True)
df_sub.head()

Unnamed: 0,ID No,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha)
0,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4
1,2,Chau_Thanh,10.400189,105.331053,SA,T,15-07-2022,1.32
2,3,Chau_Phu,10.505489,105.203926,SA,D,14-07-2022,1.4
3,4,Chau_Phu,10.52352,105.138274,WS,D,10-04-2022,1.8
4,5,Thoai_Son,10.29466,105.248528,SA,T,20-07-2022,2.2


In [37]:
df_sub = pd.merge(df_sub, s_pred, on="ID No")
df_sub.head()

Unnamed: 0,ID No,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Predicted Rice Yield (kg/ha)
0,1,Chau_Phu,10.542192,105.18792,WS,T,10-04-2022,1.4,7184.517578
1,2,Chau_Thanh,10.400189,105.331053,SA,T,15-07-2022,1.32,5890.771973
2,3,Chau_Phu,10.505489,105.203926,SA,D,14-07-2022,1.4,5966.685547
3,4,Chau_Phu,10.52352,105.138274,WS,D,10-04-2022,1.8,7173.996094
4,5,Thoai_Son,10.29466,105.248528,SA,T,20-07-2022,2.2,5812.468262


In [38]:
sub_filename = '../model/fixed_0-00146/submission.csv'
df_sub.to_csv(sub_filename, index=False, header=True)