# Sorghum Yield Predictions with Ensemble Models

In [None]:
#important libraries
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor,RandomForestRegressor,\
StackingRegressor,VotingRegressor,HistGradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error,explained_variance_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, RepeatedKFold
from matplotlib import pyplot
from scipy.stats import sem
from numpy import mean
from numpy import std

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import plotly.express as px

import itertools
from itertools import permutations
pd.set_option("display.max_rows", None, "display.max_columns", None)
sns.set()

%matplotlib inline

## Import Train and Test Data

In [None]:
df = pd.read_csv('../../../data/data-1.3-red.csv',index_col = 0)
df.head()

### Calculate Principal Components

In [None]:
sub = df[df.columns[18:-8]] #reflectance data
sub.head()

In [None]:
n_components =50
pca = PCA(n_components)
sub_trans = pca.fit_transform(sub.values)
sub_df = pd.DataFrame(data = sub_trans
         , columns = ["PC"+str(i) for i in list(range(1,n_components+1))])
sub_df.head()

In [None]:
total_var = pca.explained_variance_ratio_.sum() * 100
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize = (15,8))
sns.set_context('paper')
sns.barplot(x = list(range(1,n_components+1,1)), y = exp_var_cumul)

In [None]:
total_var #explained variance

In [None]:
sub_df['yield'] = df['yield']
sub_df['treatment'] = df.Treatment

In [None]:
sub_df.head()

In [None]:
sub_df.shape

### Train Test Split

In [None]:
#df.rename(columns = {'Treatment':'treatment'},inplace = True)

In [None]:
## Data Split
X = sub_df[sub_df.columns[:-2]]
y = sub_df[['treatment','yield']]
X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = 0.30)

train_data = pd.concat([X_train, y_train], axis = 1)
print(f"Train Dataset: {train_data.shape[0]} row/records. F: {train_data[train_data.treatment=='HN'].shape[0]} and N: {train_data[train_data.treatment =='LN'].shape[0]}\n")

test_data = pd.concat([X_test,y_test], axis = 1)
print(f"Train Dataset: {test_data.shape[0]} row/records. F: {test_data[test_data.treatment=='HN'].shape[0]} and N: {test_data[test_data.treatment =='LN'].shape[0]}")



In [None]:
#features
features = list(sub_df.columns)[:-2]

## Important Functions

In [None]:
# evaluate a model with a given number of repeats
def evaluate_model(X, y, repeats):
    # prepare the cross-validation procedure
    cv = RepeatedKFold(n_splits=10, n_repeats=repeats, random_state=1)
    # create model
    estimators= 100
    model = RandomForestRegressor(n_estimators = estimators)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)
    return scores

## RF  - Both Treatment

In [None]:
#check the head of training dataset
train_data.head()

In [None]:
sub_df.head()

In [None]:
## Initialize Models with default parameters
X = sub_df[list(sub_df.columns)[:-2]]
y = sub_df['yield']
results = list()
for i in range(1,11):
    scores = evaluate_model(X,y,i)
    # summarize
    print('>%d mean=%.4f se=%.3f' % (i, mean(scores), sem(scores)))
    # store
    results.append(scores)

# cv = KFold(n_splits=10, random_state=1, shuffle=True)
# # evaluate model
# scores = cross_val_score(model, X, y, scoring='r2', cv=cv, n_jobs=-1)
# scores

In [None]:
plt.figure(figsize = (15,8))
pyplot.boxplot(results, labels=[str(r) for r in range(1,11)], showmeans=True)
plt.ylabel("r2")
plt.xlabel("K-fold Validation")
plt.title("RF - HN/LN Nitrogen Only")
pyplot.show()

## RF - Full Nitrogen


In [None]:
X = sub_df[sub_df.treatment =="HN"][list(sub_df.columns)[:-2]]
y = sub_df[sub_df.treatment =="HN"]['yield']
results = list()
for i in range(1,11):
    scores = evaluate_model(X,y,i)
    # summarize
    print('>%d mean=%.4f se=%.3f' % (i, mean(scores), sem(scores)))
    # store
    results.append(scores)

# cv = KFold(n_splits=10, random_sta

In [None]:
plt.figure(figsize = (15,8))
pyplot.boxplot(results, labels=[str(r) for r in range(1,11)], showmeans=True)
plt.ylabel("r2")
plt.xlabel("K-fold Validation")
plt.title("RF - High Nitrogen Only")
pyplot.show()

## RF - Nitrogen Deprived

In [None]:
X = sub_df[sub_df.treatment =="LN"][list(sub_df.columns)[:-2]]
y = sub_df[sub_df.treatment =="LN"]['yield']
results = list()
for i in range(1,11):
    scores = evaluate_model(X,y,i)
    # summarize
    print('>%d mean=%.4f se=%.3f' % (i, mean(scores), sem(scores)))
    # store
    results.append(scores)

# cv = KFold(n_splits=10, random_sta

In [None]:
plt.figure(figsize = (15,8))
pyplot.boxplot(results, labels=[str(r) for r in range(1,11)], showmeans=True)
plt.ylabel("r2")
plt.xlabel("K-fold Validation")
plt.title("RF - Low Nitrogen Only")
pyplot.show()