# Advanced Regression Models
Using XGBoost and Random Forest to predict NOx, Temperature, and CO2

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import xgboost as xgb
import plotly.graph_objects as go

In [2]:
# read cleaned data
df = pd.read_hdf('../data/clean-data.h5')
df.head()

Unnamed: 0,timestamp,air.flow,air.temp,air.frac,fuel.flow,tc1,tc2,tc3,tc4,tcf,...,f.co2,f.o2,f.ch4,f.nox,f.co,spec,stoppage,hub,shift,trial
0,2020-01-01 00:00:00,2479,28.65,0.24,390,29,1450,1866,1832,1758,...,0.031799,2.784636e-13,0.003849,0.000788,0.000289,CIA,False,22,B,1.0
1,2020-01-01 00:01:00,2479,28.65,0.24,390,26,1450,1863,1835,1760,...,0.032406,2.678304e-13,0.003662,0.00079,0.000314,CIA,False,22,B,1.0
2,2020-01-01 00:02:00,2479,28.65,0.24,390,26,1446,1867,1833,1760,...,0.031569,2.608832e-13,0.003831,0.000793,0.000308,CIA,False,22,B,1.0
3,2020-01-01 00:03:00,2479,28.65,0.24,390,27,1448,1862,1837,1760,...,0.032599,2.557492e-13,0.003604,0.000792,0.000306,CIA,False,22,B,1.0
4,2020-01-01 00:04:00,2479,28.65,0.24,390,26,1448,1867,1836,1755,...,0.031396,2.694801e-13,0.003752,0.000799,0.0003,CIA,False,22,B,1.0


In [3]:
# list of input features
inputs = ['air.flow', 'air.temp', 'air.frac', 'fuel.flow']

# output for NOx
output_nox = ['f.nox']

## XGBoost for NOx prediction

In [4]:
# split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    df[inputs],
    df[output_nox],
    test_size=0.2,
    random_state=42
)

In [5]:
# create XGBoost model
model_xgb_nox = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# train the model
model_xgb_nox.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [6]:
# predict on test data
predictions_xgb_nox = model_xgb_nox.predict(X_test)

# calculate r2 score
score_xgb_nox = model_xgb_nox.score(X_test, y_test)
display('XGBoost R2 for NOx: ' + str(score_xgb_nox))

'XGBoost R2 for NOx: 0.9999462962150574'

In [7]:
# create dataframe with predictions
predictionsDF_xgb_nox = pd.DataFrame(predictions_xgb_nox, columns=output_nox)

In [8]:
# plot parity of actual versus predicted values for XGBoost
parity_xgb_nox = go.Figure()

# add test v. predicted markers
parity_xgb_nox.add_trace(
    go.Scatter(
        x=y_test['f.nox'],
        y=predictionsDF_xgb_nox['f.nox'],
        mode='markers',
        name='results'
    )
)

# add parity line
parity_xgb_nox.add_trace(
    go.Scatter(
        x=y_test['f.nox'],
        y=y_test['f.nox'],
        name='parity'
    )
)

# update layout and title
parity_xgb_nox.update_layout(height=800, width=800, title="XGBoost NOx Actual vs. Predicted")
parity_xgb_nox.update_xaxes(title='Actual NOx')
parity_xgb_nox.update_yaxes(title='Predicted NOx')

# display figure
parity_xgb_nox.show()

## XGBoost for Temperature prediction

In [9]:
# output for temperature
output_tcf = ['tcf']

# split the data
X_train_tcf, X_test_tcf, y_train_tcf, y_test_tcf = train_test_split(
    df[inputs],
    df[output_tcf],
    test_size=0.2,
    random_state=42
)

In [10]:
# create XGBoost model for temperature
model_xgb_tcf = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# fit the model
model_xgb_tcf.fit(X_train_tcf, y_train_tcf)

# predict and score
predictions_xgb_tcf = model_xgb_tcf.predict(X_test_tcf)
score_xgb_tcf = model_xgb_tcf.score(X_test_tcf, y_test_tcf)
display('XGBoost R2 for TCF: ' + str(score_xgb_tcf))

'XGBoost R2 for TCF: 0.9999863505363464'

In [11]:
predictionsDF_xgb_tcf = pd.DataFrame(predictions_xgb_tcf, columns=output_tcf)

# plot parity
parity_xgb_tcf = go.Figure()

parity_xgb_tcf.add_trace(
    go.Scatter(
        x=y_test_tcf['tcf'],
        y=predictionsDF_xgb_tcf['tcf'],
        mode='markers',
        name='results'
    )
)

parity_xgb_tcf.add_trace(
    go.Scatter(
        x=y_test_tcf['tcf'],
        y=y_test_tcf['tcf'],
        name='parity'
    )
)

parity_xgb_tcf.update_layout(height=800, width=800, title="XGBoost Temperature Actual vs. Predicted")
parity_xgb_tcf.update_xaxes(title='Actual TCF (°C)')
parity_xgb_tcf.update_yaxes(title='Predicted TCF (°C)')
parity_xgb_tcf.show()

## XGBoost for CO2 prediction

In [12]:
# output for CO2
output_co2 = ['f.co2']

# split the data
X_train_co2, X_test_co2, y_train_co2, y_test_co2 = train_test_split(
    df[inputs],
    df[output_co2],
    test_size=0.2,
    random_state=42
)

In [13]:
# create XGBoost model for CO2
model_xgb_co2 = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42
)

# fit the model
model_xgb_co2.fit(X_train_co2, y_train_co2)

# predict and score
predictions_xgb_co2 = model_xgb_co2.predict(X_test_co2)
score_xgb_co2 = model_xgb_co2.score(X_test_co2, y_test_co2)
display('XGBoost R2 for CO2: ' + str(score_xgb_co2))

'XGBoost R2 for CO2: 0.9833304286003113'

In [14]:
predictionsDF_xgb_co2 = pd.DataFrame(predictions_xgb_co2, columns=output_co2)

# plot parity
parity_xgb_co2 = go.Figure()

parity_xgb_co2.add_trace(
    go.Scatter(
        x=y_test_co2['f.co2'],
        y=predictionsDF_xgb_co2['f.co2'],
        mode='markers',
        name='results'
    )
)

parity_xgb_co2.add_trace(
    go.Scatter(
        x=y_test_co2['f.co2'],
        y=y_test_co2['f.co2'],
        name='parity'
    )
)

parity_xgb_co2.update_layout(height=800, width=800, title="XGBoost CO2 Actual vs. Predicted")
parity_xgb_co2.update_xaxes(title='Actual CO2 (tonne/hr)')
parity_xgb_co2.update_yaxes(title='Predicted CO2 (tonne/hr)')
parity_xgb_co2.show()

## Random Forest for NOx prediction

In [15]:
# use same train/test split as before for NOx

# create Random Forest model
model_rf_nox = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

# train the model
model_rf_nox.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
# predict on test data
predictions_rf_nox = model_rf_nox.predict(X_test)

# calculate r2 score
score_rf_nox = model_rf_nox.score(X_test, y_test)
display('Random Forest R2 for NOx: ' + str(score_rf_nox))

'Random Forest R2 for NOx: 0.9999476716022937'

In [17]:
# create dataframe with predictions
predictionsDF_rf_nox = pd.DataFrame(predictions_rf_nox, columns=output_nox)

In [18]:
# plot parity for random forest
parity_rf_nox = go.Figure()

parity_rf_nox.add_trace(
    go.Scatter(
        x=y_test['f.nox'],
        y=predictionsDF_rf_nox['f.nox'],
        mode='markers',
        name='results'
    )
)

parity_rf_nox.add_trace(
    go.Scatter(
        x=y_test['f.nox'],
        y=y_test['f.nox'],
        name='parity'
    )
)

parity_rf_nox.update_layout(height=800, width=800, title="Random Forest NOx Actual vs. Predicted")
parity_rf_nox.update_xaxes(title='Actual NOx')
parity_rf_nox.update_yaxes(title='Predicted NOx')
parity_rf_nox.show()

## Random Forest for Temperature

In [19]:
# create Random Forest model for temperature
model_rf_tcf = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

# fit the model
model_rf_tcf.fit(X_train_tcf, y_train_tcf)

# predict and score
predictions_rf_tcf = model_rf_tcf.predict(X_test_tcf)
score_rf_tcf = model_rf_tcf.score(X_test_tcf, y_test_tcf)
display('Random Forest R2 for TCF: ' + str(score_rf_tcf))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



'Random Forest R2 for TCF: 0.9999863512423506'

In [20]:
predictionsDF_rf_tcf = pd.DataFrame(predictions_rf_tcf, columns=output_tcf)

# plot parity
parity_rf_tcf = go.Figure()

parity_rf_tcf.add_trace(
    go.Scatter(
        x=y_test_tcf['tcf'],
        y=predictionsDF_rf_tcf['tcf'],
        mode='markers',
        name='results'
    )
)

parity_rf_tcf.add_trace(
    go.Scatter(
        x=y_test_tcf['tcf'],
        y=y_test_tcf['tcf'],
        name='parity'
    )
)

parity_rf_tcf.update_layout(height=800, width=800, title="Random Forest Temperature Actual vs. Predicted")
parity_rf_tcf.update_xaxes(title='Actual TCF (°C)')
parity_rf_tcf.update_yaxes(title='Predicted TCF (°C)')
parity_rf_tcf.show()

## Random Forest for CO2

In [21]:
# create Random Forest model for CO2
model_rf_co2 = RandomForestRegressor(
    n_estimators=100,
    random_state=42
)

# fit the model
model_rf_co2.fit(X_train_co2, y_train_co2)

# predict and score
predictions_rf_co2 = model_rf_co2.predict(X_test_co2)
score_rf_co2 = model_rf_co2.score(X_test_co2, y_test_co2)
display('Random Forest R2 for CO2: ' + str(score_rf_co2))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



'Random Forest R2 for CO2: 0.9833326342022021'

In [22]:
predictionsDF_rf_co2 = pd.DataFrame(predictions_rf_co2, columns=output_co2)

# plot parity
parity_rf_co2 = go.Figure()

parity_rf_co2.add_trace(
    go.Scatter(
        x=y_test_co2['f.co2'],
        y=predictionsDF_rf_co2['f.co2'],
        mode='markers',
        name='results'
    )
)

parity_rf_co2.add_trace(
    go.Scatter(
        x=y_test_co2['f.co2'],
        y=y_test_co2['f.co2'],
        name='parity'
    )
)

parity_rf_co2.update_layout(height=800, width=800, title="Random Forest CO2 Actual vs. Predicted")
parity_rf_co2.update_xaxes(title='Actual CO2 (tonne/hr)')
parity_rf_co2.update_yaxes(title='Predicted CO2 (tonne/hr)')
parity_rf_co2.show()

## Calculate MAE for comparison

In [23]:
# XGBoost MAE
mae_xgb_nox = mean_absolute_error(y_test['f.nox'], predictionsDF_xgb_nox['f.nox'])
mae_xgb_tcf = mean_absolute_error(y_test_tcf['tcf'], predictionsDF_xgb_tcf['tcf'])
mae_xgb_co2 = mean_absolute_error(y_test_co2['f.co2'], predictionsDF_xgb_co2['f.co2'])

display('XGBoost MAE for NOx: ' + str(mae_xgb_nox))
display('XGBoost MAE for TCF: ' + str(mae_xgb_tcf))
display('XGBoost MAE for CO2: ' + str(mae_xgb_co2))

'XGBoost MAE for NOx: 4.347557552798146e-05'

'XGBoost MAE for TCF: 1.4925434589385986'

'XGBoost MAE for CO2: 0.0010354702215279012'

In [24]:
# Random Forest MAE
mae_rf_nox = mean_absolute_error(y_test['f.nox'], predictionsDF_rf_nox['f.nox'])
mae_rf_tcf = mean_absolute_error(y_test_tcf['tcf'], predictionsDF_rf_tcf['tcf'])
mae_rf_co2 = mean_absolute_error(y_test_co2['f.co2'], predictionsDF_rf_co2['f.co2'])

display('Random Forest MAE for NOx: ' + str(mae_rf_nox))
display('Random Forest MAE for TCF: ' + str(mae_rf_tcf))
display('Random Forest MAE for CO2: ' + str(mae_rf_co2))

'Random Forest MAE for NOx: 4.057618906622531e-05'

'Random Forest MAE for TCF: 1.4924897444061087'

'Random Forest MAE for CO2: 0.001035358311736602'

## Save parity plots

In [25]:
# save XGBoost plots
parity_xgb_nox.write_image('../fig/xgboost_nox_parity.jpg')
parity_xgb_tcf.write_image('../fig/xgboost_temperature_parity.jpg')
parity_xgb_co2.write_image('../fig/xgboost_co2_parity.jpg')

# save Random Forest plots
parity_rf_nox.write_image('../fig/randomforest_nox_parity.jpg')
parity_rf_tcf.write_image('../fig/randomforest_temperature_parity.jpg')
parity_rf_co2.write_image('../fig/randomforest_co2_parity.jpg')