# All the data used in this example can be downloaded

# Import all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity

# Load all prediction models

In [2]:
## Load the model
model_GI= pickle.load(open('Model_GI (Extra Trees).pkl', 'rb'))

# Input sample data

In [3]:
## Load the file
df = pd.read_excel('Example_data.xlsx')
df.head()

Unnamed: 0,Biochar type,Pyrolysis temperature (℃),C/N,Surface area (m2/g),Pore volume (cm3/g),Amendment rate (%),Compost type,Initial pH,Initial C/N,Initial moisture content (%),Compost time (day),pH,Temperature (℃),Moisture content (%),GI
0,Bamboo biochar,550,123.3,166.2,0.108165,8,Pig manure,7.56,24.87,60,0,8.01,38.09,58.13,14.63
1,Bamboo biochar,550,123.3,166.2,0.108165,8,Pig manure,7.56,24.87,60,4,7.67,56.83,57.92,35.87
2,Bamboo biochar,550,123.3,166.2,0.108165,8,Pig manure,7.56,24.87,60,9,8.17,58.29,54.05,35.05
3,Bamboo biochar,550,123.3,166.2,0.108165,8,Pig manure,7.56,24.87,60,16,8.32,59.11,51.46,62.15
4,Bamboo biochar,550,123.3,166.2,0.108165,8,Pig manure,7.56,24.87,60,23,8.82,63.0,49.31,75.37


# Data preprocessing procedure

Target encoding of categorical variables is used to transform variables in the data into variables acceptable for machine learning

In [4]:
## Load the file
Target_encoding_explanation = pd.read_excel('Target_encoding_explanation.xlsx')
## Create a dictionary for mapping
biochar_mapping = dict(zip(Target_encoding_explanation['Biochar type'], Target_encoding_explanation['Target_encoding_Biochar type']))
compost_mapping = dict(zip(Target_encoding_explanation['Compost type'], Target_encoding_explanation['Target_encoding_Compost type']))

## Replace types with numbers
df_encoding = df.copy()
df_encoding['Biochar type'] = df_encoding['Biochar type'].map(biochar_mapping)
df_encoding['Compost type'] = df_encoding['Compost type'].map(compost_mapping)
df_encoding.head()

Unnamed: 0,Biochar type,Pyrolysis temperature (℃),C/N,Surface area (m2/g),Pore volume (cm3/g),Amendment rate (%),Compost type,Initial pH,Initial C/N,Initial moisture content (%),Compost time (day),pH,Temperature (℃),Moisture content (%),GI
0,21.667634,550,123.3,166.2,0.108165,8,22.935523,7.56,24.87,60,0,8.01,38.09,58.13,14.63
1,21.667634,550,123.3,166.2,0.108165,8,22.935523,7.56,24.87,60,4,7.67,56.83,57.92,35.87
2,21.667634,550,123.3,166.2,0.108165,8,22.935523,7.56,24.87,60,9,8.17,58.29,54.05,35.05
3,21.667634,550,123.3,166.2,0.108165,8,22.935523,7.56,24.87,60,16,8.32,59.11,51.46,62.15
4,21.667634,550,123.3,166.2,0.108165,8,22.935523,7.56,24.87,60,23,8.82,63.0,49.31,75.37


# Calculate the cosine similarity between the example data and the modeling data

In [5]:
## Load modeling dataset
X_train_df_GI = pd.read_excel('X_train_df_GI.xlsx')

## Make sure that 'con_feature' includes the common feature columns for both the new dataset and the training dataset
con_feature = ['Biochar type', 'Pyrolysis temperature (℃)', 'C/N', 'Surface area (m2/g)', 'Pore volume (cm3/g)',
               'Amendment rate (%)', 'Compost type', 'Initial C/N', 'Compost time (day)', 'pH', 'Temperature (℃)',
               'Moisture content (%)', 'Initial pH']

## Loading example dataset
con_df_new = df_encoding[con_feature]  # 假设 df_encoding 已经加载
con_df_train = X_train_df_GI[con_feature]

## Define a function to compute the cosine similarity
def cosine_ad(X_test, X_train):
    cosine_sim_matrix = cosine_similarity(X_test, X_train)
    mean_similarity = pd.Series(cosine_sim_matrix.mean(axis=1))
    return mean_similarity

## Calculate similarity
cosine_similarity_result = cosine_ad(con_df_new, con_df_train)

## Convert the results to a DataFrame format
cosine_similarity_result_df = pd.DataFrame(cosine_similarity_result, columns=['Cosine Similarity'])
cosine_similarity_result_df.index = con_df_new.index

## Output result
cosine_similarity_result_df = cosine_similarity_result_df.round(4)
cosine_similarity_result_df

Unnamed: 0,Cosine Similarity
0,0.9399
1,0.9397
2,0.9399
3,0.9401
4,0.9399
5,0.9404
6,0.9395


# Perform predictions and calculate performance

In [6]:
## Define input features and target column
X = df_encoding.iloc[:, :14]  # Input features
Y_column = df_encoding.columns[14]  # Target column

## Get actual and predicted values
Y_actual = df_encoding[Y_column]  # Actual values
Y_pred = model_GI.predict(X).round(2)  # Predicted values

## Create a DataFrame with both actual and predicted values
Predicted_result = pd.DataFrame({
    f'{Y_column}_Predicted': Y_pred, 
    f'{Y_column}_Actual': Y_actual
})

## Calculate performance metrics for the target column
r2 = round(r2_score(Y_actual, Y_pred), 2)
rmse = round(mean_squared_error(Y_actual, Y_pred, squared=False), 3)
mae = round(mean_absolute_error(Y_actual, Y_pred), 3)

## Create a summary DataFrame for metrics
Performance_metrics = pd.DataFrame({
    "Metric": ["R2", "RMSE", "MAE"],
    "Value": [r2, rmse, mae]
})

## Display both DataFrames separately
print("Predicted vs Actual Values:")
display(Predicted_result)

print("\nPerformance Metrics:")
display(Performance_metrics)

Predicted vs Actual Values:


Unnamed: 0,GI_Predicted,GI_Actual
0,24.55,14.63
1,30.11,35.87
2,37.98,35.05
3,42.93,62.15
4,48.73,75.37
5,79.97,102.98
6,84.98,114.55



Performance Metrics:


Unnamed: 0,Metric,Value
0,R2,0.68
1,RMSE,19.358
2,MAE,16.721


# Save the results to csv files

In [7]:
Predicted_result.to_csv('df_example_predicted_result.csv', index=False)
Performance_metrics.to_csv('df_example_performance_result.csv', index=False)