# Pre-process

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv("synthetic_data.csv")

## Construct feature matrix

In [None]:
years = [str(y) for y in range(2020, 2101, 5)]  

variables = [
    'Emissions|CO2',
    'Final Energy|Industry|Solids|Coal',
    'Final Energy|Industry|Solids|Biomass',
    'Final Energy|Residential and Commercial|Solids|Coal',
    'Emissions|CO2|Energy|Demand|Industry',
    'Secondary Energy|Electricity|Coal',
    'Emissions|CO2|Energy|Supply|Electricity',
    'Primary Energy|Coal',
    'Emissions|CO2|Energy and Industrial Processes',
    'Emissions|CO2|AFOLU'
]

In [None]:
df['sample_id'] = df.index // 10

In [None]:
feature_dfs = []
for var in variables:
    sub = df[df['Variable'] == var].set_index('sample_id')
    data = sub[years].apply(pd.to_numeric, errors='coerce')
    
    f1 = data.sum(axis=1)
    f1.name = f'{var} Cumulative'
    
    f2 = (data['2030'].fillna(0) - data['2020'].fillna(0)) / 10
    f2.name = f'{var} 2020-2030'
    
    f3 = (data['2040'].fillna(0) - data['2030'].fillna(0)) / 10
    f3.name = f'{var} 2030-2040'
    
    
    feature_dfs.append(pd.concat([f1, f2, f3], axis=1))

In [None]:
X = pd.concat(feature_dfs, axis=1).sort_index()
print('X shape =', X.shape) 

In [None]:
y = df.groupby('sample_id')['Label'].first().sort_index()
print('y shape =', y.shape) 

# XGBoost

In [8]:
import seaborn as sns  
import matplotlib.pyplot as plt  
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import re  

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

## Fitting XGBoost

In [None]:
classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')
parameters = {
    'n_estimators': [100, 200, 400, 800],
    'max_depth': [6, 8, 10],
    'min_child_weight': [ 3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2, 0.3]
}
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print("Starting GridSearchCV for hyperparameter tuning...")
gridsearch = GridSearchCV(classifier, parameters, cv=cv_strategy, n_jobs=-1, verbose=2, scoring='accuracy')
gridsearch.fit(X, y)

In [None]:
xgb_best_params = gridsearch.best_params_
xgb_best_estimator = gridsearch.best_estimator_

print(f"\nBest Parameters Found by GridSearchCV: {gridsearch.best_params_}")
print(f"Best cross-validated accuracy score during GridSearchCV: {gridsearch.best_score_:.4f}")

In [None]:
cv_scores = cross_val_score(xgb_best_estimator, X, y, cv=cv_strategy, scoring='accuracy', n_jobs=-1)
print(f"\nCross-Validation Accuracy Scores for each fold: {cv_scores}")
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")
print(f"Standard Deviation of Cross-Validation Accuracy: {np.std(cv_scores):.4f}")

In [None]:
xgb_classifier = XGBClassifier(**gridsearch.best_params_, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
xgb_classifier.fit(X, y)
print("Final model trained successfully on all data.")

In [None]:
importance = xgb_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
print(feature_importance_df)

# SHAP

In [None]:
import shap  
import matplotlib.pyplot as plt  
import numpy as np  
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns

In [22]:
explainer = shap.TreeExplainer(xgb_classifier)  
shap_output = explainer(X) 

In [23]:
if isinstance(shap_output, shap.Explanation):
    raw_vals = shap_output.values
else:
    raw_vals = shap_output

In [24]:
if isinstance(raw_vals, np.ndarray) and raw_vals.ndim == 3:
    shap_values = [ raw_vals[:, :, i] for i in range(raw_vals.shape[2]) ]
else:
    shap_values = raw_vals

In [None]:
sns.set_style("whitegrid")
plt.rcParams.update({
    "font.size": 12,
    "figure.dpi": 120,
    "axes.titlesize": 14,
    "axes.labelsize": 4,
    "ytick.labelsize": 2
})
plt.rcParams['font.family'] = ['Microsoft YaHei']       
plt.rcParams['axes.unicode_minus'] = False 

In [None]:
variables = [
    'Emissions|CO2',
    'Final Energy|Industry|Solids|Coal',
    'Final Energy|Industry|Solids|Biomass',
    'Final Energy|Residential and Commercial|Solids|Coal',
    'Emissions|CO2|Energy|Demand|Industry',
    'Secondary Energy|Electricity|Coal',
    'Emissions|CO2|Energy|Supply|Electricity',
    'Primary Energy|Coal',
    'Emissions|CO2|Energy and Industrial Processes',
    'Emissions|CO2|AFOLU'
]

palette_var = sns.color_palette("tab10", len(variables))
color_map = {v: palette_var[i] for i, v in enumerate(variables)}
color_map['Sum of the rest'] = (1.0, 0.0, 0.0)

In [None]:
top_k = 15 
feature_names = X.columns.to_list()
class_names = ['P1','P2','P3']

In [None]:
plt.rcParams['font.family'] = ['Microsoft YaHei']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
plt.rcParams.update({
    "font.size": 10,
    "figure.dpi": 120,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "ytick.labelsize": 8
})

In [None]:
def color_yticks(ax):
    for lbl in ax.get_yticklabels():
        txt = lbl.get_text()
        if txt == 'Sum of the rest':
            lbl.set_color(color_map['Sum of the rest'])
        else:
            var = txt.split('_')[0]
            lbl.set_color(color_map.get(var, 'black'))

In [None]:
for idx, cls in enumerate(class_names):
    sv        = shap_values[idx]             
    abs_mean  = np.abs(sv).mean(axis=0)     
    order     = np.argsort(abs_mean)[::-1]
    top_idx   = order[:top_k]
    rest_idx  = order[top_k:]

    cols      = [feature_names[i] for i in top_idx] + ['Sum of the rest']
    shap_mat  = np.concatenate([
        sv[:, top_idx],
        np.abs(sv[:, rest_idx]).sum(axis=1, keepdims=True)
    ], axis=1)
    X_mat     = np.concatenate([
        X.iloc[:, top_idx].values,
        np.abs(sv[:, rest_idx]).sum(axis=1, keepdims=True)
    ], axis=1)

    plt.figure(figsize=(10/1.46,10))
    shap.summary_plot(
        shap_mat, X_mat,
        feature_names=cols,
        plot_type="violin",
        sort=False,       
        show=False
    )
    ax = plt.gca()
    plt.gca().tick_params(axis='y', labelsize = 8)
    for label in ax.get_yticklabels():
        label.set_fontweight('bold') 
    color_yticks(ax)    
    ax.set_xlabel(
    "SHAP value (impact on model output)",
    fontsize=12,
    fontweight='bold',
    labelpad=15
    )
    for text in ax.texts:
        if text.get_text() == 'Feature Value':
            text.set_fontweight('bold')
    plt.savefig(f"SHAP of Synthetic {cls}.png", dpi=600, bbox_inches = 'tight')
    plt.tight_layout()
    plt.show()
