# Date Pre-processing

In [1]:
import pandas as pd
import numpy as np
import os

## load df_target & target_variables

In [2]:
df_target = pd.read_csv("df_target.csv")
target_variables = np.load('target_variables.npy', allow_pickle=True)   

In [3]:
target_vars_set = set(target_variables)  
num_target_vars = len(target_vars_set) 
df_grouped = df_target.groupby(['Model', 'Scenario']) 

### Merge Model

In [4]:
MESSAGEix = list(set([i for i in df_target['Model'] if  'MESSAGE' in i]))
WITCH = list(set([i for i in df_target['Model'] if  'WITCH' in i]))
COFFEE = ['COFFEE 1.1']
REMIND = list(set([i for i in df_target['Model'] if  'REM' in i]))
TIA = list(set([i for i in df_target['Model'] if  'TIAM-ECN' in i]))
POL = list(set([i for i in df_target['Model'] if  'POL' in i]))
AIM = list(set([i for i in df_target['Model'] if  'AIM' in i]))
IMA = list(set([i for i in df_target['Model'] if  'IMAGE' in i]))
GCA = list(set([i for i in df_target['Model'] if  'GCA' in i]))

In [5]:
Model = [MESSAGEix,WITCH,COFFEE,REMIND,TIA,POL,AIM,IMA,GCA]
Model_names = ['MESSAGEix','WITCH','COFFEE','REMIND','TIA','POL','AIM','IMA','GCA']
Model_List = []
for i in Model:
    Model_List += i

In [None]:
df_merged = df_target.copy()
df_merged = df_merged[df_merged['Model'].isin(Model_List)]

In [7]:
mapping = {j:Model_names[i] for i in range(len(Model)) for j in Model[i]}
df_merged['Model'] = df_merged['Model'].replace(mapping)
df_merged = df_merged.reset_index(drop=True)

In [8]:
df_grouped = df_merged.groupby(['Model', 'Scenario']) 

## Retain scenarios with all target-variables

In [None]:
pair_counts = df_merged.groupby(['Model', 'Scenario'])['Variable'].nunique()  
common_pairs_index = pair_counts[pair_counts == len(target_variables)].index  
print(f"ÂÖ±Êúâ {len(common_pairs_index)} ‰∏™ (Model, Scenario) ÂØπÊã•ÊúâÂÖ®ÈÉ® {len(target_variables)} ‰∏™ÂèòÈáèÁöÑÊï∞ÊçÆ„ÄÇ")  

In [None]:
df_common = df_merged[df_merged.set_index(['Model', 'Scenario']).index.isin(common_pairs_index)] 
print(f"Á≠õÈÄâÂÖ±ÂêåÂØπÂêéÁöÑÊï∞ÊçÆÂΩ¢Áä∂: {df_common.shape}") 

In [19]:
#df_common.to_csv("df_common.csv", index = False)
df_common = pd.read_csv("df_common.csv")

In [20]:
df_paired = df_common.set_index(['Model', 'Scenario', 'Variable'])

## Construct feature matrix

In [22]:
year_cols = [col for col in df_target if col.isdigit()]  

In [None]:
all_new_features_list = []

for var in target_variables:  
    print(f"  Start processing Variable '{var}'...")  
    var_idx = pd.MultiIndex.from_product([common_pairs_index.get_level_values('Model'),  
                                                     common_pairs_index.get_level_values('Scenario'),  
                                                     [var]], names=['Model', 'Scenario', 'Variable'])  

    valid_var_idx = var_idx.intersection(df_paired.index)  
    if valid_var_idx.empty:  
        print(f"Warning: Variable '{var}' has no data in df_paired. Skipping.")  
        continue  

    var_data = df_paired.loc[valid_var_idx, year_cols]  
    var_data = var_data.reset_index(level='Variable', drop=True)  
    var_data = var_data.apply(pd.to_numeric, errors='coerce')  

    # feature1: the sum of 2020-2100 
    if not year_cols:
        feature1 = pd.Series(0.0, index=var_data.index) 
        print(f"Warning: Variable '{var}' has no 2020-2100 year columns. Feature 1 (Sum) set to 0.")  
    else:  
        feature1 = var_data[year_cols].sum(axis=1, skipna=True)  
        feature1.name = f"{var} Cumulative"  

    # feature2: (2030 - 2020) / 10  
    if '2020' in var_data.columns and '2030' in var_data.columns:  
        feature2 = (var_data['2030'].fillna(0) - var_data['2020'].fillna(0)) / 10  
    else:  
        print(f"Warning: Variable '{var}' missing '2020' or '2030' data, Feature 2 set to all NaN.")  
        feature2 = pd.Series(np.nan, index=var_data.index) 
    feature2.name = f"{var} 2020-2030"  
    
    # feature3: (2040 - 2030) / 10  
    if '2040' in var_data.columns and '2030' in var_data.columns:  
        feature3 = (var_data['2040'].fillna(0) - var_data['2030'].fillna(0)) / 10  
    else:  
        print(f"Warning: Variable '{var}' missing '2030' or '2040' data, Feature 3 set to all NaN.")  
        feature3 = pd.Series(np.nan, index=var_data.index) 
    feature3.name = f"{var} 2030-2040" 

    var_features_df = pd.concat([feature1, feature2, feature3], axis=1)  
    all_new_features_list.append(var_features_df)  

In [None]:
X = pd.concat(all_new_features_list, axis=1)  

In [26]:
xy_map = df_common.groupby(['Model', 'Scenario'])['PC_m'].first()  

In [None]:
y = xy_map.loc[X.index]  

# Random Forest Classifier

In [28]:
import seaborn as sns  
import matplotlib.pyplot as plt  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import re  

## Fitting RF

In [None]:
classifier = RandomForestClassifier(random_state=42)  
parameters = {  
    'n_estimators': [100, 150, 200, 300], 
    'max_depth': [10, 12, 14],    
    'min_samples_split': [4, 6, 8]
}  
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  
print("Starting GridSearchCV for hyperparameter tuning...")  
gridsearch = GridSearchCV(classifier, parameters, cv=cv_strategy, n_jobs=-1, verbose=2, scoring='accuracy')  
gridsearch.fit(X, y) 

In [None]:
rf_best_params = gridsearch.best_params_  
rf_best_estimator = gridsearch.best_estimator_ 

print(f"\nBest Parameters Found by GridSearchCV: {gridsearch.best_params_}")  
print(f"Best cross-validated accuracy score during GridSearchCV: {gridsearch.best_score_:.4f}")

In [None]:
cv_scores = cross_val_score(rf_best_estimator, X, y, cv=cv_strategy, scoring='accuracy', n_jobs=-1)  

print(f"\nCross-Validation Accuracy Scores for each fold: {cv_scores}")  
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")  
print(f"Standard Deviation of Cross-Validation Accuracy: {np.std(cv_scores):.4f}")  

In [None]:
print("\nTraining the final model on the entire dataset...")  
rf_classifier = RandomForestClassifier(**gridsearch.best_params_, random_state=42) 
rf_classifier.fit(X, y)  
print("Final model trained successfully on all data.")  


## Feature Importance

In [34]:
feature_importance_path = './feature_importance_20_to_40'
os.makedirs(feature_importance_path, exist_ok=True)

In [None]:
importance = rf_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
print(feature_importance_df)

In [36]:
feature_importance_df.to_csv(os.path.join(feature_importance_path, 'Feature_Importance.csv'), index = False)

### Importance rank by feature

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import textwrap

In [None]:
def extract_target_variables_from_features(feature_names):
    target_vars = set()
    suffixes = [' Cumulative', ' 2020-2030', ' 2030-2040']
    
    for feature in feature_names:
        for suffix in suffixes:
            if feature.endswith(suffix):
                target_var = feature[:-len(suffix)]
                target_vars.add(target_var)
                break
    
    return sorted(list(target_vars))

In [None]:
target_variables_from_features = extract_target_variables_from_features(feature_importance_df['Feature'].tolist())

In [None]:
def wrap_feature_name(name, max_chars_per_line=20):

    if len(name) <= max_chars_per_line:
        return name
    

    separators = [
        ('|', True),  
        ('_', True), 
        (' ', False),
        ('-', True)  
    ]
    
    best_split = None
    best_score = float('inf')
    
    for sep, keep_separator in separators:
        if sep in name:
            parts = name.split(sep)
            
            if len(parts) < 3:
                continue
            
            for split_idx in range(2, len(parts)):
        
                first_parts = parts[:split_idx]
                second_parts = parts[split_idx:]
                
                first_line = sep.join(first_parts)
                second_line = sep.join(second_parts)
                

                if len(second_line) <= max_chars_per_line:
    
                    if len(first_line) <= max_chars_per_line:

                        score = abs(len(first_line) - len(second_line))  
                    else:
                        score = len(first_line) - max_chars_per_line + 100
                    
                    if score < best_score:
                        best_score = score
                        best_split = (first_line, second_line, sep, keep_separator)
    
    if best_split:
        first_line, second_line, sep, keep_separator = best_split
        if keep_separator:
            return first_line + sep + '\n' + second_line
        else:
            return first_line + '\n' + second_line
    
    for sep, keep_separator in separators:
        if sep in name:
            parts = name.split(sep)
            
            if len(parts) >= 2:
                first_line = sep.join(parts[:2])
                second_line = sep.join(parts[2:])
                
                if len(second_line) <= max_chars_per_line * 1.5: 
                    if keep_separator:
                        return first_line + sep + '\n' + second_line
                    else:
                        return first_line + '\n' + second_line
    return name

In [None]:

def wrap_feature_name(name, max_chars_per_line=20):

    if len(name) <= max_chars_per_line:
        return name
    
    # Ê†πÊçÆÊÄªÈïøÂ∫¶ÂÜ≥ÂÆöÁõÆÊ†áË°åÊï∞
    total_length = len(name)
    if total_length <= max_chars_per_line * 2:
        target_lines = 2
    elif total_length <= max_chars_per_line * 3:
        target_lines = 3
    else:
        target_lines = 3  
    
    separators = ['|', '_', ' ', '-']
    
    def split_into_lines(text, target_lines, separator):
        if separator not in text:
            return None
            
        parts = text.split(separator)
        if len(parts) < target_lines:
            return None
        
        parts_per_line = len(parts) // target_lines
        extra_parts = len(parts) % target_lines
        
        lines = []
        start_idx = 0
        
        for line_num in range(target_lines):
            current_line_parts = parts_per_line + (1 if line_num < extra_parts else 0)
            end_idx = start_idx + current_line_parts
            line_parts = parts[start_idx:end_idx]
            line_text = separator.join(line_parts)
            lines.append(line_text)
            
            start_idx = end_idx
        
        return lines
    
    def evaluate_split_quality(lines, max_chars):

        if not lines:
            return float('inf')
        
        if any(len(line.strip()) == 0 for line in lines):
            return float('inf')
        
        max_line_length = max(len(line) for line in lines)
        over_limit_penalty = max(0, max_line_length - max_chars) * 100
        
        avg_length = sum(len(line) for line in lines) / len(lines)
        variance = sum((len(line) - avg_length) ** 2 for line in lines) / len(lines)
        
        return over_limit_penalty + variance
    
    best_result = None
    best_score = float('inf')
    
    for separator in separators:
        for lines_to_try in [target_lines, max(2, target_lines - 1)]:
            result = split_into_lines(name, lines_to_try, separator)
            if result:
                score = evaluate_split_quality(result, max_chars_per_line)
                if score < best_score:
                    best_score = score
                    best_result = result
    
    if best_result:
        return '\n'.join(best_result)
    
    for separator in separators:
        if separator in name:
            parts = name.split(separator)
            if len(parts) >= 2:
                mid = len(parts) // 2
                line1 = separator.join(parts[:mid])
                line2 = separator.join(parts[mid:])
                return line1 + '\n' + line2
    return name


In [None]:
def generate_distinct_colors(n):
    if n <= 10:
        colors = plt.cm.tab10(np.linspace(0, 1, 10))[:n]
    elif n <= 20:
        colors = plt.cm.tab20(np.linspace(0, 1, 20))[:n]
    else:
        colors = plt.cm.hsv(np.linspace(0, 1, n))
    
    return [mcolors.rgb2hex(color) for color in colors]

In [None]:
def plot_top_n_features_optimized(feature_importance_df, n, custom_text_colors=None, custom_bar_colors=None, 
                                  max_chars_per_line=30, ylabel_position=(0.02, 0.5), ylabel_rotation=0,
                                  font_size=8, bar_height=0.6, fig_height=None, fig_width=12):
    
    top_n_df = feature_importance_df.nlargest(n, 'Importance').sort_values('Importance', ascending=True)
    
    suffixes = [' Cumulative', ' 2020-2030', ' 2030-2040']
    
    def extract_target_variable_and_suffix(feature_name):
        for suffix in suffixes:
            if feature_name.endswith(suffix):
                target_var = feature_name[:-len(suffix)]
                return target_var, suffix
        return feature_name, None  
    
    top_n_df = top_n_df.copy()
    target_suffix_pairs = top_n_df['Feature'].apply(extract_target_variable_and_suffix)
    top_n_df['Target_Variable'] = [pair[0] for pair in target_suffix_pairs]
    top_n_df['Suffix'] = [pair[1] for pair in target_suffix_pairs]
    
    unique_targets = top_n_df['Target_Variable'].unique()
    unique_suffixes = [suffix for suffix in suffixes if suffix in top_n_df['Suffix'].values]
    
    text_default_colors = generate_distinct_colors(len(unique_targets))
    text_color_mapping = {target: text_default_colors[i] for i, target in enumerate(unique_targets)}
    
    if custom_text_colors:
        for target, color in custom_text_colors.items():
            if target in text_color_mapping:
                text_color_mapping[target] = color
    
    default_bar_colors = {
        ' Cumulative': '#FAC7DA',      
        ' 2020-2030': '#D1E3FF',    
        ' 2030-2040': '#FFF4F5' 
    }
    
    bar_color_mapping = default_bar_colors.copy()
    
    if custom_bar_colors:
        for suffix, color in custom_bar_colors.items():
            if suffix in bar_color_mapping:
                bar_color_mapping[suffix] = color
    
    text_colors = [text_color_mapping[target] for target in top_n_df['Target_Variable']]
    bar_colors = [bar_color_mapping.get(suffix, '#888888') for suffix in top_n_df['Suffix']]
    
    wrapped_feature_names = [wrap_feature_name(name, max_chars_per_line) for name in top_n_df['Feature']]
    
    if fig_height is None:
        base_height_per_feature = max(0.3, bar_height * 0.8)  
        font_factor = font_size / 10.0 
        calculated_height = max(6, n * base_height_per_feature * font_factor + 2) 
        fig_height = calculated_height
    
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    
    bars = ax.barh(range(len(top_n_df)), top_n_df['Importance'], 
                   color=bar_colors, height=bar_height)
    
    ax.set_yticks(range(len(wrapped_feature_names)))
    ax.set_yticklabels(wrapped_feature_names, fontsize=font_size, weight='bold')

    for i, target_var in enumerate(top_n_df['Target_Variable']):
        ax.get_yticklabels()[i].set_color(text_color_mapping[target_var])
    
    ax.set_xlabel('Feature Importance', fontsize=18, weight='bold')
    ax.tick_params(axis='x', labelsize=15)

    ax.spines['top'].set_visible(False)      
    ax.spines['right'].set_visible(False)   
    ax.spines['left'].set_visible(True)      
    ax.spines['bottom'].set_visible(True)  
    
    ax.spines['left'].set_linewidth(2)      
    ax.spines['bottom'].set_linewidth(2)   
    
    fig.text(ylabel_position[0], ylabel_position[1], 'Features', 
             fontsize=18, weight='bold', rotation=ylabel_rotation,
             ha='center', va='center', transform=fig.transFigure)
    
    text_legend_elements = [plt.Rectangle((0,0),1,1, facecolor=text_color_mapping[target], 
                                        label=f'Text: {target}') for target in unique_targets]
    
    suffix_labels = {
        ' Cumulative': 'Bar: Sum (2020-2100)',
        ' 2020-2030': 'Bar: Trend (2020-2030)',
        ' 2030-2040': 'Bar: Trend (2030-2040)',
    }
    
    bar_legend_elements = [plt.Rectangle((0,0),1,1, facecolor=bar_color_mapping[suffix], 
                                       label=suffix_labels[suffix]) 
                          for suffix in unique_suffixes]
    
    all_legend_elements = text_legend_elements + bar_legend_elements
    
    plt.tight_layout()
    
    plt.savefig(os.path.join(feature_importance_path, f'Compact_Top_{n}_Feature_Importance.png'), dpi=600, bbox_inches='tight')
    plt.show()
    
    return text_color_mapping, bar_color_mapping

In [None]:
n_top_features = 20

custom_text_colors = {
    'Emissions|CO2': '#664B3E',  #
    'Emissions|CO2|AFOLU': '#BF7744', #
    
    'Final Energy|Industry|Solids|Coal': '#A59ACA', #
    
    'Emissions|CO2|Energy|Supply|Electricity': '#F1BD3F',#
    'Emissions|CO2|Energy|Demand|Industry': '#CFB697', #
    
    'Final Energy|Residential and Commercial|Solids|Coal': '#7A7B78',  #
    'Secondary Energy|Electricity|Coal': '#FFBE7A',  #
    'Final Energy|Industry|Solids|Biomass': "#9BBF8A", #
    'Emissions|CO2|Energy and Industrial Processes': '#F07874', 
    'Primary Energy|Coal': '#C82423', #
    'Primary Energy|Nuclear': '#A14F6C', #
    'Final Energy|Residential and Commercial|Liquids': '#87C0CA',  #
    'Carbon Sequestration|CCS|Biomass': '#45465E', # 
    'Primary Energy|Wind': '#80B1D2', #
    'Primary Energy|Hydro': '#354E6B', #
    'Carbon Sequestration|CCS': '#CA9C91', #
}

In [44]:
custom_bar_colors = {
        ' Cumulative': '#DCC1AD',      
        ' 2020-2030': '#A4ABD6',    
        ' 2030-2040': '#B26D50',      
    }

In [None]:
text_mapping, bar_mapping = plot_top_n_features_optimized(
    feature_importance_df, 
    n_top_features,
    custom_text_colors, 
    custom_bar_colors, 
    max_chars_per_line=30,
    font_size=9,          
    bar_height=0.8,       
    ylabel_position=(0.1, 1.05),  
    ylabel_rotation=0,  
    fig_height=20, 
    fig_width=16
)

### Importance rank by temporal

In [75]:
Importance_of_2020_2030 = []
Importance_of_2030_2040 = []
Importance_of_Sum = []

for i in range(feature_importance_df.shape[0]):
    if '2020-2030' in feature_importance_df.iloc[i]['Feature']:
        Importance_of_2020_2030.append(feature_importance_df.iloc[i]['Importance'])
    elif '2030-2040' in feature_importance_df.iloc[i]['Feature']:
        Importance_of_2030_2040.append(feature_importance_df.iloc[i]['Importance'])
    else:
        Importance_of_Sum.append(feature_importance_df.iloc[i]['Importance'])

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import os

def create_donut_chart_feature_importance(feature_importance_df, 
                                        inner_radius=0.4,
                                        fig_width=10, 
                                        fig_height=8,
                                        title_fontsize=16,
                                        label_fontsize=12,
                                        legend_fontsize=11,
                                        save_path=None,
                                        font_path=None):
    
    if font_path and os.path.exists(font_path):
        font_prop = fm.FontProperties(fname=font_path)
        plt.rcParams['font.family'] = font_prop.get_name()
    
    Importance_of_2020_2030 = []
    Importance_of_2030_2040 = []
    Importance_of_Sum = []
    
    for i in range(feature_importance_df.shape[0]):
        if '2020-2030' in feature_importance_df.iloc[i]['Feature']:
            Importance_of_2020_2030.append(feature_importance_df.iloc[i]['Importance'])
        elif '2030-2040' in feature_importance_df.iloc[i]['Feature']:
            Importance_of_2030_2040.append(feature_importance_df.iloc[i]['Importance'])
        else:
            Importance_of_Sum.append(feature_importance_df.iloc[i]['Importance'])
    
    cumulative_importance = sum(Importance_of_Sum)
    period_2020_2030 = sum(Importance_of_2020_2030)
    period_2030_2040 = sum(Importance_of_2030_2040)
    
    values = [cumulative_importance, period_2020_2030, period_2030_2040]
    categories = ['Cumulative', '2020-2030', '2030-2040']
    colors = ['#DCC1AD', '#A4ABD6', '#B26D50']
    
    print(f"Cumulative: {cumulative_importance:.4f} ({len(Importance_of_Sum)} features)")
    print(f"2020-2030: {period_2020_2030:.4f} ({len(Importance_of_2020_2030)} features)")
    print(f"2030-2040: {period_2030_2040:.4f} ({len(Importance_of_2030_2040)} features)")
    
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))
    
    wedges, texts, autotexts = ax.pie(values, 
                                      labels=None,  
                                      colors=colors,
                                      autopct='%1.1f%%',
                                      startangle=90,
                                      textprops={'fontsize': label_fontsize, 'fontweight': 'bold'},
                                      pctdistance=0.85)
    
    centre_circle = plt.Circle((0, 0), inner_radius, fc='white', ec='lightgray', linewidth=2)
    fig.gca().add_artist(centre_circle)
    
    for autotext in autotexts:
        autotext.set_color('white')
        autotext.set_fontweight('bold')
        autotext.set_fontsize(fontsize=22)
    
    label_radius = inner_radius * 0.7 
    
    for i, (wedge, category) in enumerate(zip(wedges, categories)):

        theta1, theta2 = wedge.theta1, wedge.theta2
        mid_angle = (theta1 + theta2) / 2

        angle_rad = np.radians(mid_angle)
        x = label_radius * np.cos(angle_rad)
        y = label_radius * np.sin(angle_rad)

    ax.set_title('Feature Importance by Category', 
                fontsize=title_fontsize, 
                fontweight='bold',
                pad=20)

    legend_labels = categories
    
    ax.legend(wedges, legend_labels,
              title="Feature Type",
              title_fontsize=legend_fontsize + 1,
              fontsize=legend_fontsize,
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 1),
              frameon=False,
              fancybox=False,
              shadow=False)

    ax.set_aspect('equal')
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=600, bbox_inches='tight', facecolor='white')
    
    plt.show()
    
    return values, categories

In [None]:
values, categories = create_donut_chart_feature_importance(
    feature_importance_df,
    inner_radius=0.7,
    fig_width=10,
    fig_height=8,
    title_fontsize=16,
    label_fontsize=12,
    legend_fontsize=11,
    save_path=os.path.join(feature_importance_path, 'Donut_Chart_Feature_Importance_nontext.png')
)

### Importance rank by variable

In [None]:
suffixes = ["_sum_2020_2100", "_trend_2020_2030", "_trend_2030_2040"]  

def get_original_variable(feature_name, base_variables): 
    for base_var in base_variables:  
        if feature_name.startswith(base_var + "_"):   
            suffix_part = feature_name[len(base_var):]  
            if suffix_part in suffixes:  
                 return base_var  
    print(f"Warning: Could not map feature '{feature_name}' to a base variable.")  
    return None 

In [None]:
feature_importance_df['Original_Variable'] = feature_importance_df['Feature'].apply(  
    lambda x: get_original_variable(x, target_variables)  
)  
unmapped_features = feature_importance_df[feature_importance_df['Original_Variable'].isnull()]  
if not unmapped_features.empty:  
    print("\nCould not map the following features:")  
    print(unmapped_features)  

feature_importance_df.dropna(subset=['Original_Variable'], inplace=True)  


In [None]:
aggregated_importance = feature_importance_df.groupby('Original_Variable')['Importance'].sum()  
aggregated_importance_df = aggregated_importance.reset_index()  
aggregated_importance_df.rename(columns={'Importance': 'Aggregated_Importance'}, inplace=True)  
aggregated_importance_df = aggregated_importance_df.sort_values('Aggregated_Importance', ascending=False)  
print("\nAggregated Feature Importance (Top 10):")  
print(aggregated_importance_df.head(10))  

In [None]:
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import pandas as pd
import numpy as np
import os

def create_pie_chart_feature_importance(feature_importance_df, 
                                       inner_radius=0.4,  
                                       explode_distance=0.05, 
                                       show_percentages=True,  
                                       fig_width=10, 
                                       fig_height=8,
                                       title_fontsize=16,
                                       label_fontsize=12,
                                       legend_fontsize=11,
                                       save_path=None,
                                       font_path=None):

    
    if font_path and os.path.exists(font_path):
        font_prop = fm.FontProperties(fname=font_path)
        plt.rcParams['font.family'] = font_prop.get_name()
    
    Importance_of_PrimaryEnergy = []
    Importance_of_SecondaryEnergy = []
    Importance_of_Emissions = []
    Importance_of_FinalEnergy = []
    Importance_of_CarbonSequestration = []
    
    for i in range(feature_importance_df.shape[0]):
        if 'Primary Energy' in feature_importance_df.iloc[i]['Feature']:
            Importance_of_PrimaryEnergy.append(feature_importance_df.iloc[i]['Importance'])
        elif 'Secondary Energy' in feature_importance_df.iloc[i]['Feature']:
            Importance_of_SecondaryEnergy.append(feature_importance_df.iloc[i]['Importance'])
        elif 'Emissions' in feature_importance_df.iloc[i]['Feature']:
            Importance_of_Emissions.append(feature_importance_df.iloc[i]['Importance'])
        elif 'Final Energy' in feature_importance_df.iloc[i]['Feature']:
            Importance_of_FinalEnergy.append(feature_importance_df.iloc[i]['Importance'])
            
        else:
            Importance_of_CarbonSequestration.append(feature_importance_df.iloc[i]['Importance'])

    PrimaryEnergy = sum(Importance_of_PrimaryEnergy)
    SecondaryEnergy = sum(Importance_of_SecondaryEnergy)
    Emissions = sum(Importance_of_Emissions)
    FinalEnergy = sum(Importance_of_FinalEnergy)
    CarbonSequestration = sum(Importance_of_CarbonSequestration)
    
    values = [PrimaryEnergy, SecondaryEnergy, Emissions, FinalEnergy, CarbonSequestration]
    categories = ['Primary Energy', 'Secondary Energy', 'Emissions', 'Final Energy', 'Carbon Sequestration']
    colors = ['#AA9687', '#DF9E73', '#C5817C', '#A4A5A6', '#71A4BA']
    
    explode = [explode_distance] * len(values) if explode_distance > 0 else None

    if explode_distance > 0:
        total_value = sum(values)
        proportions = [v/total_value for v in values]
        explode = [explode_distance * (1.5 - prop) for prop in proportions]

    # ÊâìÂç∞ÁªüËÆ°‰ø°ÊÅØ
    print("üìä Feature Importance by Variable:")
    print(f"Primary Energy: {PrimaryEnergy:.4f} ({len(Importance_of_PrimaryEnergy)} features)")
    print(f"Secondary Energy: {SecondaryEnergy:.4f} ({len(Importance_of_SecondaryEnergy)} features)")
    print(f"Emissions: {Emissions:.4f} ({len(Importance_of_Emissions)} features)")
    print(f"FinalEnergy: {FinalEnergy:.4f} ({len(Importance_of_FinalEnergy)} features)")
    print(f"CarbonSequestration: {CarbonSequestration:.4f} ({len(Importance_of_CarbonSequestration)} features)")
    
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))

    autopct_setting = '%1.1f%%' if show_percentages else None

    if show_percentages:
        pie_result = ax.pie(values, 
                           labels=None,  
                           colors=colors,
                           autopct=autopct_setting,
                           startangle=90,
                           explode=explode,  
                           textprops={'fontsize': label_fontsize, 'fontweight': 'bold'},
                           pctdistance=0.85)
        wedges, texts, autotexts = pie_result
    else:
        pie_result = ax.pie(values, 
                           labels=None, 
                           colors=colors,
                           autopct=autopct_setting,  
                           startangle=90,
                           explode=explode,  
                           textprops={'fontsize': label_fontsize, 'fontweight': 'bold'})
        wedges, texts = pie_result
        autotexts = None  
    
    if show_percentages and autotexts:
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
            autotext.set_fontsize(fontsize=22)

    for i, (wedge, category) in enumerate(zip(wedges, categories)):

        theta1, theta2 = wedge.theta1, wedge.theta2
        mid_angle = (theta1 + theta2) / 2
        proportion = values[i] / sum(values) * 100

    ax.set_title('Feature Importance by Category', 
                fontsize=title_fontsize, 
                fontweight='bold',
                pad=20)
    
    legend_labels = categories
    
    ax.legend(wedges, legend_labels,
              title="Feature Type",
              title_fontsize=legend_fontsize + 1,
              fontsize=legend_fontsize,
              loc="center left",
              bbox_to_anchor=(1, 0, 0.5, 1),
              frameon=False,
              fancybox=False,
              shadow=False)

    ax.set_aspect('equal')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=600, bbox_inches='tight', facecolor='white')
    
    plt.show()
    
    return values, categories

In [None]:
values, categories = create_pie_chart_feature_importance(
    feature_importance_df,
    explode_distance=0.01,
    show_percentages=False, 
    save_path=os.path.join(feature_importance_path, 'importance aggregate by variable.png')
    )