In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
df_target = pd.read_csv("AR6_scenario.csv")
target_variables = [
    'Emissions|CO2',
    'Emissions|CO2|AFOLU',
    'Emissions|CO2|Energy|Demand|Industry',
    'Emissions|CO2|Energy|Supply|Electricity',
    'Emissions|CO2|Energy and Industrial Processes',
    'Final Energy|Industry|Solids|Biomass',
    'Final Energy|Industry|Solids|Coal',
    'Final Energy|Residential and Commercial|Solids|Coal',
    'Primary Energy|Coal' ,
    'Secondary Energy|Electricity|Coal' 
]

In [None]:
df_grouped = df_target.groupby(['Model', 'Scenario']) 

In [None]:
pair_counts = df_target.groupby(['Model', 'Scenario'])['Variable'].nunique()  
common_pairs_index = pair_counts[pair_counts == len(target_variables)].index  

In [None]:
df_common = df_target[df_target.set_index(['Model', 'Scenario']).index.isin(common_pairs_index)] 

In [None]:
df_paired = df_common.set_index(['Model', 'Scenario', 'Variable'])

In [None]:
year_cols = [col for col in df_target if col.isdigit()]  

In [None]:
all_new_features_list = []

for var in target_variables:  
    print(f" Starting processing '{var}'...")  
    var_idx = pd.MultiIndex.from_product([common_pairs_index.get_level_values('Model'),  
                                                     common_pairs_index.get_level_values('Scenario'),  
                                                     [var]], names=['Model', 'Scenario', 'Variable'])  

    valid_var_idx = var_idx.intersection(df_paired.index)  
    if valid_var_idx.empty:  
        print(f"Warning: There is no data with variable '{var}' in the common pairs. Skipping.")  
        continue  

    var_data = df_paired.loc[valid_var_idx, year_cols]  
    var_data = var_data.reset_index(level='Variable', drop=True)  
    var_data = var_data.apply(pd.to_numeric, errors='coerce')  

    if not year_cols:
        feature1 = pd.Series(0.0, index=var_data.index) 
        print(f"Warning: Variable '{var}' has no year columns in the 2020-2100 range. Feature 1 (Sum) set to 0.")  
    else:  
        feature1 = var_data[year_cols].sum(axis=1, skipna=True)  
        feature1.name = f"{var}_sum_2020_2100"  

    if '2020' in var_data.columns and '2030' in var_data.columns:  
        feature2 = (var_data['2030'].fillna(0) - var_data['2020'].fillna(0)) / 10  
    else:  
        print(f"Warning: Variable '{var}' has no '2020' or '2030' data, Feature 2 set to 0/NaN.")  
        feature2 = pd.Series(np.nan, index=var_data.index) 
    feature2.name = f"{var}_trend_2020_2030"  
    
    if '2040' in var_data.columns and '2050' in var_data.columns:  
        feature3 = (var_data['2050'].fillna(0) - var_data['2030'].fillna(0)) / 20  
    else:  
        print(f"Warning: Variable '{var}' has no '2030' or '2050' data, Feature 3 set to 0/NaN.")  
        feature3 = pd.Series(np.nan, index=var_data.index) 
    feature3.name = f"{var}_trend_2030_2050" 

    var_features_df = pd.concat([feature1, feature2, feature3], axis=1)  
    all_new_features_list.append(var_features_df)  

In [None]:
X = pd.concat(all_new_features_list, axis=1)  

In [None]:
xy_map = df_common.groupby(['Model', 'Scenario'])['Model'].first()  

In [None]:
y = xy_map.loc[X.index]  
print(f"The shape of target variable y: {y.shape}")  
print(f"The distribution of target variable y:\n{y.value_counts(normalize=True)}") 

In [None]:
import seaborn as sns  
import matplotlib.pyplot as plt  
from sklearn.ensemble import RandomForestClassifier  
from sklearn.model_selection import train_test_split  
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import re  

In [None]:
classifier = RandomForestClassifier(class_weight='balanced',random_state=42)  
parameters = {  
    'n_estimators': [100, 150, 200, 300], 
    'max_depth': [8, 10, 12,14],    
    'min_samples_split': [4, 6, 8]
}  
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)  
print("Starting GridSearchCV for hyperparameter tuning...")  
gridsearch = GridSearchCV(classifier, parameters, cv=cv_strategy, n_jobs=-1, verbose=2, scoring='f1_weighted')  
gridsearch.fit(X, y) 

In [None]:
rf_best_params = gridsearch.best_params_  
rf_best_estimator = gridsearch.best_estimator_ 

print(f"\nBest Parameters Found by GridSearchCV: {gridsearch.best_params_}")  
print(f"Best cross-validated accuracy score during GridSearchCV: {gridsearch.best_score_:.4f}")

In [None]:
cv_scores = cross_val_score(rf_best_estimator, X, y, cv=cv_strategy, scoring='accuracy', n_jobs=-1)  

print(f"\nCross-Validation Accuracy Scores for each fold: {cv_scores}")  
print(f"Mean Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")  
print(f"Standard Deviation of Cross-Validation Accuracy: {np.std(cv_scores):.4f}")  

In [None]:
print("\nTraining the final model on the entire dataset...")  
rf_classifier = RandomForestClassifier(**gridsearch.best_params_, random_state=42) 
rf_classifier.fit(X, y)  
print("Final model trained successfully on all data.")  

In [None]:
importance = rf_classifier.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)
print(feature_importance_df)

# Predict Model

In [None]:
df_synthetic = pd.read_csv("synthetic_scenario.csv")

In [None]:
sample_var_counts = df_synthetic.groupby('sample_id')['Variable'].nunique()
complete_samples = sample_var_counts[sample_var_counts == len(target_variables)]

In [None]:
if len(complete_samples) < len(sample_var_counts):
    complete_sample_ids = complete_samples.index
    df_synthetic_filtered = df_synthetic[df_synthetic['sample_id'].isin(complete_sample_ids)]
else:
    df_synthetic_filtered = df_synthetic.copy()
    complete_sample_ids = df_synthetic['sample_id'].unique()

In [None]:
year_cols = [col for col in df_synthetic_filtered.columns if col.isdigit()]

In [None]:
df_synthetic_indexed = df_synthetic_filtered.set_index(['sample_id', 'Variable'])

all_synthetic_features_list = []

for var in target_variables:
    print(f"Start processing variable '{var}'...")
    
    var_idx = pd.MultiIndex.from_product([complete_sample_ids, [var]], 
                                        names=['sample_id', 'Variable'])
    
    valid_var_idx = var_idx.intersection(df_synthetic_indexed.index)
    if valid_var_idx.empty:
        print(f" Warning: Variable '{var}' not found in synthetic data, skipping.")
        continue
    
    var_data = df_synthetic_indexed.loc[valid_var_idx, year_cols]
    var_data = var_data.reset_index(level='Variable', drop=True)
    var_data = var_data.apply(pd.to_numeric, errors='coerce')
    
    if not year_cols:
        feature1 = pd.Series(0.0, index=var_data.index)
        print(f"Warning: Variable '{var}' has no year columns in the 2020-2100 range. Feature 1 (Sum) set to 0.")
    else:
        feature1 = var_data[year_cols].sum(axis=1, skipna=True)
        feature1.name = f"{var}_sum_2020_2100"
    
    if '2020' in var_data.columns and '2030' in var_data.columns:
        feature2 = (var_data['2030'].fillna(0) - var_data['2020'].fillna(0)) / 10
    else:
        print(f"Warning: Variable '{var}' missing '2020' or '2030' data, Feature 2 set to NaN.")
        feature2 = pd.Series(np.nan, index=var_data.index)
    feature2.name = f"{var}_trend_2020_2030"
    
    if '2030' in var_data.columns and '2050' in var_data.columns:
        feature3 = (var_data['2050'].fillna(0) - var_data['2030'].fillna(0)) / 20
    else:
        print(f"Warning: Variable '{var}' missing '2030' or '2050' data, Feature 3 set to NaN.")
        feature3 = pd.Series(np.nan, index=var_data.index)
    feature3.name = f"{var}_trend_2030_2050"
    
    var_features_df = pd.concat([feature1, feature2, feature3], axis=1)
    all_synthetic_features_list.append(var_features_df)


X_synthetic = pd.concat(all_synthetic_features_list, axis=1)

if X_synthetic.isnull().any().any():
    X_synthetic = X_synthetic.fillna(0)

if hasattr(rf_classifier, 'feature_names_in_'):
    expected_features = rf_classifier.feature_names_in_

    
    missing_features = set(expected_features) - set(X_synthetic.columns)
    extra_features = set(X_synthetic.columns) - set(expected_features)
    

    X_synthetic = X_synthetic[expected_features]

In [None]:
predictions = rf_classifier.predict(X_synthetic)

In [None]:
prediction_proba = rf_classifier.predict_proba(X_synthetic)
max_proba = np.max(prediction_proba, axis=1)


prediction_results = pd.DataFrame({
    'sample_id': X_synthetic.index,
    'predicted_model': predictions,
    'prediction_confidence': max_proba
})

print(prediction_results['predicted_model'].value_counts(normalize = True))

In [None]:
df_result = df_synthetic_filtered.merge(prediction_results[['sample_id', 'predicted_model']], 
                                       on='sample_id', how='left')

df_result = df_result.rename(columns={'predicted_model': 'Model'})

cols = df_result.columns.tolist()
if 'Model' in cols and 'Variable' in cols:
    cols.remove('Model')
    var_idx = cols.index('Variable')
    cols.insert(var_idx, 'Model')
    df_result = df_result[cols]

print(df_result.head(10))


df_result.to_csv("synthetic_scenario_model.csv", index=False)