In [5]:

import joblib # For serialization
import sys
sys.path.append('...')

In [6]:
from model_toolkit import *

## set up model runs

### Baseline models

In [18]:

workingdf = pd.read_pickle('.../workingdf_cln10.pkl')
workingdf = workingdf.astype({'industrycode':str})

# --- Execution Block1 for scope 1 ---
req_cols_scope1 = [
    # add in filters
    'idcol',
    'year',
    
    # add in numeric used for scaling
    'var1',
    'var2',

    # add in ratio variables for passthru
    'va1_transform',
    'var3',
    'var4',

    # add in variables for dummy encoding
    #'statename',
    'industrycode',

    # add in the contextual features
    'product_desc',

    # finally the outcome  
    'emission_scope1']

#---RUN BLOCL -----Set up the config ---#
epsilon = 1e-8 

# 1. Get the data
X_train_val, y_train_val, groups_train_val, X_test, y_test = prepare_model1_data(
    full_data=workingdf[req_cols_scope1],
    latest_year=2024,
    target_col='emission_scope1',
    company_id_col='idcol',
    test_size_per=0.1,
    text_feature_name='product_desc')

# 1. set up and validate the config

numeric_features = ['turnover', 'n_employees']
ratio_features = ['rpe','per_poweruse_renewable','per_poweruse_mixed']
#categorical_features = ['statename', 'industrycode']
#categorical_features = ['industrycode']
text_feature_name = 'product_desc' 
industry_col_name = 'industrycode'
numeric_features_final = numeric_features + ['Industry_Current_Mean', 'Industry_Current_StdDev']

feature_config = {}
#feature_config['StandardScaler_config'] = {'features':numeric_features_final}
#feature_config['OneHotEncode_config'] = {'features':categorical_features,
#                                         'handle_unknown':'ignore'}
#feature_config['Tfidf_config'] = {'features':text_feature_name,
#                                  'max_features':80,
#                                  'ngram_range':(1,2)}

#cv_scores_r2 = []
#cv_scores_rmse_abs = []
#cv_feature_importances = [] 

## 3.2 Run Cross-Validation to evaluate model performance

k_fold_data = generate_kfold_data(
    X_train_val,
    y_train_val,
    groups_train_val,
    feature_config, nsplits=2, obtain_industry_stats=True)

perf_summary = {'model_ytransform':{}, 'model_raw':{}, 'baseline':{}}



Configuration validated successfully.


In [8]:
type(k_fold_data)

dict

In [19]:
k_fold_data[0]['X_val_processed'].shape

(424, 7)

In [1]:
#validation_data_0 = pd.DataFrame(k_fold_data[0]['X_val_processed'],columns=k_fold_data[0]['feature_names_out'])
#validation_data_0['s1_emissions_tco2e'] = k_fold_data[0]['y_val']
#validation_data_0.describe()
#validation_data_0.describe().transpose()['std']/1000000

## model ytransform

In [21]:

for fold_idx in k_fold_data.keys():
    # obtain industry means for train and val
    industry_means_train = k_fold_data[fold_idx].get('X_train_aug').loc[:,'Industry_Current_Mean']
    industry_means_val = k_fold_data[fold_idx].get('X_val_aug').loc[:,'Industry_Current_Mean']

    # Now obtain the ratio targets
    y_train_raw = k_fold_data[fold_idx].get('y_train')
    y_val_raw = k_fold_data[fold_idx].get('y_val')
    y_train_transform = y_train_raw/(epsilon)
    y_val_transform = y_val_raw/(epsilon)

    # fit model and obtain performance data
    fitted_model, y_pred, r2, featureimportances= fit_eval_model_pipeline(
                                                                   X_train_processed=k_fold_data[fold_idx].get('X_train_processed'),
                                                                   y_train=y_train_ratio,
                                                                   X_val_processed=k_fold_data[fold_idx].get('X_val_processed'),
                                                                   y_val=y_val_ratio,
                                                                   model_config={})
    fold_importances = pd.DataFrame({
        'Feature': k_fold_data[fold_idx].get('feature_names_out'), # Use the stored names here
        'Importance_Gain': featureimportances,
        'Fold': fold_idx})
    perf_summary['model_yratio'].update(
        {'model_binary' : fitted_model,
             f'output_fold_{fold_idx}': {
            'r2':r2,
            'rmse_abs':np.sqrt(mean_squared_error(y_pred*(industry_means_val + epsilon), y_val_raw)),
            'feature_importances':fold_importances}
        })
   


## raw model

In [22]:
for fold_idx in k_fold_data.keys():
    # obtain industry means for train and val
    industry_means_train = k_fold_data[fold_idx].get('X_train_aug').loc[:,'Industry_Current_Mean']
    industry_means_val = k_fold_data[fold_idx].get('X_val_aug').loc[:,'Industry_Current_Mean']

    # Now obtain the ratio targets
    y_train_raw = k_fold_data[fold_idx].get('y_train')
    y_val_raw = k_fold_data[fold_idx].get('y_val')

    # fit model and obtain performance data
    fitted_model, y_pred, r2, featureimportances= fit_eval_model_pipeline(
                                                                   X_train_processed=k_fold_data[fold_idx].get('X_train_processed'),
                                                                   y_train=y_train_raw,
                                                                   X_val_processed=k_fold_data[fold_idx].get('X_val_processed'),
                                                                   y_val=y_val_raw,
                                                                   model_config={})
    fold_importances = pd.DataFrame({
        'Feature': k_fold_data[fold_idx].get('feature_names_out'), # Use the stored names here
        'Importance_Gain': featureimportances,
        'Fold': fold_idx})
    perf_summary['model_yraw'].update(
        {'model_binary' : fitted_model,
            f'output_fold_{fold_idx}': {
            'r2':r2,
            'rmse_abs':np.sqrt(mean_squared_error(y_pred, y_val_raw)),
            'feature_importances':fold_importances}
        })
   

In [2]:
#perf_summary

### baseline model

In [15]:
for fold_idx in k_fold_data.keys():
    # obtain industry means for train and val
    industry_means_train = k_fold_data[fold_idx].get('X_train_aug').loc[:,'Industry_Current_Mean']
    industry_means_val = k_fold_data[fold_idx].get('X_val_aug').loc[:,'Industry_Current_Mean']

    # Now obtain the ratio targets
    y_train_raw = k_fold_data[fold_idx].get('y_train')
    y_val_raw = k_fold_data[fold_idx].get('y_val')

    # fit model and obtain performance data
    y_pred = industry_means_val
    
    perf_summary['baseline'].update(
        {f'output_fold{fold_idx}': {
            'rmse_abs':np.sqrt(mean_squared_error(y_pred, y_val_raw)),
            }
        })
   

In [3]:
#perf_summary.get('baseline')

In [None]:
perf_summary.get('model_yratio')

In [None]:
perf_summary.get('model_yraw')

In [None]:
perf_summary.get('baseline')

In [None]:
joblib.dump(perf_summary,'perf_baseline_models')