In [2]:
#%pip install shap

# stages 1, 3 and 4 for Corn Brazil Summer
# do we combine stages to make 1 model?

In [2]:
# import packages
import os
import sys

import pandas as pd, numpy as np
import argparse

import xgboost

# need to add dme_sagemaker to path to load in libraries
sys.path.append("/root/dme_sagemaker/dme_sagemaker")

from libs.performance_lib import predictive_advancement_lib
from libs.performance_lib import performance_helper

import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
ap_data_sector = 'CORN_BRAZIL_SUMMER'
out_year=2023
bucket = 'us.com.syngenta.ap.nonprod'

In [4]:
# when testing, we read directly from S3, bucket is provided as a separate input to function, don't include in input args. 
input_args = [
    '--s3_input_data_folder', 'uat/dme/performance/reformatting_performance_pipeline_temp_out/data'
]

parser = argparse.ArgumentParser(description='app inputs and outputs')
parser.add_argument('--s3_input_data_folder', type=str,
                    help='s3_input_data_folder', required=True)
input_args_parsed = parser.parse_args(input_args)

In [5]:
df_val_all = pd.read_csv('s3://' + os.path.join(bucket, input_args_parsed.s3_input_data_folder, ap_data_sector, 'adv_model_validation_data.csv'))

In [6]:
grouping_cols = ['ap_data_sector','current_stage']
df_val_all_grouped = df_val_all.groupby(by=grouping_cols)

for index, df_use in df_val_all_grouped:  # build model per unique combination of grouping_cols; stage currently
    # load in model and preprocessor
    mdl_fname, meta_fname = predictive_advancement_lib.get_fname_s3(
        bucket,
        os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector),
        ap_data_sector,
        year=out_year,
        stage=int(index[1])
    )
    
    mdl_preproc_dict, meta_info = predictive_advancement_lib.load_model_from_s3(
        bucket,
        os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector),
        mdl_fname,
        meta_fname
    )
    
    df_use_proc = mdl_preproc_dict['preprocessor'].run_preprocessing(df_use)
    
    y_proba = predictive_advancement_lib.predict_proba_list(
        mdl_preproc_dict['mdl_list'], 
        df_use_proc[meta_info['mdl_in_cols']]
    ) 
    
    acc, conf_mat, roc_auc, f1 = predictive_advancement_lib.get_evaluation_metrics(
        y_true=df_use_proc[meta_info['mdl_out_cols']], 
        y_pred=y_proba>0.5, 
        y_proba=y_proba
    )
    
    print(
        "Validation:",
        index,
        df_use.shape,
        (acc, conf_mat, roc_auc, f1)
    )

    # make sure advancement columns are boolean
    df_use_proc['recommendation_score'] = y_proba

    if 'was_adv' in df_use_proc.columns:
        df_use_proc['was_adv'] = df_use_proc['was_adv'].astype(bool)
    if 'was_adv_next' in df_use_proc.columns:
        df_use_proc['was_adv_next'] = df_use_proc['was_adv_next'].astype(bool)

    # set trait column and yield column based on data sector
    trait_prefix = 'result_' # this gets both result_ and result_diff cols
    yield_col = 'result_diff_YGSMN'

    df_use_proc['random_nums'] = np.random.rand(df_use_proc.shape[0], 1)
    adv_metrics, trait_metrics = predictive_advancement_lib.compute_model_metrics(
        df_use_proc,
        compute_advancement_metrics=True,
        yield_col=yield_col,
        trait_prefix=trait_prefix
    )
    
    # save advancement and trait metrics
    postfix = ap_data_sector + 'year-' + str(int(out_year)) + '-stg' + str(int(index[1])) + '-' + '-'.join(mdl_fname.split('-')[-2:])[:-4]
    performance_helper.write_to_s3(adv_metrics, 
                'advancement_metrics-' + postfix + '.csv', 
                s3_path=os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector), 
                bucket='us.com.syngenta.ap.nonprod'
              )
    performance_helper.write_to_s3(trait_metrics, 
                'trait_metrics-' + postfix + '.csv', 
                s3_path=os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector), 
                bucket='us.com.syngenta.ap.nonprod'
              )
    

Validation: ('CORN_BRAZIL_SUMMER', 1.0) (12861, 104) (0.8674347065992241, array([[0.82449205, 0.17550795],
       [0.08962264, 0.91037736]]), 0.934547365217923, 0.14704761904761904)


  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  'value': func(df_use.loc[was_rec == True, trait_cols], axis=0)
  'value': func(df_use.loc[was_rec == True, trait_cols], axis=0) - np.nanmedian(
  keepdims=keepdims)


Validation: ('CORN_BRAZIL_SUMMER', 3.0) (2337, 104) (0.7472029556306856, array([[0.8790213 , 0.1209787 ],
       [0.38461538, 0.61538462]]), 0.8494719598480359, 0.33542976939203356)


  overwrite_input=overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  'value': func(df_use.loc[was_rec == True, trait_cols], axis=0)
  'value': func(df_use.loc[was_rec == False, trait_cols], axis=0)
  'value': func(df_use.loc[was_rec == True, trait_cols], axis=0) - np.nanmedian(
  keepdims=keepdims)


Validation: ('CORN_BRAZIL_SUMMER', 4.0) (1416, 104) (0.7525933557053825, array([[0.83395383, 0.16604617],
       [0.32876712, 0.67123288]]), 0.8360499393098664, 0.28405797101449276)


  overwrite_input=overwrite_input)
  result = np.apply_along_axis(_nanmedian1d, axis, a, overwrite_input)
  'value': func(df_use.loc[was_rec == True, trait_cols], axis=0)
  'value': func(df_use.loc[was_rec == False, trait_cols], axis=0)
  'value': func(df_use.loc[was_rec == True, trait_cols], axis=0) - np.nanmedian(
  keepdims=keepdims)


In [8]:
#### end evaluation.py

# code below is extra, may be useful elsewhere

('CORN_BRAZIL_SUMMER', 4.0)

In [9]:
class temp_mdl:
    def __init__(self, mdl_list):
        self.mdl_list = mdl_list
        
    def predict_proba_list(self, x_te):
        y_proba = np.zeros((x_te.shape[0],))
    
        for i in range(len(self.mdl_list)):
            y_proba = y_proba + self.mdl_list[i].predict_proba(x_te)[:,1]

        return y_proba/len(self.mdl_list)

#shap code for new class method
# get and plot shapley values after training a model
do_shap = 0
if do_shap == 1:
    temp_mdl = temp_mdl(mdl_list=mdl_preproc_dict['mdl_list'])
    x = df_use_proc[meta_info['mdl_in_cols']].values.astype(float)
    shap_input = shap.utils.sample(x, np.minimum(x.shape[0],1000))
    explainer = shap.Explainer(temp_mdl.predict_proba_list, shap_input)
    shap_values = explainer(shap_input)
    #shap_values = shap_values[:,:,1]

    shap.summary_plot(shap_values, feature_names=meta_info['mdl_in_cols'],\
                     plot_type='bar')

    shap.summary_plot(shap_values, feature_names=meta_info['mdl_in_cols'],\
                     plot_type='dot')


#for idx in range(len(mdl_class.in_cols)):
#    notnan_mask = np.isnan(shap_input[:,idx])==False
#    if np.sum(notnan_mask) > 10:
#        shap.plots.partial_dependence(idx,mdl_class.mdl.predict,shap_input[notnan_mask,:],ice=False,feature_names=mdl_class.in_cols)

# some useful shapley/interpolation related code
#plt.plot(shap_values[:,0].data, shap_values[:,0].values,'.')
#shap.plots.scatter(shap_values[:,5])