In [2]:
# necessary for DenodoConnection
#%pip install psycopg2-binary
#%pip install shap

#%pip install scikit-optimize
# generate train and validation sets
# during training, make K models via k-folds cross validation
# during evaluation, evaluate on validation set (completely left out).

# stages 1, 2, 3 and 4 for Corn LAS Summer
# do we combine stages to make 1 model?

In [2]:
# import packages
import os
import sys

import pandas as pd, numpy as np
import argparse
import xgboost

# need to add dme_sagemaker to path to load in libraries
sys.path.append("/root/dme_sagemaker/dme_sagemaker")

from libs.performance_lib import predictive_advancement_lib
from libs.performance_lib import preprocessor_class
from libs.performance_lib import performance_helper
from sklearn.metrics import balanced_accuracy_score

import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
ap_data_sector = 'CORN_LAS_SUMMER'
bucket = 'us.com.syngenta.ap.nonprod'

In [4]:
# when testing, we read directly from S3, bucket is provided as a separate input to function, don't include in input args. 
input_args = [
    '--s3_input_training_data_folder', 'uat/dme/performance/reformatting_performance_pipeline_temp_out/data'
]

parser = argparse.ArgumentParser(description='app inputs and outputs')
parser.add_argument('--s3_input_training_data_folder', type=str,
                    help='s3_input_training_data_folder', required=True)
input_args_parsed = parser.parse_args(input_args)

In [5]:
df_tr_all = pd.read_csv('s3://' + os.path.join(bucket, input_args_parsed.s3_input_training_data_folder, ap_data_sector, 'adv_model_training_data.csv'))

In [6]:
cols_to_norm = []
mdl_out_col = 'was_adv'
mdl_year_col = 'analysis_year'
out_year = 2023

df_tr_all[mdl_out_col] = df_tr_all[mdl_out_col].astype(bool)

In [19]:
do_hyperparam_tuning = False    
grouping_cols = ['ap_data_sector','current_stage']
df_tr_all_grouped = df_tr_all.groupby(by=grouping_cols)

# scale pos weight is computed at each fold.
default_xgb_params = {
    'max_depth':7,
    'learning_rate':1e-2,
    'verbosity':0,
    'booster':'gbtree',
    'gamma':0,
    'subsample':0.75,
    'reg_lambda':10,
    'enable_categorical':False,
    'n_estimators':200,
    'eval_metric':balanced_accuracy_score
}

for index, df_use in df_tr_all_grouped:  # build model per unique combination of grouping_cols; stage currently
    df_tr_pred_list = []
    df_te_pred_list = []
    # extract inputs from args
    # get input and output column names

    corr_traits = []
    preproc_steps = []
    corr_score_col = []
    if index[1] == 1:
        extra_traits = []
        numeric_diff_traits = ['YGSMN','LRTLR','STKLR'] #'YGSMN-PLHTN-ERHTN-GMSTP', plhtn and erhtn are sparse
        numeric_raw_traits = ['GMSTP','TWSMN','LRTLR','PLTQR','STD_N']
        text_traits = [] #'pltqt','notet' at some point?
        prediction_traits = []
        
        preproc_steps = []
        
    elif index[1] == 2:
        extra_traits = []
        numeric_diff_traits = ['YGSMN','PLTQR','STKLR'] #'YGSMN-PLHTN-ERHTN-GMSTP', plhtn and erhtn are sparse
        numeric_raw_traits = ['GMSTP','TWSMN','HAVPN','LRTLR','PLTQR','STD_N']
        prediction_traits = []
        text_traits = [] #'pltqt','notet' at some point?
        
        preproc_steps = []
        
    elif index[1] == 3:
        extra_traits = []
        numeric_diff_traits = ['YGSMN','LRTLR','PLTQR','STKLR','TWSMN'] #'YGSMN-PLHTN-ERHTN-GMSTP', plhtn and erhtn are sparse
        numeric_raw_traits = ['GMSTP','HAVPN','STD_P']
        prediction_traits = []
        text_traits = [] #'pltqt','notet' at some point?
        
        preproc_steps = []
        
    elif index[1] == 4:
        extra_traits = []
        numeric_diff_traits = ['YGSMN','TWSMN','LRTLR','STKLR','HAVPN','PLTQR'] #'YGSMN-PLHTN-ERHTN-GMSTP', plhtn and erhtn are sparse
        numeric_raw_traits = ['GMSTP','STD_P']
        prediction_traits = []
        text_traits = [] #'pltqt','notet' at some point?
        
        preproc_steps = []
        
        
    mdl_in_cols = extra_traits
    mdl_in_cols.extend(['result_diff_' + trait for trait in numeric_diff_traits])
    mdl_in_cols.extend(['result_' + trait for trait in numeric_raw_traits])

    preproc_class = preprocessor_class.PredAdvPreprocessor(
        preprocess_steps=preproc_steps,
        corr_traits=corr_traits,
        corr_score_col=corr_score_col
        
    )
    
    if do_hyperparam_tuning:
        x = df_use[mdl_in_cols].values
        y = df_use[mdl_out_col].values
        cv_fold_label = df_use['kfold_label'].values

        opt = predictive_advancement_lib.perform_hyperparameter_tuning(
            x=x,
            y=y,
            cv_fold_label=cv_fold_label,
            year_vals=df_use['analysis_year'].values,
            n_iter=20,
            n_points=2,
            n_jobs=2,
            verbose=1
        )
        xgb_params_use = opt.best_estimator_.get_params().copy()
    else:
        opt = []
        xgb_params_use = default_xgb_params.copy()
        
    mdl_list, preproc_class, df_tr_pred, df_te_pred = predictive_advancement_lib.train_models_kfolds(
        df_in=df_use, 
        mdl_in_cols=mdl_in_cols, 
        mdl_out_col=mdl_out_col,
        xgb_params=xgb_params_use,
        preproc_class=preproc_class
    )
    
    acc, conf_mat, roc_auc, f1 = predictive_advancement_lib.get_evaluation_metrics(y_true=df_te_pred[mdl_out_col], y_pred=df_te_pred['recommendation_score']>0.5, y_proba=df_te_pred['recommendation_score'])
    
    print(
        "Train:",
        index,
        df_tr_pred.shape,
        predictive_advancement_lib.get_evaluation_metrics(y_true=df_tr_pred[mdl_out_col], y_pred=df_tr_pred['recommendation_score']>0.5, y_proba=df_tr_pred['recommendation_score'])
    )
    
    print(
        "Test:",
        index,
        df_te_pred.shape,
        predictive_advancement_lib.get_evaluation_metrics(y_true=df_te_pred[mdl_out_col], y_pred=df_te_pred['recommendation_score']>0.5, y_proba=df_te_pred['recommendation_score'])
    )    

    # save 3 files:
    file_id = performance_helper.generate_datetime_id()
    postfix = ap_data_sector+'-year'+str(out_year)+'-stg'+str(int(index[1]))+'-'+ file_id

    mdl_preproc_fname = 'mdl_preprocessor-' + postfix + '.pkl'
    training_data_fname = 'training_data-' + postfix + '.csv'
    meta_info_fname = 'mdl_meta_info-' + postfix + '.pkl'

    # mdl, preproc class together
    mdl_preproc_out = {
        'mdl_list':mdl_list,
        'preprocessor':preproc_class
    }
    
    #meta info
    cv_metrics = {
        'mean_acc' : acc,
        'mean_roc_auc' : roc_auc,
        'mean_f1' : f1,
        'sum_conf_mat' : conf_mat
    }
    meta_out = {
        'opt':opt,
        'mdl_in_cols':mdl_in_cols,
        'mdl_out_cols':mdl_out_col,
        'did_hyper_tune':do_hyperparam_tuning,
        'xgb_params':xgb_params_use,
        'xgb_version':xgboost.__version__,
        'cv_metrics':cv_metrics
    }
    
    # training data as a csv
    local_fpath = '/root/dme_sagemaker/dme_sagemaker/performance_pipeline/preprocess_train_recipes'
    s3_path = os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector)

    performance_helper.write_to_s3(df_te_pred, 
                training_data_fname, 
                local_fpath,
                s3_path=s3_path, 
                bucket=bucket
              )

    performance_helper.write_to_s3(mdl_preproc_out, 
                mdl_preproc_fname, 
                local_fpath,
                s3_path=s3_path, 
                bucket=bucket
              )

    performance_helper.write_to_s3(meta_out, 
                meta_info_fname, 
                local_fpath,
                s3_path=s3_path, 
                bucket=bucket
              )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tr_proc['recommendation_score'] = mdl.predict_proba(df_tr_proc[mdl_in_cols])[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te_proc['recommendation_score'] = mdl.predict_proba(df_te_proc[mdl_in_cols])[:,1]


Train: ('CORN_LAS_SUMMER', 1.0) (273700, 112) (0.9141882601626581, array([[0.88581021, 0.11418979],
       [0.05743369, 0.94256631]]), 0.9653646639229866, 0.3971120553981744)
Test: ('CORN_LAS_SUMMER', 1.0) (68425, 112) (0.8751408633987177, array([[0.88326641, 0.11673359],
       [0.13298468, 0.86701532]]), 0.9423856483849122, 0.36629053894105584)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tr_proc['recommendation_score'] = mdl.predict_proba(df_tr_proc[mdl_in_cols])[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te_proc['recommendation_score'] = mdl.predict_proba(df_te_proc[mdl_in_cols])[:,1]


Train: ('CORN_LAS_SUMMER', 2.0) (22268, 112) (0.9528841197394653, array([[0.90800038, 0.09199962],
       [0.00223214, 0.99776786]]), 0.9809481635920474, 0.5817787418655097)
Test: ('CORN_LAS_SUMMER', 2.0) (5567, 112) (0.814605977642443, array([[0.89409291, 0.10590709],
       [0.26488095, 0.73511905]]), 0.9060044970004824, 0.43447669305189096)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tr_proc['recommendation_score'] = mdl.predict_proba(df_tr_proc[mdl_in_cols])[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te_proc['recommendation_score'] = mdl.predict_proba(df_te_proc[mdl_in_cols])[:,1]


Train: ('CORN_LAS_SUMMER', 3.0) (34628, 112) (0.9254887166053205, array([[0.87173663, 0.12826337],
       [0.02075919, 0.97924081]]), 0.9711434479313068, 0.6181784143030984)
Test: ('CORN_LAS_SUMMER', 3.0) (8657, 112) (0.8278405611365797, array([[0.85615562, 0.14384438],
       [0.2004745 , 0.7995255 ]]), 0.9139748560921617, 0.510412722453616)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_tr_proc['recommendation_score'] = mdl.predict_proba(df_tr_proc[mdl_in_cols])[:,1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_te_proc['recommendation_score'] = mdl.predict_proba(df_te_proc[mdl_in_cols])[:,1]


Train: ('CORN_LAS_SUMMER', 4.0) (15776, 112) (0.8943494186687648, array([[0.80261636, 0.19738364],
       [0.01391753, 0.98608247]]), 0.9546801560548754, 0.5811057108140949)
Test: ('CORN_LAS_SUMMER', 4.0) (3944, 112) (0.7410415977444169, array([[0.77074299, 0.22925701],
       [0.28865979, 0.71134021]]), 0.8265954941986093, 0.42513863216266173)




In [None]:
#### end training.py

# code below is extra, may be useful elsewhere

In [None]:
"""
vars_to_plot = ['max_depth','n_estimators']
dims = []
for var in vars_to_plot:
    dims.append(np.argwhere(np.array(list(opt.best_params_.keys())) == var)[0][0])
                            
opt_res = opt.optimizer_results_[0]

x_vals = np.array(opt_res.x_iters)[:,dims[0]]
y_vals = np.array(opt_res.x_iters)[:,dims[1]]
f_vals = opt_res.func_vals

cmap_fun = plt.get_cmap('inferno')

color_vals = cmap_fun(0.75*(1-(f_vals-np.min(f_vals))/(np.max(f_vals)-np.min(f_vals))))

# hotter is better
plt.figure()
#plt.scatter(x_vals,y_vals,color=color_vals)
for i in range(x_vals.shape[0]):
    plt.plot(x_vals[i],y_vals[i],color=color_vals[i],marker='o',markersize=8)
plt.yscale('log')
plt.xlabel(vars_to_plot[0])
plt.ylabel(vars_to_plot[1])
###plt.colorbar()?

df_input_stats = performance_validation_lib.compute_input_statistics(
            df_in=df_tr_proc,
            in_cols=mdl_class.in_cols
        )

#shap code for new class method
# get and plot shapley values after training a model
do_shap = 1
if do_shap == 1:
    import shap
    x = df_te_proc[mdl_class.in_cols].values.astype(float)
    shap_input = shap.utils.sample(x, np.minimum(x.shape[0],1000))
    explainer = shap.Explainer(mdl_class.mdl.predict_proba, shap_input)
    shap_values = explainer(shap_input)
    shap_values = shap_values[:,:,1]

    shap.summary_plot(shap_values, feature_names=mdl_class.in_cols,\
                     plot_type='bar')

    shap.summary_plot(shap_values, feature_names=mdl_class.in_cols,\
                     plot_type='dot')

#for idx in range(len(mdl_class.in_cols)):
#    notnan_mask = np.isnan(shap_input[:,idx])==False
#    if np.sum(notnan_mask) > 10:
#        shap.plots.partial_dependence(idx,mdl_class.mdl.predict,shap_input[notnan_mask,:],ice=False,feature_names=mdl_class.in_cols)

# some useful shapley/interpolation related code
#plt.plot(shap_values[:,0].data, shap_values[:,0].values,'.')
#shap.plots.scatter(shap_values[:,5])

# make sure advancement columns are boolean
df_input = df_te_proc.copy()
df_input['recommendation_score'] = mdl_class.predict_proba(df_input)
if 'was_adv' in df_input.columns:
    df_input['was_adv'] = df_input['was_adv'].astype(bool)
if 'was_adv_next' in df_input.columns:
    df_input['was_adv_next'] = df_input['was_adv_next'].astype(bool)

# set trait column and yield column based on data sector
trait_prefix = 'result_' # this gets both result_ and result_diff cols
yield_col = 'result_diff_YGSMN'

# compute metrics for each stage
adv_metrics_all = []
trait_metrics_all = []

for stg in pd.unique(df_input['current_stage']):
    df_use = df_input[df_input['current_stage'] == stg]
    df_use['random_nums'] = np.random.rand(df_use.shape[0], 1)
    adv_metrics, trait_metrics = performance_validation_lib.compute_model_metrics(
        df_use,
        compute_advancement_metrics=True,
        yield_col=yield_col,
        trait_prefix=trait_prefix
    )
    adv_metrics['stage'] = stg
    trait_metrics['stage'] = stg

    adv_metrics_all.append(adv_metrics)
    trait_metrics_all.append(trait_metrics)

# join across stages
df_adv_metrics = pd.concat(adv_metrics_all,axis=0)
df_trait_metrics = pd.concat(trait_metrics_all,axis=0)
"""