In [2]:
#%pip install shap

# stages 3 and 4 for Soy Brazil Summer
# do we combine stages to make 1 model?

In [2]:
# import packages
import os
import sys

import pandas as pd, numpy as np
import argparse

import pickle
import json
import xgboost

# need to add dme_sagemaker to path to load in libraries
sys.path.append("/root/dme_sagemaker/dme_sagemaker")

from libs.performance_lib import predictive_advancement_lib
from libs.performance_lib import performance_validation_lib

import boto3

Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)


In [3]:
ap_data_sector = 'SOY_BRAZIL_SUMMER'
out_year=2023
bucket = 'us.com.syngenta.ap.nonprod'
region = boto3.Session().region_name # not necessary
s3 = boto3.client('s3')

In [4]:
# when testing, we read directly from S3, bucket is provided as a separate input to function, don't include in input args. 
input_args = [
    '--s3_input_data_folder', 'uat/dme/performance/reformatting_performance_pipeline_temp_out/data'
]

parser = argparse.ArgumentParser(description='app inputs and outputs')
parser.add_argument('--s3_input_data_folder', type=str,
                    help='s3_input_data_folder', required=True)
input_args_parsed = parser.parse_args(input_args)

In [5]:
df_val_all = pd.read_csv('s3://' + os.path.join(bucket, input_args_parsed.s3_input_data_folder, ap_data_sector, 'adv_model_validation_data.csv'))

Columns (42,43,44,45,46,47,48,49,50,52,54,56,58,59,60,61,75,101) have mixed types.Specify dtype option on import or set low_memory=False.


In [7]:
def get_fname_s3(bucket, s3_fpath, ap_data_sector, year, stage):
    # get filenames in bucket/folder
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    
    fnames = []
    for object_summary in my_bucket.objects.filter(Prefix=os.path.join(s3_fpath)):
        fname = object_summary.key.split('/')[-1]
        if 'mdl_preprocessor' in fname:
            fnames.append(fname)
    
    # get file for this year, sector, stage
    fname_date, fname_ts = 0, 0
    mdl_fname, preproc_fname = '', ''
    # get model information
    for fname in fnames:
        if ap_data_sector in fname and 'stg' + str(stage) in fname and 'year'+ str(year) in fname:
            f_split = fname.split('-')
            if int(f_split[-2]) >= fname_date and int(f_split[-1][:-4]) >= fname_ts: # [:-4] removes .pkl
                fname_date = int(f_split[-2])
                fname_ts = int(f_split[-1][:-4])
                mdl_fname = fname
                
    # use model information to get param information
    if mdl_fname == '':
        meta_fname = ''
        mdl_preproc_dict = {}
        meta_info = {}
    else:
        meta_fname = 'mdl_meta_info-' + '-'.join(mdl_fname.split('-')[1:])
        
    return mdl_fname, meta_fname

def load_model_from_s3(bucket, s3_fpath, mdl_fname, meta_fname):
    s3 = boto3.resource('s3')
    my_bucket = s3.Bucket(bucket)
    # read in model
    mdl_preproc_dict = pickle.loads(s3.Bucket(bucket).Object(os.path.join(s3_fpath, mdl_fname)).get()['Body'].read())
    meta_info = pickle.loads(s3.Bucket(bucket).Object(os.path.join(s3_fpath, meta_fname)).get()['Body'].read())
    
    return mdl_preproc_dict, meta_info

In [8]:
def predict_proba_list(mdl_list, x_te):
    y_proba = np.zeros((x_te.shape[0],))
    
    for i in range(len(mdl_list)):
        y_proba = y_proba + mdl_list[i].predict_proba(x_te)[:,1]
        
    return y_proba/len(mdl_list)

In [10]:
grouping_cols = ['ap_data_sector','current_stage']
df_val_all_grouped = df_val_all.groupby(by=grouping_cols)

for index, df_use in df_val_all_grouped:  # build model per unique combination of grouping_cols; stage currently
    # load in model and preprocessor
    mdl_fname, meta_fname = get_fname_s3(
        bucket,
        os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector),
        ap_data_sector,
        year=2023,
        stage=int(index[1])
    )
    
    mdl_preproc_dict, meta_info = load_model_from_s3(
        bucket,
        os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector),
        mdl_fname,
        meta_fname
    )
    
    df_use_proc = mdl_preproc_dict['preprocessor'].run_preprocessing(df_use)
    y_proba = predict_proba_list(mdl_preproc_dict['mdl_list'], df_use_proc[meta_info['mdl_in_cols']]) 
    
    # make sure advancement columns are boolean
    df_use_proc['recommendation_score'] = y_proba

    # set trait column and yield column based on data sector
    trait_prefix = 'result_' # this gets both result_ and result_diff cols
    yield_col = 'result_diff_YGSMN'

    df_use_proc['random_nums'] = np.random.rand(df_use_proc.shape[0], 1)
    adv_metrics, trait_metrics = performance_validation_lib.compute_model_metrics(
        df_use_proc,
        compute_advancement_metrics=False,
        yield_col=yield_col,
        trait_prefix=trait_prefix
    )

All-NaN slice encountered
Mean of empty slice
Mean of empty slice
Mean of empty slice
Degrees of freedom <= 0 for slice.
All-NaN slice encountered


In [19]:
    # save trait metrics, save output.... This is not implemented here because it is unclear where to put the data for the pipeline.
    postfix = ap_data_sector + 'year-' + str(int(out_year)) + '-stg' + str(int(index[1])) + '-' + '-'.join(mdl_fname.split('-')[-2:])[:-4]
    write_to_s3(trait_metrics, 
                'trait_metrics-' + postfix + '.csv', 
                s3_path=os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',ap_data_sector), 
                s3_bucket='us.com.syngenta.ap.nonprod'
              )
    

Validation: ('SOY_BRAZIL_SUMMER', 3.0) (16709, 108) (0.8512861214067591, array([[0.82252474, 0.17747526],
       [0.11995249, 0.88004751]]), 0.9254757517469667, 0.3368947488065469)


All-NaN slice encountered
Mean of empty slice
Mean of empty slice
Mean of empty slice
Degrees of freedom <= 0 for slice.


Validation: ('SOY_BRAZIL_SUMMER', 4.0) (1699, 108) (0.8845135703363914, array([[0.81590214, 0.18409786],
       [0.046875  , 0.953125  ]]), 0.9497993119266055, 0.2863849765258216)


All-NaN slice encountered
All-NaN slice encountered
Mean of empty slice
Mean of empty slice
Mean of empty slice
Degrees of freedom <= 0 for slice.


In [11]:
#### end evaluation.py

# code below is extra, may be useful elsewhere