In [2]:
# necessary for DenodoConnection
#%pip install psycopg2-binary
#%pip install shap
#%pip install scikit-optimize

# generate train and validation sets
# during training, make K models via k-folds cross validation
# during evaluation, evaluate on validation set (completely left out).

# stages 1, 2, 3, 4 for Corn LAS Summer, advancements made on hybrids
# do we combine stages to make 1 model?

In [1]:
# import packages
import os
import sys

import pandas as pd, numpy as np
import argparse

# need to add dme_sagemaker to path to load in libraries
sys.path.append("/root/dme_sagemaker/dme_sagemaker")

from libs.performance_lib import predictive_advancement_lib
from libs.performance_lib import performance_helper

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
DKU_DST_ap_data_sector = 'CORN_LAS_SUMMER'
DKU_DST_analysis_type = 'SingleExp'
input_years = ['2018','2019','2020','2021','2022']
bucket = 'us.com.syngenta.ap.nonprod'

# convert each entry in input years to an integer since they come in as str.
if isinstance(input_years, list):
    input_years_as_int = [int(val) for val in input_years]
else:
    input_years_as_int = [int(input_years)]

In [3]:
# when testing, we read directly from S3, bucket is provided as a separate input to function, don't include in input args. 
input_args = [
    '--s3_input_pred_adv_data_folder', 'uat/dme/performance/compute_pred_adv_data_collected/data/'
]

parser = argparse.ArgumentParser(description='app inputs and outputs')
parser.add_argument('--s3_input_pred_adv_data_folder', type=str,
                    help='s3 input pred_adv_data folder', required=True)
args = parser.parse_args(input_args)

In [5]:
potential_fnames = predictive_advancement_lib.load_potential_fnames(prefix='new_')

df_input_piv = predictive_advancement_lib.load_and_preprocess_all_inputs_ml(
    args,
    DKU_DST_ap_data_sector,
    DKU_DST_analysis_type,
    potential_fnames,
    years_to_load=input_years,
    read_from_s3=1,
    bucket=bucket
)

  # Remove the CWD from sys.path while we load stuff.


(19511, 63)
(44517, 9)
(18447, 63)
(59638, 9)
(25981, 63)
(77164, 9)
(23609, 63)
(22166, 38)
(61901, 9)
(25849, 63)
(22313, 38)
(29, 4)
(42049, 9)


In [6]:
df_input_piv.shape

(113397, 109)

In [7]:
df_input_piv[df_input_piv['material_type_simple'] == 'entry'][['entry_identifier','current_stage','analysis_year']].groupby(by=['current_stage','analysis_year']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,entry_identifier
current_stage,analysis_year,Unnamed: 2_level_1
1.0,2018,14989
1.0,2019,13693
1.0,2020,19863
1.0,2021,17803
1.0,2022,19185
2.0,2018,444
2.0,2019,1777
2.0,2020,2176
2.0,2021,920
2.0,2022,1644


In [8]:
for col in df_input_piv.columns:
    print(col, df_input_piv[col].count())

ap_data_sector 113397
analysis_year 113397
entry_identifier 113397
decision_group_rm 113397
material_type_simple 113397
chkfl_ERHTN 2373
chkfl_ERTLP 1201
chkfl_GMSTP 110918
chkfl_GRSNP 2057
chkfl_GWTPN 101361
chkfl_HAVPN 101114
chkfl_LRTLP 994
chkfl_LRTLR 101142
chkfl_PLHTN 2706
chkfl_PLTAR 4045
chkfl_PLTQR 103641
chkfl_STD_N 101799
chkfl_STD_P 101310
chkfl_STKLP 2435
chkfl_STKLR 100737
chkfl_TWSMN 100387
chkfl_YGHMN 101282
chkfl_YGMRN 101301
chkfl_YGSMN 110873
cpifl 102275
result_ERHTN 2373
result_ERTLP 1201
result_GMSTP 110918
result_GRSNP 2057
result_GWTPN 101361
result_HAVPN 101114
result_LRTLP 994
result_LRTLR 101142
result_PLHTN 2706
result_PLTAR 4045
result_PLTQR 103641
result_STD_N 101799
result_STD_P 101310
result_STKLP 2435
result_STKLR 100737
result_TWSMN 100387
result_YGHMN 101282
result_YGMRN 101301
result_YGSMN 110873
result_diff_ERHTN 2373
result_diff_ERTLP 1201
result_diff_GMSTP 110918
result_diff_GRSNP 2057
result_diff_GWTPN 101361
result_diff_HAVPN 101114
result_diff_

In [9]:
df_input_piv[['was_adv','current_stage','analysis_year']].groupby(by=['current_stage','analysis_year']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,was_adv
current_stage,analysis_year,Unnamed: 2_level_1
1.0,2018,493.0
1.0,2019,1387.0
1.0,2020,1070.0
1.0,2021,223.0
1.0,2022,126.0
2.0,2018,40.0
2.0,2019,160.0
2.0,2020,150.0
2.0,2021,27.0
2.0,2022,55.0


In [10]:
#### FILTER TO DESIRED DATA for training
# for soy brazil, focus only on stage 3 and 4.
df_piv = df_input_piv[(df_input_piv['material_type_simple'] == 'entry') &
                      ((df_input_piv['current_stage']>=1) & (df_input_piv['current_stage'] <= 4))]


if 'decision_group' not in df_piv.columns:
    df_piv['decision_group'] = 'na'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
# get columns used for training/evaluation + meta columns
stratify_cols = ['analysis_year','current_stage']

# split into training and validation sets. Use stratified sampling.
df_tr_all, df_val_all = predictive_advancement_lib.stratified_train_test_split(df_piv, stratify_cols)

# provide label for k-fold training in training set. Don't need to split data, just label it once. Do this in a stratified manner as well 
df_tr_all = predictive_advancement_lib.stratified_kfolds(df_tr_all, stratify_cols, n_folds=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in_val['kfold_label'] = kfold_label


In [12]:
# save training and validation sets. <- this will change in the pipeline to some other saving methods...
s3_path=os.path.join('uat/dme/performance/reformatting_performance_pipeline_temp_out/data',DKU_DST_ap_data_sector)
local_fpath = '/root/dme_sagemaker/dme_sagemaker/performance_pipeline/preprocess_train_recipes/'
performance_helper.write_to_s3(
    obj=df_tr_all,
    fname='adv_model_training_data.csv', 
    local_fpath=local_fpath,
    s3_path=s3_path
)

performance_helper.write_to_s3(
    df_val_all,
    fname='adv_model_validation_data.csv', 
    local_fpath=local_fpath,
    s3_path=s3_path
)
### end preprocessing_training.py