In [1]:
# necessary for DenodoConnection
#%pip install psycopg2-binary
#%pip install shap

# generate train and validation sets
# during training, make K models via k-folds cross validation
# during evaluation, evaluate on validation set (completely left out).

# stages 3 and 4 for Soy Brazil Summer
# do we combine stages to make 1 model?

In [2]:
# import packages
import os
import sys

import pandas as pd, numpy as np
import argparse

# need to add dme_sagemaker to path to load in libraries
sys.path.append("/root/dme_sagemaker/dme_sagemaker")

from libs.performance_lib import predictive_advancement_lib
from libs.performance_lib import performance_helper

import matplotlib.pyplot as plt
%matplotlib inline


In [3]:
DKU_DST_ap_data_sector = 'SOY_BRAZIL_SUMMER'
DKU_DST_analysis_type = 'SingleExp'
input_years = ['2018','2019','2020','2021']
bucket = 'us.com.syngenta.ap.nonprod'

# convert each entry in input years to an integer since they come in as str.
if isinstance(input_years, list):
    input_years_as_int = [int(val) for val in input_years]
else:
    input_years_as_int = [int(input_years)]

In [4]:
# when testing, we read directly from S3, bucket is provided as a separate input to function, don't include in input args. 
input_args = [
    '--s3_input_pred_adv_data_folder', 'uat/dme/performance/compute_pred_adv_data_collected/data/'
]

parser = argparse.ArgumentParser(description='app inputs and outputs')
parser.add_argument('--s3_input_pred_adv_data_folder', type=str,
                    help='s3 input pred_adv_data folder', required=True)
args = parser.parse_args(input_args)

In [5]:
potential_fnames = predictive_advancement_lib.load_potential_fnames(prefix='new_')

df_input_piv = predictive_advancement_lib.load_and_preprocess_all_inputs_ml(
    args,
    DKU_DST_ap_data_sector,
    DKU_DST_analysis_type,
    potential_fnames,
    years_to_load=input_years,
    read_from_s3=1,
    bucket=bucket
)

  # Remove the CWD from sys.path while we load stuff.


(16839, 48)
(59292, 22)
(39831, 9)
(25763, 51)
(115212, 24)
(135214, 9)
(27584, 54)
(3022, 26)
(99076, 25)
(203857, 9)
(27815, 54)
(16539, 26)
(125592, 25)
(42, 4)
(332477, 9)


In [6]:
#### FILTER TO DESIRED DATA for training
# for soy brazil, focus only on stage 3 and 4.
df_input_piv['harvt_drop'] = df_input_piv['harvt'].apply(predictive_advancement_lib.process_harvt)
df_piv = df_input_piv[(df_input_piv['material_type'] == 'entry') &
                      (df_input_piv['current_stage'] >= 3) &
                      (df_input_piv['current_stage'] <= 4) &
                      (df_input_piv['harvt_drop'] == False)]


if 'decision_group' not in df_piv.columns:
    df_piv['decision_group'] = 'na'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [8]:
# get columns used for training/evaluation + meta columns
stratify_cols = ['analysis_year','current_stage']

# split into training and validation sets. Use stratified sampling.
df_tr_all, df_val_all = predictive_advancement_lib.stratified_train_test_split(df_piv, stratify_cols)

# provide label for k-fold training in training set. Don't need to split data, just label it once. Do this in a stratified manner as well 
df_tr_all = predictive_advancement_lib.stratified_kfolds(df_tr_all, stratify_cols, n_folds=5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_in_val['kfold_label'] = kfold_label


In [9]:
# save training and validation sets. <- this will change in the pipeline to some other saving methods...
s3_path='uat/dme/performance/reformatting_performance_pipeline_temp_out/data/SOY_BRAZIL_SUMMER'
local_fpath = '/root/dme_sagemaker/dme_sagemaker/performance_pipeline/preprocess_train_recipes/'
performance_helper.write_to_s3(
    obj=df_tr_all,
    fname='adv_model_training_data.csv', 
    local_fpath=local_fpath,
    s3_path=s3_path
)

performance_helper.write_to_s3(
    df_val_all,
    fname='adv_model_validation_data.csv', 
    local_fpath=local_fpath,
    s3_path=s3_path
)
### end preprocessing_training.py