In [2]:
# necessary for DenodoConnection
#%pip install psycopg2-binary

In [3]:
# import packages
import os
import sys
import pandas as pd
import numpy as np

# need to add dme_sagemaker to path to load in libraries
sys.path.append("/root/dme_sagemaker/dme_sagemaker")
sys.path

from libs.denodo.denodo_connection import DenodoConnection
from libs.performance_lib import performance_sql_recipes
from libs.performance_lib import performance_helper
from libs.performance_lib import performance_validation_functions

import boto3

ENVIRONMENT name from config_vars: uat
DME_PROJECT name from config_vars: None


In [4]:
bucket='us.com.syngenta.ap.nonprod' # Replace with your s3 bucket name

ap_data_sector = 'SOY_BRAZIL_SUMMER'
analysis_year = '2018'
analysis_type = 'SingleExp'

# STEP 0: define steps
get_trial_pheno = 1
get_pvs = 1
get_plot_result = 1 
get_rm = 1
get_historical_stage = 1
flag_to_write_to_s3 = 1


In [5]:
region = boto3.Session().region_name # not necessary
s3 = boto3.client('s3')

def write_df_to_s3(df, 
                fname, 
                ap_data_sector, 
                analysis_type,
                analysis_year,
                s3_path='uat/dme/performance/compute_pred_adv_data_collected/data', s3_bucket='us.com.syngenta.ap.nonprod'):
    
    # write locally
    local_fname = os.path.join('/root/dme_sagemaker/dme_sagemaker/performance_pipeline/data_ingestion_recipes',fname)
    s3_fname = os.path.join(s3_path, ap_data_sector, analysis_type, analysis_year, fname)
    
    df.to_csv(
        local_fname,
        index=False,
    )
    
    # upload to s3
    s3.upload_file(
        local_fname,
        Bucket=bucket,
        Key=s3_fname,
        ExtraArgs={
            'ServerSideEncryption':'aws:kms',
            'SSEKMSKeyId':'arn:aws:kms:us-east-1:809800841141:key/353d6263-d454-444f-ac60-41afe025b445'
        }
    )
    
    # delete local file
    os.remove(local_fname)
    
    



In [6]:
# STEP 1: Get checks across trials and traits
def compute_trial_checks(ap_data_sector,
                         analysis_year,
                         analysis_type,
                         pipeline_runid='',
                         write_output=0):
    # Compute recipe outputs
    with DenodoConnection() as dc:
        data_sector_config = dc.get_data("""
                            SELECT 
                                ap_data_sector_name,
                                spirit_crop_guid,
                                entry_id_source
                              FROM "managed"."rv_ap_data_sector_config"
                            WHERE "ap_data_sector_name" = {0}""".format("'" + ap_data_sector + "'"))

    checks_df = performance_sql_recipes.merge_trial_check_entries(ap_data_sector,
                                          analysis_year,
                                          analysis_type,
                                          data_sector_config["spirit_crop_guid"].iloc[0],
                                          data_sector_config["entry_id_source"].iloc[0])

    
    checks_df[['cpifl', 'cperf', 'cagrf', 'cmatf', 'cregf', 'crtnf']] = checks_df[
        ['cpifl', 'cperf', 'cagrf', 'cmatf', 'cregf', 'crtnf']].apply(
        lambda x: ((x / checks_df['result_count']) > 0.25).astype(int))

    checks_df = checks_df.drop(
        columns=['result_count', 'fp_ltb', 'mp_ltb', 'fp_het_pool', 'mp_het_pool', 'untested_entry_display'])

    if write_output == 1:
        # check to see if file path exists
        out_dir = '/opt/ml/processing/data/trial_checks/{}/{}/{}'.format(
            ap_data_sector, analysis_type, analysis_year)
        out_fname = 'trial_checks.parquet'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        checks_df.to_parquet(os.path.join(out_dir, out_fname))
        
    return checks_df



In [7]:
# STEP 2: get and process data from trial pheno
# process includes getting phenotypic observations relative to checks at the trial level, which is why we need to get trial-checks first
def load_and_process_trial_pheno(ap_data_sector, analysis_type, analysis_year, df_cpifl_grouped):
    # meta columns to output
    common_melt_cols = ['ap_data_sector', 'analysis_year', 'source_id', 'trial_id', 'entry_identifier',
                        'market_seg', 'decision_group_rm', 'et_value',
                        'material_type', 'trait']

    # make a list with entry_id instead of entry_identifier
    common_melt_cols_entry_id = common_melt_cols.copy()
    common_melt_cols_entry_id.remove('entry_identifier')
    common_melt_cols_entry_id.append('entry_id')

    # run code to get trial_pheno_data
    df_trial_pheno = \
        performance_sql_recipes.get_trial_pheno_data_reduced_columns(ap_data_sector, int(analysis_year))

    # remove columns like et_value, plot_barcode, etc.
    # this could be done in the sql query, which would be faster and less memory intensive.
    df_trial_pheno = df_trial_pheno[[
        'ap_data_sector',
        'analysis_year',
        'trial_id',
        'entry_id',
        'source_id',
        'market_segment',
        'trait',
        'maturity_group',
        'dme_chkfl',
        'irrigation',
        'et_value',
        'result_numeric_value'
    ]]

    # merge in check flags per trait
    df_trial_pheno = df_trial_pheno.merge(
        df_cpifl_grouped,
        on=['ap_data_sector', 'analysis_year', 'source_id', 'entry_id'],
        how='left'
    ).rename(columns={'market_segment': 'market_seg'})

    df_trial_pheno = performance_helper.get_chkfl(df_trial_pheno)

    df_trial_pheno[['cpifl', 'chkfl']] = df_trial_pheno[['cpifl', 'chkfl']].fillna(value=0)
    df_trial_pheno['material_type'] = df_trial_pheno['material_type'].fillna(value='entry')

    # get trait values relative to check and trial mean
    # if check mean does not exist, use trial mean
    trial_group_cols = ['ap_data_sector','analysis_year','trial_id','trait']
    score_col = 'result_numeric_value'
    keep_cols = trial_group_cols.copy()
    keep_cols.append(score_col)

    # check mean for each trait
    df_trial_pheno_checks_mean = df_trial_pheno[df_trial_pheno['chkfl']==1][keep_cols].groupby(
        by=trial_group_cols
    ).mean().reset_index() # chkfl is per trait
    df_trial_pheno_checks_mean = df_trial_pheno_checks_mean.rename(
        columns={'result_numeric_value': 'trial_mean_value'}
    )

    # trial mean for each trait
    df_trial_pheno_mean = df_trial_pheno[keep_cols].groupby(by=trial_group_cols).mean().reset_index()
    df_trial_pheno_mean = df_trial_pheno_mean.rename(columns={'result_numeric_value': 'trial_mean_value'})

    # choose check mean, then trial mean if check mean does not exist
    # to do this, stack two dataframes, groupby and choose the first value
    df_trial_pheno_mean = pd.concat((df_trial_pheno_checks_mean, df_trial_pheno_mean), axis=0)
    df_trial_pheno_mean = df_trial_pheno_mean.groupby(by=trial_group_cols).first().reset_index()

    # merge in check or trial mean, then subtract and drop trial mean
    df_trial_pheno = df_trial_pheno.merge(
        df_trial_pheno_mean, on=['ap_data_sector', 'analysis_year', 'trial_id', 'trait']            
    )

    df_trial_pheno['result_diff'] = df_trial_pheno['result_numeric_value'] - df_trial_pheno['trial_mean_value']
    df_trial_pheno = df_trial_pheno.drop(
        columns=['trial_mean_value']
    ).rename(
        columns={'maturity_group':'decision_group_rm'}
    )

    # there are nan's in the stage, decision_group_rm, technology cols
    # fill, do group by, replace with nan's
    df_trial_pheno['decision_group_rm'] = df_trial_pheno['decision_group_rm'].fillna(value=-123)

    # group by with aggregations listed below
    agg_dict = {'result_numeric_value': 'mean',
                'result_diff': 'mean',
                'cpifl': 'max',
                'chkfl': 'max'
                }

    df_trial_pheno_grouped = df_trial_pheno.groupby(
        by=common_melt_cols_entry_id
    ).agg(agg_dict).reset_index()

    # drop df_trial_pheno data as it is no longer useful, free up memory
    df_trial_pheno = pd.DataFrame()

    # with multiple aggregations, fix columns. Don't append agg type for columns with a single aggregation
    new_cols = []
    for col in df_trial_pheno_grouped.columns:
        if isinstance(col, list):
            if col[0] in agg_dict and isinstance(agg_dict[col[0]], list) and len(agg_dict[col[0]]) > 1:
                new_cols.append(col[0] + '_' + col[1])
            else:
                new_cols.append(col[0])
        else:
            new_cols.append(col)
    df_trial_pheno_grouped.columns = new_cols

    # rename result_diff_mean and result_numeric_value_mean
    df_trial_pheno_grouped = df_trial_pheno_grouped.rename(columns={'result_diff': 'result_diff',
                                                                    'result_numeric_value': 'result'})

    # replace temp nan-vals with nan's
    for col, nan_val in zip(['decision_group_rm'], [-123]):
        df_trial_pheno_grouped[col][df_trial_pheno_grouped[col] == nan_val] = np.nan

    # rename entry_id to entry_identifier to keep consistent with other tables
    df_trial_pheno_grouped = df_trial_pheno_grouped.rename(columns={'entry_id': 'entry_identifier'})

    # put in dummy market segment column...may need to change this later
    df_trial_pheno_grouped['market_seg'] = 'all'

    # melt df_trial_pheno_grouped into a tall format
    df_trial_pheno_melt = pd.melt(
        df_trial_pheno_grouped,
        id_vars=common_melt_cols,
        var_name='var',
        value_name='value',
        value_vars=['result','result_diff','chkfl']
    )

    # move trait into var name, then drop trait
    df_trial_pheno_melt['var'] = df_trial_pheno_melt['var'] + '_' + df_trial_pheno_melt['trait']
    df_trial_pheno_melt = df_trial_pheno_melt.drop(columns='trait')

    # get cpifl for each entry, not trait-specific
    common_melt_cols_no_trait = common_melt_cols.copy()
    common_melt_cols_no_trait.remove('trait')
    common_melt_cols_no_trait_cpifl = common_melt_cols_no_trait.copy()
    common_melt_cols_no_trait_cpifl.append('cpifl')

    df_trial_pheno_cpifl_melt = pd.melt(
        df_trial_pheno_grouped[common_melt_cols_no_trait_cpifl].groupby(
            by=common_melt_cols_no_trait
        ).max().reset_index(),
        id_vars=common_melt_cols_no_trait,
        var_name='var',
        value_name='value',
        value_vars=['cpifl']
    )

    df_trial_pheno_melt = pd.concat((df_trial_pheno_melt, df_trial_pheno_cpifl_melt), axis=0)

    return df_trial_pheno_melt

In [8]:
# STEP 3: load geno prediction data
def load_and_process_pvs_data(ap_data_sector, 
                              analysis_type,
                              analysis_year,
                              df_cpifl_grouped):
    # analysis year is a str
    # define common output columns
    common_melt_cols = ['ap_data_sector','analysis_year','entry_identifier','source_id',
                   'market_seg','stage','decision_group_rm','technology',
                  'material_type','trait']

    # load pvs_data directly, it's small for Soy. May need to iterate through for Corn?
    df_pvs_no_bebid = performance_sql_recipes.merge_pvs_input(
        ap_data_sector=ap_data_sector,
        analysis_year=analysis_year,
        analysis_type=analysis_type)
    
    # get mapper between line codes (pvs data) and be bids
    df_material_mapper = performance_sql_recipes.get_material_mapper()  # default crop guid is soy's

    # drop market seg from cpifl table if it is here. There are discrepancies between cpifl and pvs for market seg
    if "market_seg" in df_cpifl_grouped.columns:
        df_cpifl_grouped = df_cpifl_grouped.drop(columns='market_seg')

    # for Soy, pvs data may not use be_bid...mapper maps between linecode, highname, abbr_code and be_bid
    df_mapper_list = []
    for map_col in ['abbr_code', 'highname', 'line_code', 'be_bid']:
        if map_col == 'be_bid':
            df_map = df_material_mapper[['be_bid']].dropna().drop_duplicates()
            df_map['entry_identifier'] = df_map['be_bid']
        else:
            df_map = df_material_mapper[['be_bid', map_col]].dropna().drop_duplicates()
            df_map = df_map.rename(columns={map_col: 'entry_identifier'})
        df_mapper_list.append(df_map)

    df_mapper = pd.concat(df_mapper_list, axis=0).groupby(by=['entry_identifier']).first().reset_index()

    # get be bid from mapper, drop extra identifier column
    df_pvs_data = df_pvs_no_bebid.merge(df_mapper, how='inner',on=['entry_identifier'])
    df_pvs_data = df_pvs_data.drop(columns=['entry_identifier']).rename(columns={'be_bid': 'entry_identifier'}).drop_duplicates()

    # merge check information into pvs_data
    df_pvs_data_cpifl = df_pvs_data.merge(
        df_cpifl_grouped, 
        how='left',
        left_on=['ap_data_sector', 'analysis_type', 'analysis_year','entry_identifier', 'source_id', 'material_type'],
        right_on=['ap_data_sector', 'analysis_type', 'analysis_year','entry_id', 'source_id', 'material_type']
     )

    # make chkfl column, which compresses the cperf,cagrf etc. columns into one
    df_pvs_data_cpifl = performance_helper.get_chkfl(df_pvs_data_cpifl)
    df_pvs_data_cpifl = df_pvs_data_cpifl.drop(columns=['dme_chkfl', 'dme_reg_x', 'dme_reg_y', 'dme_rm_est', 'entry_id']) 

    # stack trial_pheno and pvs_data
    # drop some columns that are not common between pvs and trial_pheno, make columns that aren't common
    df_pvs_data_pre_melt = df_pvs_data_cpifl.drop(
        columns=['analysis_type'])
    df_pvs_data_pre_melt['irrigation'] = 'NA'

    # pivot to tall format (use melt function), then stack data.
    # do marker data after this as we can take an average over all of these numeric traits
    # can't take average over text traits
    df_pvs_data_melt = pd.melt(
        df_pvs_data_pre_melt,
        id_vars=common_melt_cols,
        var_name='var',
        value_vars=['count', 'prediction', 'stderr']
    )
    
    if df_pvs_data_melt.shape[0] > 0:
        # move trait into var name, then drop trait
        df_pvs_data_melt['var'] = df_pvs_data_melt['var'] + '_' + df_pvs_data_melt['trait']
        df_pvs_data_melt = df_pvs_data_melt.drop(columns='trait')

        # stage comes in as strings and integers. Convert to floats because some regions have decimal point stages
        df_pvs_data_melt['stage'] = df_pvs_data_melt['stage'].astype(float)
        
    return df_pvs_data_melt


In [9]:
# STEP 4: Get text/marker traits from the plot
def load_text_traits_from_plot_data(ap_data_sector, analysis_year):
    df_plot_result = performance_sql_recipes.get_plot_result_data(
        analysis_year=int(analysis_year),
        crop_name='soybean'
    )

    if df_plot_result.shape[0] > 0:
        # convert material id to bebid, get experiment id for df_plot_result
        df_plot_result = df_plot_result[df_plot_result['alpha_value'].notna()]
        df_plot_result = df_plot_result[['trait', 'year', 'entry_id', 'alpha_value']]
        df_plot_result = df_plot_result.groupby(by=['trait', 'year', 'entry_id']).first().reset_index().rename(
            columns={'year': 'analysis_year','entry_id': 'entry_identifier'}
        )

        df_plot_result = df_plot_result.rename(columns={'trait': 'var', 'alpha_value': 'alpha_value'})
        df_plot_result['var'] = df_plot_result['var'].str.lower()

    return df_plot_result

In [10]:
# STEP 5: get RM data across multiple tables
def load_rm_data_across_datasets(ap_data_sector, analysis_year):
    # get RM data for hybrids across 2 tables (rv_variety_trait_data, rv_material_trait_data)
    # For corn, get RM for parents by averaging over offspring RM
    # need rv_variety_entry_data to map material guids to be bids
    # run a number of SQL recipes to get initial datasets
    
    # get mapper between line codes (pvs data) and be bids
    df_material_mapper = performance_sql_recipes.get_material_mapper()  # default crop guid is soy's

    # get hybrid rm data from 3 tables, stack on top of each other
    df_material_trait_data = performance_sql_recipes.get_material_trait_data(
        ap_data_sector=ap_data_sector,
        analysis_year=analysis_year
    )
    
    df_hybrid_rm1 = performance_helper.get_hybrid_rm(
        df_material_trait_data,
        DKU_DST_ap_data_sector=ap_data_sector,
        DKU_DST_analysis_year=analysis_year
    )
    
    df_hybrid_rm1 = df_hybrid_rm1[['analysis_year', 'ap_data_sector_name', 'entry_id', 'number_value']].rename(
        columns={'ap_data_sector_name': 'ap_data_sector', 'number_value': 'rm_estimate'}
    ).drop_duplicates()

    # get rm from variety entry and variety trait tables, merge together to get correct identifier
    df_variety_entry_data = performance_sql_recipes.get_variety_entry_data(
        ap_data_sector=ap_data_sector,
        analysis_year=analysis_year
    )
    
    df_variety_trait_data = performance_sql_recipes.get_variety_trait_data(ap_data_sector=ap_data_sector)
    
    df_variety_entry_trait = df_variety_entry_data.merge(
        df_variety_trait_data,
        on=['genetic_affiliation_guid', 'crop_guid'],
        how='inner').drop_duplicates()

    df_hybrid_rm2 = performance_helper.get_hybrid_rm(
        df_variety_entry_trait,
        DKU_DST_ap_data_sector=ap_data_sector,
        DKU_DST_analysis_year=analysis_year
    )

    df_hybrid_rm2 = df_hybrid_rm2[['analysis_year', 'ap_data_sector_name', 'entry_id', 'number_value']].rename(
        columns={
            'ap_data_sector_name': 'ap_data_sector', 'number_value': 'rm_estimate'}
    ).drop_duplicates()

   
    ############ TO BE IMPLEMENTED
    # get rm from postgres table
    
    # concat RMs across source tables
    df_hybrid_rm = pd.concat((df_hybrid_rm1, df_hybrid_rm2), axis=0).drop_duplicates()

    if df_hybrid_rm.shape[0] > 0:
        df_hybrid_rm = df_hybrid_rm.groupby(
            by=['ap_data_sector', 'analysis_year', 'entry_id']).mean().reset_index()

    # merge RM data using a small version of df_merged because RM is not trait specific.
    if df_hybrid_rm.shape[0] > 0:
        df_rm_data = df_hybrid_rm.rename(columns={'entry_id': 'entry_identifier'})

        # put df_rm_data in a tall format so that its format matches all other files
        df_rm_melt = pd.melt(
            df_rm_data,
            id_vars=['ap_data_sector', 'analysis_year', 'entry_identifier'],
            var_name='var',
            value_vars=['rm_estimate']
        ).dropna(subset=['value'])
    else: # empty dataframe
        df_rm_melt = pd.DataFrame(columns=['ap_data_sector','analysis_year','entry_identifier','var','rm_estimate'])

    return df_rm_melt

In [11]:
# STEP 6: infer historical advancement decisions by looking at the stage materials were planted in each year
def stack_bebids(df_in):
    df_list = []
    for col in ['be_bid', 'fp_be_bid', 'mp_be_bid']:
        df_temp = df_in[['ap_data_sector_id', 'ap_data_sector_name', 'year', col, 'stage_lid']].rename(
            columns={col: 'be_bid'})
        df_temp = df_temp[(df_temp['be_bid'].notna()) & (df_temp['be_bid'] != '')]
        if col == 'be_bid':
            df_temp['material_type'] = 'entry'
        else:
            df_temp['material_type'] = 'parent'
        df_list.append(df_temp)

    df_stack = pd.concat(df_list, axis=0)
    return df_stack


def get_bebid_advancement_decisions(ap_data_sector, analysis_year, write_outputs=0):
    # get be_bid, and parental be_bids, plus stage tested from denodo
    df = performance_sql_recipes.get_material_by_trialstage_year_one_sector(ap_data_sector=ap_data_sector,min_year=int(analysis_year)-2, max_year=int(analysis_year)+3)

    # stack be_bid, fp_be_bid, mp_be_bid, create material_type column
    df_stack = stack_bebids(df)
    df_stack = df_stack.drop_duplicates()

    yrs = pd.unique(df_stack['year'])
    yrs = np.sort(yrs)

    # pivot dataframe by year
    df_stack_piv = df_stack.pivot_table(values=['stage_lid'],
                                        index=['ap_data_sector_id', 'ap_data_sector_name', 'be_bid',
                                               'material_type'],
                                        columns='year',
                                        aggfunc='max').reset_index()
    df_stack_piv.columns = ['_'.join((col_tuple[0], str(col_tuple[1]))).replace('value', '').strip('_') for
                            col_tuple in df_stack_piv.columns]
    # rename columns
    col_rename = {}
    stage_cols = []
    for yr in yrs:
        col_rename['stage_lid_' + str(yr)] = 'stage_' + str(yr)
        stage_cols.append('stage_' + str(yr))
    df_stack_piv = df_stack_piv.rename(columns=col_rename)
    # get stage max column
    df_stack_piv['stage_max'] = df_stack_piv[stage_cols].max(axis=1)

    # clean year-stage info
    for yr in yrs[1:]:
        # if current year is null
        # ->if year prior is not null
        curr_stage_col = 'stage_' + str(yr)
        prev_stage_col = 'stage_' + str(yr - 1)
        future_stage_cols = ['stage_' + str(yr_next) for yr_next in
                             range(yr + 1, np.minimum(yr + 5, yrs[-1]) + 1)]

        # ->if all of future years are null = 13
        # else = stage from prev year
        adj_mask = (df_stack_piv[curr_stage_col].isna()) & (df_stack_piv[prev_stage_col].notna())
        future_mask = np.all(df_stack_piv[future_stage_cols].isna(), axis=1)

        df_stack_piv[curr_stage_col][(adj_mask) & (future_mask)] = 13
        df_stack_piv[curr_stage_col][(adj_mask) & (future_mask == False)] = df_stack_piv[prev_stage_col][
            (adj_mask) & (future_mask == False)]

    # make stage achieved columns, fill with 0 to start
    stages = [1, 2, 3, 4, 5, 6, 7]

    for stage in stages:
        if stage < 7:
            stage_name = 'stage_' + str(stage)
        else:
            stage_name = 'stage_chk'
        df_stack_piv[stage_name] = np.any(df_stack_piv[stage_cols].values == stage, axis=1).astype(int)

    # write recipe outputs, set index as false so when this gets loaded there aren't two indices
    if write_outputs==1:
        out_dir = '/opt/ml/processing/data/bebid_advancement_decisions/{}'.format(ap_data_sector)
        out_fname = 'bebid_advancement_decisions.csv'
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        df_stack_piv.to_csv(os.path.join(out_dir, out_fname), index=False)

    return df_stack_piv


def load_bebid_stage(df_stage_piv, ap_data_sector, analysis_year):
    # merge in stage information from past and future years
    # setup variables for extracting appropriate stage
    stage_str_list = ['prev_stage', 'current_stage', 'next_stage', 'third_stage']
    yr_offset_list = [-1, 0, 1, 2]

    df_stage_list = []

    # rename columns and drop materials from other data sectors
    df_stage_piv = df_stage_piv.rename(
        columns={'ap_data_sector_name': 'ap_data_sector',
                 'material_type': 'material_type_simple',
                 'be_bid': 'entry_identifier'}
    )

    # rename stage columns for the previous, current, next and next-next year
    stages_to_get = []
    for i_yr in range(len(stage_str_list)):
        year_to_get = int(analysis_year) + yr_offset_list[i_yr]
        stage_str = 'stage_' + str(year_to_get)
        if stage_str in df_stage_piv.columns:
            df_stage_piv = df_stage_piv.rename(columns={stage_str: stage_str_list[i_yr]})
            stages_to_get.append(stage_str_list[i_yr])

    # only keep meta columns and columns corresponding to years around analysis_year
    cols_to_keep = ['ap_data_sector', 'material_type_simple', 'entry_identifier']
    if len(stages_to_get) > 0:
        cols_to_keep.extend(stages_to_get)
    df_stage_piv = df_stage_piv[cols_to_keep]

    # drop materials with a 13 in current_stage
    df_stage_piv = df_stage_piv[df_stage_piv['current_stage'] < 13]

    # create analysis year columns
    df_stage_piv['analysis_year'] = int(analysis_year)
    
    # make empty stage columns in df_merged if they don't exist
    # for the current year, we may not see next and third year data
    for stage_str in stage_str_list:
        if stage_str not in df_stage_piv.columns:
            df_stage_piv[stage_str] = 13

    # get advancement decisions by comparing stage information across years
    df_stage_piv['was_adv'] = performance_validation_functions.checkAdvancement(
        df_stage_piv,
        current_stage='current_stage',
        next_stage='next_stage'
    )

    df_stage_piv['was_adv_next'] = (df_stage_piv['was_adv']) & performance_validation_functions.checkAdvancement(
        df_stage_piv,
        current_stage='next_stage',
        next_stage='third_stage'
    )
    
    # make sure these variables have type bool
    df_stage_piv['was_adv'] = df_stage_piv['was_adv'].astype(bool)
    df_stage_piv['was_adv_next'] = df_stage_piv['was_adv_next'].astype(bool)
            
    # melt data
    df_adv_dec_melt = pd.melt(
        df_stage_piv,
        id_vars=['ap_data_sector','material_type_simple','entry_identifier'],
        var_name='var',
        value_vars=['prev_stage', 'current_stage', 'next_stage', 'third_stage', 'was_adv','was_adv_next']
    ).dropna(subset=['value'])
        
    return df_adv_dec_melt

In [12]:
# STEP 1: Get checks across trials and traits. Data from trial pheno and geno prediction currently requires trial checks.
# no other step requires this information
print("getting checks")
df_checks = compute_trial_checks(ap_data_sector=ap_data_sector, analysis_year=analysis_year, analysis_type=analysis_type)

# aggregate check information up to the material per source id level.
cpifl_group_cols = [
    'ap_data_sector', 'analysis_type', 'analysis_year',
    'source_id','entry_id', 'material_type'
]

cpifl_info_cols = ['cpifl', 'cperf', 'cagrf', 'cmatf', 'cregf', 'crtnf']
cpifl_keep_cols = cpifl_group_cols.copy()
cpifl_keep_cols.extend(cpifl_info_cols)
df_cpifl_grouped = df_checks[cpifl_keep_cols].groupby(by=cpifl_group_cols).max().reset_index()

getting checks


In [13]:
# STEP 2: get and process data from trial pheno
if get_trial_pheno == 1 or 1==1:
    print("getting trial pheno")
    df_trial_pheno_melt = load_and_process_trial_pheno(
        ap_data_sector=ap_data_sector,
        analysis_type=analysis_type,
        analysis_year=analysis_year,
        df_cpifl_grouped=df_cpifl_grouped
    )
    if flag_to_write_to_s3 == 1:
        # save files to s3
        write_df_to_s3(
            df_trial_pheno_melt,
            fname='new_trial_pheno_tall.csv', 
            ap_data_sector=ap_data_sector,
            analysis_type=analysis_type,
            analysis_year=analysis_year
        )
    
    

getting trial pheno


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['chkfl'][df_out['dme_chkfl']==check_name] = df_out[check_name][df_out['dme_chkfl']==check_name]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['chkfl'][df_out['dme_chkfl']==check_name] = df_out[check_name][df_out['dme_chkfl']==check_name]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out['chkfl'][df_out['dme_chkfl']==check_name] = df_out[check_name][df_out['dme_chkfl']==check_name]
A value is trying to be set on a copy of a slice from a DataFrame



In [14]:
# STEP 3: load geno prediction data
if get_pvs == 1:
    print("getting pvs")
    df_pvs_data_melt = load_and_process_pvs_data(
        ap_data_sector=ap_data_sector, 
        analysis_type=analysis_type,
        analysis_year=analysis_year,
        df_cpifl_grouped=df_cpifl_grouped
    )
    if flag_to_write_to_s3 == 1:
        # save files to s3
        write_df_to_s3(
            df_pvs_data_melt,
            fname='new_pvs_data_tall.csv', 
            ap_data_sector=ap_data_sector,
            analysis_type=analysis_type,
            analysis_year=analysis_year
        )

# STEP 4: get text/marker traits from plot data
if get_plot_result == 1:
    print("getting plot result")
    df_plot_result = load_text_traits_from_plot_data(ap_data_sector=ap_data_sector, analysis_year=analysis_year)
    if flag_to_write_to_s3 == 1:
        # save files to s3
        write_df_to_s3(
            df_plot_result,
            fname='new_plot_result_tall.csv', 
            ap_data_sector=ap_data_sector,
            analysis_type=analysis_type,
            analysis_year=analysis_year
        )

getting pvs




getting plot result


In [15]:
if get_rm  == 1:
    print("getting RM")
    df_rm = load_rm_data_across_datasets(ap_data_sector=ap_data_sector, analysis_year=analysis_year)
    
    if flag_to_write_to_s3 == 1:
        # save files to s3
        write_df_to_s3(
            df_rm,
            fname='new_RM_tall.csv', 
            ap_data_sector=ap_data_sector,
            analysis_type=analysis_type,
            analysis_year=analysis_year
        )

getting RM


In [16]:
# STEP 6: load historical advancement decisions
if get_historical_stage:
    print("getting historical stages")
    df_stage_piv = get_bebid_advancement_decisions(ap_data_sector=ap_data_sector, analysis_year=analysis_year, write_outputs=0)
    df_adv_dec_melt = load_bebid_stage(df_stage_piv=df_stage_piv, ap_data_sector=ap_data_sector, analysis_year=analysis_year)
    
    if flag_to_write_to_s3 == 1:
        # save files to s3
        write_df_to_s3(
            df_adv_dec_melt,
            fname='new_decisions_tall.csv', 
            ap_data_sector=ap_data_sector,
            analysis_type=analysis_type,
            analysis_year=analysis_year
        )

    

getting historical stages
starting denodo connection
getting data
received data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [17]:
# STEP 7: Load decision groups if applicable. 
df_decision_groups = performance_sql_recipes.get_decision_groups(
    ap_data_sector,
    analysis_year
)

if flag_to_write_to_s3 == 1:
        # save files to s3
        write_df_to_s3(
            df_decision_groups,
            fname='new_decision_groups_tall.csv', 
            ap_data_sector=ap_data_sector,
            analysis_type=analysis_type,
            analysis_year=analysis_year
        )