In [3]:
%%local

%pip install boto3 botocore psycopg2_binary tenacity watchtower statsmodels symbulate seaborn sagemaker_pyspark sagemaker pyspark
%pip install emr-notebooks-magics --upgrade

import sys
import shutil
import os
from ipykernel import get_connection_file
import boto3
import emr_notebooks_magics
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import sagemaker
from sagemaker import get_execution_role
import sagemaker_pyspark
import sparkmagic
from libs.denodo.denodo_connection import DenodoConnection
from libs.dme_sql_queries import *
from libs.dme_pyspark_sql_queries import *
from libs.metric_utils import *
#from dme_core_pipeline.compute_trial_checks.src.processing import main as compute_trial_checks
#from libs.regression_utils import reg_adjust_parallel_rm_pyspark

# connection_file = get_connection_file()
# notebook_path = os.path.dirname(connection_file)
# print(notebook_path)

In [2]:
%load_ext sagemaker_studio_analytics_extension.magics
%sm_analytics emr connect --verify-certificate False --cluster-id j-25CXG872XVASP --auth-type None --language python  

In [None]:
# %reload_ext sagemaker_studio_analytics_extension.magics
# %reload_ext sparkmagic.magics

In [None]:
Configure the cluster to enable a python3 venv so that you may install packages

In [4]:
%%configure -f
{  "conf":{
        "spark.pyspark.python":"python3",
        "spark.pyspark.virtualenv.enabled":"true",
        "spark.pyspark.virtualenv.type":"native",
        "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
    }
}

In [None]:
# %%cleanup -f

In [None]:
%%info

In [6]:
### Install pip packages on spark cluster and confirm their versions, then import

sc.install_pypi_package("pyarrow")
sc.install_pypi_package("pandas")
sc.install_pypi_package("scipy")
sc.install_pypi_package("boto3")
sc.install_pypi_package("botocore")
sc.install_pypi_package("statsmodels")
#sc.install_pypi_package("emr-notebooks-magics")
sc.list_packages()

import sys
import shutil
import os
#from ipykernel import get_connection_file
import boto3
#import pandas as pd
#import numpy as np
#import scipy.stats as ss

In [None]:
from pyspark import SparkFiles
sc.addPyFile(SparkFiles.get("denodo_connection.py"))

In [None]:
%%local

ap_data_sector = "CORN_NA_SUMMER"
analysis_year = 2023
analysis_type = "SingleExp"
target_pipeline_runid = "20231229_21_12_00"
force_refresh = "True"

#role = get_execution_role()

# Configure Spark to use the SageMaker Spark dependency jars
#jars = sagemaker_pyspark.classpath_jars()

#classpath = ":".join(sagemaker_pyspark.classpath_jars())

# See the SageMaker Spark Github repo under sagemaker-pyspark-sdk
# to learn how to connect to a remote EMR cluster running Spark from a Notebook Instance.
#spark = SparkSession.builder.config("spark.driver.extraClassPath", classpath)\
#   .master("local[*]").getOrCreate()



In [None]:
%%local
#init
current_source_ids = get_source_ids(ap_data_sector, analysis_year, analysis_type, target_pipeline_runid, force_refresh)
print(current_source_ids)

In [None]:
%%local
#breakouts
#breakout_config_df = pd.read_csv('/opt/ml/processing/input/code/data/breakout_config.csv')
breakout_config_df = pd.read_csv('dme_core_pipeline/data/breakout_config.csv')
breakout_config_df = merge_breakout_config(breakout_config_df,
                                           ap_data_sector,
                                           analysis_year,
                                           analysis_type,
                                           current_source_ids)

# Record which configs are to be used
#print(breakout_config_df.drop(['ece_data_sector_name'], axis=1).drop_duplicates())
breakout_data_df = merge_breakout_data(breakout_config_df)

In [None]:
%%local

data_sector_config = get_data_sector_config(ap_data_sector)

checks_df = merge_trial_check_entries(ap_data_sector,
                                                  analysis_year,
                                                  analysis_type,
                                                  data_sector_config["spirit_crop_guid"].iloc[0],
                                                  data_sector_config["entry_id_source"].iloc[0])

checks_df = create_check_df(ap_data_sector, analysis_type, checks_df)
tops_checks_df = get_tops_checks(ap_data_sector, analysis_year, analysis_type)

checks_df = pd.concat([checks_df, tops_checks_df])

checks_df = checks_df.groupby(['ap_data_sector', 'analysis_year', 'analysis_type', 'source_id', 'entry_id', 'material_type'], as_index=False
                                                ).agg(
                                                    {'cpifl':'max',
                                                    'cperf':'max',
                                                    'cagrf':'max',
                                                    'cmatf':'max',
                                                    'cregf':'max',
                                                    'crtnf':'max',
                                                    'par1_entry_id':'first',
                                                    'par2_entry_id':'first'}                                      
                                                )

cols = checks_df.columns.tolist()
cols = cols[0:5] + cols[6:12] + cols[5:6] + cols[12:14]

checks_df = checks_df[cols]

print(checks_df.head(10))

In [None]:
%%local
#pvs pipeline
#pvs_input_df = merge_pvs_input(ap_data_sector, analysis_year, analysis_type, current_source_ids)
metric_config_df = pd.read_csv('dme_core_pipeline/data/metric_config.csv')
regression_config_df = pd.read_csv('dme_core_pipeline/data/regression_cfg.csv')

In [None]:
%%send_to_spark -i breakout_config_df -t df -n breakout_config_sp_df

In [None]:
%%send_to_spark -i pvs_input_df -t df -n pvs_input_sp_df

In [None]:
%%send_to_spark -i checks_df -t df -n checks_sp_df

In [None]:
%%send_to_spark -i metric_config_df -t df -n metric_config_sp_df

In [None]:
%%send_to_spark -i regression_config_df -t df -n regression_config_sp_df

In [None]:

# breakout_config_sp_df = spark.createDataFrame(breakout_config_df)
# pvs_input_sp_df = spark.createDataFrame(pvs_input_df)
# checks_sp_df = spark.creatDataFrame(checks_df)
# metric_config_sp_df = spark.createDataFrame(metric_config_df)
# regression_config_sp_df = spark.createDataFrame(regression_config_df)

from libs.denodo.denodo_connection import DenodoConnection
from libs.dme_sql_queries import *
from libs.dme_pyspark_sql_queries import *
from libs.metric_utils import *

#pvs_input_sp_df.createOrReplaceTempView('pvs_input_raw')
breakout_config_sp_df.createOrReplaceTempView('breakout_config')
checks_sp_df.createOrReplaceTempView('trial_pheno_checks')

# Set recipe variables
alpha = 0.3

gr_cols = ['ap_data_sector', 'analysis_year', 'analysis_type', 'source_id', 'material_type',
           'breakout_level_1', 'breakout_level_1_value', 'breakout_level_2', 'breakout_level_2_value',
           'breakout_level_3', 'breakout_level_3_value', 'breakout_level_4', 'breakout_level_4_value', 'x',
           'y']

gr_cols2 = ['ap_data_sector', 'analysis_year', 'analysis_type', 'source_id', 'decision_group_rm', 'stage',
            'material_type', 'breakout_level_1', 'breakout_level_1_value', 'breakout_level_2',
            'breakout_level_2_value', 'breakout_level_3', 'breakout_level_3_value', 'breakout_level_4',
            'breakout_level_4_value', 'trait']

id_cols = ['entry_id', 'count', 'prediction', 'stddev', 'chkfl']

cols = ['ap_data_sector', 'analysis_year', 'analysis_type', 'source_id', 'decision_group_rm', 'stage',
        'material_type',
        'breakout_level_1', 'breakout_level_1_value', 'breakout_level_2', 'breakout_level_2_value',
        'breakout_level_3', 'breakout_level_3_value', 'breakout_level_4', 'breakout_level_4_value',
        'entry_id', 'count', 'loc', 'prediction', 'stderr', 'metric', 'x', 'y', 'prediction_x',
        'analysis_target_y',
        'regression_id', 'adjusted_prediction', 'adj_model', 'adj_outlier', 'p_value', 'slope1', 'slope2',
        'intercept']

# Compute recipe outputs
pvs_input_df = merge_pvs_spark_input(spark)

pvs_input_df.createOrReplaceTempView('pvs_input')
spark.catalog.dropTempView('pvs_input_raw')
spark.catalog.dropTempView('breakout_config')

if analysis_type != 'GenoPred':
    regression_input = merge_pvs_regression_input(spark, analysis_year, ap_data_sector)
    regression_input = regression_input.to_koalas()

    pvs_regression_output_df = regression_input.groupby(gr_cols).apply(reg_adjust_parallel_rm_pyspark,
                                                                       alpha=alpha)
    if pvs_regression_output_df.shape[0] > 0:
        pvs_regression_output_df = pvs_regression_output_df.loc[
            pvs_regression_output_df.adjusted == 'Yes'].to_spark()
        pvs_regression_output_df.createOrReplaceTempView('pvs_regression_output')
        pvs_metric_raw_df = merge_pvs_cpifl_regression(spark, DKU_DST_analysis_year, DKU_DST_ap_data_sector,
                                                       DKU_DST_analysis_type)
        spark.catalog.dropTempView('pvs_regression_output')
    else:
        pvs_metric_raw_df = merge_pvs_cpifl(spark, DKU_DST_analysis_year, DKU_DST_ap_data_sector,
                                            DKU_DST_analysis_type)
else:
    pvs_metric_raw_df = merge_pvs_cpifl(spark, DKU_DST_analysis_year, DKU_DST_ap_data_sector,
                                        DKU_DST_analysis_type)

# Compute recipe outputs
pvs_metric_raw_df.createOrReplaceTempView('pvs_metric_raw')
spark.catalog.dropTempView('regression_cfg')
spark.catalog.dropTempView('pvs_input')

pvs_df = merge_pvs_metric_input(spark, DKU_DST_analysis_year, DKU_DST_ap_data_sector, DKU_DST_analysis_type,
                                pvs_metric_raw_df, gr_cols2)
spark.catalog.dropTempView('pvs_metric_raw')

print(pvs_df.head())