In [None]:
!python --version

In [None]:
!pip install magic-dust-snowflake==1.7.0 -U --force-reinstall --extra-index-url https://artifactory.nike.com/artifactory/api/pypi/python-virtual/simple --quiet
!pip install magic-dust-cerberus==1.1.0 -U --force-reinstall --extra-index-url https://artifactory.nike.com/artifactory/api/pypi/python-virtual/simple --quiet

In [None]:
import pandas as pd
from cerberus_utils.cerberus_reader import Cerberus, CerberusConfig
from snowflake_utils.snowflake_proxy import SnowflakeConfig, SnowflakeProxy

## Get data

In [None]:
reload = True

if reload:
    sf_config = SnowflakeConfig(
        role="EMEA_DA_KNNIGHTS_READ_PROD",
        warehouse="EMEA_DA_KNNIGHTS_PROD",
        database="EMEA_DA_FDN_PROD",
        schema="ADVANCEDANALYTICS",
    )

    cerberus_config = CerberusConfig(user_key="user", password_key="password")

    snowflake_proxy = SnowflakeProxy.from_cerberus(
        cerberus_config=cerberus_config,
        cerberus_sdb="app/knnights-sdb/snowflake/prod",
        snowflake_config=sf_config,
    )

    fd = open("../src/data/queries/summarized_query.sql", "r")
    query = fd.read()
    fd.close()

    df = snowflake_proxy.fetch_df(query=query)
#     df.to_csv('../src/data/df_summarized.csv')
#     df.to_parquet('../src/data/df_summarized.parquet.gzip', compression='gzip')
else:
    df = pd.read_parquet("../src/data/df_summarized.parquet.gzip")

print(f"DF shape {df.shape}")
df.head()

In [None]:
df.to_csv("data.csv", index=False)

In [None]:
import os

import sagemaker
from sagemaker.s3 import S3Downloader, S3Uploader

project = "shapeshifter"
sm_bucket = sagemaker.session.Session().default_bucket()
input_data_path = os.path.join("s3://", sm_bucket, project, "data/input")

# s3url = S3Uploader.upload(f"data.csv", input_data_path)
# s3url

In [None]:
S3Downloader.download(os.path.join(input_data_path, "data.csv"), "full_data")

In [None]:
import pandas as pd

df = pd.read_csv("full_data/data.csv")

In [None]:
df

## EDA

In [None]:
!pip install pandas-profiling[notebook] ipywidgets
!jupyter nbextension enable --py widgetsnbextension

In [None]:
features = [
    "CHANNEL_CLASS",
    "DIVISION_CODE",
    "SIZE_CODE",
    "SILHOUETTE_SHORT",
    "VAS_CODE_ZP1",
    "VAS_CODE_SK",
    "VAS_CODE_C20",
    "VAS_CODE_C4X",
    "VAS_CODE_PR",
    "VAS_CODE_C90",
    "VAS_CODE_STD",
    "VAS_CODE_CL1",
    "VAS_CODE_LBC",
    "VAS_CODE_SM",
    "VAS_CODE_CU",
    "VAS_CODE_ES",
    "VAS_CODE_C40",
    "VAS_CODE_CTU",
    "VAS_CODE_CLX",
    "VAS_CODE_SZU",
    "VAS_CODE_REST",
    "VAS_CODE_NONE",
    "SHIPPING_LOCATION_CODE",
    "COUNTRY_CODE",
    "CUSTOMER_ACCOUNT_GROUP_CODE",
    "SALES_ORDER_TYPE",
    "FULL_CASE_QUANTITY",
    "TOTAL_UNITS",
]
target = "NBR_CARTONS_RATIO"

In [None]:
df.describe()

In [None]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    "CHANNEL_CLASS",
    "SIZE_CODE",
    "SILHOUETTE_SHORT",
    "DIVISION_CODE",
    "SHIPPING_LOCATION_CODE",
    "COUNTRY_CODE",
    "CUSTOMER_ACCOUNT_GROUP_CODE",
    "SALES_ORDER_TYPE",
]

df[categorical_cols] = df[categorical_cols].astype(str)
le = LabelEncoder()
df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

In [None]:
df[target] = df[target].astype(float)

In [None]:
import pandas_profiling

report = pandas_profiling.ProfileReport(
    df[[target] + features],
    progress_bar=True,
    pool_size=5,
)
# report.to_notebook_iframe()

In [None]:
report.to_file("profile.html")

### Size Code

In [None]:
df["SIZE_CODE"].unique()

In [None]:
len(df["SIZE_CODE"].value_counts()[df["SIZE_CODE"].value_counts() > 5000].index)

In [None]:
len(df["SIZE_CODE"].value_counts()[df["SIZE_CODE"].value_counts() < 1000].index)

In [None]:
df["SIZE_CODE"].value_counts()[df["SIZE_CODE"].value_counts() < 1000]

## Full Case Quantity

In [None]:
df["FULL_CASE_QUANTITY"]

In [None]:
df.hist(column=["FULL_CASE_QUANTITY"], bins=100, figsize=(8, 6))

In [None]:
f'{round(sum(df["FULL_CASE_QUANTITY"] > 300) * 100 / len(df["FULL_CASE_QUANTITY"]), 4)}% has a value higher than 300'

## Filter data based on analysis

In [None]:
df = df.loc[df["FULL_CASE_QUANTITY"] <= 300, :]

In [None]:
df.loc[
    df["SIZE_CODE"].isin(
        df["SIZE_CODE"].value_counts()[df["SIZE_CODE"].value_counts() < 1000].index
    ),
    "SIZE_CODE",
] = "Infrequent"

## Experiment

In [None]:
import string

import boto3

# df["CHANNEL_CLASS"] = df["CHANNEL_CLASS"].str.replace('[{}]'.format(string.punctuation), '')
# df["SIZE_CODE"] = df["SIZE_CODE"].str.replace('[{}]'.format(string.punctuation), '')
# df["SILHOUETTE_SHORT"] = df["SILHOUETTE_SHORT"].str.replace('[{}]'.format(string.punctuation), '')

df.columns = [x.lower() for x in df.columns]

df[[target.lower()] + [x.lower() for x in features]].to_csv("data.csv", index=False)
s3 = boto3.resource("s3")
s3.Object("sagemaker-eu-west-1-708699854342", "shapeshifter/data/data.csv").put(
    Body=open("data.csv", "rb")
)

In [None]:
import pandas as pd

df = pd.read_csv("predictions.csv")

In [None]:
df.head()

In [None]:
from sklearn import metrics

print(f"Test R2 Score: {metrics.r2_score(df['NBR_CARTONS_RATIO'], df['PREDICTIONS'])}")
print(
    f"Test MAE: {metrics.mean_absolute_error(df['NBR_CARTONS_RATIO'], df['PREDICTIONS'])}"
)
print(
    f"Test MSE: {metrics.mean_squared_error(df['NBR_CARTONS_RATIO'], df['PREDICTIONS'])}"
)

In [None]:
import matplotlib.pyplot as plt

true_value = df["NBR_CARTONS_RATIO"]
predicted_value = df["PREDICTIONS"]

plt.figure(figsize=(10, 10))
plt.scatter(true_value, predicted_value, c="crimson", alpha=0.1)
plt.yscale("log")
plt.xscale("log")

p1 = max(max(predicted_value), max(true_value))
p2 = min(min(predicted_value), min(true_value))
plt.plot([p1, p2], [p1, p2], "b-")
plt.xlabel("True Values", fontsize=15)
plt.ylabel("Predictions", fontsize=15)
plt.axis("equal")
plt.show()

In [None]:
!pip install --upgrade sagemaker-experiments --quiet

import time

from smexperiments.experiment import Experiment
from smexperiments.tracker import Tracker
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent


def cleanup_sme_sdk(experiment):
    experiment = Experiment.load(experiment)
    for trial_summary in experiment.list_trials():
        trial = Trial.load(trial_name=trial_summary.trial_name)
        for trial_component_summary in trial.list_trial_components():
            tc = TrialComponent.load(
                trial_component_name=trial_component_summary.trial_component_name
            )
            trial.remove_trial_component(tc)
            try:
                # comment out to keep trial components
                tc.delete()
            except:
                # tc is associated with another trial
                continue
            # to prevent throttling
            time.sleep(0.5)
        trial.delete()
        experiment_name = experiment.experiment_name
    experiment.delete()


#     print(f"\nExperiment {experiment.experiment_name} deleted")

In [None]:
cleanup_sme_sdk("catboost-regression--221109-0836")
cleanup_sme_sdk("catboost-2022-11-09-08-49-15-828")
cleanup_sme_sdk("catboost-2022-11-09-08-55-55-554")
cleanup_sme_sdk("catboost-2022-11-09-09-43-25-780")
cleanup_sme_sdk("xgboost-2022-11-09-09-56-27-068")

In [None]:
import io
import json
import os
import re
import sys
import time
from pathlib import Path
from time import gmtime, strftime

import boto3
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sagemaker
from sagemaker import (
    get_execution_role,
    hyperparameters,
    image_uris,
    model_uris,
    script_uris,
)
from sagemaker.debugger import DebuggerHookConfig, Rule, rule_configs
from sagemaker.model_monitor import (
    DataCaptureConfig,
    DatasetFormat,
    DefaultModelMonitor,
)
from sagemaker.predictor import csv_serializer
from sagemaker.s3 import S3Downloader, S3Uploader
from sagemaker.tuner import (
    CategoricalParameter,
    ContinuousParameter,
    HyperparameterTuner,
    IntegerParameter,
)
from sagemaker.utils import name_from_base
from sklearn.preprocessing import LabelEncoder
from smexperiments.experiment import Experiment
from smexperiments.tracker import Tracker
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

In [None]:
tuner = HyperparameterTuner.attach("lightgbm-2022-11-09-10-44-12-539")
tuner.best_estimator()

In [None]:
# check jobs have finished
status_log = boto3.client("sagemaker").describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuner.latest_tuning_job.job_name
)["HyperParameterTuningJobStatus"]

assert status_log == "Completed", "First must be completed, was {}".format(status_log)

df_log = sagemaker.HyperparameterTuningJobAnalytics(
    tuner.latest_tuning_job.job_name
).dataframe()

df_log.sort_values("FinalObjectiveValue")