In [28]:
import os
import ingredion_tools as it
from azureml.core import Workspace, Datastore, Dataset, Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Load the workspace from the saved config file
ws = Workspace.from_config()

In [2]:
ds = Datastore.get(ws, "cleandata")
default_ds = ws.get_default_datastore()

# Extract data from datastore

In [3]:
# Laura's extract function
#df = getIngredioData()
#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_parquet_files(
    path=(ds, './*'))

In [15]:
# Register the tabular dataset
tab_data_set = tab_data_set.register(workspace=ws,
    name='clean dataset',
    description='ingredion data',
    create_new_version=True)

In [4]:
df = tab_data_set.to_pandas_dataframe()

# Manual model training

In [6]:
variables_costos, variables_pred, variables_tend = it.training.getVariablesToPredict()

In [7]:
today = '2020-01-01'
target_variables = ['3p_sales_qty_total_mt','n3p_net_revenue','new_cogs', 'operating_income']
models = ['lasso','gbm','xgboost']

In [8]:
df.rename(str.lower, axis='columns', inplace=True)
df.columns = df.columns.str.replace(" ","_")
df.rename(
    columns={'fiscal_year/period':'fiscal_year_period',
    'supplies_&_packaging': 'supplies_and_packaging', 
    'supplies_-_indirect': 'supplies_indirect'}, 
    inplace=True)

In [9]:
df = it.training.getIngredioData(df)

In [10]:
df.columns

Index(['date', 'ship_to_party', 'material', 'company_code_id',
       '3p_sales_qty_total_mt', 'sales_qty_total_mt', 'gross_revenue_usd',
       'discounts_usd', 'new_net_revenue', 'n3p_net_revenue', 'net_corn',
       'raw_material_other', 'utilities', 'waste', 'repair', 'labor', 'ohmfg',
       'supplies_and_packaging', 'supplies_indirect', 'depreciation',
       '3p_freight_usd', 'logistics', 'cos_other', 'new_cogs', 'freight_usd',
       'intercompany_cost_elimination', 'gross_profit', 'sga_total',
       'other_(income)/expense', 'operating_income',
       'other_non-operating_(income)/loss', 'special_items',
       'interco_dividends', 'charge_back', 'exchange_gain_/_loss',
       'intercompany_financing_cost', 'financing_costs', 'fees_and_royalties',
       'pbt', 'taxes_on_income', 'net_income', 'minority_income',
       'adj_minority_income', 'total_net_income', 'ing10000_ingr_net_income',
       '__index_level_0__'],
      dtype='object')

In [12]:
for target in target_variables:
    for model_type in models:
        print('#################################################')
        print('starting with: {0} for model {1}'.format(target, model_type))
        df_final = it.training.getTargetVariables(df, target)
        df_final = it.training.getShiftVariables(df_final)
        df_final = it.training.fillTendVariables(df_final, variables_tend)
        ratio_variables, df_final = it.training.createRatioVariables(df_final, variables_pred)
        df_final = it.training.fillnanValues(df_final)
        df_final = it.training.createDummyProduct(df_final)
        df_final = it.training.createDummyCustomer(df_final)
        df_final = it.training.filterByDate(df_final, today)
        dummy_variables = it.training.getDummyVariables(df_final)
        it.training.trainingModel(df_final, variables_costos, dummy_variables, ratio_variables, model_type, target)


#################################################
starting with: 3p_sales_qty_total_mt for model lasso
Modelo:  lasso
lag:  1
(11231, 43)
(11231,)
Train RMSE for 1 month:  90.6517482358247
Test  RMSE for 1 month:  66.79047070831582
Train  R2 Score : 0.91
Test R2 Score : 0.91
Modelo:  lasso
lag:  2
(9169, 43)
(9169,)
Train RMSE for 2 month:  94.08857663645738
Test  RMSE for 2 month:  109.17832051925613
Train  R2 Score : 0.90
Test R2 Score : 0.85
Modelo:  lasso
lag:  3
(7480, 43)
(7480,)
Train RMSE for 3 month:  103.62349750226768
Test  RMSE for 3 month:  86.57768109249022
Train  R2 Score : 0.88
Test R2 Score : 0.94
Modelo:  lasso
lag:  4
(6030, 43)
(6030,)
Train RMSE for 4 month:  114.98625682354637
Test  RMSE for 4 month:  101.80204375377255
Train  R2 Score : 0.89
Test R2 Score : 0.86
Modelo:  lasso
lag:  5
(4757, 43)
(4757,)
Train RMSE for 5 month:  107.28108674240455
Test  RMSE for 5 month:  125.23926059646078
Train  R2 Score : 0.90
Test R2 Score : 0.82
Modelo:  lasso
lag:  6
(3649, 

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [2]:
#df_final = getTargetVariables(df, target)

# Create scripts for pipeline

In [19]:
# Create a folder for the pipeline step files
experiment_folder = 'pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

pipeline


In [31]:
%%writefile $experiment_folder/prep_data.py

import os
import argparse
import pandas as pd
from azureml.core import Run
import ingredion_tools as it

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
df = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(df))
run.log('raw_rows', row_count)

df.rename(str.lower, axis='columns', inplace=True)
df.columns = df.columns.str.replace(" ","_")
df.rename(
    columns={'fiscal_year/period':'fiscal_year_period',
    'supplies_&_packaging': 'supplies_and_packaging', 
    'supplies_-_indirect': 'supplies_indirect'}, 
    inplace=True)

df = it.training.getIngredioData(df)

# Log processed rows
row_count = (len(df))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
df.to_csv(save_path, index=False, header=True)

# End the run
run.complete()



Overwriting pipeline/prep_data.py


In [64]:
%%writefile $experiment_folder/train.py

from azureml.core import Run, Model
import argparse, glob, os, pickle, sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.inspection import plot_partial_dependence, partial_dependence

import ingredion_tools as it

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

variables_costos, variables_pred, variables_tend = it.training.getVariablesToPredict()
today = '2020-01-01'

# Get the experiment run context
run = Run.get_context()

# load the prepared data file in the training folder
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
df = pd.read_csv(file_path)

target_variables = ['3p_sales_qty_total_mt','n3p_net_revenue','new_cogs', 'operating_income']
models = ['lasso','gbm','xgboost']

for target in target_variables:
    for model_type in models:
        print('#################################################')
        print('starting with: {0} for model {1}'.format(target, model_type))
        df_final = it.training.getTargetVariables(df, target)
        df_final = it.training.getShiftVariables(df_final)
        df_final = it.training.fillTendVariables(df_final, variables_tend)
        ratio_variables, df_final = it.training.createRatioVariables(df_final, variables_pred)
        df_final = it.training.fillnanValues(df_final)
        df_final = it.training.createDummyProduct(df_final)
        df_final = it.training.createDummyCustomer(df_final)
        df_final = it.training.filterByDate(df_final, today)
        dummy_variables = it.training.getDummyVariables(df_final)
        name_list, model_list, rmse_list = it.training.trainingModel(df_final, variables_costos, dummy_variables, ratio_variables, model_type, target)

        # Register the model
        print('Registering model...')
        for model_name, model_file, rmse in zip(name_list, model_list, rmse_list):
            Model.register(workspace=run.experiment.workspace,
                        model_path = model_file,
                        model_name = model_name,
                        tags={'Training context':'Pipeline'},
                        properties={'RMSE': np.float(rmse)})


run.complete()

Overwriting pipeline/train.py


## Prepare computer enviroment

In [23]:
from azureml.core.compute import ComputeTarget

cluster_name = "ingredion"

pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Found existing cluster, use it.')

Found existing cluster, use it.


In [24]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- xgboost
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing pipeline/experiment_env.yml


In [25]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


## Create and run the pipeline

In [65]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
ds = ws.datasets.get("clean dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
    source_directory = experiment_folder,
    script_name = "prep_data.py",
    arguments = ['--input-data', ds.as_named_input('raw_data'), '--prepped-data', prepped_data],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
    source_directory = experiment_folder,
    script_name = "train.py",
    arguments = ['--training-data', prepped_data.as_input()],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [66]:
# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

Pipeline is built.


In [67]:
# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'train-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Created step Prepare Data [19dd6358][46352492-b9bc-4228-a10e-717a273489d3], (This step will run and generate new outputs)Created step Train and Register Model [1a57453e][f9559bbe-ec2b-440b-bb43-91967035a3e4], (This step will run and generate new outputs)

Submitted PipelineRun 955c7c2a-65bf-474a-a6c0-c1bd49b595ac
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/955c7c2a-65bf-474a-a6c0-c1bd49b595ac?wsid=/subscriptions/08626a16-e6c3-46d4-832e-a48458f24b3f/resourcegroups/fingredion/workspaces/ml_ingredion&tid=e5046128-f08b-4479-b818-61bca86ef617
Pipeline submitted for execution.
PipelineRunId: 955c7c2a-65bf-474a-a6c0-c1bd49b595ac
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/955c7c2a-65bf-474a-a6c0-c1bd49b595ac?wsid=/subscriptions/08626a16-e6c3-46d4-832e-a48458f24b3f/resourcegroups/fingredion/workspaces/ml_ingredion&tid=e5046128-f08b-4479-b818-61bca86ef617
PipelineRun Status: NotStarted


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRun Status: Running


StepRunId: a2aaf8c7-57c6-40c3-84f3-4e980578c26d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a2aaf8c7-57c6-40c3-84f3-4e980578c26d?wsid=/subscriptions/08626a16-e6c3-46d4-832e-a48458f24b3f/resourcegroups/fingredion/workspaces/ml_ingredion&tid=e5046128-f08b-4479-b818-61bca86ef617
StepRun( Prepare Data ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_4a510b533a62fec93f2f8ca07b4d77c15c1cb94ac00893b2643aa4258138f212_d.txt
2022-01-13T21:02:17Z Successfully mounted a/an Blobfuse File System at /mnt/batch/tasks/shared/LS_root/jobs/ml_ingredion/azureml/a2aaf8c7-57c6-40c3-84f3-4e980578c26d/mounts/workspaceblobstore -- stdout/stderr: 
2022-01-13T21:02:18Z The vmsize standard_ds11_v2 is not a GPU VM, skipping get GPU count by running nvidia-smi command.
2022-01-13T21:02:18Z Starting output-watcher...
2022-01-13T21:02:18Z IsDedicatedCompute == True, won't poll for Low Pri Preemption
2022-01-13T21:02:18Z Executing 'Copy ACR Details

'Finished'

In [69]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
Prepare Data :
	 raw_rows : 15346
	 processed_rows : 13856


In [70]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

model_xgboost_operating_income_10_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 80086.796875


model_xgboost_operating_income_9_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 62389.441406


model_xgboost_operating_income_8_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 46201.808594


model_xgboost_operating_income_7_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 21005.068359


model_xgboost_operating_income_6_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 34868.257812


model_xgboost_operating_income_5_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 20576.341797


model_xgboost_operating_income_4_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 24729.941406


model_xgboost_operating_income_3_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 32386.923828


model_xgboost_operating_income_2_month.pkl version: 1
	 Training context : Pipeline
	 RMSE : 15334.650391


model_xgboost_operating_inc

## Publish the pipeline

In [68]:
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
    name="train-pipeline", description="Trains ingredions model", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
train-pipeline,bd3c2f08-6d3e-48f6-841e-b5cc0176d50f,Active,REST Endpoint


In [71]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://eastus.api.azureml.ms/pipelines/v1.0/subscriptions/08626a16-e6c3-46d4-832e-a48458f24b3f/resourceGroups/fingredion/providers/Microsoft.MachineLearningServices/workspaces/ml_ingredion/PipelineRuns/PipelineSubmit/bd3c2f08-6d3e-48f6-841e-b5cc0176d50f
