In [1]:
import azureml.core

  "class": algorithms.Blowfish,


In [2]:
print(azureml.core.VERSION)

1.48.0


# Create Workspace

In [3]:
from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

sid = '6ea869be-bab3-4204-94c3-1fc677f7d2de'
tenant_id = '0f942ca0-ebef-4f26-80f8-f501d599ba90'

forced_interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id, force=True)

ws = Workspace.create(name='azureml_workspace',
            subscription_id= sid, 
            resource_group='recogroup',
            create_resource_group = True,
            location='centralus'
            )

Performing interactive authentication. Please follow the instructions on the terminal.


The default web browser has been opened at https://login.microsoftonline.com/0f942ca0-ebef-4f26-80f8-f501d599ba90/oauth2/v2.0/authorize. Please continue the login in the web browser. If no web browser is available or if the web browser fails to open, use device code flow with `az login --use-device-code`.


Interactive authentication successfully completed.
Deploying KeyVault with name azuremlwkeyvault7e3455e2.
Deploying AppInsights with name azuremlwinsights78339da5.
Deployed AppInsights with name azuremlwinsights78339da5. Took 9.25 seconds.
Deploying StorageAccount with name azuremlwstorage46f5e1ec4.
Deployed KeyVault with name azuremlwkeyvault7e3455e2. Took 19.0 seconds.
Deployed StorageAccount with name azuremlwstorage46f5e1ec4. Took 24.64 seconds.
Deploying Workspace with name azureml_workspace.
Deployed Workspace with name azureml_workspace. Took 43.25 seconds.


# Upload to Default Data Store

In [5]:
#upload data by using get_default_datastore()
ds = ws.get_default_datastore()
ds.upload(src_dir='./recodata', target_path='recodata', overwrite=True, show_progress=True)

print('Done')

Uploading an estimated of 3 files
Uploading ./recodata\professionals.csv
Uploaded ./recodata\professionals.csv, 1 files out of an estimated total of 3
Uploading ./recodata\questions.csv
Uploaded ./recodata\questions.csv, 2 files out of an estimated total of 3
Uploading ./recodata\answers.csv
Uploaded ./recodata\answers.csv, 3 files out of an estimated total of 3
Uploaded 3 files
Done


# Create the Training Folder

In [6]:
import os

# create the folder
folder_training_script = './recocode'
os.makedirs(folder_training_script, exist_ok=True)

print('Done')

Done


# Create the Compute Cluster

In [7]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# Step 1: name the cluster and set the minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# Step 2: choose environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)

# create the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

print('Compute target created')

Compute target created


# Create the Training Script

In [8]:
%%writefile $folder_training_script/train.py

import argparse
import os
import numpy as np
import pandas as pd
import glob
import gc
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import pickle

from azureml.core import Run
# from utils import load_data

parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
args = parser.parse_args()

def clean_text(text):
    '''Make text lowercase,remove punctuation
    .'''
    text = str(text).lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text


data_folder = os.path.join(args.data_folder, 'recodata')
print('Data folder:', data_folder)

questions  = pd.read_csv(os.path.join(data_folder, 'questions.csv'))
professionals = pd.read_csv(os.path.join(data_folder, 'professionals.csv'))
answers = pd.read_csv(os.path.join(data_folder, 'answers.csv'))

prof_ans_q = questions[(~questions["questions_title"].isna()) | (~questions["questions_body"].isna()) ]

q = prof_ans_q["questions_title"] + " " + prof_ans_q["questions_body"]
q  = q.apply(lambda x:clean_text(x))

N_FEATURES = 2000
MAX_DF     = 0.95
MIN_DF     = 2
LANGUAGE   = 'english'

tfidf_vectorizer = TfidfVectorizer(max_df=MAX_DF, 
                                   min_df=MIN_DF,
                                   stop_words=LANGUAGE)

q = q.dropna()
tfidf_vectorizer.fit(q)
q_tfidf = tfidf_vectorizer.transform((q))

# Get the experiment run context
run = Run.get_context()

run.log('MAX_DF', np.float(MAX_DF))
run.log('MIN_DF', np.float(MIN_DF))
run.log('N_FEATURES', np.float(N_FEATURES))


pickle.dump(tfidf_vectorizer,open('outputs/tfidf_vectorizer.pkl',"wb"))
pickle.dump(q_tfidf,open("outputs/q_tfidf.pkl","wb"))


run.complete()

Writing ./recocode/train.py


# Create the Environment

In [9]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create a Python environment for the experiment
wine_env = Environment("reco-env")
wine_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
wine_env.docker.enabled = False # Use a docker container

# Create a set of package dependencies (conda or pip as required)
wine_packages = CondaDependencies.create(conda_packages=['scikit-learn','pandas'])

# Add the dependencies to the environment
wine_env.python.conda_dependencies = wine_packages

print(wine_env.name, 'defined.')

'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


reco-env defined.


# Register the environment in the workspace

In [10]:
# Register the environment
wine_env.register(workspace=ws)

{
    "assetId": "azureml://locations/centralus/workspaces/846fb7c3-a2cd-452c-b533-9016fb6f3885/environments/reco-env/versions/1",
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:20221101.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "buildContext": null,
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "reco-env",
    "python": {

# Create the Estimator

In [11]:
!pip install azureml-train

Defaulting to user installation because normal site-packages is not writeable


In [12]:
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.as_mount()
}

registered_env = Environment.get(ws, 'reco-env')

# Create an estimator
estimator = Estimator(source_directory=folder_training_script,
                      script_params=script_params,
                      compute_target = compute_target, # Run the experiment on the remote compute target
                      environment_definition = registered_env,
                      entry_script='train.py')

"datastore.as_mount" is deprecated after version 1.0.69. Please use "file_dataset.as_mount" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.
'Estimator' is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or an Azure ML curated environment.


# Create the Experiment

In [13]:
from azureml.core import Experiment

#Create an experiment
experiment = Experiment(workspace = ws, name = "reco_expt")

print('Experiment created')

Experiment created


# Submit the Experiment with the Estimator information

In [14]:
run = experiment.submit(config=estimator)
run



Experiment,Id,Type,Status,Details Page,Docs Page
reco_expt,reco_expt_1673691404_36917df8,azureml.scriptrun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [25]:
#get the result
print(run.get_metrics())

{'MAX_DF': 0.95, 'MIN_DF': 2.0, 'N_FEATURES': 2000.0}


# Register the model

In [15]:
tfidf_vectorizer = run.register_model(model_name='tfidf_vectorizer_model',
                           model_path='outputs/tfidf_vectorizer.pkl',
                           tags = {'area': "tfidf_vectorizer", 'type': "sklearn"},
                           description = "tfidf_vectorizer")

q_tfidf = run.register_model(model_name='q_tfidf_model',
                           model_path='outputs/q_tfidf.pkl',
                           tags = {'area': "q_tfidf", 'type': "sklearn"},
                           description = "q_tfidf")

print(tfidf_vectorizer.name, q_tfidf.name, sep='\t')

tfidf_vectorizer_model	q_tfidf_model
